diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 56ed3bcb016d8be1c7b4ee524b907a7253fc6eba..488b291d6df1061f95cccd0f89a492046eb4aa08 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -141,6 +141,7 @@ const std::map OpenCLRuntime::program_map_ = { {"addn", "addn.cl"}, {"batch_norm", "batch_norm.cl"}, + {"conv_2d", "conv_2d.cl"}, {"conv_2d_1x1", "conv_2d_1x1.cl"}, {"conv_2d_3x3", "conv_2d_3x3.cl"}, {"depthwise_conv_3x3", "depthwise_conv_3x3.cl"}, diff --git a/mace/kernels/neon/pooling_neon.cc b/mace/kernels/neon/pooling_neon.cc index 0f9162349c03398c5301b152a9c87048431caa5f..76868335d12500623cc08fff5d0cfae70761cff9 100644 --- a/mace/kernels/neon/pooling_neon.cc +++ b/mace/kernels/neon/pooling_neon.cc @@ -58,19 +58,27 @@ void PoolingFunctor::operator()( const Tensor *input_tensor, Tensor *output_tensor) { + std::vector output_shape(4); + std::vector paddings(2); + std::vector filter_shape(4); + filter_shape[0] = input_tensor->shape()[1]; + filter_shape[1] = input_tensor->shape()[1]; + filter_shape[2] = kernels_[0]; + filter_shape[3] = kernels_[1]; + + kernels::CalcPaddingAndOutputSize( + input_tensor->shape().data(), filter_shape.data(), this->dilations_, + strides_, this->padding_, output_shape.data(), + paddings.data()); + output_tensor->Resize(output_shape); + const float *input = input_tensor->data(); float *output = output_tensor->mutable_data(); const index_t *input_shape = input_tensor->shape().data(); - const index_t *output_shape = output_tensor->shape().data(); - int paddings[2]; - std::vector filter_shape = {input_shape[1], input_shape[0], - kernels_[0], kernels_[1]}; - kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_, - strides_, this->padding_, paddings); #ifdef __COPY_MAKE_PADDING Tensor padded_input; - ConstructInputWithPadding(input_tensor, paddings, &padded_input); + ConstructInputWithPadding(input_tensor, paddings.data(), &padded_input); input = padded_input.data(); input_shape = padded_input.shape().data(); #endif @@ -80,17 +88,17 @@ void PoolingFunctor::operator()( // kernel_size: 2x2, strides: 2x2 if (pooling_type_ == MAX) { // MAX_POOL_2x2s2x2 #ifdef __COPY_MAKE_PADDING - PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape); + PoolingMaxNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data()); #else - PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape, - paddings); + PoolingMaxNeonK2x2S2x2(input, input_shape, output, output_shape.data(), + paddings.data()); #endif } else { // AVG_POOL_2x2s2x2 #ifdef __COPY_MAKE_PADDING - PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape); + PoolingAvgNeonK2x2S2x2Padded(input, input_shape, output, output_shape.data()); #else - PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape, - paddings); + PoolingAvgNeonK2x2S2x2(input, input_shape, output, output_shape.data(), + paddings.data()); #endif } } else if (kernels_[0] == 3 && kernels_[1] == 3 && strides_[0] == 2 && @@ -98,17 +106,17 @@ void PoolingFunctor::operator()( // kernel_size: 3x3, strides: 2x2 if (pooling_type_ == MAX) { // MAX_POOL_3x3s2x2 #ifdef __COPY_MAKE_PADDING - PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape); + PoolingMaxNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data()); #else - PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape, - paddings); + PoolingMaxNeonK3x3S2x2(input, input_shape, output, output_shape.data(), + paddings.data()); #endif } else { // AVG_POOL_3x3s2x2 #ifdef __COPY_MAKE_PADDING - PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape); + PoolingAvgNeonK3x3S2x2Padded(input, input_shape, output, output_shape.data()); #else - PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape, - paddings); + PoolingAvgNeonK3x3S2x2(input, input_shape, output, output_shape.data(), + paddings.data()); #endif } } else { // not implement yet diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h index 13d99c41100665bd5a6fb0572070277a0c7d6e82..499c8164ddc3a0c5158c97e70c6f6ec55f0ccd87 100644 --- a/mace/kernels/opencl/cl/common.h +++ b/mace/kernels/opencl/cl/common.h @@ -18,4 +18,7 @@ #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE) #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE) + +__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + #endif // MACE_KERNELS_OPENCL_CL_COMMON_H_ diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl new file mode 100644 index 0000000000000000000000000000000000000000..e5ddb3d78fd5d5176123e9b0ab5e4e460e035314 --- /dev/null +++ b/mace/kernels/opencl/cl/conv_2d.cl @@ -0,0 +1,148 @@ +#include + +__kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ + __read_only image2d_t filter, /* cout%4 * cin * kw * kh, cout/4 */ +#ifdef BIAS + __read_only image2d_t bias, /* cout%4 * cout/4 */ +#endif + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int in_ch_blks, + __private const int out_height, + __private const int out_width, + __private const int filter_height, + __private const int filter_width, + __private const int padding_top, + __private const int padding_left) { + const int out_ch_blk = get_global_id(0); + const int out_w_blk = get_global_id(1); + const int out_w_blks = get_global_size(1); + const int out_hb = get_global_id(2); + const int rounded_in_ch = in_ch_blks * 4; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; +#ifdef BIAS + DATA_TYPE4 out0 = + READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)); + DATA_TYPE4 out1 = out0; + DATA_TYPE4 out2 = out0; + DATA_TYPE4 out3 = out0; +#else + DATA_TYPE4 out0 = 0; + DATA_TYPE4 out1 = 0; + DATA_TYPE4 out2 = 0; + DATA_TYPE4 out3 = 0; +#endif + +#if STRIDE == 1 + int in_width0 = out_w_blk - padding_left; + int in_width1 = in_width0 + out_w_blks; + int in_width2 = in_width1 + out_w_blks; + int in_width3 = in_width2 + out_w_blks; + const int height_idx = (out_hb % out_height) - padding_top; +#else + int in_width0 = out_w_blk * 2 - padding_left; + int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left; + int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left; + int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left; + const int height_idx = (out_hb % out_height) * 2 - padding_top; +#endif + + const int batch_idx = (out_hb / out_height) * in_height; + + DATA_TYPE4 in0, in1, in2, in3; + DATA_TYPE4 weights0, weights1, weights2, weights3; + int in_idx, in_width_idx; + // Unrolling this loop hurt perfmance + for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { + for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) { + + int in_hb_value = height_idx + hb_idx; + in_hb_value = select(in_hb_value + batch_idx, + -1, + (in_hb_value < 0 || in_hb_value >= in_height)); + + for (short width_idx = 0; width_idx < filter_width; ++width_idx) { + in_idx = in_ch_blk * in_width; + int in_width_value; +#define READ_INPUT(i) \ + in_width_value = in_width##i + width_idx; \ + in_width_value = select(in_idx + in_width_value, \ + -1, \ + (in_width_value < 0 || in_width_value >= in_width)); \ + in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value)); + + READ_INPUT(0); + READ_INPUT(1); + READ_INPUT(2); + READ_INPUT(3); + +#undef READ_INPUT + + int filter_idx = (in_ch_blk << 2) + (hb_idx * filter_width + width_idx) * rounded_in_ch; + weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk)); + weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk)); + weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk)); + weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk)); + + // Will prefetch L2 improve performance? How to pretch image data? + + // Interleaving load and mul does not improve performance as expected + out0 += in0.x * weights0; + out0 += in0.y * weights1; + out0 += in0.z * weights2; + out0 += in0.w * weights3; + + out1 += in1.x * weights0; + out1 += in1.y * weights1; + out1 += in1.z * weights2; + out1 += in1.w * weights3; + + out2 += in2.x * weights0; + out2 += in2.y * weights1; + out2 += in2.z * weights2; + out2 += in2.w * weights3; + + out3 += in3.x * weights0; + out3 += in3.y * weights1; + out3 += in3.z * weights2; + out3 += in3.w * weights3; + + } + } + } + +#ifdef FUSED_RELU + // TODO relux + out0 = fmax(out0, 0); + out1 = fmax(out1, 0); + out2 = fmax(out2, 0); + out3 = fmax(out3, 0); +#endif + + const int out_x_base = out_ch_blk * out_width; + int w = out_w_blk; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out0); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out1); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out2); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out3); + +} diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 98847ab0f78f90e460a93d13ab31b781b123a696..08bf04d3c883e12f5970cd82a9394620b2649e51 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -59,15 +59,13 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] // Unrolling this loop hurt perfmance for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (short hb_idx = 0; hb_idx < 3; ++hb_idx) { + int in_hb_value = height_idx + hb_idx; + in_hb_value = select(in_hb_value + batch_idx, + -1, + (in_hb_value < 0 || in_hb_value >= in_height)); for (short width_idx = 0; width_idx < 3; ++width_idx) { in_idx = in_ch_blk * in_width; - - int in_hb_value = height_idx + hb_idx; - in_hb_value = select(in_hb_value + batch_idx, - -1, - (in_hb_value < 0 || in_hb_value >= in_height)); - int in_width_value; #define READ_INPUT(i) \ in_width_value = in_width##i + width_idx; \ diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl index bc987dddb96ec5202fb3882fc279d4480c077f38..bd2763fc8c7eb68c06e09b4822ccc5025807a151 100644 --- a/mace/kernels/opencl/cl/pooling.cl +++ b/mace/kernels/opencl/cl/pooling.cl @@ -1,193 +1,87 @@ #include -VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s1(const DATA_TYPE *input_ptr, const int in_width) { - VEC_DATA_TYPE(DATA_TYPE,4) row00 = vload4(0, input_ptr); - VEC_DATA_TYPE(DATA_TYPE,2) row01 = vload2(0, input_ptr + 4); - VEC_DATA_TYPE(DATA_TYPE,4) row10 = vload4(0, input_ptr + in_width); - VEC_DATA_TYPE(DATA_TYPE,2) row11 = vload2(0, input_ptr + in_width + 4); - VEC_DATA_TYPE(DATA_TYPE,4) row20 = vload4(0, input_ptr + in_width * 2); - VEC_DATA_TYPE(DATA_TYPE,2) row21 = vload2(0, input_ptr + in_width * 2 + 4); - - VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01212323); - VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row01.s0, row00.s3, row01.s01); - VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01212323); - VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row11.s0, row10.s3, row11.s01); - VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01212323); - VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row21.s0, row20.s3, row21.s01); - - VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20); - VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21); - - VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1), - (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2)); - res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03)); - - return res; -} - -VEC_DATA_TYPE(DATA_TYPE,4) vec_pooling_3_s2(const DATA_TYPE *input_ptr, const int in_width) { - VEC_DATA_TYPE(DATA_TYPE,8) row00 = vload8(0, input_ptr); - DATA_TYPE row01 = *(input_ptr + 8); - VEC_DATA_TYPE(DATA_TYPE,8) row10 = vload8(0, input_ptr + in_width); - DATA_TYPE row11 = *(input_ptr + in_width + 8); - VEC_DATA_TYPE(DATA_TYPE,8) row20 = vload8(0, input_ptr + in_width * 2); - DATA_TYPE row21 = *(input_ptr + in_width * 2 + 8); - - VEC_DATA_TYPE(DATA_TYPE,8) data00 = (VEC_DATA_TYPE(DATA_TYPE,8))(row00.s01223445); - VEC_DATA_TYPE(DATA_TYPE,4) data01 = (VEC_DATA_TYPE(DATA_TYPE,4))(row00.s667, row01); - VEC_DATA_TYPE(DATA_TYPE,8) data10 = (VEC_DATA_TYPE(DATA_TYPE,8))(row10.s01223445); - VEC_DATA_TYPE(DATA_TYPE,4) data11 = (VEC_DATA_TYPE(DATA_TYPE,4))(row10.s667, row11); - VEC_DATA_TYPE(DATA_TYPE,8) data20 = (VEC_DATA_TYPE(DATA_TYPE,8))(row20.s01223445); - VEC_DATA_TYPE(DATA_TYPE,4) data21 = (VEC_DATA_TYPE(DATA_TYPE,4))(row20.s667, row21); - - VEC_DATA_TYPE(DATA_TYPE,8) left = fmax(fmax(data00, data10), data20); - VEC_DATA_TYPE(DATA_TYPE,4) right = fmax(fmax(data01, data11), data21); - - VEC_DATA_TYPE(DATA_TYPE,4) res = fmax((VEC_DATA_TYPE(DATA_TYPE,4))(left.s036, right.s1), - (VEC_DATA_TYPE(DATA_TYPE,4))(left.s147, right.s2)); - res = fmax(res, (VEC_DATA_TYPE(DATA_TYPE,4))(left.s25, right.s03)); - - return res; -} - -DATA_TYPE inner_pooling_3(const DATA_TYPE *input_ptr, const int in_width) { - VEC_DATA_TYPE(DATA_TYPE,3) row0 = vload3(0, input_ptr); - VEC_DATA_TYPE(DATA_TYPE,3) row1 = vload3(0, input_ptr + in_width); - VEC_DATA_TYPE(DATA_TYPE,3) row2 = vload3(0, input_ptr + in_width * 2); - - VEC_DATA_TYPE(DATA_TYPE,3) data = fmax(fmax(row0, row1), row2); - - DATA_TYPE res = fmax(fmax(data.s0, data.s1), data.s2); - return res; -} - -// Supported data type: half/float -__kernel void pooling3(__global const DATA_TYPE *input, /* n, c, h, w */ - __private const int in_height, - __private const int in_width, - __private const int out_chan_num, - __private const int out_height, - __private const int out_width, - __private const int stride, - __global DATA_TYPE *output) { - int batch = get_global_id(0); - int out_chan_blk = get_global_id(1); - int out_pixel_blk = get_global_id(2); - - const int round_out_width = (out_width + 3) / 4; - const int out_pixel_height = out_pixel_blk / round_out_width; - const int out_pixel_width = out_pixel_blk % round_out_width; - - const int out_chan_begin = out_chan_blk * 4; - const int out_chan_end = min(out_chan_begin + 4, out_chan_num); - const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; - const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); - const int in_pixel_begin = out_pixel_height * stride * in_width + out_pixel_width * stride * 4; - - const int in_pixel = in_height * in_width; - const int out_pixel = out_height * out_width; - - const int in_offset = batch * out_chan_num * in_pixel; - const int out_offset = batch * out_chan_num * out_pixel; - const DATA_TYPE *input_base = input + in_offset + in_pixel_begin; - DATA_TYPE *output_base = output + out_offset + out_pixel_begin; - - const int pixels = out_pixel_end - out_pixel_begin; - - for (int i = out_chan_begin; i < out_chan_end; ++i) { - const DATA_TYPE *input_ptr = input_base + i * in_pixel; - DATA_TYPE *output_ptr = output_base + i * out_pixel; - if (pixels == 4) { - VEC_DATA_TYPE(DATA_TYPE,4) res; -#ifdef STRIDE_1 - res = vec_pooling_3_s1(input_ptr, in_width); +#ifdef FP16 +#define MIN_VALUE -USHRT_MAX #else - res = vec_pooling_3_s2(input_ptr, in_width); +#define MIN_VALUE -FLT_MAX #endif - vstore4(res, 0, output_ptr); - } else { - for (int p = 0; p < pixels; ++p) { - output_ptr[p] = inner_pooling_3(input_ptr, in_width); - input_ptr += stride; - } - } - } -} -int calculate_avg_block_size(const int pos_h, - const int pos_w, - const int pool_size, - const int pad_h, - const int pad_w, - const int h_size, - const int w_size) { - const int h_start = max(0, pos_h - pad_h); - const int w_start = max(0, pos_w - pad_w); - const int h_end = min(pos_h + pool_size - pad_h, h_size); - const int w_end = min(pos_w + pool_size - pad_w, w_size); +inline int calculate_avg_block_size(const int pool_size, + const int pos_h, + const int pos_w, + const int h_size, + const int w_size) { + const int h_start = max(0, pos_h); + const int w_start = max(0, pos_w); + const int h_end = min(pos_h + pool_size, h_size); + const int w_end = min(pos_w + pool_size, w_size); return (h_end - h_start) * (w_end - w_start); } // Supported data type: half/float -__kernel void poolingn(__global const DATA_TYPE *input, /* n, c, h, w */ - __private const int in_height, - __private const int in_width, - __private const int out_chan_num, - __private const int out_height, - __private const int out_width, - __private const int stride, - __private const int pad_h, - __private const int pad_w, - __private const int pooling_size, - __global DATA_TYPE *output) { - int batch = get_global_id(0); - int out_chan_idx = get_global_id(1); - int out_pixel_idx = get_global_id(2); - - const int out_pixel_height = out_pixel_idx / out_width; - const int out_pixel_width = out_pixel_idx % out_width; - - const int out_chan_begin = out_chan_idx * 4; - const int out_chan_end = min(out_chan_begin + 4, out_chan_num); - const int in_pixel_idx = out_pixel_height * stride * in_width - + out_pixel_width * stride; - - const int in_pixel = in_height * in_width; - const int out_pixel = out_height * out_width; - - const int in_offset = batch * out_chan_num * in_pixel; - const int out_offset = batch * out_chan_num * out_pixel; - const DATA_TYPE *input_base = input + in_offset + in_pixel_idx; - DATA_TYPE *output_base = output + out_offset + out_pixel_idx; - - const int block_size = calculate_avg_block_size( - out_pixel_height * stride, - out_pixel_width * stride, - pooling_size, - pad_h/2, - pad_w/2, - in_height - pad_h, - in_width - pad_w); - for (int i = out_chan_begin; i < out_chan_end; ++i) { - VEC_DATA_TYPE(DATA_TYPE,8) sum8 = 0.0f; - DATA_TYPE sum1 = 0.0f; - DATA_TYPE *output_ptr = output_base + i * out_pixel; - for (int y = 0; y < pooling_size; ++y) { - const DATA_TYPE *input_ptr = input_base + i * in_pixel + y * in_width; - int x = 0; - for (; x < (pooling_size-8); x += 8) { - VEC_DATA_TYPE(DATA_TYPE,8) data = vload8(0, input_ptr); - sum8 += data; - input_ptr += 8; - } - for (; x < pooling_size; ++x) { - sum1 += *input_ptr; - input_ptr++; +__kernel void pooling(__read_only image2d_t input, + __private const int in_height, + __private const int in_width, + __private const int out_height, + __private const int pad_top, + __private const int pad_left, + __private const int stride, + __private const int pooling_size, + __write_only image2d_t output) { + const int out_chan_idx = get_global_id(0); + const int out_width_idx = get_global_id(1); + const int out_width = get_global_size(1); + const int out_hb_idx = get_global_id(2); + + const int batch_idx = (out_hb_idx / out_height) * in_height; + const int in_height_start = (out_hb_idx % out_height) * stride - pad_top; + const int in_width_start = out_width_idx * stride - pad_left; + const int in_channel_offset = out_chan_idx * in_width; + + +#ifdef POOL_AVG + DATA_TYPE4 res = 0; + for (int height = 0; height < pooling_size; ++height) { + int in_height_idx = in_height_start + height; + in_height_idx = select(batch_idx + in_height_idx, + -1, + (in_height_idx < 0 || in_height_idx >= in_height)); + for (int width = 0; width < pooling_size; ++width) { + int in_width_idx = in_width_start + width; + in_width_idx = select(in_channel_offset + in_width_idx, + -1, + (in_width_idx < 0 || in_width_idx >= in_width)); + + DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx)); + res = res + in; + } + } + const int block_size = calculate_avg_block_size(pooling_size, + in_height_start, in_width_start, + in_height, in_width); + res /= block_size; +#else + DATA_TYPE4 res = (DATA_TYPE4)(MIN_VALUE); + for (int height = 0; height < pooling_size; ++height) { + int in_height_idx = in_height_start + height; + in_height_idx = select(batch_idx + in_height_idx, + -1, + (in_height_idx < 0 || in_height_idx >= in_height)); + if (in_height_idx != -1) { + for (int width = 0; width < pooling_size; ++width) { + int in_width_idx = in_width_start + width; + in_width_idx = select(in_channel_offset + in_width_idx, + -1, + (in_width_idx < 0 || in_width_idx >= in_width)); + + if (in_width_idx != -1) { + DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(in_width_idx, in_height_idx)); + res = fmax(res, in); + } } } - VEC_DATA_TYPE(DATA_TYPE,4) sum4 = sum8.s0123 + sum8.s4567; - VEC_DATA_TYPE(DATA_TYPE,2) sum2 = sum4.s01 + sum4.s23; - - *output_ptr = (sum2.s0 + sum2.s1 + sum1) / block_size; } +#endif + + WRITE_IMAGET(output, (int2)(out_chan_idx * out_width + out_width_idx, out_hb_idx), res); } diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index cb48be92c1112a8148be742f8a029ef37d37e716..c40481543796215c80f4367e8e5f01a59b32c3be 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -28,6 +28,11 @@ extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, const int *padding, const DataType dt, Tensor *output); +extern void Conv2dOpencl(const Tensor *input, const Tensor *filter, + const Tensor *bias, const bool fused_relu, + const uint32_t stride, const int *padding, + const DataType dt, Tensor *output); + template void Conv2dFunctor::operator()(const Tensor *input, const Tensor *filter, @@ -47,17 +52,13 @@ void Conv2dFunctor::operator()(const Tensor *input, index_t kernel_h = filter->dim(0); index_t kernel_w = filter->dim(1); - if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] || - strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 || - selector[kernel_h - 1][strides_[0] - 1] == nullptr) { + if (!input->is_image() || strides_[0] != strides_[1] || + strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1) { LOG(WARNING) << "OpenCL conv2d kernel with " << "filter" << kernel_h << "x" << kernel_w << "," << " stride " << strides_[0] << "x" << strides_[1] << " is not implemented yet, using slow version"; - // TODO(heliangliang) The CPU/NEON kernel should map the buffer - Conv2dFunctor(strides_, paddings_, dilations_)( - input, filter, bias, output); - return; + MACE_NOT_IMPLEMENTED; } std::vector output_shape(4); @@ -66,16 +67,18 @@ void Conv2dFunctor::operator()(const Tensor *input, input->shape().data(), filter->shape().data(), dilations_, strides_, paddings_, output_shape.data(), paddings.data()); - if (input->is_image()) { - std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); - output->ResizeImage(output_shape, output_image_shape); + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + output->ResizeImage(output_shape, output_image_shape); + + if (kernel_h == kernel_w && kernel_h <= 5 && + selector[kernel_h - 1][strides_[0] - 1] != nullptr) { + auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; + conv2d_func(input, filter, bias, false, paddings.data(), DataTypeToEnum::value, output); } else { - output->Resize(output_shape); + Conv2dOpencl(input, filter, bias, false, strides_[0], paddings.data(), DataTypeToEnum::value, output); } - auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; - conv2d_func(input, filter, bias, false, paddings.data(), DataTypeToEnum::value, output); } template diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 24bf90a1178961665ec1cf65935809d8409987bd..7b7453ad53a4af2921cdea07f9a983b51865d848 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -38,7 +38,6 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, auto program = runtime->program(); auto conv_2d_kernel = runtime->BuildKernel("conv_2d_3x3", "conv_2d_3x3", built_options); - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); uint32_t idx = 0; conv_2d_kernel.setArg(idx++, *(static_cast(input->buffer()))); diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc new file mode 100644 index 0000000000000000000000000000000000000000..e46ecbcaca06e811de44b5a29e08abb1e3418906 --- /dev/null +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -0,0 +1,73 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/common.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/conv_2d.h" +#include "mace/kernels/opencl/helper.h" +#include "mace/utils/utils.h" + +namespace mace { +namespace kernels { + +void Conv2dOpencl(const Tensor *input, const Tensor *filter, + const Tensor *bias, const bool fused_relu, + const uint32_t stride, const int *padding, + const DataType dt, Tensor *output) { + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channels = output->dim(3); + const index_t input_channels = input->dim(3); + + const index_t channel_blocks = RoundUpDiv4(channels); + const index_t input_channel_blocks = RoundUpDiv4(input_channels); + const index_t width_blocks = RoundUpDiv4(width); + + std::set built_options; + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + built_options.emplace("-DSTRIDE=" + ToString(stride)); + if (fused_relu) { + built_options.emplace("-DFUSED_RELU"); + } + + auto runtime = OpenCLRuntime::Get(); + auto program = runtime->program(); + + auto conv_2d_kernel = runtime->BuildKernel("conv_2d", "conv_2d", built_options); + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); + + uint32_t idx = 0; + conv_2d_kernel.setArg(idx++, *(static_cast(input->buffer()))); + conv_2d_kernel.setArg(idx++, *(static_cast(filter->buffer()))); + if (bias != nullptr) { + conv_2d_kernel.setArg(idx++, *(static_cast(bias->buffer()))); + } + conv_2d_kernel.setArg(idx++, *(static_cast(output->buffer()))); + conv_2d_kernel.setArg(idx++, static_cast(input->dim(1))); + conv_2d_kernel.setArg(idx++, static_cast(input->dim(2))); + conv_2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); + conv_2d_kernel.setArg(idx++, static_cast(height)); + conv_2d_kernel.setArg(idx++, static_cast(width)); + conv_2d_kernel.setArg(idx++, static_cast(filter->dim(0))); + conv_2d_kernel.setArg(idx++, static_cast(filter->dim(1))); + conv_2d_kernel.setArg(idx++, padding[0] / 2); + conv_2d_kernel.setArg(idx++, padding[1] / 2); + + auto command_queue = runtime->command_queue(); + cl_int error; + error = command_queue.enqueueNDRangeKernel( + conv_2d_kernel, cl::NullRange, + cl::NDRange(static_cast(channel_blocks), static_cast(width_blocks), + static_cast(height * batch)), + cl::NDRange(16, 16, 4), + NULL, OpenCLRuntime::Get()->GetDefaultEvent()); + MACE_CHECK(error == CL_SUCCESS, error); + +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index fb9216f767dfd2770a6ccfc405283e51dea2ffe5..349c619574e425aea00b4521194c3ae04649942f 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -10,131 +10,94 @@ namespace mace { namespace kernels { -static void Pooling3(const Tensor *input, - const int *stride, - const PoolingType type, - Tensor *output) { - if (type != MAX) { - MACE_NOT_IMPLEMENTED; - } +static void Pooling(const Tensor *input, + const int *stride, + const int *paddings, + const int pooling_size, + const PoolingType type, + const DataType dt, + Tensor *output) { index_t batch = output->dim(0); - index_t channels = output->dim(1); - index_t out_height = output->dim(2); - index_t out_width = output->dim(3); + index_t out_height = output->dim(1); + index_t out_width = output->dim(2); + index_t channels = output->dim(3); - index_t channel_blk = (channels + 3) / 4; - const index_t pixel_width = (out_width + 3) / 4 ; + index_t channel_blocks = (channels + 3) / 4; const uint32_t gws[3] = { - static_cast(batch), - static_cast(channel_blk), - static_cast(pixel_width * out_height), + static_cast(channel_blocks), + static_cast(out_width), + static_cast(batch * out_height), }; auto runtime = OpenCLRuntime::Get(); std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype())); - built_options.emplace(stride[0] == 1 ? "-DSTRIDE_1" : ""); - auto pooling_kernel = runtime->BuildKernel("pooling", "pooling3", built_options); + if (type == MAX && input->dtype() == output->dtype()) { + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + built_options.emplace(dt == DT_HALF ? "-DFP16" : ""); + } else { + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + } + if (type == AVG) { + built_options.emplace("-DPOOL_AVG"); + } + auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options); + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel); - const uint32_t lws[3] = {1, 8, 128}; + uint32_t lws[3]; + lws[0] = std::min(channel_blocks, kwg_size); + lws[1] = std::min(out_width, kwg_size / lws[0]); + lws[2] = std::min(out_height * batch, kwg_size / (lws[0] * lws[1])); uint32_t idx = 0; - pooling_kernel.setArg(idx++, *(static_cast(input->buffer()))); + pooling_kernel.setArg(idx++, *(static_cast(input->buffer()))); + pooling_kernel.setArg(idx++, static_cast(input->dim(1))); pooling_kernel.setArg(idx++, static_cast(input->dim(2))); - pooling_kernel.setArg(idx++, static_cast(input->dim(3))); - pooling_kernel.setArg(idx++, static_cast(channels)); pooling_kernel.setArg(idx++, static_cast(out_height)); - pooling_kernel.setArg(idx++, static_cast(out_width)); + pooling_kernel.setArg(idx++, paddings[0] / 2); + pooling_kernel.setArg(idx++, paddings[1] / 2); pooling_kernel.setArg(idx++, stride[0]); - pooling_kernel.setArg(idx++, *(static_cast(output->buffer()))); + pooling_kernel.setArg(idx++, pooling_size); + pooling_kernel.setArg(idx++, *(static_cast(output->buffer()))); cl_int error = runtime->command_queue().enqueueNDRangeKernel( pooling_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(lws[0], lws[1], lws[2]), NULL, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS); + MACE_CHECK(error == CL_SUCCESS) << error; } -static void PoolingN(const Tensor *input, - const int *stride, - const int *paddings, - const int pooling_size, - const PoolingType type, - Tensor *output) { - if (type != AVG) { - MACE_NOT_IMPLEMENTED; - } - index_t batch = output->dim(0); - index_t channels = output->dim(1); - index_t out_height = output->dim(2); - index_t out_width = output->dim(3); - - index_t channel_blk = (channels + 3) / 4; - const uint32_t gws[3] = { - static_cast(batch), - static_cast(channel_blk), - static_cast(out_height * out_width), +template +void PoolingFunctor::operator()(const Tensor *input, + Tensor *output) { + MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1) << "Pooling opencl kernel not support dilation yet"; + std::vector output_shape(4); + std::vector paddings(2); + std::vector filter_shape = { + kernels_[0], kernels_[1], + input->dim(3), input->dim(3) }; - auto runtime = OpenCLRuntime::Get(); - std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype())); - auto pooling_kernel = runtime->BuildKernel("pooling", "poolingn", built_options); + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter_shape.data(), + dilations_, strides_, this->padding_, + output_shape.data(), paddings.data()); - const uint32_t lws[3] = {1, 8, 128}; + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + output->ResizeImage(output_shape, output_image_shape); - uint32_t idx = 0; - pooling_kernel.setArg(idx++, *(static_cast(input->buffer()))); - pooling_kernel.setArg(idx++, static_cast(input->dim(2))); - pooling_kernel.setArg(idx++, static_cast(input->dim(3))); - pooling_kernel.setArg(idx++, static_cast(channels)); - pooling_kernel.setArg(idx++, static_cast(out_height)); - pooling_kernel.setArg(idx++, static_cast(out_width)); - pooling_kernel.setArg(idx++, stride[0]); - pooling_kernel.setArg(idx++, paddings[0]); - pooling_kernel.setArg(idx++, paddings[1]); - pooling_kernel.setArg(idx++, pooling_size); - pooling_kernel.setArg(idx++, *(static_cast(output->buffer()))); + Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_, + DataTypeToEnum::value, output); - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - pooling_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), - NULL, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS); -} - -template <> -void PoolingFunctor::operator()(const Tensor *input, - Tensor *output) { - int paddings[2]; - std::vector filter_shape = {input->dim(1), input->dim(0), - kernels_[0], kernels_[1]}; - kernels::CalPaddingSize(input->shape().data(), filter_shape.data(), this->dilations_, - strides_, this->padding_, paddings); -#define POOLING_HELPER \ - switch(kernels_[0]) { \ - case 3: \ - Pooling3(input, strides_, pooling_type_, output); \ - break; \ - default: \ - PoolingN(input, strides_, paddings, kernels_[0], \ - pooling_type_, output); \ - break; \ - } - - if (paddings[0] > 0 || paddings[1] > 0) { - Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum::v()); - ConstructInputWithPadding(input, paddings, &padded_input, pooling_type_ == MAX); - input = &padded_input; - POOLING_HELPER - } else { - POOLING_HELPER - } -#undef POOLING_HELPER } +template +struct PoolingFunctor; +template +struct PoolingFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index 11c05e47c5eb0faef5b3febe3ee63c4d1864d5c6..0a1960a4e7e891d6c71d1841cf672b3a48a83fdb 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -18,36 +18,66 @@ enum PoolingType { namespace kernels { -template -struct PoolingFunctor { - PoolingFunctor(const PoolingType pooling_type, - const int *kernels, - const int *strides, - const Padding padding, - const int *dilations) +struct PoolingFunctorBase { + PoolingFunctorBase(const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding padding, + const int *dilations) : pooling_type_(pooling_type), kernels_(kernels), strides_(strides), padding_(padding), dilations_(dilations) {} + const PoolingType pooling_type_; + const int *kernels_; + const int *strides_; + const Padding padding_; + const int *dilations_; +}; + +template +struct PoolingFunctor : PoolingFunctorBase { + PoolingFunctor(const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding padding, + const int *dilations) + : PoolingFunctorBase(pooling_type, kernels, + strides, padding, + dilations) {} + void operator()(const Tensor *input_tensor, Tensor *output_tensor) { + + std::vector output_shape(4); + std::vector paddings(2); + std::vector filter_shape = { + kernels_[0], kernels_[1], + input_tensor->dim(3), input_tensor->dim(3) + }; + + kernels::CalcNHWCPaddingAndOutputSize( + input_tensor->shape().data(), filter_shape.data(), + dilations_, strides_, this->padding_, + output_shape.data(), paddings.data()); + output_tensor->Resize(output_shape); + Tensor::MappingGuard in_guard(input_tensor); Tensor::MappingGuard out_guard(output_tensor); const T *input = input_tensor->data(); T *output = output_tensor->mutable_data(); const index_t *input_shape = input_tensor->shape().data(); - const index_t *output_shape = output_tensor->shape().data(); index_t batch = output_shape[0]; - index_t channels = output_shape[1]; - index_t height = output_shape[2]; - index_t width = output_shape[3]; + index_t height = output_shape[1]; + index_t width = output_shape[2]; + index_t channels = output_shape[3]; index_t out_image_size = height * width; - index_t input_channels = input_shape[1]; - index_t input_height = input_shape[2]; - index_t input_width = input_shape[3]; + index_t input_height = input_shape[1]; + index_t input_width = input_shape[2]; + index_t input_channels = input_shape[3]; index_t in_image_size = input_height * input_width; int kernel_h = kernels_[0]; @@ -59,11 +89,6 @@ struct PoolingFunctor { int dilation_h = dilations_[0]; int dilation_w = dilations_[1]; - int paddings[2]; - std::vector filter_shape = {input_shape[1], input_shape[0], - kernels_[0], kernels_[1]}; - kernels::CalPaddingSize(input_shape, filter_shape.data(), this->dilations_, - strides_, this->padding_, paddings); // The left-upper most offset of the padded input int padded_h_start = 0 - paddings[0] / 2; int padded_w_start = 0 - paddings[1] / 2; @@ -71,25 +96,24 @@ struct PoolingFunctor { if (pooling_type_ == MAX) { #pragma omp parallel for collapse(2) for (int b = 0; b < batch; ++b) { - for (int c = 0; c < channels; ++c) { - index_t out_offset = (b * channels + c) * out_image_size; - index_t in_offset = (b * input_channels + c) * in_image_size; - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { - T max = std::numeric_limits::lowest(); + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + for (int c = 0; c < channels; ++c) { + index_t in_offset = b * in_image_size * input_channels + c; + T res = std::numeric_limits::lowest(); for (int kh = 0; kh < kernel_h; ++kh) { for (int kw = 0; kw < kernel_w; ++kw) { int inh = padded_h_start + h * stride_h + dilation_h * kh; int inw = padded_w_start + w * stride_w + dilation_w * kw; if (inh >= 0 && inh < input_height && inw >= 0 && inw < input_width) { - index_t input_offset = in_offset + inh * input_width + inw; - max = std::max(max, input[input_offset]); + index_t input_offset = in_offset + (inh * input_width + inw) * input_channels; + res = std::max(res, input[input_offset]); } } } - output[out_offset] = max; - out_offset += 1; + *output = res; + output++; } } } @@ -97,11 +121,10 @@ struct PoolingFunctor { } else if (pooling_type_ == AVG) { #pragma omp parallel for collapse(2) for (int b = 0; b < batch; ++b) { - for (int c = 0; c < channels; ++c) { - index_t out_offset = (b * channels + c) * out_image_size; - index_t in_offset = (b * input_channels + c) * in_image_size; - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + for (int c = 0; c < channels; ++c) { + index_t in_offset = b * in_image_size * input_channels + c; T sum = 0; int block_size = 0; for (int kh = 0; kh < kernel_h; ++kh) { @@ -110,14 +133,14 @@ struct PoolingFunctor { int inw = padded_w_start + w * stride_w + dilation_w * kw; if (inh >= 0 && inh < input_height && inw >= 0 && inw < input_width) { - index_t input_offset = in_offset + inh * input_width + inw; + index_t input_offset = in_offset + (inh * input_width + inw) * input_channels; sum += input[input_offset]; block_size += 1; } } } - output[out_offset] = sum / block_size; - out_offset += 1; + *output = sum / block_size; + output++; } } } @@ -125,22 +148,26 @@ struct PoolingFunctor { } } - const PoolingType pooling_type_; - const int *kernels_; - const int *strides_; - const Padding padding_; - const int *dilations_; }; -template <> +template<> void PoolingFunctor::operator()( const Tensor *input_tensor, Tensor *output_tensor); -template <> -void PoolingFunctor::operator()( - const Tensor *input_tensor, - Tensor *output_tensor); +template +struct PoolingFunctor : PoolingFunctorBase { + PoolingFunctor(const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding padding, + const int *dilations) + : PoolingFunctorBase(pooling_type, kernels, + strides, padding, + dilations) {} + void operator()(const Tensor *input_tensor, + Tensor *output_tensor); +}; } // namespace kernels } // namespace mace diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc index 3836a7ae90291dbbfb80da20cf78a1bb1c79d87e..43092084d3f75cacf48ecf9dc9dd3fd3861f557d 100644 --- a/mace/ops/buffer_to_image_test.cc +++ b/mace/ops/buffer_to_image_test.cc @@ -118,14 +118,13 @@ void TestDiffTypeBidirectionTransform(const int type, const std::vector .Input("B2IOutput") .Output("I2BOutput") .AddIntArg("buffer_type", type) - .AddIntArg("T", DataTypeToEnum::value) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); // Check - ExpectTensorNear(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-2); + ExpectTensorNear(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-3); } TEST(BufferToImageTest, ArgFloatToHalfSmall) { diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index b4fd374b578d3b1eef058f495d331c2182619246..711bf3891211451429fc3ad0e80e1f55611a4b70 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -558,18 +558,20 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) { } template -static void TestHalfComplexConvNxNS12(const std::vector &shape) { +static void TestHalfComplexConvNxNS12(const std::vector &input_shape, + const std::vector &filter_shape) { testing::internal::LogToStderr(); - auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, - Padding type) { - srand(time(NULL)); + srand(time(NULL)); + auto func = [&](int stride_h, int stride_w, Padding padding) { // generate random input index_t batch = 3 + (rand() % 10); - index_t height = shape[0]; - index_t width = shape[1]; - index_t input_channels = shape[2] + (rand() % 10); - index_t output_channels = shape[3] + (rand() % 10); + index_t height = input_shape[0]; + index_t width = input_shape[1]; + index_t kernel_h = filter_shape[0]; + index_t kernel_w = filter_shape[1]; + index_t input_channels = filter_shape[2] + (rand() % 10); + index_t output_channels = filter_shape[3] + (rand() % 10); // Construct graph OpsTestNet net; OpDefBuilder("Conv2D", "Conv2dTest") @@ -578,7 +580,7 @@ static void TestHalfComplexConvNxNS12(const std::vector &shape) { .Input("Bias") .Output("Output") .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) + .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) .Finalize(net.NewOperatorDef()); @@ -611,7 +613,7 @@ static void TestHalfComplexConvNxNS12(const std::vector &shape) { .Input("BiasImage") .Output("OutputImage") .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) + .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) .AddIntArg("T", static_cast(DataType::DT_HALF)) .Finalize(net.NewOperatorDef()); @@ -620,20 +622,46 @@ static void TestHalfComplexConvNxNS12(const std::vector &shape) { ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.2); + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.5); }; - for (int kernel_size : {1, 3}) { - for (int stride : {1, 2}) { - func(kernel_size, kernel_size, stride, stride, VALID); - } + for (int stride : {1, 2}) { + func(stride, stride, VALID); + func(stride, stride, SAME); } } -TEST_F(Conv2dOpTest, OPENCLHalfAlignedConvNxNS12) { - TestHalfComplexConvNxNS12({32, 32, 32, 64}); +TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) { + TestHalfComplexConvNxNS12({32, 32}, + {1, 1, 32, 64}); +} + +TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) { + TestHalfComplexConvNxNS12({32, 32}, + {3, 3, 32, 64}); +} + +TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) { + TestHalfComplexConvNxNS12({32, 32}, + {15, 1, 256, 2}); +} + +TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) { + TestHalfComplexConvNxNS12({32, 32}, + {1, 15, 256, 2}); +} + +TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) { + TestHalfComplexConvNxNS12({32, 32}, + {7, 7, 3, 64}); +} + +TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) { + TestHalfComplexConvNxNS12({107, 113}, + {1, 1, 5, 7}); } -TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConvNxNS12) { - TestHalfComplexConvNxNS12({107, 113, 5, 7}); +TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) { + TestHalfComplexConvNxNS12({107, 113}, + {3, 3, 5, 7}); } diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index dc058d71e85bcfdd286922fccd58e5d6fef8bc1f..17031378f7e93ac6924f794ec352d3009181179d 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -10,6 +10,10 @@ REGISTER_CPU_OPERATOR(OpKeyBuilder("Pooling") .TypeConstraint("T") .Build(), PoolingOp); +REGISTER_CPU_OPERATOR(OpKeyBuilder("Pooling") + .TypeConstraint("T") + .Build(), + PoolingOp); #if __ARM_NEON REGISTER_NEON_OPERATOR(OpKeyBuilder("Pooling") @@ -22,5 +26,9 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Pooling") .TypeConstraint("T") .Build(), PoolingOp); +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Pooling") + .TypeConstraint("T") + .Build(), + PoolingOp); } // namespace mace diff --git a/mace/ops/pooling.h b/mace/ops/pooling.h index f62992f53ed44abae64383e300b873433e9b0216..bbc653ab75d627a412d5fcdfaf5c67772658f24f 100644 --- a/mace/ops/pooling.h +++ b/mace/ops/pooling.h @@ -27,21 +27,6 @@ class PoolingOp : public ConvPool2dOpBase { const Tensor *input = this->Input(INPUT); Tensor *output = this->Output(OUTPUT); - std::vector output_shape(4); - std::vector paddings(2); - std::vector filter_shape(4); - // TODO(chenghui): is it kind of a hack? - filter_shape[0] = input->shape()[1]; - filter_shape[1] = input->shape()[0]; - filter_shape[2] = kernels_[0]; - filter_shape[3] = kernels_[1]; - - kernels::CalcPaddingAndOutputSize( - input->shape().data(), filter_shape.data(), this->dilations_.data(), - this->strides_.data(), this->padding_, output_shape.data(), - paddings.data()); - output->Resize(output_shape); - functor_(input, output); return true; }; diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index bf2b182467426292a2fef53dddd0a0d9d3b09dfc..dcda06b75483e6e0e01cfe16594991d72171d2bf 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -28,48 +28,20 @@ TEST_F(PoolingOpTest, MAX_VALID) { // Add input data net.AddInputFromArray( - "Input", {1, 2, 4, 4}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); + "Input", {1, 4, 4, 2}, + {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); // Run net.RunOp(); // Check auto expected = - CreateTensor({1, 2, 2, 2}, {5, 7, 13, 15, 21, 23, 29, 31}); + CreateTensor({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } -TEST_F(PoolingOpTest, AVG_VALID) { - // Construct graph - auto &net = test_net(); - OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") - .AddIntsArg("kernels", {2, 2}) - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("pooling_type", PoolingType::AVG) - .Finalize(net.NewOperatorDef()); - - // Add input data - net.AddInputFromArray( - "Input", {1, 2, 4, 4}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}); - - // Run - net.RunOp(); - - // Check - auto expected = CreateTensor( - {1, 2, 2, 2}, {2.5, 4.5, 10.5, 12.5, 18.5, 20.5, 26.5, 28.5}); - - ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); -} TEST_F(PoolingOpTest, MAX_SAME) { // Construct graph @@ -85,14 +57,14 @@ TEST_F(PoolingOpTest, MAX_SAME) { .Finalize(net.NewOperatorDef()); // Add input data - net.AddInputFromArray("Input", {1, 1, 3, 3}, - {0, 1, 2, 3, 4, 5, 6, 7, 8}); + net.AddInputFromArray("Input", {1, 3, 3, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8}); // Run net.RunOp(); // Check - auto expected = CreateTensor({1, 1, 2, 2}, {4, 5, 7, 8}); + auto expected = CreateTensor({1, 2, 2, 1}, {4, 5, 7, 8}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -112,14 +84,14 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) { // Add input data net.AddInputFromArray( - "Input", {1, 1, 4, 4}, + "Input", {1, 4, 4, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); // Run net.RunOp(); // Check - auto expected = CreateTensor({1, 1, 2, 2}, {10, 11, 14, 15}); + auto expected = CreateTensor({1, 2, 2, 1}, {10, 11, 14, 15}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -139,42 +111,57 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { // Add input data net.AddInputFromArray( - "Input", {1, 1, 2, 9}, + "Input", {1, 2, 9, 1}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}); // Run - net.RunOp(DeviceType::NEON); + net.RunOp(); // Check - auto expected = CreateTensor({1, 1, 1, 5}, {10, 12, 14, 16, 17}); + auto expected = CreateTensor({1, 1, 5, 1}, {10, 12, 14, 16, 17}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } - -template +template static void SimpleMaxPooling3S2() { // Construct graph OpsTestNet net; - OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") - .AddIntArg("pooling_type", PoolingType::MAX) - .AddIntsArg("kernels", {3, 3}) - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( - "Input", {1, 1, 3, 9}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + "Input", {1, 3, 9, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}); - // Run - net.RunOp(D); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + OpDefBuilder("Pooling", "PoolingTest") + .Input("InputImage") + .Output("OutputImage") + .AddIntArg("pooling_type", PoolingType::MAX) + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + net.RunOp(D); + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + } else { + // Run + OpDefBuilder("Pooling", "PoolingTest") + .Input("Input") + .Output("Output") + .AddIntArg("pooling_type", PoolingType::MAX) + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + net.RunOp(D); + } // Check - auto expected = CreateTensor({1, 1, 1, 4}, {20, 22, 24, 26}); + auto expected = CreateTensor({1, 1, 4, 1}, {20, 22, 24, 26}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -182,15 +169,15 @@ static void SimpleMaxPooling3S2() { TEST_F(PoolingOpTest, CPUSimpleMaxPooling3S2) { SimpleMaxPooling3S2(); } -TEST_F(PoolingOpTest, NEONSimpleMaxPooling3S2) { - SimpleMaxPooling3S2(); -} + TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) { SimpleMaxPooling3S2(); } -template -static void AlignedMaxPooling3S2(Padding padding) { +template +static void MaxPooling3S2(const std::vector &input_shape, + const std::vector strides, + Padding padding) { // Construct graph OpsTestNet net; OpDefBuilder("Pooling", "PoolingTest") @@ -198,22 +185,35 @@ static void AlignedMaxPooling3S2(Padding padding) { .Output("Output") .AddIntArg("pooling_type", PoolingType::MAX) .AddIntsArg("kernels", {3, 3}) - .AddIntsArg("strides", {2, 2}) + .AddIntsArg("strides", strides) .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {3, 128, 64, 64}); - // Run - net.RunOp(D); + net.AddRandomInput("Input", input_shape); + + // run on cpu + net.RunOp(); Tensor expected; expected.Copy(*net.GetOutput("Output")); - // Run on cpu - net.RunOp(); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + OpDefBuilder("Pooling", "PoolingTest") + .Input("InputImage") + .Output("OutputImage") + .AddIntArg("pooling_type", PoolingType::MAX) + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + net.RunOp(D); + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); - ExpectTensorNear(*net.GetOutput("Output"), expected, 0.001); + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.001); } // TODO(chenghui) : there is a bug. @@ -223,152 +223,158 @@ static void AlignedMaxPooling3S2(Padding padding) { //} TEST_F(PoolingOpTest, OPENCLAlignedMaxPooling3S2) { - AlignedMaxPooling3S2(Padding::VALID); - AlignedMaxPooling3S2(Padding::SAME); + MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::VALID); + MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::VALID); + MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::SAME); + MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::SAME); +} + +TEST_F(PoolingOpTest, OPENCLHalfAlignedMaxPooling3S2) { + MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::VALID); + MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::VALID); + MaxPooling3S2({3, 64, 32, 32}, {1, 1}, Padding::SAME); + MaxPooling3S2({3, 64, 32, 32}, {2, 2}, Padding::SAME); } -template -static void UnalignedMaxPooling3S2(Padding padding) { +TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) { + MaxPooling3S2({3, 41, 43, 47}, {1, 1}, Padding::VALID); + MaxPooling3S2({3, 41, 43, 47}, {2, 2}, Padding::VALID); + MaxPooling3S2({3, 41, 43, 47}, {1, 1}, Padding::SAME); + MaxPooling3S2({3, 41, 43, 47}, {2, 2}, Padding::SAME); +} + +TEST_F(PoolingOpTest, AVG_VALID) { // Construct graph - OpsTestNet net; + auto &net = test_net(); OpDefBuilder("Pooling", "PoolingTest") .Input("Input") .Output("Output") - .AddIntArg("pooling_type", PoolingType::MAX) - .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("kernels", {2, 2}) .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", padding) + .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("pooling_type", PoolingType::AVG) .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {3, 113, 43, 47}); - // Run - net.RunOp(D); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + net.AddInputFromArray( + "Input", {1, 4, 4, 2}, + {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); - // Run on cpu + // Run net.RunOp(); - ExpectTensorNear(*net.GetOutput("Output"), expected, 0.001); -} - -// TODO(chenghui) : there is a bug. -//TEST_F(PoolingOpTest, NEONUnalignedMaxPooling3S2) { -// UnalignedMaxPooling3S2(); -//} + // Check + auto expected = CreateTensor( + {1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5}); -TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) { - UnalignedMaxPooling3S2(Padding::VALID); - UnalignedMaxPooling3S2(Padding::SAME); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } -template +template static void SimpleAvgPoolingTest() { // Construct graph OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {1, 2, 8, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") + .Input("InputImage") + .Output("OutputImage") .AddIntArg("pooling_type", PoolingType::AVG) .AddIntsArg("kernels", {2, 2}) .AddIntsArg("strides", {2, 2}) .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) .Finalize(net.NewOperatorDef()); - - // Add input data - net.AddInputFromArray( - "Input", {1, 1, 2, 8}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); // Run net.RunOp(D); + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); // Check - auto expected = CreateTensor({1, 1, 1, 4}, {4.5, 6.5, 8.5, 10.5}); + auto expected = CreateTensor({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } -TEST_F(PoolingOpTest, NEONSimpleAvgPooling) { - SimpleAvgPoolingTest(); -} - TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) { SimpleAvgPoolingTest(); } -template -static void AlignedAvgPoolingTest(Padding padding) { +template +static void AvgPoolingTest(const std::vector &shape, + const std::vector &kernels, + const std::vector &strides, + Padding padding) { // Construct graph OpsTestNet net; OpDefBuilder("Pooling", "PoolingTest") .Input("Input") .Output("Output") .AddIntArg("pooling_type", PoolingType::AVG) - .AddIntsArg("kernels", {4, 4}) - .AddIntsArg("strides", {4, 4}) + .AddIntsArg("kernels", kernels) + .AddIntsArg("strides", strides) .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {3, 128, 15, 15}); - // Run - net.RunOp(D); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + net.AddRandomInput("Input", shape); - // Run on cpu + // run on cpu net.RunOp(); + Tensor expected; + expected.Copy(*net.GetOutput("Output")); - ExpectTensorNear(*net.GetOutput("Output"), expected, 1e-5); -} - -TEST_F(PoolingOpTest, NEONAlignedAvgPooling) { - AlignedAvgPoolingTest(Padding::VALID); - AlignedAvgPoolingTest(Padding::SAME); -} - -TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) { - AlignedAvgPoolingTest(Padding::VALID); - AlignedAvgPoolingTest(Padding::SAME); -} - -template -static void UnAlignedAvgPoolingTest(Padding padding) { - // Construct graph - OpsTestNet net; + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") + .Input("InputImage") + .Output("OutputImage") .AddIntArg("pooling_type", PoolingType::AVG) - .AddIntsArg("kernels", {7, 7}) - .AddIntsArg("strides", {7, 7}) + .AddIntsArg("kernels", kernels) + .AddIntsArg("strides", strides) .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - - // Add input data - net.AddRandomInput("Input", {3, 128, 31, 37}); - // Run net.RunOp(D); - Tensor expected; - expected.Copy(*net.GetOutput("Output")); + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); - // Run on cpu - net.RunOp(); + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.01); +} - ExpectTensorNear(*net.GetOutput("Output"), expected, 1e-5); +TEST_F(PoolingOpTest, OPENCLAlignedAvgPooling) { + AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID); + AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME); } -TEST_F(PoolingOpTest, NEONUnAlignedAvgPooling) { - UnAlignedAvgPoolingTest(Padding::VALID); - UnAlignedAvgPoolingTest(Padding::SAME); +TEST_F(PoolingOpTest, OPENCLHalfAlignedAvgPooling) { + AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::VALID); + AvgPoolingTest({3, 15, 15, 128}, {4, 4}, {4, 4}, Padding::SAME); +} + +TEST_F(PoolingOpTest, OPENCLAlignedLargeKernelAvgPooling) { + AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::VALID); + AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::SAME); +} + +TEST_F(PoolingOpTest, OPENCLHalfAlignedLargeKernelAvgPooling) { + AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::VALID); + AvgPoolingTest({3, 64, 64, 128}, {16, 16}, {16, 16}, Padding::SAME); } TEST_F(PoolingOpTest, OPENCLUnAlignedAvgPooling) { - UnAlignedAvgPoolingTest(Padding::VALID); - UnAlignedAvgPoolingTest(Padding::SAME); + AvgPoolingTest({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::VALID); + AvgPoolingTest({3, 31, 37, 128}, {2, 2}, {2, 2}, Padding::SAME); } + +TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) { + AvgPoolingTest({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::VALID); + AvgPoolingTest({3, 31, 37, 128}, {8, 8}, {8, 8}, Padding::SAME); +} +