From b20196877f3aaab3a4bbf91d9de00f3646f19b7a Mon Sep 17 00:00:00 2001 From: liutuo Date: Tue, 15 May 2018 16:54:35 +0800 Subject: [PATCH] add winograd 6x6 kernel --- mace/kernels/buffer_to_image.h | 12 +- mace/kernels/image_to_buffer.h | 12 +- mace/kernels/opencl/buffer_to_image.cc | 15 +- mace/kernels/opencl/cl/buffer_to_image.cl | 333 ++++++++- mace/kernels/opencl/cl/winograd_transform.cl | 673 +++++++++++++++++- mace/kernels/opencl/helper.cc | 22 +- mace/kernels/opencl/helper.h | 6 +- mace/kernels/opencl/image_to_buffer.cc | 12 +- mace/kernels/opencl/winograd_transform.cc | 141 +++- mace/kernels/winograd_transform.h | 34 +- mace/ops/buffer_to_image.h | 3 +- mace/ops/conv_2d_benchmark.cc | 5 + mace/ops/image_to_buffer.h | 3 +- mace/ops/ops_test_util.h | 33 +- mace/ops/winograd_convolution_benchmark.cc | 134 ++++ mace/ops/winograd_convolution_test.cc | 168 +++-- mace/ops/winograd_inverse_transform.h | 3 +- mace/ops/winograd_transform.h | 4 +- mace/ops/winograd_transform_benchmark.cc | 223 +++++- .../tools/converter_tool/base_converter.py | 1 + .../tools/converter_tool/transformer.py | 54 +- 21 files changed, 1710 insertions(+), 181 deletions(-) create mode 100644 mace/ops/winograd_convolution_benchmark.cc diff --git a/mace/kernels/buffer_to_image.h b/mace/kernels/buffer_to_image.h index a93af90b..fedf8190 100644 --- a/mace/kernels/buffer_to_image.h +++ b/mace/kernels/buffer_to_image.h @@ -25,14 +25,17 @@ namespace mace { namespace kernels { struct BufferToImageFunctorBase { - BufferToImageFunctorBase() - : kernel_error_(nullptr) {} + explicit BufferToImageFunctorBase(const int wino_blk_size) + : kernel_error_(nullptr), + wino_blk_size_(wino_blk_size) {} std::unique_ptr kernel_error_; + const int wino_blk_size_; }; template struct BufferToImageFunctor : BufferToImageFunctorBase { - BufferToImageFunctor() {} + explicit BufferToImageFunctor(const int wino_blk_size) + : BufferToImageFunctorBase(wino_blk_size) {} MaceStatus operator()(const Tensor *input, const BufferType type, Tensor *output, @@ -48,7 +51,8 @@ struct BufferToImageFunctor : BufferToImageFunctorBase { template struct BufferToImageFunctor : BufferToImageFunctorBase { - BufferToImageFunctor() {} + explicit BufferToImageFunctor(const int wino_blk_size) + : BufferToImageFunctorBase(wino_blk_size) {} MaceStatus operator()(const Tensor *input, const BufferType type, Tensor *output, diff --git a/mace/kernels/image_to_buffer.h b/mace/kernels/image_to_buffer.h index 22ce6af5..77388da7 100644 --- a/mace/kernels/image_to_buffer.h +++ b/mace/kernels/image_to_buffer.h @@ -25,14 +25,17 @@ namespace mace { namespace kernels { struct ImageToBufferFunctorBase { - ImageToBufferFunctorBase() - : kernel_error_(nullptr) {} + explicit ImageToBufferFunctorBase(const int wino_blk_size) + : kernel_error_(nullptr), + wino_blk_size_(wino_blk_size) {} std::unique_ptr kernel_error_; + const int wino_blk_size_; }; template struct ImageToBufferFunctor : ImageToBufferFunctorBase { - ImageToBufferFunctor() {} + explicit ImageToBufferFunctor(const int wino_blk_size) + : ImageToBufferFunctorBase(wino_blk_size) {} MaceStatus operator()(const Tensor *input, const BufferType type, Tensor *output, @@ -48,7 +51,8 @@ struct ImageToBufferFunctor : ImageToBufferFunctorBase { template struct ImageToBufferFunctor : ImageToBufferFunctorBase { - ImageToBufferFunctor() {} + explicit ImageToBufferFunctor(const int wino_blk_size) + : ImageToBufferFunctorBase(wino_blk_size) {} MaceStatus operator()(const Tensor *input, const BufferType type, Tensor *output, diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index 14cc9312..394c48ff 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -26,9 +26,10 @@ MaceStatus BufferToImageFunctor::operator()( Tensor *image, StatsFuture *future) { std::vector image_shape; - CalImage2DShape(buffer->shape(), type, &image_shape); + CalImage2DShape(buffer->shape(), type, &image_shape, wino_blk_size_); if (type == WINOGRAD_FILTER) { - std::vector new_shape = CalWinogradShape(buffer->shape(), type); + std::vector new_shape = + CalWinogradShape(buffer->shape(), type, wino_blk_size_); MACE_RETURN_IF_ERROR(image->ResizeImage(new_shape, image_shape)); } else { MACE_RETURN_IF_ERROR(image->ResizeImage(buffer->shape(), image_shape)); @@ -62,10 +63,14 @@ MaceStatus BufferToImageFunctor::operator()( case WEIGHT_WIDTH: kernel_name = "weight_width_buffer_to_image"; break; - case WINOGRAD_FILTER: - gws[1] /= 16; - kernel_name = "winograd_filter_buffer_to_image"; + case WINOGRAD_FILTER: { + std::stringstream ss_tmp; + gws[1] /= (wino_blk_size_ + 2) * (wino_blk_size_ + 2); + ss_tmp << "winograd_filter_buffer_to_image_" + << wino_blk_size_ << "x" << wino_blk_size_; + kernel_name = ss_tmp.str(); break; + } } auto runtime = OpenCLRuntime::Global(); diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl index 4efab52a..c4ef4988 100644 --- a/mace/kernels/opencl/cl/buffer_to_image.cl +++ b/mace/kernels/opencl/cl/buffer_to_image.cl @@ -617,7 +617,7 @@ __kernel void weight_width_image_to_buffer(KERNEL_ERROR_PARAMS } // only support 3x3 now -__kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS +__kernel void winograd_filter_buffer_to_image_2x2(KERNEL_ERROR_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global const DATA_TYPE *input, //Oc, Ic, H, W __private const int input_offset, @@ -724,7 +724,7 @@ __kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS } // only support 3x3 now -__kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS +__kernel void winograd_filter_image_to_buffer_2x2(KERNEL_ERROR_PARAMS GLOBAL_WORK_GROUP_SIZE_DIM2 __global DATA_TYPE *output, //Oc, Ic, H, W __private const int height, @@ -765,3 +765,332 @@ __kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS offset += height * width; } } + +// only support 3x3 now +__kernel void winograd_filter_buffer_to_image_6x6(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 + __global const DATA_TYPE *input, //Oc, Ic, H, W + __private const int input_offset, + __private const int in_channels, + __private const int height, + __private const int width, + __write_only image2d_t output) { + int w = get_global_id(0); + int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } + const int out_channels = global_size_dim1; +#else + const int out_channels = get_global_size(1); +#endif + + const int out_channel_idx = h; + const int in_channel_idx = w << 2; + const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width; + const int length = min((in_channels - in_channel_idx) * 9, 36); + DATA_TYPE in[36] = {0}; + DATA_TYPE4 tt0, tt1, t1; + DATA_TYPE4 tu0[3], tu1[3], tu2[3], tu3[3], tu4[3], tu5[3], tu6[3], tu7[3]; + + const float a = -0.222222222f; + const float b = 0.011111111f; + const float c = 0.005555556f; + +#pragma unroll + for (short i = 0; i < length; ++i) { + in[i] = *(input + offset + i); + } + + tu0[0] = (DATA_TYPE4)(in[0], in[9], in[18], in[27]); + t1 = (DATA_TYPE4)(in[3], in[12], in[21], in[30]); + tu7[0] = (DATA_TYPE4)(in[6], in[15], in[24], in[33]); + + tt0 = tu0[0] + tu7[0]; + tt1 = t1; + tu1[0] = mad(tt0 + tt1, a, 0); + tu2[0] = mad(tt0 - tt1, a, 0); + tt0 = mad(tu7[0], 4, tu0[0]); + tt1 = mad(t1, 2, 0); + tu3[0] = mad(tt0 + tt1, b, 0); + tu4[0] = mad(tt0 - tt1, b, 0); + tt0 = mad(tu0[0], 4, tu7[0]); + tt1 = mad(t1, 2, 0); + tu5[0] = mad(tt0 + tt1, c, 0); + tu6[0] = mad(tt0 - tt1, c, 0); + + tu0[1] = (DATA_TYPE4)(in[1], in[10], in[19], in[28]); + t1 = (DATA_TYPE4)(in[4], in[13], in[22], in[31]); + tu7[1] = (DATA_TYPE4)(in[7], in[16], in[25], in[34]); + + tt0 = tu0[1] + tu7[1]; + tt1 = t1; + tu1[1] = mad(tt0 + tt1, a, 0); + tu2[1] = mad(tt0 - tt1, a, 0); + + tt0 = mad(tu7[1], 4, tu0[1]); + tt1 = mad(t1, 2, 0); + tu3[1] = mad(tt0 + tt1, b, 0); + tu4[1] = mad(tt0 - tt1, b, 0); + + tt0 = mad(tu0[1], 4, tu7[1]); + tt1 = mad(t1, 2, 0); + tu5[1] = mad(tt0 + tt1, c, 0); + tu6[1] = mad(tt0 - tt1, c, 0); + + tu0[2] = (DATA_TYPE4)(in[2], in[11], in[20], in[29]); + t1 = (DATA_TYPE4)(in[5], in[14], in[23], in[32]); + tu7[2] = (DATA_TYPE4)(in[8], in[17], in[26], in[35]); + + tt0 = tu0[2] + tu7[2]; + tt1 = t1; + tu1[2] = mad(tt0 + tt1, a, 0); + tu2[2] = mad(tt0 - tt1, a, 0); + + tt0 = mad(tu7[2], 4, tu0[2]); + tt1 = mad(t1, 2, 0); + tu3[2] = mad(tt0 + tt1, b, 0); + tu4[2] = mad(tt0 - tt1, b, 0); + + tt0 = mad(tu0[2], 4, tu7[2]); + tt1 = mad(t1, 2, 0); + tu5[2] = mad(tt0 + tt1, c, 0); + tu6[2] = mad(tt0 - tt1, c, 0); + +#define PROCESS(i) \ + t1 = tu##i[0]; \ + WRITE_IMAGET(output, (int2)(w, h), t1); \ + h += out_channels; \ + tt0 = tu##i[0] + tu##i[2]; \ + tt1 = tu##i[1]; \ + t1 = mad(tt0 + tt1, a, 0); \ + WRITE_IMAGET(output, (int2)(w, h), t1); \ + h += out_channels; \ + t1 = mad(tt0 - tt1, a, 0); \ + WRITE_IMAGET(output, (int2)(w, h), t1); \ + h += out_channels; \ + tt0 = mad(tu##i[2], 4, tu##i[0]); \ + tt1 = mad(tu##i[1], 2, 0); \ + t1 = mad(tt0 + tt1, b, 0); \ + WRITE_IMAGET(output, (int2)(w, h), t1); \ + h += out_channels; \ + t1 = mad(tt0 - tt1, b, 0); \ + WRITE_IMAGET(output, (int2)(w, h), t1); \ + h += out_channels; \ + tt0 = mad(tu##i[0], 4, tu##i[2]); \ + tt1 = mad(tu##i[1], 2, 0); \ + t1 = mad(tt0 + tt1, c, 0); \ + WRITE_IMAGET(output, (int2)(w, h), t1); \ + h += out_channels; \ + t1 = mad(tt0 - tt1, c, 0); \ + WRITE_IMAGET(output, (int2)(w, h), t1); \ + h += out_channels; \ + t1 = tu##i[2]; \ + WRITE_IMAGET(output, (int2)(w, h), t1); \ + h += out_channels; \ + +PROCESS(0); +PROCESS(1); +PROCESS(2); +PROCESS(3); +PROCESS(4); +PROCESS(5); +PROCESS(6); +PROCESS(7); + +#undef PROCESS + +} +__kernel void winograd_filter_image_to_buffer_6x6(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 + __global DATA_TYPE *output, //Oc, Ic, H, W + __private const int height, + __private const int width, + __private const int channel, + __read_only image2d_t input) { + const int w = get_global_id(0); + const int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } +#endif + + const int width_idx = w << 2; + const int size = width - width_idx; + int offset = h * width + width_idx; + + int2 coord = (int2)(w, h); + DATA_TYPE4 values; + for (short i = 0; i < 64; ++i) { + values = READ_IMAGET(input, SAMPLER, coord); + if (size < 4) { + switch (size) { + case 3: + output[offset+2] = values.z; + case 2: + output[offset+1] = values.y; + case 1: + output[offset] = values.x; + } + } else { + vstore4(values, 0, output + offset); + } + coord.y += height; + offset += height * width; + } +} + +// only support 3x3 now +__kernel void winograd_filter_buffer_to_image_4x4(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 + __global const DATA_TYPE *input, //Oc, Ic, H, W + __private const int input_offset, + __private const int in_channels, + __private const int height, + __private const int width, + __write_only image2d_t output) { + int w = get_global_id(0); + int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } + const int out_channels = global_size_dim1; +#else + const int out_channels = get_global_size(1); +#endif + + const int out_channel_idx = h; + const int in_channel_idx = w << 2; + const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width; + const int length = min((in_channels - in_channel_idx) * 9, 36); + DATA_TYPE in[36] = {0}; + DATA_TYPE4 tt0, tt1, tt2; + DATA_TYPE4 tu0[3], tu1[3], tu2[3], tu3[3], tu4[3], tu5[3]; + const float a = 0.25f; + const float b = -0.166666667f; + const float c = 0.041666667f; + +#pragma unroll + for (short i = 0; i < length; ++i) { + in[i] = *(input + offset + i); + } + + tt0 = (DATA_TYPE4)(in[0], in[9], in[18], in[27]); + tt1 = (DATA_TYPE4)(in[3], in[12], in[21], in[30]); + tt2 = (DATA_TYPE4)(in[6], in[15], in[24], in[33]); + + tu0[0] = mad(tt0, a, 0); + tu1[0] = mad((tt0 + tt1 + tt2), b, 0); + tu2[0] = mad((tt0 - tt1 + tt2), b, 0); + tt0 = mad(tt2, 4, tt0); + tu3[0] = mad(mad(tt1, 2, tt0), c, 0); + tu4[0] = mad(mad(tt1, -2, tt0), c, 0); + + tu5[0] = tt2; + + tt0 = (DATA_TYPE4)(in[1], in[10], in[19], in[28]); + tt1 = (DATA_TYPE4)(in[4], in[13], in[22], in[31]); + tt2 = (DATA_TYPE4)(in[7], in[16], in[25], in[34]); + + tu0[1] = mad(tt0, a, 0); + tu1[1] = mad((tt0 + tt1 + tt2), b, 0); + tu2[1] = mad((tt0 - tt1 + tt2), b, 0); + tt0 = mad(tt2, 4, tt0); + tu3[1] = mad(mad(tt1, 2, tt0), c, 0); + tu4[1] = mad(mad(tt1, -2, tt0), c, 0); + + tu5[1] = tt2; + + tt0 = (DATA_TYPE4)(in[2], in[11], in[20], in[29]); + tt1 = (DATA_TYPE4)(in[5], in[14], in[23], in[32]); + tt2 = (DATA_TYPE4)(in[8], in[17], in[26], in[35]); + + tu0[2] = mad(tt0, a, 0); + tu1[2] = mad((tt0 + tt1 + tt2), b, 0); + tu2[2] = mad((tt0 - tt1 + tt2), b, 0); + tt0 = mad(tt2, 4, tt0); + tu3[2] = mad(mad(tt1, 2, tt0), c, 0); + tu4[2] = mad(mad(tt1, -2, tt0), c, 0); + + tu5[2] = tt2; + +#define PROCESS(i) \ + tt2 = mad(tu##i[0], a, 0); \ + WRITE_IMAGET(output, (int2)(w, h), tt2); \ + h += out_channels; \ + tt0 = tu##i[1]; \ + tt1 = tu##i[0] + tu##i[2]; \ + tt2 = mad((tt0 + tt1), b, 0); \ + WRITE_IMAGET(output, (int2)(w, h), tt2); \ + h += out_channels; \ + tt2 = mad(tt1 - tt0, b, 0); \ + WRITE_IMAGET(output, (int2)(w, h), tt2); \ + h += out_channels; \ + tt0 = mad(tu##i[2], 4, tu##i[0]); \ + tt1 = 2 * tu##i[1]; \ + tt2 = mad(tt0 + tt1, c, 0); \ + WRITE_IMAGET(output, (int2)(w, h), tt2); \ + h += out_channels; \ + tt2 = mad(tt0 - tt1, c, 0); \ + WRITE_IMAGET(output, (int2)(w, h), tt2); \ + h += out_channels; \ + tt2 = tu##i[2]; \ + WRITE_IMAGET(output, (int2)(w, h), tt2); \ + h += out_channels; \ + + PROCESS(0); + PROCESS(1); + PROCESS(2); + PROCESS(3); + PROCESS(4); + PROCESS(5); + +#undef PROCESS + +} +__kernel void winograd_filter_image_to_buffer_4x4(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 + __global DATA_TYPE *output, //Oc, Ic, H, W + __private const int height, + __private const int width, + __private const int channel, + __read_only image2d_t input) { + const int w = get_global_id(0); + const int h = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (w >= global_size_dim0 || h >= global_size_dim1) { + return; + } +#endif + + const int width_idx = w << 2; + const int size = width - width_idx; + int offset = h * width + width_idx; + + int2 coord = (int2)(w, h); + DATA_TYPE4 values; + for (short i = 0; i < 36; ++i) { + values = READ_IMAGET(input, SAMPLER, coord); + if (size < 4) { + switch (size) { + case 3: + output[offset+2] = values.z; + case 2: + output[offset+1] = values.y; + case 1: + output[offset] = values.x; + } + } else { + vstore4(values, 0, output + offset); + } + coord.y += height; + offset += height * width; + } +} \ No newline at end of file diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl index 4201fd63..833b7191 100644 --- a/mace/kernels/opencl/cl/winograd_transform.cl +++ b/mace/kernels/opencl/cl/winograd_transform.cl @@ -8,7 +8,9 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS __private const int in_width, __private const int in_channel, __private const int round_hw, + __private const float round_hw_r, __private const int round_w, + __private const float round_w_r, __private const int padding_top, __private const int padding_left) { int out_width_idx = get_global_id(0); @@ -23,10 +25,12 @@ __kernel void winograd_transform_2x2(KERNEL_ERROR_PARAMS const int chan_blk_size = get_global_size(1); #endif - const int batch_idx = out_width_idx / round_hw; - const int t_idx = out_width_idx % round_hw; - const int height_idx = ((t_idx / round_w) << 1) - padding_top; - const int width_idx = ((t_idx % round_w) << 1) - padding_left; + const int batch_idx = out_width_idx * round_hw_r; + const int t_idx = mad24(batch_idx, -round_hw, out_width_idx); + const int n_round_w = t_idx * round_w_r; + const int mod_round_w = mad24(n_round_w, -round_w, t_idx); + const int height_idx = (n_round_w << 1) - padding_top; + const int width_idx = (mod_round_w << 1) - padding_left; const int nh_idx = mad24(batch_idx, in_height, height_idx); const int wc_idx = mad24(chan_blk_idx, in_width, width_idx); @@ -126,7 +130,9 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS __private const int out_height, __private const int out_width, __private const int round_hw, + __private const float round_hw_r, __private const int round_w, + __private const float round_w_r, __private const float relux_max_limit) { const int width_idx = get_global_id(0); const int height_idx = get_global_id(1); @@ -143,10 +149,12 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS int width = width_idx; int height = height_idx; - const int batch = width_idx / round_hw; - int t = width_idx % round_hw; - const int out_height_idx = (t / round_w) << 1; - const int out_width_idx = (t % round_w) << 1; + const int batch = width_idx * round_hw_r; + int t = mad24(batch, -round_hw, width_idx); + const int n_round_w = t * round_w_r; + const int mod_round_w = mad24(n_round_w, -round_w, t); + const int out_height_idx = n_round_w << 1; + const int out_width_idx = mod_round_w << 1; const int out_chan_idx = height_idx; const int coord_x = mad24(out_chan_idx, out_width, out_width_idx); const int coord_y = mad24(batch, out_height, out_height_idx); @@ -226,3 +234,652 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS } } + +__kernel void winograd_transform_6x6(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 + __read_only image2d_t input, + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int in_channel, + __private const int round_hw, + __private const float round_hw_r, + __private const int round_w, + __private const float round_w_r, + __private const int padding_top, + __private const int padding_left) { + int out_width_idx_i = get_global_id(0); + int chan_blk_idx_i = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (out_width_idx_i >= global_size_dim0 || chan_blk_idx_i >= global_size_dim1) { + return; + } + const int chan_blk_size = global_size_dim1 >> 3; +#else + const int chan_blk_size = get_global_size(1) >> 3; +#endif + __local DATA_TYPE4 in[8][8]; + + int out_width_idx = out_width_idx_i >> 3; + int chan_blk_idx = chan_blk_idx_i >> 3; + int i = mad24(out_width_idx, -8, out_width_idx_i); + int j = mad24(chan_blk_idx, -8, chan_blk_idx_i); + const int batch_idx = out_width_idx / round_hw; + const int t_idx = mad24(batch_idx, -round_hw, out_width_idx); + const int n_round_w = t_idx / round_w; + const int mod_round_w = mad24(n_round_w, -round_w, t_idx); + const int height_idx = mad24(n_round_w, 6, -padding_top); + const int width_idx = mad24(mod_round_w, 6, -padding_left); + + const int nh_idx = mad24(batch_idx, in_height, height_idx); + const int wc_idx = mad24(chan_blk_idx, in_width, width_idx); + + int y = select(nh_idx + j, -1, height_idx + j < 0 || height_idx + j >= in_height); + int x = width_idx + i; + x = select(wc_idx + i, -1, x < 0 || x >= in_width); + in[j][i] = READ_IMAGET(input, SAMPLER, (int2)(x, y)); + barrier(CLK_LOCAL_MEM_FENCE); + + DATA_TYPE4 tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7; + DATA_TYPE4 tmp; + + if (j == 0) { + tmp = 0.5f * (in[1][i] + in[5][i]) - 2.5f * in[3][i]; + tt2 = 1.5f * in[5][i] + tmp; + tt4 = 1.5f * in[1][i] + tmp; + tt0 = in[1][i] - 4.25f * in[3][i] + in[5][i]; + tt1 = in[2][i] - 4.25f * in[4][i] + in[6][i]; + tmp = in[2][i] - 5 * in[4][i]; + tt3 = in[6][i] + 0.25f * tmp; + tt5 = in[6][i] + 3 * in[2][i] + tmp; + tt6 = 5.25f * (in[4][i] - in[2][i]) + in[0][i] - in[6][i]; + tt7 = 5.25f * (in[3][i] - in[5][i]) + in[7][i] - in[1][i]; + in[0][i] = tt6; + in[1][i] = tt1 + tt0; + in[2][i] = tt1 - tt0; + in[3][i] = tt3 + tt2; + in[4][i] = tt3 - tt2; + in[5][i] = tt5 + tt4; + in[6][i] = tt5 - tt4; + in[7][i] = tt7; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(j == 0) { + tmp = 0.5f * (in[i][1] + in[i][5]) - 2.5f * in[i][3]; + tt2 = 1.5f * in[i][5] + tmp; + tt4 = 1.5f * in[i][1] + tmp; + tt0 = in[i][1] - 4.25f * in[i][3] + in[i][5]; + tt1 = in[i][2] - 4.25f * in[i][4] + in[i][6]; + tmp = in[i][2] - 5 * in[i][4]; + tt3 = in[i][6] + 0.25f * tmp; + tt5 = in[i][6] + 3 * in[i][2] + tmp; + tt6 = 5.25f * (in[i][4] - in[i][2]) + in[i][0] - in[i][6]; + tt7 = 5.25f * (in[i][3] - in[i][5]) + in[i][7] - in[i][1]; + in[i][0] = tt6; + in[i][1] = tt1 + tt0; + in[i][2] = tt1 - tt0; + in[i][3] = tt3 + tt2; + in[i][4] = tt3 - tt2; + in[i][5] = tt5 + tt4; + in[i][6] = tt5 - tt4; + in[i][7] = tt7; + } + barrier(CLK_LOCAL_MEM_FENCE); + + chan_blk_idx += mul24(mad24(j, 8, i), chan_blk_size); + WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in[j][i]); + +} + +__kernel void winograd_inverse_transform_6x6(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 + __read_only image2d_t input, +#ifdef BIAS + __read_only image2d_t bias, /* cout%4 * cout/4 */ +#endif + __write_only image2d_t output, + __private const int out_height, + __private const int out_width, + __private const int round_hw, + __private const float round_hw_r, + __private const int round_w, + __private const float round_w_r, + __private const float relux_max_limit) { + const int width_idx = get_global_id(0); + const int height_idx = get_global_id(1); +#ifndef NON_UNIFORM_WORK_GROUP + if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) { + return; + } + const int out_channel = global_size_dim1; +#else + const int out_channel = get_global_size(1); +#endif + + DATA_TYPE4 in0[8], in1[8], in2[8], in3[8], in4[8], in5[8], in6[8], in7[8]; + + DATA_TYPE4 tv0[8], tv1[8], tv2[8], tv3[8], tv4[8], tv5[8]; + const int width = width_idx; + const int height = height_idx; + + const int batch = width / round_hw; + const int t = mad24(batch, -round_hw, width); + const int n_round_w = t / round_w; + const int mod_round_w = mad24(n_round_w, -round_w, t); + const int out_height_idx = mul24(n_round_w, 6); + const int out_width_idx = mul24(mod_round_w, 6); + const int out_chan_idx = height; + const int coord_x = mad24(out_chan_idx, out_width, out_width_idx); + const int coord_y = mad24(batch, out_height, out_height_idx); + int h = height_idx; +#pragma unroll + for (short i = 0; i < 8; ++i) { + in0[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h)); + h += out_channel; + } +#pragma unroll + for (short i = 0; i < 8; ++i) { + in1[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h)); + h += out_channel; + } +#pragma unroll + for (short i = 0; i < 8; ++i) { + in2[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h)); + h += out_channel; + } +#pragma unroll + for (short i = 0; i < 8; ++i) { + in3[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h)); + h += out_channel; + } +#pragma unroll + for (short i = 0; i < 8; ++i) { + in4[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h)); + h += out_channel; + } +#pragma unroll + for (short i = 0; i < 8; ++i) { + in5[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h)); + h += out_channel; + } +#pragma unroll + for (short i = 0; i < 8; ++i) { + in6[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h)); + h += out_channel; + } +#pragma unroll + for (short i = 0; i < 8; ++i) { + in7[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h)); + h += out_channel; + } + DATA_TYPE4 tt0, tt1, tt2, tt3, tt4, tt5, d0, d7; +#define PROCESS_IN(i) \ + d0 = in0[i];\ + d7 = in7[i];\ + tt0 = in1[i] + in2[i];\ + tt1 = in1[i] - in2[i];\ + tt2 = in3[i] + in4[i];\ + tt3 = in3[i] - in4[i];\ + tt3 = tt3 + tt3;\ + tt4 = in5[i] + in6[i];\ + tt4 = tt4 + tt4;\ + tt5 = in5[i] - in6[i];\ + tt0 = tt0 + tt2 + tt4;\ + tt1 = tt1 + tt3 + tt5;\ + tv0[i] = tt0 + tt4 * 15 + d0;\ + tv1[i] = tt1 + tt5 * 15;\ + tv2[i] = tt0 + 3 * (tt2 + tt4);\ + tv3[i] = tt1 + 3 * (tt3 + tt5);\ + tv4[i] = tt0 + tt2 * 15;\ + tv5[i] = tt1 + tt3 * 15 + d7;\ + + PROCESS_IN(0); + PROCESS_IN(1); + PROCESS_IN(2); + PROCESS_IN(3); + PROCESS_IN(4); + PROCESS_IN(5); + PROCESS_IN(6); + PROCESS_IN(7); +#undef PROCESS_IN + +#define PROCESS_SND(i) \ + d0 = tv##i[0];\ + d7 = tv##i[7];\ + tt0 = tv##i[1] + tv##i[2];\ + tt1 = tv##i[1] - tv##i[2];\ + tt2 = tv##i[3] + tv##i[4];\ + tt3 = tv##i[3] - tv##i[4];\ + tt3 = tt3 + tt3;\ + tt4 = tv##i[5] + tv##i[6];\ + tt4 = tt4 + tt4;\ + tt5 = tv##i[5] - tv##i[6];\ + tt0 = tt0 + tt2 + tt4;\ + tt1 = tt1 + tt3 + tt5;\ + in##i[0] = tt0 + tt4 * 15 + d0;\ + in##i[1] = tt1 + tt5 * 15;\ + in##i[2] = tt0 + (tt2 + tt4) * 3;\ + in##i[3] = tt1 + (tt3 + tt5) * 3;\ + in##i[4] = tt0 + tt2 * 15;\ + in##i[5] = tt1 + tt3 * 15 + d7; + + PROCESS_SND(0); + PROCESS_SND(1); + PROCESS_SND(2); + PROCESS_SND(3); + PROCESS_SND(4); + PROCESS_SND(5); +#undef PROCESS_SND + +#ifdef BIAS + const DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(out_chan_idx, 0)); +#pragma unroll + for (short i = 0; i < 6; ++i) { + in0[i] += bias_value; + } +#pragma unroll + for (short i = 0; i < 6; ++i) { + in1[i] += bias_value; + } +#pragma unroll + for (short i = 0; i < 6; ++i) { + in2[i] += bias_value; + } +#pragma unroll + for (short i = 0; i < 6; ++i) { + in3[i] += bias_value; + } +#pragma unroll + for (short i = 0; i < 6; ++i) { + in4[i] += bias_value; + } +#pragma unroll + for (short i = 0; i < 6; ++i) { + in5[i] += bias_value; + } +#endif + +#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID) +#pragma unroll + for (short i = 0; i < 6; ++i) { + in0[i] = do_activation(in0[i], relux_max_limit); + in1[i] = do_activation(in1[i], relux_max_limit); + in2[i] = do_activation(in2[i], relux_max_limit); + in3[i] = do_activation(in3[i], relux_max_limit); + in4[i] = do_activation(in4[i], relux_max_limit); + in5[i] = do_activation(in5[i], relux_max_limit); + } +#endif + const int num = min(6, out_width - out_width_idx); + const int h_num = out_height - out_height_idx; + if(h_num < 1) return; +#pragma unroll + for (int i = 0; i < num; ++i) { + WRITE_IMAGET(output, (int2)(coord_x + i, coord_y), in0[i]); + } + if(h_num < 2) return; +#pragma unroll + for (int i = 0; i < num; ++i) { + WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 1), in1[i]); + } + if(h_num < 3) return; +#pragma unroll + for (int i = 0; i < num; ++i) { + WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 2), in2[i]); + } + if(h_num < 4) return; +#pragma unroll + for (int i = 0; i < num; ++i) { + WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 3), in3[i]); + } + if(h_num < 5) return; +#pragma unroll + for (int i = 0; i < num; ++i) { + WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 4), in4[i]); + } + if(h_num < 6) return; +#pragma unroll + for (int i = 0; i < num; ++i) { + WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 5), in5[i]); + } +} + +__kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 + __read_only image2d_t input, + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int in_channel, + __private const int round_hw, + __private const float round_hw_r, + __private const int round_w, + __private const float round_w_r, + __private const int padding_top, + __private const int padding_left) { + int out_width_idx = get_global_id(0); + int chan_blk_idx = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) { + return; + } + const int chan_blk_size = global_size_dim1; +#else + const int chan_blk_size = get_global_size(1); +#endif + + const int batch_idx = out_width_idx * round_hw_r; + const int t_idx = mad24(batch_idx, -round_hw, out_width_idx); + const int n_round_w = t_idx * round_w_r; + const int mod_round_w = mad24(n_round_w, -round_w, t_idx); + const int height_idx = (n_round_w << 2) - padding_top; + const int width_idx = (mod_round_w << 2) - padding_left; + + const int nh_idx = mad24(batch_idx, in_height, height_idx); + const int wc_idx = mad24(chan_blk_idx, in_width, width_idx); + + DATA_TYPE4 in0[6], in1[6], in2[6], in3[6], in4[6], in5[6]; + DATA_TYPE4 tv0[6], tv1[6], tv2[6], tv3[6], tv4[6], tv5[6]; + + int y = select(nh_idx, -1, height_idx < 0 || height_idx >= in_height); +#pragma unroll + for (short i = 0; i < 6; ++i) { + int x = width_idx + i; + x = select(wc_idx + i, -1, x < 0 || x >= in_width); + in0[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y)); + } + y = select(nh_idx + 1, -1, height_idx + 1 < 0 || height_idx + 1 >= in_height); +#pragma unroll + for (short i = 0; i < 6; ++i) { + int x = width_idx + i; + x = select(wc_idx + i, -1, x < 0 || x >= in_width); + in1[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y)); + } + y = select(nh_idx + 2, -1, height_idx + 2 < 0 || height_idx + 2 >= in_height); +#pragma unroll + for (short i = 0; i < 6; ++i) { + int x = width_idx + i; + x = select(wc_idx + i, -1, x < 0 || x >= in_width); + in2[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y)); + } + y = select(nh_idx + 3, -1, height_idx + 3 < 0 || height_idx + 3 >= in_height); +#pragma unroll + for (short i = 0; i < 6; ++i) { + int x = width_idx + i; + x = select(wc_idx + i, -1, x < 0 || x >= in_width); + in3[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y)); + } + y = select(nh_idx + 4, -1, height_idx + 4 < 0 || height_idx + 4 >= in_height); +#pragma unroll + for (short i = 0; i < 6; ++i) { + int x = width_idx + i; + x = select(wc_idx + i, -1, x < 0 || x >= in_width); + in4[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y)); + } + y = select(nh_idx + 5, -1, height_idx + 5 < 0 || height_idx + 5 >= in_height); +#pragma unroll + for (short i = 0; i < 6; ++i) { + int x = width_idx + i; + x = select(wc_idx + i, -1, x < 0 || x >= in_width); + in5[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y)); + } + DATA_TYPE4 tt0, tt1, tt2, tt3, tt4, tt5; +#define PROCESS_IN(i) \ + tt0 = in2[i] - 4 * in0[i]; \ + tt1 = in3[i] - 4 * in1[i]; \ + tt2 = in4[i] - 4 * in2[i]; \ + tt3 = in5[i] - 4 * in3[i]; \ + tt4 = in3[i] - in1[i]; \ + tt4 = tt4 + tt4; \ + tt5 = in4[i] - in2[i]; \ + tv0[i] = tt2 - tt0; \ + tv1[i] = tt2 + tt1; \ + tv2[i] = tt2 - tt1; \ + tv3[i] = tt5 + tt4; \ + tv4[i] = tt5 - tt4; \ + tv5[i] = tt3 - tt1; + + PROCESS_IN(0); + PROCESS_IN(1); + PROCESS_IN(2); + PROCESS_IN(3); + PROCESS_IN(4); + PROCESS_IN(5); + +#undef PROCESS_IN + +#define PROCESS_SND(i) \ + tt0 = tv##i[2] - 4 * tv##i[0]; \ + tt1 = tv##i[3] - 4 * tv##i[1]; \ + tt2 = tv##i[4] - 4 * tv##i[2]; \ + tt3 = tv##i[5] - 4 * tv##i[3]; \ + tt4 = tv##i[3] - tv##i[1]; \ + tt4 = tt4 + tt4; \ + tt5 = tv##i[4] - tv##i[2]; \ + in##i[0] = tt2 - tt0; \ + in##i[1] = tt2 + tt1; \ + in##i[2] = tt2 - tt1; \ + in##i[3] = tt5 + tt4; \ + in##i[4] = tt5 - tt4; \ + in##i[5] = tt3 - tt1; + + PROCESS_SND(0); + PROCESS_SND(1); + PROCESS_SND(2); + PROCESS_SND(3); + PROCESS_SND(4); + PROCESS_SND(5); + +#undef PROCESS_SND + +#pragma unroll + for (short i = 0; i < 6; ++i) { + WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in0[i]); + chan_blk_idx += chan_blk_size; + } +#pragma unroll + for (short i = 0; i < 6; ++i) { + WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in1[i]); + chan_blk_idx += chan_blk_size; + } +#pragma unroll + for (short i = 0; i < 6; ++i) { + WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in2[i]); + chan_blk_idx += chan_blk_size; + } +#pragma unroll + for (short i = 0; i < 6; ++i) { + WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in3[i]); + chan_blk_idx += chan_blk_size; + } +#pragma unroll + for (short i = 0; i < 6; ++i) { + WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in4[i]); + chan_blk_idx += chan_blk_size; + } +#pragma unroll + for (short i = 0; i < 6; ++i) { + WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in5[i]); + chan_blk_idx += chan_blk_size; + } +} + +__kernel void winograd_inverse_transform_4x4(KERNEL_ERROR_PARAMS + GLOBAL_WORK_GROUP_SIZE_DIM2 + __read_only image2d_t input, +#ifdef BIAS + __read_only image2d_t bias, /* cout%4 * cout/4 */ +#endif + __write_only image2d_t output, + __private const int out_height, + __private const int out_width, + __private const int round_hw, + __private const float round_hw_r, + __private const int round_w, + __private const float round_w_r, + __private const float relux_max_limit) { + const int width_idx = get_global_id(0); + const int height_idx = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) { + return; + } + const int out_channel = global_size_dim1; +#else + const int out_channel = get_global_size(1); +#endif + + const int batch = width_idx * round_hw_r; + int h = mad24(batch, -round_hw, width_idx); + int n_round_w = h * round_w_r; + int mod_round_w = mad24(n_round_w, -round_w, h); + const int out_height_idx = n_round_w << 2; + const int out_width_idx = mod_round_w << 2; + const int coord_x = mad24(height_idx, out_width, out_width_idx); + const int coord_y = mad24(batch, out_height, out_height_idx); + +#ifdef BIAS + DATA_TYPE4 bias_value = + READ_IMAGET(bias, SAMPLER, (int2)(height_idx, 0)); +#endif + + DATA_TYPE4 out0[4], out1[4], out2[4], out3[4]; + DATA_TYPE4 in0[6], in1[6], in2[6], in3[6], in4[6], in5[6]; + h = height_idx; +#pragma unroll + for (short i = 0; i < 6; ++i) { + in0[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h)); + h += out_channel; + } +#pragma unroll + for (short i = 0; i < 6; ++i) { + in1[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h)); + h += out_channel; + } +#pragma unroll + for (short i = 0; i < 6; ++i) { + in2[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h)); + h += out_channel; + } +#pragma unroll + for (short i = 0; i < 6; ++i) { + in3[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h)); + h += out_channel; + } +#pragma unroll + for (short i = 0; i < 6; ++i) { + in4[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h)); + h += out_channel; + } +#pragma unroll + for (short i = 0; i < 6; ++i) { + in5[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h)); + h += out_channel; + } + + DATA_TYPE4 tt0, tt1, tt2, tt3, d0, d5; +#define PROCESS_IN(i) \ + d0 = in0[i]; \ + d5 = in5[i]; \ + tt0 = in1[i] + in2[i]; \ + tt1 = in1[i] - in2[i]; \ + tt2 = in3[i] + in4[i]; \ + tt3 = in3[i] - in4[i]; \ + tt3 = tt3 + tt3; \ + in0[i] = d0 + tt0 + tt2; \ + in1[i] = tt3 + tt1; \ + in2[i] = tt2 * 4 + tt0; \ + in3[i] = tt3 * 4 + tt1 + d5; + + PROCESS_IN(0); + PROCESS_IN(1); + PROCESS_IN(2); + PROCESS_IN(3); + PROCESS_IN(4); + PROCESS_IN(5); + +#undef PROCESS_IN + +#define PROCESS_SND(i) \ + d0 = in##i[0]; \ + d5 = in##i[5]; \ + tt0 = in##i[1] + in##i[2]; \ + tt1 = in##i[1] - in##i[2]; \ + tt2 = in##i[3] + in##i[4]; \ + tt3 = in##i[3] - in##i[4]; \ + tt3 = tt3 + tt3; \ + out##i[0] = d0 + tt0 + tt2; \ + out##i[1] = tt3 + tt1; \ + out##i[2] = tt2 * 4 + tt0; \ + out##i[3] = tt3 * 4 + tt1 + d5; + + PROCESS_SND(0); + PROCESS_SND(1); + PROCESS_SND(2); + PROCESS_SND(3); +#undef PROCESS_SND + +#ifdef BIAS + out0[0] += bias_value; + out0[1] += bias_value; + out0[2] += bias_value; + out0[3] += bias_value; + out1[0] += bias_value; + out1[1] += bias_value; + out1[2] += bias_value; + out1[3] += bias_value; + out2[0] += bias_value; + out2[1] += bias_value; + out2[2] += bias_value; + out2[3] += bias_value; + out3[0] += bias_value; + out3[1] += bias_value; + out3[2] += bias_value; + out3[3] += bias_value; +#endif + +#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID) + out0[0] = do_activation(out0[0], relux_max_limit); + out0[1] = do_activation(out0[1], relux_max_limit); + out0[2] = do_activation(out0[2], relux_max_limit); + out0[3] = do_activation(out0[3], relux_max_limit); + out1[0] = do_activation(out1[0], relux_max_limit); + out1[1] = do_activation(out1[1], relux_max_limit); + out1[2] = do_activation(out1[2], relux_max_limit); + out1[3] = do_activation(out1[3], relux_max_limit); + out2[0] = do_activation(out2[0], relux_max_limit); + out2[1] = do_activation(out2[1], relux_max_limit); + out2[2] = do_activation(out2[2], relux_max_limit); + out2[3] = do_activation(out2[3], relux_max_limit); + out3[0] = do_activation(out3[0], relux_max_limit); + out3[1] = do_activation(out3[1], relux_max_limit); + out3[2] = do_activation(out3[2], relux_max_limit); + out3[3] = do_activation(out3[3], relux_max_limit); +#endif + + const int num = min(4, out_width - out_width_idx); + const int h_num = out_height - out_height_idx; + if(h_num < 1) return; +#pragma unroll + for (int i = 0; i < num; ++i) { + WRITE_IMAGET(output, (int2)(coord_x + i, coord_y), out0[i]); + } + if(h_num < 2) return; +#pragma unroll + for (int i = 0; i < num; ++i) { + WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 1), out1[i]); + } + if(h_num < 3) return; +#pragma unroll + for (int i = 0; i < num; ++i) { + WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 2), out2[i]); + } + if(h_num < 4) return; +#pragma unroll + for (int i = 0; i < num; ++i) { + WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 3), out3[i]); + } +} \ No newline at end of file diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index f01cd0a6..d157db82 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -66,13 +66,15 @@ void CalArgImageShape(const std::vector &shape, // [ (Ic + 3) / 4, 16 * Oc] void CalWinogradFilterImageShape( const std::vector &shape, /* Oc, Ic, H, W*/ - std::vector *image_shape) { + std::vector *image_shape, + const int blk_size) { MACE_CHECK(shape.size() == 4); image_shape->resize(2); (*image_shape)[0] = RoundUpDiv4(shape[1]); - (*image_shape)[1] = (shape[0] << 4); + (*image_shape)[1] = (shape[0] * (blk_size + 2) * (blk_size + 2)); } + // [W * C, N * RoundUp<4>(H)] void CalInOutHeightImageShape(const std::vector &shape, /* NHWC */ std::vector *image_shape) { @@ -120,7 +122,8 @@ void CalWeightWidthImageShape(const std::vector &shape, /* OIHW */ void CalImage2DShape(const std::vector &shape, /* NHWC */ const BufferType type, - std::vector *image_shape) { + std::vector *image_shape, + const int wino_block_size) { MACE_CHECK_NOTNULL(image_shape); switch (type) { case CONV2D_FILTER: @@ -142,7 +145,7 @@ void CalImage2DShape(const std::vector &shape, /* NHWC */ CalInOutWidthImageShape(shape, image_shape); break; case WINOGRAD_FILTER: - CalWinogradFilterImageShape(shape, image_shape); + CalWinogradFilterImageShape(shape, image_shape, wino_block_size); break; case WEIGHT_HEIGHT: CalWeightHeightImageShape(shape, image_shape); @@ -156,12 +159,15 @@ void CalImage2DShape(const std::vector &shape, /* NHWC */ } std::vector CalWinogradShape(const std::vector &shape, - const BufferType type) { + const BufferType type, + const int wino_blk_size) { if (type == WINOGRAD_FILTER) { - return {16, shape[0], shape[1]}; + return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[0], shape[1]}; } else if (type == IN_OUT_HEIGHT) { - index_t out_width = shape[0] * ((shape[1] - 1) / 2) * ((shape[2] - 1) / 2); - return {16, shape[3], out_width}; + index_t out_width = + shape[0] * ((shape[1] + wino_blk_size - 1) / wino_blk_size) * + ((shape[2] + wino_blk_size - 1) / wino_blk_size); + return {(wino_blk_size + 2) * (wino_blk_size + 2), shape[3], out_width}; } else { LOG(FATAL) << "Mace not supported yet."; return std::vector(); diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index ca9eef17..0cc236a0 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -46,10 +46,12 @@ enum BufferType { void CalImage2DShape(const std::vector &shape, /* NHWC */ const BufferType type, - std::vector *image_shape); + std::vector *image_shape, + const int wino_blk_size = 2); std::vector CalWinogradShape(const std::vector &shape, - const BufferType type); + const BufferType type, + const int wino_blk_size = 2); std::string DtToCLCMDDt(const DataType dt); diff --git a/mace/kernels/opencl/image_to_buffer.cc b/mace/kernels/opencl/image_to_buffer.cc index e22c6e31..f293189b 100644 --- a/mace/kernels/opencl/image_to_buffer.cc +++ b/mace/kernels/opencl/image_to_buffer.cc @@ -26,7 +26,7 @@ MaceStatus ImageToBufferFunctor::operator()( Tensor *buffer, StatsFuture *future) { std::vector image_shape; - CalImage2DShape(image->shape(), type, &image_shape); + CalImage2DShape(image->shape(), type, &image_shape, wino_blk_size_); MACE_RETURN_IF_ERROR(buffer->Resize(image->shape())); uint32_t gws[2] = {static_cast(image_shape[0]), @@ -45,10 +45,14 @@ MaceStatus ImageToBufferFunctor::operator()( case IN_OUT_HEIGHT: kernel_name = "in_out_height_image_to_buffer"; break; - case WINOGRAD_FILTER: - gws[1] /= 16; - kernel_name = "winograd_filter_image_to_buffer"; + case WINOGRAD_FILTER: { + std::stringstream ss_tmp; + gws[1] /= (wino_blk_size_ + 2) * (wino_blk_size_ + 2); + ss_tmp << "winograd_filter_image_to_buffer_" + << wino_blk_size_ << "x" << wino_blk_size_; + kernel_name = ss_tmp.str(); break; + } case WEIGHT_HEIGHT: kernel_name = "weight_height_image_to_buffer"; break; diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 1dfe5f27..60d1426b 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -27,10 +27,24 @@ MaceStatus WinogradTransformFunctor::operator()( auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { - std::string obfuscated_kernel_name = - MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); + std::string obfuscated_kernel_name; std::set built_options; - built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name); + if (wino_blk_size_ == 6) { + obfuscated_kernel_name = + MACE_OBFUSCATE_SYMBOL("winograd_transform_6x6"); + built_options.emplace("-Dwinograd_transform_6x6=" + + obfuscated_kernel_name); + } else if (wino_blk_size_ == 4) { + obfuscated_kernel_name = + MACE_OBFUSCATE_SYMBOL("winograd_transform_4x4"); + built_options.emplace("-Dwinograd_transform_4x4=" + + obfuscated_kernel_name); + } else { + obfuscated_kernel_name = + MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); + built_options.emplace("-Dwinograd_transform_2x2=" + + obfuscated_kernel_name); + } built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + @@ -66,15 +80,28 @@ MaceStatus WinogradTransformFunctor::operator()( paddings_.data(), dilations_.data(), strides_.data(), RoundType::FLOOR, output_shape.data()); } - const index_t round_h = (output_shape[1] + 1) / 2; - const index_t round_w = (output_shape[2] + 1) / 2; + const index_t round_h = + (output_shape[1] + wino_blk_size_ - 1) / wino_blk_size_; + const index_t round_w = + (output_shape[2] + wino_blk_size_ - 1) / wino_blk_size_; const index_t out_width = input_tensor->dim(0) * round_h * round_w; - const uint32_t gws[2] = { - static_cast(out_width), - static_cast(RoundUpDiv4(input_tensor->dim(3)))}; + const float round_hw_r = 1.f / static_cast(round_h * round_w); + const float round_w_r = 1.f / static_cast(round_w); + const index_t blk_sqr = (wino_blk_size_ + 2) * (wino_blk_size_ + 2); + + uint32_t gws[2]; + if (wino_blk_size_ == 6) { + gws[0] = static_cast(out_width) * (wino_blk_size_ + 2); + gws[1] = + static_cast(RoundUpDiv4(input_tensor->dim(3))) * + (wino_blk_size_ + 2); + } else { + gws[0] = static_cast(out_width); + gws[1] = static_cast(RoundUpDiv4(input_tensor->dim(3))); + } if (!IsVecEqual(input_shape_, input_tensor->shape())) { - output_shape = {16, input_tensor->dim(3), out_width}; + output_shape = {blk_sqr, input_tensor->dim(3), out_width}; std::vector image_shape; CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape); MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape)); @@ -94,24 +121,66 @@ MaceStatus WinogradTransformFunctor::operator()( kernel_.setArg(idx++, static_cast(input_tensor->dim(2))); kernel_.setArg(idx++, static_cast(input_tensor->dim(3))); kernel_.setArg(idx++, static_cast(round_h * round_w)); + kernel_.setArg(idx++, round_hw_r); kernel_.setArg(idx++, static_cast(round_w)); + kernel_.setArg(idx++, round_w_r); kernel_.setArg(idx++, static_cast(paddings[0] / 2)); kernel_.setArg(idx++, static_cast(paddings[1] / 2)); input_shape_ = input_tensor->shape(); } - const std::vector lws = {kwg_size_ / 8, 8, 0}; - std::string tuning_key = Concat("winograd_transform_kernel", - output_tensor->dim(0), output_tensor->dim(1), - output_tensor->dim(2)); - TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); + if (wino_blk_size_ == 6) { + const std::vector lws = + {static_cast(wino_blk_size_ + 2), + static_cast(wino_blk_size_ + 2), 0}; + cl::Event event; + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1]), + cl::NDRange(lws[0], lws[1]), nullptr, &event); + } else { + std::vector roundup_gws(2, 0); + roundup_gws[0] = RoundUp(gws[0], lws[0]); + roundup_gws[1] = RoundUp(gws[1], lws[1]); + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1]), + cl::NDRange(lws[0], lws[1]), nullptr, &event); + } - if (runtime->IsOutOfRangeCheckEnabled()) { - kernel_error_->Map(nullptr); - char *kerror_code = kernel_error_->mutable_data(); - MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; - kernel_error_->UnMap(); + + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + + if (future != nullptr) { + future->wait_fn = [runtime, event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + runtime->GetCallStats(event, stats); + } + }; + } + } else { + const std::vector lws = {kwg_size_ / 8, 8, 0}; + std::string tuning_key = Concat("winograd_transform_kernel", + output_tensor->dim(0), + output_tensor->dim(1), + output_tensor->dim(2)); + TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); + + if (runtime->IsOutOfRangeCheckEnabled()) { + kernel_error_->Map(nullptr); + char *kerror_code = kernel_error_->mutable_data(); + MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; + kernel_error_->UnMap(); + } } return MACE_SUCCESS; @@ -126,11 +195,25 @@ MaceStatus WinogradInverseTransformFunctor::operator()( auto runtime = OpenCLRuntime::Global(); if (kernel_.get() == nullptr) { - std::string obfuscated_kernel_name = - MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); + std::string obfuscated_kernel_name; std::set built_options; - built_options.emplace("-Dwinograd_inverse_transform_2x2=" + - obfuscated_kernel_name); + if (wino_blk_size_ == 6) { + obfuscated_kernel_name = + MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_6x6"); + built_options.emplace("-Dwinograd_inverse_transform_6x6=" + + obfuscated_kernel_name); + } else if (wino_blk_size_ == 4) { + obfuscated_kernel_name = + MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_4x4"); + built_options.emplace("-Dwinograd_inverse_transform_4x4=" + + obfuscated_kernel_name); + } else { + obfuscated_kernel_name = + MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); + built_options.emplace("-Dwinograd_inverse_transform_2x2=" + + obfuscated_kernel_name); + } + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + @@ -187,8 +270,12 @@ MaceStatus WinogradInverseTransformFunctor::operator()( CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape); MACE_RETURN_IF_ERROR(output_tensor->ResizeImage(output_shape, image_shape)); - const uint32_t round_h = (height_ + 1) / 2; - const uint32_t round_w = (width_ + 1) / 2; + const index_t round_h = (height_ + wino_blk_size_ - 1) / wino_blk_size_; + const index_t round_w = (width_ + wino_blk_size_ - 1) / wino_blk_size_; + + const float round_hw_r = 1.f / static_cast(round_h * round_w); + const float round_w_r = 1.f / static_cast(round_w); + uint32_t idx = 0; if (runtime->IsOutOfRangeCheckEnabled()) { kernel_.setArg(idx++, @@ -210,12 +297,13 @@ MaceStatus WinogradInverseTransformFunctor::operator()( kernel_.setArg(idx++, static_cast(output_shape[1])); kernel_.setArg(idx++, static_cast(output_shape[2])); kernel_.setArg(idx++, static_cast(round_h * round_w)); + kernel_.setArg(idx++, round_hw_r); kernel_.setArg(idx++, static_cast(round_w)); + kernel_.setArg(idx++, round_w_r); kernel_.setArg(idx++, relux_max_limit_); input_shape_ = input_tensor->shape(); } - const std::vector lws = {kwg_size_ / 8, 8, 0}; std::string tuning_key = Concat("winograd_inverse_transform_kernel", output_tensor->dim(0), @@ -229,7 +317,6 @@ MaceStatus WinogradInverseTransformFunctor::operator()( MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; kernel_error_->UnMap(); } - return MACE_SUCCESS; } diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h index 0cdde365..d3f544d6 100644 --- a/mace/kernels/winograd_transform.h +++ b/mace/kernels/winograd_transform.h @@ -32,23 +32,27 @@ namespace kernels { struct WinogradTransformFunctorBase { WinogradTransformFunctorBase(const Padding &padding_type, - const std::vector &paddings) + const std::vector &paddings, + const int block_size) : strides_({1, 1}), dilations_({1, 1}), padding_type_(padding_type), - paddings_(paddings) {} + paddings_(paddings), + wino_blk_size_(block_size) {} const std::vector strides_; // [stride_h, stride_w] const std::vector dilations_; // [dilation_h, dilation_w] Padding padding_type_; std::vector paddings_; + const int wino_blk_size_; }; template struct WinogradTransformFunctor : WinogradTransformFunctorBase { WinogradTransformFunctor(const Padding &padding_type, - const std::vector &paddings) - : WinogradTransformFunctorBase(padding_type, paddings) {} + const std::vector &paddings, + const int block_size) + : WinogradTransformFunctorBase(padding_type, paddings, block_size) {} MaceStatus operator()(const Tensor *input, Tensor *output, @@ -66,8 +70,9 @@ template struct WinogradTransformFunctor : WinogradTransformFunctorBase { WinogradTransformFunctor(const Padding &padding_type, - const std::vector &paddings) - : WinogradTransformFunctorBase(padding_type, paddings) {} + const std::vector &paddings, + const int block_size) + : WinogradTransformFunctorBase(padding_type, paddings, block_size) {} MaceStatus operator()(const Tensor *input, Tensor *output, @@ -85,16 +90,19 @@ struct WinogradInverseTransformFunctorBase { const int height, const int width, const ActivationType activation, - const float relux_max_limit) + const float relux_max_limit, + const int block_size) : batch_(batch), height_(height), width_(width), activation_(activation), - relux_max_limit_(relux_max_limit) {} + relux_max_limit_(relux_max_limit), + wino_blk_size_(block_size) {} const int batch_; const int height_; const int width_; + const int wino_blk_size_; const ActivationType activation_; const float relux_max_limit_; }; @@ -105,9 +113,10 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { const int height, const int width, const ActivationType activation, - const float relux_max_limit) + const float relux_max_limit, + const int block_size) : WinogradInverseTransformFunctorBase( - batch, height, width, activation, relux_max_limit) {} + batch, height, width, activation, relux_max_limit, block_size) {} MaceStatus operator()(const Tensor *input, const Tensor *bias, @@ -130,9 +139,10 @@ struct WinogradInverseTransformFunctor const int height, const int width, const ActivationType activation, - const float relux_max_limit) + const float relux_max_limit, + const int block_size) : WinogradInverseTransformFunctorBase( - batch, height, width, activation, relux_max_limit) {} + batch, height, width, activation, relux_max_limit, block_size) {} MaceStatus operator()(const Tensor *input, const Tensor *bias, diff --git a/mace/ops/buffer_to_image.h b/mace/ops/buffer_to_image.h index 6d1d0395..7c59c822 100644 --- a/mace/ops/buffer_to_image.h +++ b/mace/ops/buffer_to_image.h @@ -25,7 +25,8 @@ template class BufferToImageOp : public Operator { public: BufferToImageOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws) {} + : Operator(op_def, ws), + functor_(OperatorBase::GetOptionalArg("wino_block_size", 2)) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input_tensor = this->Input(INPUT); diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 63e8869a..56feeddc 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -175,6 +175,11 @@ MACE_BM_CONV_2D(1, 160, 17, 17, 7, 1, 1, 1, SAME, 192); MACE_BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2); MACE_BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2); MACE_BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2); + +MACE_BM_CONV_2D(1, 3, 128, 128, 3, 3, 1, 1, SAME, 16); +MACE_BM_CONV_2D(1, 3, 256, 256, 3, 3, 1, 1, SAME, 16); +MACE_BM_CONV_2D(1, 3, 64, 64, 3, 3, 1, 1, SAME, 16); + } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/image_to_buffer.h b/mace/ops/image_to_buffer.h index 9d742033..c1b9b0b8 100644 --- a/mace/ops/image_to_buffer.h +++ b/mace/ops/image_to_buffer.h @@ -25,7 +25,8 @@ template class ImageToBufferOp : public Operator { public: ImageToBufferOp(const OperatorDef &op_def, Workspace *ws) - : Operator(op_def, ws) {} + : Operator(op_def, ws), + functor_(OperatorBase::GetOptionalArg("wino_block_size", 2)) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 29c8d6af..e348ba1f 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -339,6 +339,11 @@ class OpsTestNet { return &op_defs_[op_defs_.size() - 1]; } + OperatorDef *AddNewOperatorDef() { + op_defs_.emplace_back(OperatorDef()); + return &op_defs_[op_defs_.size() - 1]; + } + Workspace *ws() { return &ws_; } bool Setup(DeviceType device) { @@ -630,15 +635,17 @@ template void BufferToImage(OpsTestNet *net, const std::string &input_name, const std::string &output_name, - const kernels::BufferType type) { + const kernels::BufferType type, + const int wino_block_size = 2) { MACE_CHECK_NOTNULL(net); OpDefBuilder("BufferToImage", "BufferToImageTest") - .Input(input_name) - .Output(output_name) - .AddIntArg("buffer_type", type) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net->NewOperatorDef()); + .Input(input_name) + .Output(output_name) + .AddIntArg("buffer_type", type) + .AddIntArg("wino_block_size", wino_block_size) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net->NewOperatorDef()); // Run net->RunOp(D); @@ -650,15 +657,17 @@ template void ImageToBuffer(OpsTestNet *net, const std::string &input_name, const std::string &output_name, - const kernels::BufferType type) { + const kernels::BufferType type, + const int wino_block_size = 2) { MACE_CHECK_NOTNULL(net); OpDefBuilder("ImageToBuffer", "ImageToBufferTest") - .Input(input_name) - .Output(output_name) - .AddIntArg("buffer_type", type) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net->NewOperatorDef()); + .Input(input_name) + .Output(output_name) + .AddIntArg("buffer_type", type) + .AddIntArg("wino_block_size", wino_block_size) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net->NewOperatorDef()); // Run net->RunOp(D); diff --git a/mace/ops/winograd_convolution_benchmark.cc b/mace/ops/winograd_convolution_benchmark.cc new file mode 100644 index 00000000..5bb15603 --- /dev/null +++ b/mace/ops/winograd_convolution_benchmark.cc @@ -0,0 +1,134 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mace/core/operator.h" +#include "mace/core/testing/test_benchmark.h" +#include "mace/kernels/conv_pool_2d_util.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +namespace { +template +void BMWinogradConvolution( + int iters, int batch, int height, int width, + int in_channels, int out_channels, int block_size) { + mace::testing::StopTiming(); + OpsTestNet net; + net.AddRandomInput("Input", {batch, height, width, in_channels}); + + net.AddRandomInput("Filter", {out_channels, in_channels, 3, 3}); + net.AddRandomInput("Bias", {out_channels}); + + BufferToImage(&net, "Input", "InputImage", + kernels::BufferType::IN_OUT_CHANNEL); + BufferToImage(&net, "Filter", "FilterImage", + kernels::BufferType::CONV2D_FILTER); + BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + + // Winograd convolution + // transform filter + BufferToImage(&net, "Filter", "WinoFilter", + kernels::BufferType::WINOGRAD_FILTER, block_size); + // transform input + OpDefBuilder("WinogradTransform", "WinogradTransformTest") + .Input("InputImage") + .Output("WinoInput") + .AddIntArg("padding", Padding::SAME) + .AddIntArg("wino_block_size", block_size) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.AddNewOperatorDef()); + + // MatMul + OpDefBuilder("MatMul", "MatMulTest") + .Input("WinoFilter") + .Input("WinoInput") + .Output("WinoGemm") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.AddNewOperatorDef()); + + // Inverse transform + OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest") + .Input("WinoGemm") + .Input("BiasImage") + .AddIntArg("batch", batch) + .AddIntArg("height", height) + .AddIntArg("width", width) + .AddIntArg("wino_block_size", block_size) + .Output("OutputImage") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.AddNewOperatorDef()); + net.Setup(D); + // Warm-up + for (int i = 0; i < 5; ++i) { + net.Run(); + } + net.Sync(); + mace::testing::StartTiming(); + while (iters--) { + net.Run(); + } + net.Sync(); +} +} // namespace + +#define MACE_BM_WINOGRAD_CONV_MACRO(N, H, W, IC, OC, M, TYPE, DEVICE) \ + static void MACE_BM_WINOGRAD_CONV_##N##_##H##_##W##_##IC##_##OC##_##M##_##\ + TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * IC * H * W; \ + const int64_t macc = \ + static_cast(iters) * N * OC * H * W * (3 * 3 * IC + 1); \ + mace::testing::MaccProcessed(macc); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + BMWinogradConvolution(iters, N, H, W, IC, OC, M); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_WINOGRAD_CONV_##N##_##H##_##W##_##IC##_##OC##_##M##_##TYPE##_##DEVICE) + +#define MACE_BM_WINOGRAD_CONV(N, H, W, IC, OC, M) \ + MACE_BM_WINOGRAD_CONV_MACRO(N, H, W, IC, OC, M, half, GPU); + + +MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 2); +MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 2); +MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 2); +MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 4); +MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 4); +MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 4); +MACE_BM_WINOGRAD_CONV(1, 64, 64, 3, 16, 6); +MACE_BM_WINOGRAD_CONV(1, 128, 128, 3, 16, 6); +MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 6); + +MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 2); +MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 4); +MACE_BM_WINOGRAD_CONV(1, 28, 28, 256, 256, 6); + +MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 2); +MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 4); +MACE_BM_WINOGRAD_CONV(1, 56, 56, 256, 256, 6); + +MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 2); +MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 4); +MACE_BM_WINOGRAD_CONV(1, 128, 128, 128, 256, 6); + +MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 2); +MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 4); +MACE_BM_WINOGRAD_CONV(1, 256, 256, 256, 256, 6); + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc index 68890a91..ae87a818 100644 --- a/mace/ops/winograd_convolution_test.cc +++ b/mace/ops/winograd_convolution_test.cc @@ -25,27 +25,6 @@ namespace test { class WinogradConvlutionTest : public OpsTestBase {}; namespace { -void TransposeFilter(const std::vector &input, - const std::vector &input_shape, - std::vector *output) { - MACE_CHECK_NOTNULL(output); - output->resize(input.size()); - - const float *input_ptr = input.data(); - for (index_t h = 0; h < input_shape[0]; ++h) { - for (index_t w = 0; w < input_shape[1]; ++w) { - for (index_t oc = 0; oc < input_shape[2]; ++oc) { - for (index_t ic = 0; ic < input_shape[3]; ++ic) { - int offset = ((oc * input_shape[3] + ic) * input_shape[0] + h) * - input_shape[1] + - w; - (*output)[offset] = *input_ptr; - ++input_ptr; - } - } - } - } -} template void WinogradConvolution(const index_t batch, @@ -53,7 +32,8 @@ void WinogradConvolution(const index_t batch, const index_t width, const index_t in_channels, const index_t out_channels, - const Padding padding) { + const Padding padding, + const int block_size) { // srand(time(NULL)); // Construct graph @@ -91,13 +71,13 @@ void WinogradConvolution(const index_t batch, // Winograd convolution // transform filter BufferToImage(&net, "Filter", "WinoFilter", - kernels::BufferType::WINOGRAD_FILTER); - + kernels::BufferType::WINOGRAD_FILTER, block_size); // transform input OpDefBuilder("WinogradTransform", "WinogradTransformTest") .Input("InputImage") .Output("WinoInput") .AddIntArg("padding", padding) + .AddIntArg("wino_block_size", block_size) .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); @@ -121,6 +101,7 @@ void WinogradConvolution(const index_t batch, .AddIntArg("batch", batch) .AddIntArg("height", output_shape[1]) .AddIntArg("width", output_shape[2]) + .AddIntArg("wino_block_size", block_size) .Output("WinoOutputImage") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); @@ -139,22 +120,67 @@ void WinogradConvolution(const index_t batch, } } // namespace -TEST_F(WinogradConvlutionTest, AlignedConvolution) { - WinogradConvolution(1, 32, 32, 32, 16, - Padding::VALID); - WinogradConvolution(1, 32, 32, 32, 16, Padding::SAME); +TEST_F(WinogradConvlutionTest, AlignedConvolutionM2) { + WinogradConvolution(1, 32, 32, 3, 3, + Padding::VALID, 2); + WinogradConvolution(1, 32, 32, 3, 3, + Padding::SAME, 2); +} + +TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM2) { + WinogradConvolution(1, 61, 67, 31, 37, + Padding::VALID, 2); + WinogradConvolution(1, 61, 67, 37, 31, + Padding::SAME, 2); +} + +TEST_F(WinogradConvlutionTest, BatchConvolutionM2) { + WinogradConvolution(3, 64, 64, 32, 32, + Padding::VALID, 2); + WinogradConvolution(5, 61, 67, 37, 31, + Padding::SAME, 2); +} + +TEST_F(WinogradConvlutionTest, AlignedConvolutionM6) { + WinogradConvolution(1, 32, 32, 3, 3, + Padding::VALID, 6); + WinogradConvolution(1, 32, 32, 3, 3, + Padding::SAME, 6); +} + +TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM6) { + WinogradConvolution(1, 61, 67, 31, 37, + Padding::VALID, 6); + WinogradConvolution(1, 61, 67, 37, 31, + Padding::SAME, 6); +} + +TEST_F(WinogradConvlutionTest, BatchConvolutionM6) { + WinogradConvolution(3, 64, 64, 32, 32, + Padding::VALID, 6); + WinogradConvolution(5, 61, 67, 37, 31, + Padding::SAME, 6); +} + +TEST_F(WinogradConvlutionTest, AlignedConvolutionM4) { + WinogradConvolution(1, 32, 32, 3, 3, + Padding::VALID, 4); + WinogradConvolution(1, 32, 32, 3, 3, + Padding::SAME, 4); } -TEST_F(WinogradConvlutionTest, UnAlignedConvolution) { +TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM4) { WinogradConvolution(1, 61, 67, 31, 37, - Padding::VALID); - WinogradConvolution(1, 61, 67, 37, 31, Padding::SAME); + Padding::VALID, 4); + WinogradConvolution(1, 61, 67, 37, 31, + Padding::SAME, 4); } -TEST_F(WinogradConvlutionTest, BatchConvolution) { +TEST_F(WinogradConvlutionTest, BatchConvolutionM4) { WinogradConvolution(3, 64, 64, 32, 32, - Padding::VALID); - WinogradConvolution(5, 61, 67, 37, 31, Padding::SAME); + Padding::VALID, 4); + WinogradConvolution(5, 61, 67, 37, 31, + Padding::SAME, 4); } namespace { @@ -164,7 +190,8 @@ void WinogradConvolutionWithPad(const index_t batch, const index_t width, const index_t in_channels, const index_t out_channels, - const int padding) { + const int padding, + const int block_size) { // srand(time(NULL)); // Construct graph @@ -202,14 +229,14 @@ void WinogradConvolutionWithPad(const index_t batch, // Winograd convolution // transform filter BufferToImage(&net, "Filter", "WinoFilter", - kernels::BufferType::WINOGRAD_FILTER); - + kernels::BufferType::WINOGRAD_FILTER, block_size); // transform input OpDefBuilder("WinogradTransform", "WinogradTransformTest") .Input("InputImage") .Output("WinoInput") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .AddIntsArg("padding_values", {padding, padding}) + .AddIntArg("wino_block_size", block_size) .Finalize(net.NewOperatorDef()); // Run on opencl @@ -232,6 +259,7 @@ void WinogradConvolutionWithPad(const index_t batch, .AddIntArg("batch", batch) .AddIntArg("height", output_shape[1]) .AddIntArg("width", output_shape[2]) + .AddIntArg("wino_block_size", block_size) .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Output("WinoOutputImage") .Finalize(net.NewOperatorDef()); @@ -250,19 +278,67 @@ void WinogradConvolutionWithPad(const index_t batch, } } // namespace -TEST_F(WinogradConvlutionTest, AlignedConvolutionWithPad) { - WinogradConvolutionWithPad(1, 32, 32, 32, 16, 1); - WinogradConvolutionWithPad(1, 32, 32, 32, 16, 2); +TEST_F(WinogradConvlutionTest, AlignedConvolutionM2WithPad) { + WinogradConvolutionWithPad(1, 32, 32, 32, 16, + 1, 2); + WinogradConvolutionWithPad(1, 32, 32, 32, 16, + 2, 2); +} + +TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM2WithPad) { + WinogradConvolutionWithPad(1, 61, 67, 31, 37, + 1, 2); + WinogradConvolutionWithPad(1, 61, 67, 37, 31, + 2, 2); +} + +TEST_F(WinogradConvlutionTest, BatchConvolutionWithM2Pad) { + WinogradConvolutionWithPad(3, 64, 64, 32, 32, + 1, 2); + WinogradConvolutionWithPad(5, 61, 67, 37, 31, + 2, 2); +} + +TEST_F(WinogradConvlutionTest, AlignedConvolutionM6WithPad) { + WinogradConvolutionWithPad(1, 32, 32, 32, 16, + 1, 6); + WinogradConvolutionWithPad(1, 32, 32, 32, 16, + 2, 6); +} + +TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM6WithPad) { + WinogradConvolutionWithPad(1, 61, 67, 31, 37, + 1, 6); + WinogradConvolutionWithPad(1, 61, 67, 37, 31, + 2, 6); +} + +TEST_F(WinogradConvlutionTest, BatchConvolutionWithM6Pad) { + WinogradConvolutionWithPad(3, 64, 64, 32, 32, + 1, 6); +// WinogradConvolutionWithPad(5, 61, 67, 37, 31, +// 2, 6); +} + +TEST_F(WinogradConvlutionTest, AlignedConvolutionM4WithPad) { + WinogradConvolutionWithPad(1, 32, 32, 32, 16, + 1, 4); + WinogradConvolutionWithPad(1, 32, 32, 32, 16, + 2, 4); } -TEST_F(WinogradConvlutionTest, UnAlignedConvolutionWithPad) { - WinogradConvolutionWithPad(1, 61, 67, 31, 37, 1); - WinogradConvolutionWithPad(1, 61, 67, 37, 31, 2); +TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM4WithPad) { + WinogradConvolutionWithPad(1, 61, 67, 31, 37, + 1, 4); + WinogradConvolutionWithPad(1, 61, 67, 37, 31, + 2, 4); } -TEST_F(WinogradConvlutionTest, BatchConvolutionWithPad) { - WinogradConvolutionWithPad(3, 64, 64, 32, 32, 1); - WinogradConvolutionWithPad(5, 61, 67, 37, 31, 2); +TEST_F(WinogradConvlutionTest, BatchConvolutionWithM4Pad) { + WinogradConvolutionWithPad(3, 64, 64, 32, 32, + 1, 4); + WinogradConvolutionWithPad(5, 61, 67, 37, 31, + 2, 4); } } // namespace test diff --git a/mace/ops/winograd_inverse_transform.h b/mace/ops/winograd_inverse_transform.h index d08ce0f6..2dfa2f50 100644 --- a/mace/ops/winograd_inverse_transform.h +++ b/mace/ops/winograd_inverse_transform.h @@ -36,7 +36,8 @@ class WinogradInverseTransformOp : public Operator { kernels::StringToActivationType( OperatorBase::GetOptionalArg("activation", "NOOP")), - OperatorBase::GetOptionalArg("max_limit", 0.0f)) {} + OperatorBase::GetOptionalArg("max_limit", 0.0f), + OperatorBase::GetOptionalArg("wino_block_size", 2)) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input_tensor = this->Input(INPUT); diff --git a/mace/ops/winograd_transform.h b/mace/ops/winograd_transform.h index 90bb5501..db874287 100644 --- a/mace/ops/winograd_transform.h +++ b/mace/ops/winograd_transform.h @@ -30,7 +30,9 @@ class WinogradTransformOp : public Operator { : Operator(op_def, ws), functor_(static_cast(OperatorBase::GetOptionalArg( "padding", static_cast(VALID))), - OperatorBase::GetRepeatedArgs("padding_values")) {} + OperatorBase::GetRepeatedArgs("padding_values"), + OperatorBase::GetOptionalArg( + "wino_block_size", 2)) {} MaceStatus Run(StatsFuture *future) override { const Tensor *input_tensor = this->Input(INPUT); diff --git a/mace/ops/winograd_transform_benchmark.cc b/mace/ops/winograd_transform_benchmark.cc index 658b0ea2..1f1bb2db 100644 --- a/mace/ops/winograd_transform_benchmark.cc +++ b/mace/ops/winograd_transform_benchmark.cc @@ -23,7 +23,7 @@ namespace test { namespace { template void BMWinogradTransform( - int iters, int batch, int height, int width, int channels) { + int iters, int batch, int height, int width, int channels, int block_size) { mace::testing::StopTiming(); OpsTestNet net; @@ -35,50 +35,60 @@ void BMWinogradTransform( .Input("InputImage") .Output("OutputImage") .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .AddIntArg("block_size", block_size) .Finalize(net.NewOperatorDef()); + net.Setup(D); // Warm-up for (int i = 0; i < 5; ++i) { - net.RunOp(D); + net.Run(); } net.Sync(); mace::testing::StartTiming(); while (iters--) { - net.RunOp(D); + net.Run(); } net.Sync(); } } // namespace -#define MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ - static void \ - MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ +#define MACE_BM_WINO_TRANSFORM_MACRO(N, H, W, C, M, TYPE, DEVICE) \ + static void MACE_BM_WINO_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##\ + DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::MaccProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - BMWinogradTransform(iters, N, H, W, C); \ + BMWinogradTransform(iters, N, H, W, C, M); \ } \ MACE_BENCHMARK( \ - MACE_BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) + MACE_BM_WINO_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##DEVICE) -#define MACE_BM_WINOGRAD_TRANSFORM(N, H, W, C) \ - MACE_BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, GPU); +#define MACE_BM_WINO_TRANSFORM(N, H, W, C, M) \ + MACE_BM_WINO_TRANSFORM_MACRO(N, H, W, C, M, half, GPU); -MACE_BM_WINOGRAD_TRANSFORM(1, 16, 16, 128); -MACE_BM_WINOGRAD_TRANSFORM(1, 64, 64, 128); -MACE_BM_WINOGRAD_TRANSFORM(1, 128, 128, 128); +MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 2); +MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 2); +MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 2); +MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 4); +MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 4); +MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 4); +MACE_BM_WINO_TRANSFORM(1, 128, 128, 3, 6); +MACE_BM_WINO_TRANSFORM(1, 256, 256, 3, 6); +MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 6); namespace { template void BMWinogradInverseTransform( - int iters, int batch, int height, int width, int channels) { + int iters, int batch, int height, int width, int channels, int block_size) { mace::testing::StopTiming(); - index_t p = batch * ((height + 1) / 2) * ((width + 1) / 2); + index_t p = batch * ((height + block_size - 1) / block_size) * + ((width + block_size - 1) / block_size); OpsTestNet net; - net.AddRandomInput("Input", {16, channels, p, 1}); + net.AddRandomInput("Input", {(block_size + 2) * + (block_size + 2), channels, p, 1}); BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_HEIGHT); @@ -87,42 +97,191 @@ void BMWinogradInverseTransform( .AddIntArg("batch", batch) .AddIntArg("height", height) .AddIntArg("width", width) + .AddIntArg("block_size", block_size) .Output("OutputImage") .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); + net.Setup(D); // Warm-up for (int i = 0; i < 5; ++i) { - net.RunOp(D); + net.Run(); } net.Sync(); mace::testing::StartTiming(); while (iters--) { - net.RunOp(D); + net.Run(); } net.Sync(); } } // namespace -#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ - static void \ - MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(\ - int iters) { \ - const int64_t tot = static_cast(iters) * N * C * H * W; \ - mace::testing::MaccProcessed(tot); \ +#define MACE_BM_WINO_INVERSE_TRANSFORM_MACRO(N, H, W, C, M, TYPE, DEVICE) \ + static void \ + MACE_BM_WINO_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_\ + ##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + BMWinogradInverseTransform(iters, N, H, W, C, M); \ + } \ + MACE_BENCHMARK( \ + MACE_BM_WINO_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##M##_##TYPE##_##\ + DEVICE) + +#define MACE_BM_WINO_INVERSE_TRANSFORM(N, H, W, C, M) \ + MACE_BM_WINO_INVERSE_TRANSFORM_MACRO(N, H, W, C, M, half, GPU); + +MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 2); +MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 2); +MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 2); + +MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 4); +MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 4); +MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 4); + +MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 6); +MACE_BM_WINO_INVERSE_TRANSFORM(1, 62, 62, 16, 6); +MACE_BM_WINO_INVERSE_TRANSFORM(1, 254, 254, 16, 6); + + +namespace { +template +void WinoFilterBufferToImage(int iters, + int out_channel, int in_channel, + int height, int width, int wino_block_size) { + mace::testing::StopTiming(); + + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input", + {out_channel, in_channel, height, width}); + + OpDefBuilder("BufferToImage", "BufferToImageTest") + .Input("Input") + .Output("Output") + .AddIntArg("buffer_type", kernels::BufferType::WINOGRAD_FILTER) + .AddIntArg("wino_block_size", wino_block_size) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Warm-up + net.Setup(D); + for (int i = 0; i < 5; ++i) { + net.Run(); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.Run(); + } + net.Sync(); +} +} // namespace + +#define MACE_BM_WINO_B2I_MACRO(O, I, H, W, M, TYPE, DEVICE) \ + static void MACE_BM_WINO_B2I_##O##_##I##_##H##_##W##_##M##_##TYPE##_##DEVICE(\ + int iters) { \ + const int64_t tot = static_cast(iters) * O * I * H * W; \ + mace::testing::MaccProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + WinoFilterBufferToImage(iters, O, I, H, W, M); \ + } \ + MACE_BENCHMARK(\ + MACE_BM_WINO_B2I_##O##_##I##_##H##_##W##_##M##_##TYPE##_##DEVICE) + +#define MACE_BM_WINO_B2I(O, I, H, W, M) \ + MACE_BM_WINO_B2I_MACRO(O, I, H, W, M, half, GPU); + +MACE_BM_WINO_B2I(16, 3, 3, 3, 2); +MACE_BM_WINO_B2I(16, 3, 3, 3, 4); +MACE_BM_WINO_B2I(16, 3, 3, 3, 6); + +MACE_BM_WINO_B2I(32, 3, 3, 3, 2); +MACE_BM_WINO_B2I(32, 3, 3, 3, 4); +MACE_BM_WINO_B2I(32, 3, 3, 3, 6); +MACE_BM_WINO_B2I(128, 3, 3, 3, 2); +MACE_BM_WINO_B2I(128, 3, 3, 3, 4); +MACE_BM_WINO_B2I(128, 3, 3, 3, 6); +MACE_BM_WINO_B2I(256, 3, 3, 3, 2); +MACE_BM_WINO_B2I(256, 3, 3, 3, 4); +MACE_BM_WINO_B2I(256, 3, 3, 3, 6); + + +namespace { +template +void WinoMatMulBenchmark( + int iters, int out_channels, int in_channels, + int height, int width, int block_size) { + mace::testing::StopTiming(); + + OpsTestNet net; + const int batch = (block_size + 2) * (block_size + 2); + const index_t round_h = (height + block_size - 1) / block_size; + const index_t round_w = (width + block_size - 1) / block_size; + const index_t out_width = round_h * round_w; + // Add input data + net.AddRandomInput("A", {batch, out_channels, in_channels, 1}); + net.AddRandomInput("B", {batch, in_channels, out_width, 1}); + + if (D == DeviceType::GPU) { + BufferToImage(&net, "A", "AImage", kernels::BufferType::IN_OUT_WIDTH); + BufferToImage(&net, "B", "BImage", + kernels::BufferType::IN_OUT_HEIGHT); + + OpDefBuilder("MatMul", "MatMulBM") + .Input("AImage") + .Input("BImage") + .Output("Output") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + } else { + OpDefBuilder("MatMul", "MatMulBM") + .Input("A") + .Input("B") + .Output("Output") + .Finalize(net.NewOperatorDef()); + } + net.Setup(D); + // Warm-up + for (int i = 0; i < 5; ++i) { + net.Run(); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.Run(); + } + net.Sync(); +} +} // namespace + +#define MACE_BM_WINO_MATMUL_MACRO(OC, IC, H, W, M, TYPE, DEVICE) \ + static void MACE_BM_WINO_MATMUL_##OC##_##IC##_##H##_##W##_##M##_##TYPE##_##\ + DEVICE(int iters) { \ + const int64_t macc = static_cast(iters) * OC * IC * H * W; \ + const int64_t tot = static_cast(iters) * OC * (IC * H + H * W); \ + mace::testing::MaccProcessed(macc); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - BMWinogradInverseTransform(iters, N, H, W, C); \ + WinoMatMulBenchmark(iters, OC, IC, H, W, M); \ } \ - MACE_BENCHMARK( \ - MACE_BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) + MACE_BENCHMARK(\ + MACE_BM_WINO_MATMUL_##OC##_##IC##_##H##_##W##_##M##_##TYPE##_##DEVICE) -#define MACE_BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \ - MACE_BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, half, GPU); +#define MACE_BM_WINO_MATMUL(OC, IC, H, W, M) \ + MACE_BM_WINO_MATMUL_MACRO(OC, IC, H, W, M, half, GPU); -MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32); -MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32); -MACE_BM_WINOGRAD_INVERSE_TRANSFORM(1, 126, 126, 32); +MACE_BM_WINO_MATMUL(16, 3, 128, 128, 2); +MACE_BM_WINO_MATMUL(16, 3, 128, 128, 4); +MACE_BM_WINO_MATMUL(16, 3, 128, 128, 6); +MACE_BM_WINO_MATMUL(32, 3, 256, 256, 2); +MACE_BM_WINO_MATMUL(32, 3, 256, 256, 4); +MACE_BM_WINO_MATMUL(32, 3, 256, 256, 6); } // namespace test } // namespace ops diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py index 0a56239e..1d81fe2f 100644 --- a/mace/python/tools/converter_tool/base_converter.py +++ b/mace/python/tools/converter_tool/base_converter.py @@ -140,6 +140,7 @@ class MaceKeyword(object): mace_winograd_filter_transformed = 'is_filter_transformed' mace_device = 'device' mace_value_str = 'value' + mace_wino_block_size = 'wino_block_size' class TransformerRule(Enum): diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py index 008281a5..3116cdc1 100644 --- a/mace/python/tools/converter_tool/transformer.py +++ b/mace/python/tools/converter_tool/transformer.py @@ -31,6 +31,7 @@ from mace.python.tools.converter_tool.base_converter import TransformerRule from mace.python.tools.convert_util import mace_check OPENCL_IMAGE_MAX_SIZE = 16384 +DEFAULT_GPU_WINO_BLK_SIZE = 4 class OpenCLBufferType(enum.Enum): @@ -111,6 +112,7 @@ class Transformer(base_converter.ConverterInterface): self._option = option self._model = model + self._gpu_wino_blk = DEFAULT_GPU_WINO_BLK_SIZE self._ops = {} self._consts = {} @@ -482,21 +484,36 @@ class Transformer(base_converter.ConverterInterface): if filter_height != 3 or filter_width != 3 or strides[0] > 1 \ or strides[1] > 1 or dilations[0] > 1 or dilations[1] > 1: return False - width = batch * ((out_height + 1) / 2) * ((out_width + 1) / 2) - return (16 * in_channels < OPENCL_IMAGE_MAX_SIZE) and \ - (16 * out_channels < OPENCL_IMAGE_MAX_SIZE) and \ - (width < OPENCL_IMAGE_MAX_SIZE) + self._gpu_wino_blk = DEFAULT_GPU_WINO_BLK_SIZE + block_size = self._gpu_wino_blk + blk_sqr = (block_size + 2) * (block_size + 2) + width =\ + batch * ((out_height + block_size - 1) / block_size) *\ + ((out_width + block_size - 1) / block_size) + if blk_sqr * in_channels > OPENCL_IMAGE_MAX_SIZE \ + or blk_sqr * out_channels > OPENCL_IMAGE_MAX_SIZE \ + or width > OPENCL_IMAGE_MAX_SIZE: + self._gpu_wino_blk = 2 + block_size = self._gpu_wino_blk + blk_sqr = (block_size + 2) * (block_size + 2) + width = \ + batch * ((out_height + block_size - 1) / block_size) * \ + ((out_width + block_size - 1) / block_size) + return (blk_sqr * in_channels <= OPENCL_IMAGE_MAX_SIZE) and \ + (blk_sqr * out_channels <= OPENCL_IMAGE_MAX_SIZE) and \ + (width <= OPENCL_IMAGE_MAX_SIZE) def transform_gpu_winograd(self): """Only gpu needs winograd transform.""" net = self._model filter_format = self.filter_format() - if self._option.device == DeviceType.GPU.value: for op in net.op: if op.type == MaceOp.Conv2D.name \ and self.check_if_gpu_use_winograd_conv(op): print("Transform gpu winograd %s(%s)" % (op.name, op.type)) + block_size = self._gpu_wino_blk + blk_sqr = (block_size + 2) * (block_size + 2) output_shape = op.output_shape[0].dims filter = self._consts[op.input[1]] filter_shape = filter.dims @@ -515,10 +532,15 @@ class Transformer(base_converter.ConverterInterface): wt_op.input.extend([op.input[0]]) wt_op.output.extend([wt_op.name]) wt_output_shape = wt_op.output_shape.add() - wt_output_width = batch * ( - (out_height + 1) / 2) * ((out_width + 1) / 2) + wt_output_width =\ + batch * ((out_height + block_size - 1) / block_size) *\ + ((out_width + block_size - 1) / block_size) wt_output_shape.dims.extend( - [16, in_channels, wt_output_width]) + [blk_sqr, in_channels, wt_output_width]) + + blk_size_arg = wt_op.arg.add() + blk_size_arg.name = MaceKeyword.mace_wino_block_size + blk_size_arg.i = block_size if ConverterUtil.get_arg(op, MaceKeyword.mace_padding_str) \ @@ -543,7 +565,7 @@ class Transformer(base_converter.ConverterInterface): matmul_op.output.extend([matmul_op.name]) matmul_output_shape = matmul_op.output_shape.add() matmul_output_shape.dims.extend( - [16, out_channels, wt_output_width]) + [blk_sqr, out_channels, wt_output_width]) arg = matmul_op.arg.add() arg.name = MaceKeyword.mace_winograd_filter_transformed @@ -570,6 +592,9 @@ class Transformer(base_converter.ConverterInterface): width_arg = iwt_op.arg.add() width_arg.name = 'width' width_arg.i = out_width + blk_size_arg = iwt_op.arg.add() + blk_size_arg.name = MaceKeyword.mace_wino_block_size + blk_size_arg.i = block_size ConverterUtil.add_data_format_arg(iwt_op, data_format) filter_data = np.array(filter.float_data).reshape( @@ -872,6 +897,13 @@ class Transformer(base_converter.ConverterInterface): arg.name = MaceKeyword.mace_mode arg.i = 0 + if input_type == OpenCLBufferType.WINOGRAD_FILTER: + blk_sqr = op.output_shape[0].dims[0] + wino_blk = int(np.sqrt(blk_sqr)) - 2 + wino_arg = op_def.arg.add() + wino_arg.name = MaceKeyword.mace_wino_block_size + wino_arg.i = wino_blk + op.input[input_idx] = output_name def transform_buffer_image(self): @@ -1002,8 +1034,8 @@ class Transformer(base_converter.ConverterInterface): def transform_global_conv_to_fc(self): """Transform global conv to fc should be placed after transposing input/output and filter""" - if self._option.device == DeviceType.GPU.value: - return False + # if self._option.device == DeviceType.GPU.value: + # return False net = self._model for op in net.op: -- GitLab