diff --git a/mace/core/operator.cc b/mace/core/operator.cc index 7554fbba99a176071ba7815ef3db1a08a8efaf92..a0296d8722dbf09a5d55d707308de82b06383321 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -77,6 +77,9 @@ extern void Register_Pooling(OperatorRegistry *op_registry); extern void Register_ResizeBilinear(OperatorRegistry *op_registry); extern void Register_Softmax(OperatorRegistry *op_registry); extern void Register_SpaceToBatchND(OperatorRegistry *op_registry); +extern void Register_MatMul(OperatorRegistry *op_registry); +extern void Register_WinogradTransform(OperatorRegistry *op_registry); +extern void Register_WinogradInverseTransform(OperatorRegistry *op_registry); OperatorRegistry::OperatorRegistry() { Register_Activation(this); @@ -97,6 +100,9 @@ OperatorRegistry::OperatorRegistry() { Register_ResizeBilinear(this); Register_Softmax(this); Register_SpaceToBatchND(this); + Register_MatMul(this); + Register_WinogradTransform(this); + Register_WinogradInverseTransform(this); } } // namespace mace diff --git a/mace/core/registry.h b/mace/core/registry.h index 5c82ef2e4b60ce3645e6c5708a2b7442f9e8a85e..5a233bcd88815a12e533049dae552b4b93434d9c 100644 --- a/mace/core/registry.h +++ b/mace/core/registry.h @@ -19,7 +19,7 @@ class Registry { void Register(const SrcType &key, Creator creator) { VLOG(2) << "Registering: " << key; std::lock_guard lock(register_mutex_); - MACE_CHECK(registry_.count(key) == 0, "Key already registered."); + MACE_CHECK(registry_.count(key) == 0, "Key already registered: ", key); registry_[key] = creator; } diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h new file mode 100644 index 0000000000000000000000000000000000000000..d88e3843888d786291e34ecbfaafdff5cbe5d788 --- /dev/null +++ b/mace/kernels/matmul.h @@ -0,0 +1,66 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_KERNELS_MATMUL_H_ +#define MACE_KERNELS_MATMUL_H_ + +#include "mace/core/future.h" +#include "mace/core/tensor.h" + +namespace mace { +namespace kernels { + + +template +struct MatMulFunctor { + void operator()(const Tensor *A, + const Tensor *B, + Tensor *C, + StatsFuture *future) { + + std::vector c_shape = {A->dim(0), A->dim(1), B->dim(2), 1}; + C->Resize(c_shape); + const index_t N = C->dim(0); + const index_t height = C->dim(1); + const index_t width = C->dim(2); + const index_t K = A->dim(2); + Tensor::MappingGuard guarda(A); + Tensor::MappingGuard guardb(B); + Tensor::MappingGuard guardc(C); + const T *a_ptr_base = A->data(); + const T *b_ptr_base = B->data(); + T *c_ptr = C->mutable_data(); + for (int i = 0; i < N; ++i) { + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + const T *a_ptr = a_ptr_base + h * K; + const T *b_ptr = b_ptr_base + w; + *c_ptr = 0; + for (int k = 0; k < K; ++k) { + *c_ptr += *a_ptr * *b_ptr; + a_ptr++; + b_ptr += width; + } + c_ptr++; + } + } + a_ptr_base += height * K; + b_ptr_base += K * width; + } + } +}; + + +template +struct MatMulFunctor { + void operator()(const Tensor *A, + const Tensor *B, + Tensor *C, + StatsFuture *future); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_MATMUL_H_ diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc index 5575a0b1b70868e18a859131065ad4b498b27e43..17688e9cc3e0083bfae3a5fba1b3e384f2543be1 100644 --- a/mace/kernels/opencl/activation_opencl.cc +++ b/mace/kernels/opencl/activation_opencl.cc @@ -63,7 +63,7 @@ void ActivationFunctor::operator()(const Tensor *input, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width), static_cast(height * batch)}; - std::vector lws = {8, 16, 8, 1}; + const std::vector lws = {8, 16, 8, 1}; std::string tuning_key = Concat("relu_opencl_kernel_", activation_, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index 261efde071ee3b200c3a35290e685b43297ec956..c16741d62421252cc888fbda6ff1a9d497a10ec9 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -17,7 +17,6 @@ static void AddN(const std::vector &input_tensors, if (input_tensors.size() > 4) { MACE_NOT_IMPLEMENTED; } - output->ResizeLike(input_tensors[0]); const index_t batch = output->dim(0); const index_t height = output->dim(1); @@ -49,7 +48,7 @@ static void AddN(const std::vector &input_tensors, static_cast(width_pixels), static_cast(batch_height_pixels) }; - std::vector lws = {64, 16, 1}; + const std::vector lws = {64, 16, 1}; std::stringstream ss; ss << "addn_opencl_kernel_" << output->dim(0) << "_" @@ -82,7 +81,7 @@ void AddNFunctor::operator()( std::vector output_shape = input_tensors[0]->shape(); std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); output_tensor->ResizeImage(output_shape, output_image_shape); AddN(input_tensors, output_tensor, future); diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 02ab76a85eedcdeb735c69937a326522fcf6b273..5942dcfbfa5f7ed104286ed47107c5fbb0733f9a 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -83,7 +83,7 @@ void BatchNormFunctor::operator()(const Tensor *input, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width), static_cast(height * batch)}; - std::vector lws = {8, 16, 8, 1}; + const std::vector lws = {8, 16, 8, 1}; std::string tuning_key = Concat("batch_norm_opencl_kernel_", activation_, output->dim(0), output->dim(1), output->dim(2), output->dim(3), folded_constant_); diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index ae81d32fdd85a90c78d3a2a91265465d66d7a0e9..980354985ad06166bfc829c68c48a0e7c060a781 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -18,13 +18,21 @@ void BufferToImageFunctor::operator()(Tensor *buffer, std::vector image_shape; if (!i2b_) { CalImage2DShape(buffer->shape(), type, image_shape); - image->ResizeImage(buffer->shape(), image_shape); + if(type == WINOGRAD_FILTER) { + std::vector new_shape = + CalWinogradShape(buffer->shape(), type); + image->ResizeImage(new_shape, image_shape); + } else { + image->ResizeImage(buffer->shape(), image_shape); + } buffer->MarkUnused(); } else { image_shape = image->image_shape(); buffer->Resize(image->shape()); } + size_t gws[2] = {image_shape[0], + image_shape[1]}; string kernel_name; switch (type) { case CONV2D_FILTER: @@ -33,12 +41,23 @@ void BufferToImageFunctor::operator()(Tensor *buffer, case DW_CONV2D_FILTER: kernel_name = i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image"; break; - case IN_OUT: + case IN_OUT_CHANNEL: kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image"; break; case ARGUMENT: kernel_name = i2b_ ? "arg_image_to_buffer" : "arg_buffer_to_image"; break; + case IN_OUT_HEIGHT: + kernel_name = i2b_ ? "in_out_height_image_to_buffer" : "in_out_height_buffer_to_image"; + break; + case IN_OUT_WIDTH: + MACE_CHECK(!i2b_) << "IN_OUT_WIDTH only support buffer to image now"; + kernel_name = "in_out_width_buffer_to_image"; + break; + case WINOGRAD_FILTER: + gws[1] /= 16; + kernel_name = i2b_ ? "winograd_filter_image_to_buffer" : "winograd_filter_buffer_to_image"; + break; } string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; @@ -68,16 +87,13 @@ void BufferToImageFunctor::operator()(Tensor *buffer, } b2f_kernel.setArg(idx++, *(static_cast(image->buffer()))); - const size_t gws[3] = {image_shape[0], - image_shape[1], - 1}; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(b2f_kernel); - const std::vector lws = {16, 64, 1}; + const std::vector lws = {16, 64}; cl::Event event; cl_int error = runtime->command_queue().enqueueNDRangeKernel( b2f_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), + cl::NDRange(gws[0], gws[1]), + cl::NDRange(lws[0], lws[1]), nullptr, &event); MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl index 2ac05209871215e82c7fd7c0861067627948a82d..f95029c0300a1e44a93036136e33e8f77c393bad 100644 --- a/mace/kernels/opencl/cl/buffer_to_image.cl +++ b/mace/kernels/opencl/cl/buffer_to_image.cl @@ -233,3 +233,212 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ vstore4(values, 0, output + offset); } } + + +__kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //nhwc + __private const int height, + __private const int width, + __private const int channels, + __write_only image2d_t output) { + int w = get_global_id(0); + int h = get_global_id(1); + const int wc = width * channels; + const int height_blks = (height + 3) / 4; + const int batch_idx = h / height_blks; + const int height_idx = (h % height_blks) << 2; + const int width_idx = w % width; + const int channel_idx = w / width; + int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels + + channel_idx; + + int size = height - height_idx; + size = size >= 4 ? 0 : size; + DATA_TYPE4 values = 0; + switch(size) { + case 0: + values.w = *(input + offset + wc * 3); + case 3: + values.z = *(input + offset + wc * 2); + case 2: + values.y = *(input + offset + wc); + case 1: + values.x = *(input + offset); + } + int2 coord = (int2)(w, h); + WRITE_IMAGET(output, coord, values); +} + +__kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc + __private const int height, + __private const int width, + __private const int channels, + __read_only image2d_t input) { + int w = get_global_id(0); + int h = get_global_id(1); + const int height_blks = (height + 3) / 4; + const int batch_idx = h / height_blks; + const int height_idx = (h % height_blks) << 2; + const int width_idx = w % width; + const int channel_idx = w / width; + int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels + + channel_idx; + + int2 coord = (int2)(w, h); + DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord); + output[offset] = values.x; + if (height_idx + 1 >= height) return; + offset += width * channels; + output[offset] = values.y; + if (height_idx + 2 >= height) return; + offset += width * channels; + output[offset] = values.z; + if (height_idx + 3 >= height) return; + offset += width * channels; + output[offset] = values.w; +} + + +__kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ + __private const int height, + __private const int width, + __private const int channels, + __write_only image2d_t output) { + int w = get_global_id(0); + int h = get_global_id(1); + const int batch_idx = h / height; + const int height_idx = h % height; + const int width_idx = (w % width) << 2; + const int channel_idx = w / width; + const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels + + channel_idx; + + int size = width - width_idx; + size = size >= 4 ? 0 : size; + DATA_TYPE4 values = 0; + switch(size) { + case 0: + values.w = *(input + offset + channels * 3); + case 3: + values.z = *(input + offset + channels * 2); + case 2: + values.y = *(input + offset + channels); + case 1: + values.x = *(input + offset); + } + int2 coord = (int2)(w, h); + WRITE_IMAGET(output, coord, values); +} + +// only support 3x3 now +__kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, //Oc, Ic, H, W + __private const int in_channels, + __private const int height, + __private const int width, + __write_only image2d_t output) { + int w = get_global_id(0); + int h = get_global_id(1); + const int out_channels = get_global_size(1); + const int out_channel_idx = h; + const int in_channel_idx = w << 2; + const int offset = (out_channel_idx * in_channels + in_channel_idx) * height * width; + const int length = min((in_channels - in_channel_idx) * 9, 36); + DATA_TYPE in[36] = {0}; + DATA_TYPE4 tt; + DATA_TYPE4 tu0[4], tu1[4], tu2[4], tu3[4]; + +#pragma unroll + for (short i = 0; i < length; ++i) { + in[i] = *(input + offset + i); + } + tt = ((DATA_TYPE4)(in[0], in[9], in[18], in[27]) + + (DATA_TYPE4)(in[6], in[15], in[24], in[33])) / 2; + tu1[0] = tt + ((DATA_TYPE4)(in[3], in[12], in[21], in[30]) / 2); + tu2[0] = tt - ((DATA_TYPE4)(in[3], in[12], in[21], in[30]) / 2); + tt = ((DATA_TYPE4)(in[1], in[10], in[19], in[28]) + + (DATA_TYPE4)(in[7], in[16], in[25], in[34])) / 2; + tu1[1] = tt + ((DATA_TYPE4)(in[4], in[13], in[22], in[31]) / 2); + tu2[1] = tt - ((DATA_TYPE4)(in[4], in[13], in[22], in[31]) / 2); + tt = ((DATA_TYPE4)(in[2], in[11], in[20], in[29]) + + (DATA_TYPE4)(in[8], in[17], in[26], in[35])) / 2; + tu1[2] = tt + ((DATA_TYPE4)(in[5], in[14], in[23], in[32]) / 2); + tu2[2] = tt - ((DATA_TYPE4)(in[5], in[14], in[23], in[32]) / 2); + tu0[0] = (DATA_TYPE4)(in[0], in[9], in[18], in[27]); + tu0[1] = (DATA_TYPE4)(in[1], in[10], in[19], in[28]); + tu0[2] = (DATA_TYPE4)(in[2], in[11], in[20], in[29]); + tu3[0] = (DATA_TYPE4)(in[6], in[15], in[24], in[33]); + tu3[1] = (DATA_TYPE4)(in[7], in[16], in[25], in[34]); + tu3[2] = (DATA_TYPE4)(in[8], in[17], in[26], in[35]); + + tt = (tu0[0] + tu0[2]) / 2; + tu0[3] = tu0[2]; + tu0[2] = tt - tu0[1] / 2; + tu0[1] = tt + tu0[1] / 2; + tt = (tu1[0] + tu1[2]) / 2; + tu1[3] = tu1[2]; + tu1[2] = tt - tu1[1] / 2; + tu1[1] = tt + tu1[1] / 2; + tt = (tu2[0] + tu2[2]) / 2; + tu2[3] = tu2[2]; + tu2[2] = tt - tu2[1] / 2; + tu2[1] = tt + tu2[1] / 2; + tt = (tu3[0] + tu3[2]) / 2; + tu3[3] = tu3[2]; + tu3[2] = tt - tu3[1] / 2; + tu3[1] = tt + tu3[1] / 2; + + int2 coord = (int2)(w, h); +#pragma unroll + for (short i = 0; i < 4; ++i) { + WRITE_IMAGET(output, coord, tu0[i]); + coord.y += out_channels; + } +#pragma unroll + for (short i = 0; i < 4; ++i) { + WRITE_IMAGET(output, coord, tu1[i]); + coord.y += out_channels; + } +#pragma unroll + for (short i = 0; i < 4; ++i) { + WRITE_IMAGET(output, coord, tu2[i]); + coord.y += out_channels; + } +#pragma unroll + for (short i = 0; i < 4; ++i) { + WRITE_IMAGET(output, coord, tu3[i]); + coord.y += out_channels; + } +} + +// only support 3x3 now +__kernel void winograd_filter_image_to_buffer(__global DATA_TYPE *output, //Oc, Ic, H, W + __private const int height, + __private const int width, + __private const int channel, + __read_only image2d_t input) { + const int w = get_global_id(0); + const int h = get_global_id(1); + const int width_idx = w << 2; + const int size = width - width_idx; + int offset = h * width + width_idx; + + int2 coord = (int2)(w, h); + DATA_TYPE4 values; + for (short i = 0; i < 16; ++i) { + values = READ_IMAGET(input, SAMPLER, coord); + if (size < 4) { + switch (size) { + case 3: + output[offset+2] = values.z; + case 2: + output[offset+1] = values.y; + case 1: + output[offset] = values.x; + } + } else { + vstore4(values, 0, output + offset); + } + + coord.y += height; + offset += height * width; + } +} diff --git a/mace/kernels/opencl/cl/matmul.cl b/mace/kernels/opencl/cl/matmul.cl new file mode 100644 index 0000000000000000000000000000000000000000..cb71f21d1c78951b8c8de3c17e252cad2394dd3d --- /dev/null +++ b/mace/kernels/opencl/cl/matmul.cl @@ -0,0 +1,50 @@ +#include + +// C = A * B +__kernel void matmul(__read_only image2d_t A, + __read_only image2d_t B, + __write_only image2d_t C, + __private const int M, + __private const int N, + __private const int K, + __private const int height_blocks, + __private const int k_blocks) { + const int gx = get_global_id(0) << 2; + const int hb = get_global_id(1); + const int batch = hb / height_blocks; + const int ty = (hb % height_blocks); + const int gy = mad24(batch, height_blocks, ty); + const int bm = mad24(batch, M, ty << 2); + const int bk = mul24(batch, k_blocks); + + float4 a0, a1, a2, a3; + float4 b0, b1, b2, b3; + float4 c0 = 0, c1 = 0, c2 = 0, c3 = 0; + + for (short pos = 0; pos < k_blocks; pos += 1) { + a0 = READ_IMAGET(A, SAMPLER, (int2)(pos, (bm))); + a1 = READ_IMAGET(A, SAMPLER, (int2)(pos, (bm + 1))); + a2 = READ_IMAGET(A, SAMPLER, (int2)(pos, (bm + 2))); + a3 = READ_IMAGET(A, SAMPLER, (int2)(pos, (bm + 3))); + + b0 = READ_IMAGET(B, SAMPLER, (int2)(gx, (bk + pos))); + b1 = READ_IMAGET(B, SAMPLER, (int2)(gx + 1, (bk + pos))); + b2 = READ_IMAGET(B, SAMPLER, (int2)(gx + 2, (bk + pos))); + b3 = READ_IMAGET(B, SAMPLER, (int2)(gx + 3, (bk + pos))); + + c0 += (DATA_TYPE4)(dot(a0, b0), dot(a1, b0), dot(a2, b0), dot(a3, b0)); + + c1 += (DATA_TYPE4)(dot(a0, b1), dot(a1, b1), dot(a2, b1), dot(a3, b1)); + + c2 += (DATA_TYPE4)(dot(a0, b2), dot(a1, b2), dot(a2, b2), dot(a3, b2)); + + c3 += (DATA_TYPE4)(dot(a0, b3), dot(a1, b3), dot(a2, b3), dot(a3, b3)); + } + WRITE_IMAGET(C, (int2)(gx, gy), c0); + if ((gx + 1) >= N) return; + WRITE_IMAGET(C, (int2)(gx + 1, gy), c1); + if ((gx + 2) >= N) return; + WRITE_IMAGET(C, (int2)(gx + 2, gy), c2); + if ((gx + 3) >= N) return; + WRITE_IMAGET(C, (int2)(gx + 3, gy), c3); +} diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl new file mode 100644 index 0000000000000000000000000000000000000000..daecd39f9d5e4e45f835166e86b1daba8428574b --- /dev/null +++ b/mace/kernels/opencl/cl/winograd_transform.cl @@ -0,0 +1,210 @@ +#include + +__kernel void winograd_transform_2x2(__read_only image2d_t input, + __write_only image2d_t output, + __private const int in_height, + __private const int in_width, + __private const int in_channel, + __private const int round_hw, + __private const int round_w, + __private const int padding_top, + __private const int padding_left) { + int out_width_idx = get_global_id(0); + int chan_blk_idx = get_global_id(1); + const int chan_blk_size = get_global_size(1); + + const int batch_idx = out_width_idx / round_hw; + const int t_idx = out_width_idx % round_hw; + const int height_idx = ((t_idx / round_w) << 1) - padding_top; + const int width_idx = ((t_idx % round_w) << 1) - padding_left; + + const int nh_idx = mad24(batch_idx, in_height, height_idx); + const int wc_idx = mad24(chan_blk_idx, in_width, width_idx); + + DATA_TYPE4 input0[4]; + DATA_TYPE4 input1[4]; + DATA_TYPE4 input2[4]; + DATA_TYPE4 input3[4]; + + DATA_TYPE4 tv0[4]; + DATA_TYPE4 tv1[4]; + DATA_TYPE4 tv2[4]; + DATA_TYPE4 tv3[4]; + + int y = select(nh_idx, -1, height_idx < 0 || height_idx >= in_height); +#pragma unroll + for (short i = 0; i < 4; ++i) { + int x = width_idx + i; + x = select(wc_idx + i, -1, x < 0 || x >= in_width); + input0[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y)); + } + y = select(nh_idx + 1, -1, height_idx + 1 < 0 || height_idx + 1 >= in_height); +#pragma unroll + for (short i = 0; i < 4; ++i) { + int x = width_idx + i; + x = select(wc_idx + i, -1, x < 0 || x >= in_width); + input1[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y)); + } + y = select(nh_idx + 2, -1, height_idx + 2 < 0 || height_idx + 2 >= in_height); +#pragma unroll + for (short i = 0; i < 4; ++i) { + int x = width_idx + i; + x = select(wc_idx + i, -1, x < 0 || x >= in_width); + input2[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y)); + } + y = select(nh_idx + 3, -1, height_idx + 3 < 0 || height_idx + 3 >= in_height); +#pragma unroll + for (short i = 0; i < 4; ++i) { + int x = width_idx + i; + x = select(wc_idx + i, -1, x < 0 || x >= in_width); + input3[i] = READ_IMAGET(input, SAMPLER, (int2)(x, y)); + } + +#pragma unroll + for (short i = 0; i < 4; ++i) { + tv0[i] = input0[i] - input2[i]; + tv1[i] = input1[i] + input2[i]; + tv2[i] = input2[i] - input1[i]; + tv3[i] = input1[i] - input3[i]; + } + input0[0] = tv0[0] - tv0[2]; + input0[1] = tv0[1] + tv0[2]; + input0[2] = tv0[2] - tv0[1]; + input0[3] = tv0[1] - tv0[3]; + input1[0] = tv1[0] - tv1[2]; + input1[1] = tv1[1] + tv1[2]; + input1[2] = tv1[2] - tv1[1]; + input1[3] = tv1[1] - tv1[3]; + input2[0] = tv2[0] - tv2[2]; + input2[1] = tv2[1] + tv2[2]; + input2[2] = tv2[2] - tv2[1]; + input2[3] = tv2[1] - tv2[3]; + input3[0] = tv3[0] - tv3[2]; + input3[1] = tv3[1] + tv3[2]; + input3[2] = tv3[2] - tv3[1]; + input3[3] = tv3[1] - tv3[3]; + +#pragma unroll + for (short i = 0; i < 4; ++i) { + WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), input0[i]); + chan_blk_idx += chan_blk_size; + } +#pragma unroll + for (short i = 0; i < 4; ++i) { + WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), input1[i]); + chan_blk_idx += chan_blk_size; + } +#pragma unroll + for (short i = 0; i < 4; ++i) { + WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), input2[i]); + chan_blk_idx += chan_blk_size; + } +#pragma unroll + for (short i = 0; i < 4; ++i) { + WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), input3[i]); + chan_blk_idx += chan_blk_size; + } +} + +__kernel void winograd_inverse_transform_2x2(__read_only image2d_t input, +#ifdef BIAS + __read_only image2d_t bias, /* cout%4 * cout/4 */ +#endif + __write_only image2d_t output, + __private const int out_height, + __private const int out_width, + __private const int round_hw, + __private const int round_w, + __private const DATA_TYPE relux_max_limit, + __private const DATA_TYPE prelu_alpha) { + const int width_idx = get_global_id(0); + const int height_idx = get_global_id(1); + const int out_channel = get_global_size(1); + int width = width_idx; + int height = height_idx; + + const int batch = width_idx / round_hw; + int t = width_idx % round_hw; + const int out_height_idx = (t / round_w) << 1; + const int out_width_idx = (t % round_w) << 1; + const int out_chan_idx = height_idx; + const int coord_x = mad24(out_chan_idx, out_width, out_width_idx); + const int coord_y = mad24(batch, out_height, out_height_idx); + +#ifdef BIAS + DATA_TYPE4 bias_value = + READ_IMAGET(bias, SAMPLER, (int2)(out_chan_idx, 0)); +#endif + + DATA_TYPE4 in0[4], in1[4], in2[4], in3[4]; + +#pragma unroll + for (short i = 0; i < 4; ++i) { + in0[i] = READ_IMAGET(input, SAMPLER, (int2)(width, height)); + height += out_channel; + } +#pragma unroll + for (short i = 0; i < 4; ++i) { + in1[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, height)); + height += out_channel; + } +#pragma unroll + for (short i = 0; i < 4; ++i) { + in2[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, height)); + height += out_channel; + } +#pragma unroll + for (short i = 0; i < 4; ++i) { + in3[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, height)); + height += out_channel; + } + + in0[0] = in0[0] + in1[0] + in2[0]; + in0[1] = in0[1] + in1[1] + in2[1]; + in0[2] = in0[2] + in1[2] + in2[2]; + in0[3] = in0[3] + in1[3] + in2[3]; + + in0[0] = in0[0] + in0[1] + in0[2]; + in0[1] = in0[1] - in0[2] - in0[3]; + + in1[0] = in1[0] - in2[0] - in3[0]; + in1[1] = in1[1] - in2[1] - in3[1]; + in1[2] = in1[2] - in2[2] - in3[2]; + in1[3] = in1[3] - in2[3] - in3[3]; + + in1[0] = in1[0] + in1[1] + in1[2]; + in1[1] = in1[1] - in1[2] - in1[3]; + +#ifdef BIAS + in0[0] += bias_value; + in0[1] += bias_value; + in1[0] += bias_value; + in1[1] += bias_value; +#endif + + +#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID) + in0[0] = do_activation(in0[0], relux_max_limit, prelu_alpha); + in0[1] = do_activation(in0[1], relux_max_limit, prelu_alpha); + in1[0] = do_activation(in1[0], relux_max_limit, prelu_alpha); + in1[1] = do_activation(in1[1], relux_max_limit, prelu_alpha); +#endif + + WRITE_IMAGET(output, (int2)(coord_x, coord_y), in0[0]); + + t = 0; + if (out_width_idx + 1 < out_width) { + WRITE_IMAGET(output, (int2)(coord_x + 1, coord_y), in0[1]); + t += 1; + } + if (out_height_idx + 1 < out_height) { + WRITE_IMAGET(output, (int2)(coord_x, coord_y + 1), in1[0]); + t += 1; + } + if (t == 2) { + WRITE_IMAGET(output, (int2)(coord_x + 1, coord_y + 1), in1[1]); + } + + + +} diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index b47a096efd2d2472e50b510e722e7142740fb332..48f14bea8f313e19d483fd3c1b6e57f483382413 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -50,7 +50,7 @@ static void Concat2(const Tensor *input0, static_cast(width), static_cast(batch * height), }; - std::vector lws = {8, 16, 8, 1}; + const std::vector lws = {8, 16, 8, 1}; std::stringstream ss; ss << "concat_opencl_kernel_" << output->dim(0) << "_" @@ -85,7 +85,7 @@ void ConcatFunctor::operator()(const std::vectordim(axis_); } std::vector image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT, image_shape); + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape); output->ResizeImage(output_shape, image_shape); switch (inputs_count) { diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index 072a0abf0ad16ad3ec7e09d57c7ce90b268cc33b..03883d3a6fac483298fcfbc15d39564ad5ae8c06 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -109,7 +109,7 @@ void Conv2dFunctor::operator()(const Tensor *input, paddings_, output_shape.data(), paddings.data()); std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); output->ResizeImage(output_shape, output_image_shape); if (kernel_h == kernel_w && kernel_h <= 5 && diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index d62fdf56535372d7fa98da2dad16395656c078bb..d96f80ceea72699b4e762ae6cacb48dc5c3e4eb8 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -96,7 +96,7 @@ void Conv1x1(const Tensor *input, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), static_cast(height * batch)}; - std::vector lws = {8, 15, 8, 1}; + const std::vector lws = {8, 15, 8, 1}; std::string tuning_key = Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 3875403862fd97e58f6e6279e0d4e6a92ab9a96c..853dfe0a008080f60dcbdfca6c1bb1bcc0534f27 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -94,7 +94,7 @@ static void Conv2d3x3S12(const Tensor *input, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), static_cast(height * batch)}; - std::vector lws = {4, 15, 8, 1}; + const std::vector lws = {4, 15, 8, 1}; std::string tuning_key = Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index 264add718bdf2fc48bc11ac3b0f00a5cebbe07bd..1bff90f65ba8e08a46cf1f283e969f26a4e72a19 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -97,7 +97,7 @@ void Conv2dOpencl(const Tensor *input, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), static_cast(height * batch)}; - std::vector lws = {8, 16, 8, 1}; + const std::vector lws = {8, 16, 8, 1}; std::string tuning_key = Concat("conv2d_general_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc index 4397b50814fc27d398793ca03108c8b685d6ea0a..ef4f3b4f40740cf1e25bd55b1b57ae4508a532c4 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl.cc @@ -106,7 +106,7 @@ void DepthwiseConv2d(const Tensor *input, // NHWC const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), static_cast(height * batch)}; - std::vector lws = {8, 16, 8, 1}; + const std::vector lws = {8, 16, 8, 1}; std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation, batch, height, width, channels, multiplier); TuningOrRun3DKernel(dw_conv2d_kernel, tuning_key, gws, lws, future); @@ -150,7 +150,7 @@ void DepthwiseConv2dFunctor::operator()( padding_, output_shape.data(), paddings.data()); std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); output->ResizeImage(output_shape, output_image_shape); DepthwiseConv2d(input, filter, bias, strides_[0], paddings.data(), dilations_, diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 56c157a6228421b787dfe580081922571f3a2cac..84e102e08b3f3d78e18a07a85809b59df5b49021 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -45,6 +45,34 @@ void CalArgImageShape(const std::vector &shape, image_shape[1] = 1; } +// Only support 3x3 now +// [ (Ic + 3) / 4, 16 * Oc] +void CalWinogradFilterImageShape(const std::vector &shape, /* Oc, Ic, H, W*/ + std::vector &image_shape) { + MACE_CHECK(shape.size() == 4); + image_shape.resize(2); + image_shape[0] = RoundUpDiv4(shape[1]); + image_shape[1] = (shape[0] << 4); +} + +// [W * C, N * RoundUp<4>(H)] +void CalInOutHeightImageShape(const std::vector &shape, /* NHWC */ + std::vector &image_shape) { + MACE_CHECK(shape.size() == 4); + image_shape.resize(2); + image_shape[0] = shape[2] * shape[3]; + image_shape[1] = shape[0] * RoundUpDiv4(shape[1]); +} + +// [RoundUp<4>(W) * C, N * H] +void CalInOutWidthImageShape(const std::vector &shape, /* NHWC */ + std::vector &image_shape) { + MACE_CHECK(shape.size() == 4); + image_shape.resize(2); + image_shape[0] = RoundUpDiv4(shape[2]) * shape[3]; + image_shape[1] = shape[0] * shape[1]; +} + void CalImage2DShape(const std::vector &shape, /* NHWC */ const BufferType type, std::vector &image_shape) { @@ -55,13 +83,39 @@ void CalImage2DShape(const std::vector &shape, /* NHWC */ case DW_CONV2D_FILTER: CalDepthwiseConv2dFilterImageShape(shape, image_shape); break; - case IN_OUT: + case IN_OUT_CHANNEL: CalInOutputImageShape(shape, image_shape); break; case ARGUMENT: CalArgImageShape(shape, image_shape); break; - default:LOG(FATAL) << "Mace not supported yet."; + case IN_OUT_HEIGHT: + CalInOutHeightImageShape(shape, image_shape); + break; + case IN_OUT_WIDTH: + CalInOutWidthImageShape(shape, image_shape); + break; + case WINOGRAD_FILTER: + CalWinogradFilterImageShape(shape, image_shape); + break; + default: + LOG(FATAL) << "Mace not supported yet."; + } +} + + +std::vector CalWinogradShape(const std::vector &shape, + const BufferType type) { + if (type == WINOGRAD_FILTER) { + return {16, shape[0], shape[1], 1}; + }else if (type == IN_OUT_HEIGHT) { + index_t out_width = shape[0] * + ((shape[1] - 1) / 2) * + ((shape[2] - 1) / 2); + return {16, shape[3], out_width, 1}; + } else { + LOG(FATAL) << "Mace not supported yet."; + return std::vector(); } } @@ -104,7 +158,7 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) { void TuningOrRun3DKernel(cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, - std::vector &lws, + const std::vector &lws, StatsFuture *future) { auto runtime = OpenCLRuntime::Global(); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel); @@ -201,7 +255,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel, void TuningOrRun2DKernel(cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, - std::vector &lws, + const std::vector &lws, StatsFuture *future) { auto runtime = OpenCLRuntime::Global(); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel); diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index dc40514f095800d2e22aec4e3c27046a416f5a6b..8927859263c4e43310c073d4abcae36195bbd8c0 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -18,15 +18,21 @@ const float kMaxKernelExeTime = 1000.0; // microseconds enum BufferType { CONV2D_FILTER = 0, - DW_CONV2D_FILTER = 1, - IN_OUT = 2, - ARGUMENT = 3 + IN_OUT_CHANNEL = 1, + ARGUMENT = 2, + IN_OUT_HEIGHT = 3, + IN_OUT_WIDTH = 4, + WINOGRAD_FILTER = 5, + DW_CONV2D_FILTER = 6, }; void CalImage2DShape(const std::vector &shape, /* NHWC */ const BufferType type, std::vector &image_shape); +std::vector CalWinogradShape(const std::vector &shape, + const BufferType type); + std::string DtToCLCMDDt(const DataType dt); std::string DtToUpstreamCLCMDDt(const DataType dt); @@ -38,14 +44,14 @@ std::string DtToUpstreamCLDt(const DataType dt); void TuningOrRun3DKernel(cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, - std::vector &lws, + const std::vector &lws, StatsFuture *future); void TuningOrRun2DKernel(cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, - std::vector &lws, + const std::vector &lws, StatsFuture *future); inline void SetFuture(StatsFuture *future, const cl::Event &event) { diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc new file mode 100644 index 0000000000000000000000000000000000000000..4406308eace8e424c2b5a2d6cf4f449d797f2ab2 --- /dev/null +++ b/mace/kernels/opencl/matmul.cc @@ -0,0 +1,75 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/kernels/matmul.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/opencl/helper.h" +#include "mace/utils/tuner.h" + +namespace mace { +namespace kernels { + +template +void MatMulFunctor::operator()( + const Tensor *A, + const Tensor *B, + Tensor *C, + StatsFuture *future) { + + std::vector c_shape = {A->dim(0), A->dim(1), B->dim(2), 1}; + std::vector c_image_shape; + CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, c_image_shape); + C->ResizeImage(c_shape, c_image_shape); + + const index_t batch = C->dim(0); + const index_t height = C->dim(1); + const index_t width = C->dim(2); + + const index_t height_blocks = RoundUpDiv4(height); + const index_t width_blocks = RoundUpDiv4(width); + + auto runtime = OpenCLRuntime::Global(); + std::set built_options; + auto dt = DataTypeToEnum::value; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); + built_options.emplace("-Dmatmul=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + auto matmul_kernel = runtime->BuildKernel("matmul", kernel_name, built_options); + + uint32_t idx = 0; + matmul_kernel.setArg(idx++, + *(static_cast(A->buffer()))); + matmul_kernel.setArg(idx++, + *(static_cast(B->buffer()))); + matmul_kernel.setArg(idx++, *(static_cast(C->buffer()))); + matmul_kernel.setArg(idx++, static_cast(height)); + matmul_kernel.setArg(idx++, static_cast(width)); + matmul_kernel.setArg(idx++, static_cast(A->dim(2))); + matmul_kernel.setArg(idx++, static_cast(height_blocks)); + matmul_kernel.setArg(idx++, static_cast(RoundUpDiv4(A->dim(2)))); + + const uint32_t gws[2] = { + static_cast(width_blocks), + static_cast(height_blocks * batch), + }; + const std::vector lws = {16, 64, 1}; + std::stringstream ss; + ss << "matmul_opencl_kernel_" + << C->dim(0) << "_" + << C->dim(1) << "_" + << C->dim(2) << "_" + << C->dim(3); + TuningOrRun2DKernel(matmul_kernel, ss.str(), gws, lws, future); + +}; + +template +struct MatMulFunctor; + +template +struct MatMulFunctor; + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index b147c15ad1e34def84560c4fd81da2988d1b8c89..248bf6a7a30953e6fce662f017859a97f6b44527 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -92,7 +92,7 @@ void PoolingFunctor::operator()(const Tensor *input, output_shape.data(), paddings.data()); std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); output->ResizeImage(output_shape, output_image_shape); Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_, diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index f8d3aed2a3cb232aafe54d9713dd8efd7635bddb..97550999b1a98b77abab858e7d8d21436802c4d8 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -28,7 +28,7 @@ void ResizeBilinearFunctor::operator()( std::vector output_shape {batch, out_height, out_width, channels}; if (input->is_image()) { std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); output->ResizeImage(output_shape, output_image_shape); } else { output->Resize(output_shape); @@ -59,7 +59,7 @@ void ResizeBilinearFunctor::operator()( const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(out_width), static_cast(out_height * batch)}; - std::vector lws = {8, 16, 8, 1}; + const std::vector lws = {8, 16, 8, 1}; std::stringstream ss; ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_" diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc index e47a4f8956397424475dd14026b205a0b698485c..25e99661afe7d3afc6dbc407a5c9b7260c986ea9 100644 --- a/mace/kernels/opencl/softmax_opencl.cc +++ b/mace/kernels/opencl/softmax_opencl.cc @@ -41,7 +41,7 @@ void SoftmaxFunctor::operator()(const Tensor *logits, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width), static_cast(height * batch)}; - std::vector lws = {8, 16, 8, 1}; + const std::vector lws = {8, 16, 8, 1}; std::stringstream ss; ss << "softmax_opencl_kernel_" << output->dim(0) << "_" diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc index 8ef3f7c45e4c9bd61c0d02aa6e7d0e0dfdb75d82..744fe0273084751a403dbe28553d93042e784c93 100644 --- a/mace/kernels/opencl/space_to_batch_opencl.cc +++ b/mace/kernels/opencl/space_to_batch_opencl.cc @@ -21,7 +21,7 @@ void SpaceToBatchFunctor::operator()(Tensor *space_tensor Tensor *batch_tensor, StatsFuture *future) { std::vector output_image_shape; - CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape); const char *kernel_name = nullptr; if (b2s_) { space_tensor->ResizeImage(output_shape, output_image_shape); @@ -61,7 +61,7 @@ void SpaceToBatchFunctor::operator()(Tensor *space_tensor const uint32_t gws[3] = {chan_blk, static_cast(batch_tensor->dim(2)), static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; - std::vector lws = {8, 16, 8, 1}; + const std::vector lws = {8, 16, 8, 1}; std::stringstream ss; ss << kernel_name << "_" << batch_tensor->dim(0) << "_" diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..273806061f3bd068725451cc88787c59bc79bce5 --- /dev/null +++ b/mace/kernels/opencl/winograd_transform.cc @@ -0,0 +1,148 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/kernels/winograd_transform.h" +#include "mace/core/runtime/opencl/cl2_header.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/opencl/helper.h" +#include "mace/utils/tuner.h" + +namespace mace { +namespace kernels { + +template +void WinogradTransformFunctor::operator()(const Tensor *input_tensor, + Tensor *output_tensor, + StatsFuture *future) { + std::vector output_shape(4); + std::vector filter_shape = {3, 3, input_tensor->dim(3), 1}; + std::vector paddings(2); + kernels::CalcNHWCPaddingAndOutputSize( + input_tensor->shape().data(), filter_shape.data(), dilations_.data(), + strides_.data(), paddings_, output_shape.data(), paddings.data()); + + const index_t round_h = (output_shape[1] + 1) / 2; + const index_t round_w = (output_shape[2] + 1) / 2; + const index_t out_width = input_tensor->dim(0) * round_h * round_w; + output_shape = {16, input_tensor->dim(3), out_width, 1}; + std::vector image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape); + output_tensor->ResizeImage(output_shape, image_shape); + + string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); + std::set built_options; + built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); + auto runtime = OpenCLRuntime::Global(); + auto wino_kernel = runtime->BuildKernel("winograd_transform", + obfuscated_kernel_name, + built_options); + + uint32_t idx = 0; + wino_kernel.setArg(idx++, *(static_cast(input_tensor->buffer()))); + wino_kernel.setArg(idx++, *(static_cast(output_tensor->buffer()))); + wino_kernel.setArg(idx++, static_cast(input_tensor->dim(1))); + wino_kernel.setArg(idx++, static_cast(input_tensor->dim(2))); + wino_kernel.setArg(idx++, static_cast(input_tensor->dim(3))); + wino_kernel.setArg(idx++, static_cast(round_h * round_w)); + wino_kernel.setArg(idx++, static_cast(round_w)); + wino_kernel.setArg(idx++, static_cast(paddings[0] / 2)); + wino_kernel.setArg(idx++, static_cast(paddings[1] / 2)); + + const uint32_t gws[2] = {static_cast(out_width), + static_cast(RoundUpDiv4(input_tensor->dim(3)))}; + const std::vector lws = {128, 8, 1}; + std::stringstream ss; + ss << "winograd_transform_kernel_" + << input_tensor->dim(0) << "_" + << input_tensor->dim(1) << "_" + << input_tensor->dim(2) << "_" + << input_tensor->dim(3); + TuningOrRun2DKernel(wino_kernel, ss.str(), gws, lws, future); +} + +template +void WinogradInverseTransformFunctor::operator()(const Tensor *input_tensor, + const Tensor *bias, + Tensor *output_tensor, + StatsFuture *future) { + std::vector output_shape = {batch_, height_, width_, input_tensor->dim(1)}; + std::vector image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape); + output_tensor->ResizeImage(output_shape, image_shape); + + string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); + std::set built_options; + built_options.emplace("-Dwinograd_inverse_transform_2x2=" + obfuscated_kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + switch (activation_) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + defeult: + LOG(FATAL) << "Unknown activation type: " << activation_; + } + + auto runtime = OpenCLRuntime::Global(); + auto wino_kernel = runtime->BuildKernel("winograd_transform", + obfuscated_kernel_name, + built_options); + + const uint32_t round_h = (height_ + 1) / 2; + const uint32_t round_w = (width_ + 1) / 2; + uint32_t idx = 0; + wino_kernel.setArg(idx++, *(static_cast(input_tensor->buffer()))); + if (bias != nullptr) { + wino_kernel.setArg(idx++, *(static_cast(bias->buffer()))); + } + wino_kernel.setArg(idx++, *(static_cast(output_tensor->buffer()))); + wino_kernel.setArg(idx++, static_cast(output_shape[1])); + wino_kernel.setArg(idx++, static_cast(output_shape[2])); + wino_kernel.setArg(idx++, static_cast(round_h * round_w)); + wino_kernel.setArg(idx++, static_cast(round_w)); + wino_kernel.setArg(idx++, relux_max_limit_); + wino_kernel.setArg(idx++, prelu_alpha_); + + const uint32_t gws[2] = {static_cast(input_tensor->dim(2)), + static_cast(RoundUpDiv4(input_tensor->dim(1)))}; + const std::vector lws = {128, 8, 1}; + + std::stringstream ss; + ss << "winograd_inverse_transform_kernel_" + << input_tensor->dim(0) << "_" + << input_tensor->dim(1) << "_" + << input_tensor->dim(2) << "_" + << input_tensor->dim(3); + TuningOrRun2DKernel(wino_kernel, ss.str(), gws, lws, future); +} + +template +struct WinogradTransformFunctor; +template +struct WinogradTransformFunctor; + +template +struct WinogradInverseTransformFunctor; +template +struct WinogradInverseTransformFunctor; + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..a71bda24b120f3eab77171dd2836c606151b6486 --- /dev/null +++ b/mace/kernels/winograd_transform.h @@ -0,0 +1,108 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_KERNELS_WINOGRAD_TRANSFORM_H_ +#define MACE_KERNELS_WINOGRAD_TRANSFORM_H_ + +#include "mace/core/future.h" +#include "mace/core/tensor.h" +#include "mace/kernels/conv_pool_2d_util.h" +#include "mace/kernels/activation.h" + +namespace mace { +namespace kernels { + +struct WinogradTransformFunctorBase { + WinogradTransformFunctorBase(const Padding &paddings) + : strides_({1, 1}), dilations_({1, 1}), paddings_(paddings) {} + + const std::vector strides_; // [stride_h, stride_w] + const std::vector dilations_; // [dilation_h, dilation_w] + Padding paddings_; +}; + +template +struct WinogradTransformFunctor : WinogradTransformFunctorBase { + WinogradTransformFunctor(const Padding &paddings) + : WinogradTransformFunctorBase(paddings) {} + + void operator()(const Tensor *input, + Tensor *output, + StatsFuture *future) { + MACE_NOT_IMPLEMENTED; + } + +}; + +template +struct WinogradTransformFunctor : WinogradTransformFunctorBase { + WinogradTransformFunctor(const Padding &paddings) + : WinogradTransformFunctorBase(paddings) {} + + void operator()(const Tensor *input, + Tensor *output, + StatsFuture *future); +}; + +struct WinogradInverseTransformFunctorBase { + WinogradInverseTransformFunctorBase(const int batch, + const int height, + const int width, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha) + : batch_(batch), + height_(height), + width_(width), + activation_(activation), + relux_max_limit_(relux_max_limit), + prelu_alpha_(prelu_alpha) {} + + const int batch_; + const int height_; + const int width_; + const ActivationType activation_; + const float relux_max_limit_; + const float prelu_alpha_; +}; + +template +struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { + WinogradInverseTransformFunctor(const int batch, + const int height, + const int width, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha) + : WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit, prelu_alpha) {} + + void operator()(const Tensor *input, + const Tensor *bias, + Tensor *output, + StatsFuture *future) { + MACE_NOT_IMPLEMENTED; + } + +}; + +template +struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase { + WinogradInverseTransformFunctor(const int batch, + const int height, + const int width, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha) + : WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit, prelu_alpha) {} + + void operator()(const Tensor *input, + const Tensor *bias, + Tensor *output, + StatsFuture *future); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_WINOGRAD_TRANSFORM_H_ diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc index 63d0cf7fa3dd6545d98db7a3c834ac065268eead..8010bc24dea8effe2750826e9d1c2bc8bb99fe9e 100644 --- a/mace/ops/activation_benchmark.cc +++ b/mace/ops/activation_benchmark.cc @@ -20,7 +20,7 @@ static void ReluBenchmark( if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Activation", "ReluBM") .Input("InputImage") @@ -79,7 +79,7 @@ static void ReluxBenchmark( if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Activation", "ReluxBM") .Input("InputImage") @@ -140,7 +140,7 @@ static void PreluBenchmark( if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Activation", "PreluBM") .Input("InputImage") @@ -201,7 +201,7 @@ static void TanhBenchmark( if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Activation", "TanhBM") .Input("InputImage") @@ -260,7 +260,7 @@ static void SigmoidBenchmark( if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Activation", "SigmoidBM") .Input("InputImage") diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc index e99579ab4bd7be3bc9f4af17351174284ef53acd..2fd1078c88ef3151268c2ff548a281bfb1bf3b3e 100644 --- a/mace/ops/activation_test.cc +++ b/mace/ops/activation_test.cc @@ -20,7 +20,7 @@ void TestSimpleRelu() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Activation", "ReluTest") .Input("InputImage") @@ -33,7 +33,7 @@ void TestSimpleRelu() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Activation", "ReluTest") .Input("Input") @@ -70,7 +70,7 @@ void TestUnalignedSimpleRelu() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Activation", "ReluTest") .Input("InputImage") @@ -83,7 +83,7 @@ void TestUnalignedSimpleRelu() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Activation", "ReluTest") .Input("Input") @@ -125,7 +125,7 @@ void TestSimpleRelux() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Activation", "ReluxTest") .Input("InputImage") @@ -139,7 +139,7 @@ void TestSimpleRelux() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Activation", "ReluxTest") .Input("Input") @@ -179,7 +179,7 @@ void TestSimpleReluRelux() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Activation", "ReluxTest") .Input("InputImage") @@ -193,7 +193,7 @@ void TestSimpleReluRelux() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Activation", "ReluxTest") .Input("Input") @@ -237,7 +237,7 @@ void TestSimplePrelu() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Activation", "PreluTest") .Input("InputImage") @@ -251,7 +251,7 @@ void TestSimplePrelu() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Activation", "PreluTest") .Input("Input") @@ -293,7 +293,7 @@ void TestSimpleTanh() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Activation", "TanhTest") .Input("InputImage") @@ -306,7 +306,7 @@ void TestSimpleTanh() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Activation", "TanhTest") .Input("Input") @@ -348,7 +348,7 @@ void TestSimpleSigmoid() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Activation", "SigmoidTest") .Input("InputImage") @@ -361,7 +361,7 @@ void TestSimpleSigmoid() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Activation", "SigmoidTest") .Input("Input") diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc index 41fb6e9e96a2385288b37650b3882a93aa4d26b8..7e9d9856be72fa95fbc968b8c056d7a4caf52d5d 100644 --- a/mace/ops/addn_benchmark.cc +++ b/mace/ops/addn_benchmark.cc @@ -23,7 +23,7 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { for (int i = 0; i < inputs; ++i) { BufferToImage(net, internal::MakeString("Input", i).c_str(), internal::MakeString("InputImage", i).c_str(), - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } OpDefBuilder op_def_builder("AddN", "AddNBM"); for (int i = 0; i < inputs; ++i) { diff --git a/mace/ops/addn_test.cc b/mace/ops/addn_test.cc index 5f9bd2bfe7cce685eca883e6c2159312ca0dd41f..691b15712b4f72f074486c86a75fd95ee5d08d7e 100644 --- a/mace/ops/addn_test.cc +++ b/mace/ops/addn_test.cc @@ -104,7 +104,7 @@ void RandomTest() { for (int i = 0; i < input_num; ++i) { BufferToImage(net, "Input" + ToString(i), "InputImage" + ToString(i), - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } auto op_def_cl = OpDefBuilder("AddN", "AddNTest"); @@ -119,7 +119,7 @@ void RandomTest() { net.RunOp(D); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.1); } diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc index 976bc2419f621c8ddf5cc088ca01e4d44eff02e3..abfe85a6cb8cd65a553a1610164b61aae570e2a1 100644 --- a/mace/ops/batch_norm_benchmark.cc +++ b/mace/ops/batch_norm_benchmark.cc @@ -24,7 +24,7 @@ static void BatchNorm( if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); BufferToImage(net, "Offset", "OffsetImage", diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc index 595635e7a6f7bfdbe119746b42a2799f6c07da2c..a312df78a10feed11087bb00e2ac3e67e9ee564c 100644 --- a/mace/ops/batch_norm_test.cc +++ b/mace/ops/batch_norm_test.cc @@ -23,7 +23,7 @@ void Simple() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); BufferToImage(net, "Offset", "OffsetImage", @@ -47,7 +47,7 @@ void Simple() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("BatchNorm", "BatchNormTest") .Input("Input") @@ -204,7 +204,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { // Run on opencl BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); BufferToImage(net, "Offset", "OffsetImage", @@ -234,7 +234,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { net.Sync(); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2); } @@ -276,7 +276,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { // Run on opencl BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); BufferToImage(net, "Offset", "OffsetImage", @@ -307,7 +307,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { net.Sync(); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.5); } @@ -349,7 +349,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { // Run on opencl BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); BufferToImage(net, "Offset", "OffsetImage", @@ -379,7 +379,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { net.Sync(); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2); } @@ -421,7 +421,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { // Run on opencl BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); BufferToImage(net, "Offset", "OffsetImage", @@ -452,7 +452,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { net.Sync(); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.5); } } diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc index 93df21f931979ba75040f5d3d7002ee4b674ab03..02da45ca126f2cb02962a67af7c887d5642e6036 100644 --- a/mace/ops/batch_to_space_benchmark.cc +++ b/mace/ops/batch_to_space_benchmark.cc @@ -15,7 +15,7 @@ static void BMBatchToSpace( OpsTestNet net; net.AddRandomInput("Input", {batch, height, width, channels}); BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") .Input("InputImage") .Output("OutputImage") diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc index 917c28a15898d2d4e4cf8fafd427de4a03bcc378..09f96267940903cd6438a82882247d3e32e92961 100644 --- a/mace/ops/bias_add_benchmark.cc +++ b/mace/ops/bias_add_benchmark.cc @@ -20,7 +20,7 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("BiasAdd", "BiasAddBM") diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc index ce83ebd72909d0d06bc5755b30537e2b3ddc3c93..91bc96e4553c53c1f611fd5a16b3012eb8f7e4cf 100644 --- a/mace/ops/bias_add_test.cc +++ b/mace/ops/bias_add_test.cc @@ -20,7 +20,7 @@ void BiasAddSimple() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); @@ -34,7 +34,7 @@ void BiasAddSimple() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("BiasAdd", "BiasAddTest") .Input("Input") @@ -90,7 +90,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { // Run on opencl BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); @@ -105,7 +105,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) { net.Sync(); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2); } @@ -140,7 +140,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { // Run on opencl BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); @@ -155,7 +155,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) { net.Sync(); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2); } } diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc index f77bbde0201646e8e69575f45bf3b1b0a44c6571..760103d0f40886e4c2b34fecb6a94ed2dee37fdc 100644 --- a/mace/ops/buffer_to_image_test.cc +++ b/mace/ops/buffer_to_image_test.cc @@ -55,23 +55,23 @@ TEST(BufferToImageTest, ArgLarge) { } TEST(BufferToImageTest, InputSmallSingleChannel) { - TestBidirectionTransform(kernels::IN_OUT, {1, 2, 3, 1}); + TestBidirectionTransform(kernels::IN_OUT_CHANNEL, {1, 2, 3, 1}); } TEST(BufferToImageTest, InputSmallMultipleChannel) { - TestBidirectionTransform(kernels::IN_OUT, {1, 2, 3, 3}); + TestBidirectionTransform(kernels::IN_OUT_CHANNEL, {1, 2, 3, 3}); } TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) { - TestBidirectionTransform(kernels::IN_OUT, {3, 2, 3, 3}); + TestBidirectionTransform(kernels::IN_OUT_CHANNEL, {3, 2, 3, 3}); } TEST(BufferToImageTest, InputMedia) { - TestBidirectionTransform(kernels::IN_OUT, {3, 13, 17, 128}); + TestBidirectionTransform(kernels::IN_OUT_CHANNEL, {3, 13, 17, 128}); } TEST(BufferToImageTest, InputLarge) { - TestBidirectionTransform(kernels::IN_OUT, {3, 64, 64, 256}); + TestBidirectionTransform(kernels::IN_OUT_CHANNEL, {3, 64, 64, 256}); } TEST(BufferToImageTest, Filter1x1Small) { @@ -124,7 +124,7 @@ void TestDiffTypeBidirectionTransform(const int type, const std::vector net.RunOp(D); // Check - ExpectTensorNear(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-3); + ExpectTensorNear(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-2); } TEST(BufferToImageTest, ArgFloatToHalfSmall) { diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc index 6a3dda02f7448b968456ba334a1167349c4ef6e4..11d7de4b1055d6eefb6bc47203791c39f03570c3 100644 --- a/mace/ops/concat_benchmark.cc +++ b/mace/ops/concat_benchmark.cc @@ -61,9 +61,9 @@ static void OpenclConcatHelper(int iters, net.AddRandomInput("Input1", shape1); BufferToImage(net, "Input0", "InputImage0", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Input1", "InputImage1", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Concat", "ConcatBM") .Input("InputImage0") .Input("InputImage1") diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc index 49d55d2a830e59b62e81ce9592b7327ad3a7a219..dff64dbf3f94d2bea70f2194360ac0ace7df90ce 100644 --- a/mace/ops/concat_test.cc +++ b/mace/ops/concat_test.cc @@ -153,7 +153,7 @@ void OpenclRandomTest(const std::vector> &shapes, concat_axis_size += shapes[i][axis]; net.AddRandomInput(input_name, shapes[i]); BufferToImage(net, input_name, image_name, - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } auto builder = OpDefBuilder("Concat", "ConcatTest"); @@ -170,7 +170,7 @@ void OpenclRandomTest(const std::vector> &shapes, net.RunOp(DeviceType::OPENCL); ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); // Check auto output = net.GetOutput("Output"); diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 42d187b37bc26bb42c3cde79f9791d333c1801e4..b02eb17e63d5e1e3cf126b6be0284bdb9f73d954 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -34,7 +34,7 @@ static void Conv2d(int iters, if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", @@ -97,17 +97,21 @@ static void Conv2d(int iters, // ICNet BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half); -// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105 +//// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105 BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half); -// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108 +//// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108 BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half); BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half); -// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8 +//// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8 BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half); BM_CONV_2D(1, 3, 512, 512, 7, 7, 2, SAME, 64, half); BM_CONV_2D(1, 512, 64, 64, 1, 1, 1, SAME, 256, half); +BM_CONV_2D(1, 128, 16, 16, 3, 3, 1, VALID, 32, half); +BM_CONV_2D(1, 128, 64, 64, 3, 3, 1, VALID, 32, half); +BM_CONV_2D(1, 128, 128, 128, 3, 3, 1, VALID, 32, half); + // Test RGB <-> YUV // BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float); // BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float); diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index a12842e2862573813e9f2c72d86742a58039335f..877da76da3ae1d45c9fcca0abdc6e54426091401 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -100,7 +100,7 @@ void TestNHWCSimple3x3VALID() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", @@ -120,7 +120,7 @@ void TestNHWCSimple3x3VALID() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Conv2D", "Conv2dTest") @@ -157,7 +157,7 @@ void TestNHWCSimple3x3SAME() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", @@ -177,7 +177,7 @@ void TestNHWCSimple3x3SAME() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Conv2D", "Conv2dTest") @@ -262,7 +262,7 @@ void TestNHWCSimple3x3WithoutBias() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); @@ -279,7 +279,7 @@ void TestNHWCSimple3x3WithoutBias() { net.RunOp(D); // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Conv2D", "Conv2dTest") .Input("Input") @@ -369,7 +369,7 @@ static void TestNHWCCombined3x3() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", @@ -389,7 +389,7 @@ static void TestNHWCCombined3x3() { net.RunOp(D); ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Conv2D", "Conv2DTest") .Input("Input") @@ -442,7 +442,7 @@ void TestConv1x1() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", @@ -461,7 +461,7 @@ void TestConv1x1() { net.RunOp(D); ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Conv2D", "Conv2DTest") .Input("Input") @@ -533,7 +533,7 @@ static void TestComplexConvNxNS12(const std::vector &shape) { // run on gpu BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", @@ -553,7 +553,7 @@ static void TestComplexConvNxNS12(const std::vector &shape) { net.RunOp(D); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.001); }; @@ -626,7 +626,7 @@ static void TestHalfComplexConvNxNS12(const std::vector &input_shape, // run on gpu BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", @@ -646,7 +646,7 @@ static void TestHalfComplexConvNxNS12(const std::vector &input_shape, net.RunOp(D); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.5); }; @@ -758,7 +758,7 @@ static void TestDilationConvNxN(const std::vector &shape, const int dil expected.Copy(*net.GetOutput("Output")); // run on gpu - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); @@ -775,7 +775,7 @@ static void TestDilationConvNxN(const std::vector &shape, const int dil // Run on device net.RunOp(D); - ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.001); }; diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index 43950a498b1d337e60692dcbe05e90268e38b0cb..f2cabdee9254773f3791a81b212610a7f8a878c4 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -26,7 +26,7 @@ void SimpleValidTest() { net.AddInputFromArray("Bias", {2}, {.1f, .2f}); if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::DW_CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", @@ -46,7 +46,7 @@ void SimpleValidTest() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") @@ -129,7 +129,7 @@ void ComplexValidTest() { {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}); if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::DW_CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", @@ -149,7 +149,7 @@ void ComplexValidTest() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") @@ -239,7 +239,7 @@ void TestNxNS12(const index_t height, const index_t width) { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::DW_CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", @@ -259,7 +259,7 @@ void TestNxNS12(const index_t height, const index_t width) { // Transfer output ImageToBuffer(net, "OutputImage", "DeviceOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("Input") diff --git a/mace/ops/depthwise_conv_2d_benchmark.cc b/mace/ops/depthwise_conv_2d_benchmark.cc index 5acc514f9fe0a15510bcb467e150991bc874aa85..3a963dd0408afc4069ce654b4a5002d3d8252504 100644 --- a/mace/ops/depthwise_conv_2d_benchmark.cc +++ b/mace/ops/depthwise_conv_2d_benchmark.cc @@ -34,7 +34,7 @@ static void DepthwiseConv2d(int iters, if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::DW_CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", diff --git a/mace/ops/folded_batch_norm.cc b/mace/ops/folded_batch_norm.cc index 5a04c48dd8f2000c9a33b175ec5c67f4c4aebe81..9915bee4128f1e3766a91070d1cae48e044f459f 100644 --- a/mace/ops/folded_batch_norm.cc +++ b/mace/ops/folded_batch_norm.cc @@ -7,10 +7,11 @@ namespace mace { void Register_FoldedBatchNorm(OperatorRegistry *op_registry) { - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") - .Device(DeviceType::CPU) - .TypeConstraint("T") - .Build(), + REGISTER_OPERATOR(op_registry, + OpKeyBuilder("FoldedBatchNorm") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), FoldedBatchNormOp); #if MACE_ENABLE_NEON @@ -21,16 +22,18 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) { FoldedBatchNormOp); #endif // MACE_ENABLE_NEON - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") - .Device(DeviceType::OPENCL) - .TypeConstraint("T") - .Build(), + REGISTER_OPERATOR(op_registry, + OpKeyBuilder("FoldedBatchNorm") + .Device(DeviceType::OPENCL) + .TypeConstraint("T") + .Build(), FoldedBatchNormOp); - REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") - .Device(DeviceType::OPENCL) - .TypeConstraint("T") - .Build(), + REGISTER_OPERATOR(op_registry, + OpKeyBuilder("FoldedBatchNorm") + .Device(DeviceType::OPENCL) + .TypeConstraint("T") + .Build(), FoldedBatchNormOp); } diff --git a/mace/ops/folded_batch_norm_test.cc b/mace/ops/folded_batch_norm_test.cc index 5ee0a9473b196cc2747ae748dbdd37cbcbc5a5c3..45bd6736f94809feafdd2c0c1dbeac90254d4c67 100644 --- a/mace/ops/folded_batch_norm_test.cc +++ b/mace/ops/folded_batch_norm_test.cc @@ -38,7 +38,7 @@ void Simple() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); BufferToImage(net, "Offset", "OffsetImage", @@ -55,7 +55,7 @@ void Simple() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest") .Input("Input") @@ -204,7 +204,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { // Run on opencl BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); BufferToImage(net, "Offset", "OffsetImage", @@ -222,7 +222,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) { net.Sync(); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2); } @@ -259,7 +259,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { // Run on opencl BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); BufferToImage(net, "Offset", "OffsetImage", @@ -278,7 +278,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) { net.Sync(); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.5); } @@ -315,7 +315,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { // Run on opencl BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); BufferToImage(net, "Offset", "OffsetImage", @@ -332,7 +332,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) { net.RunOp(DeviceType::OPENCL); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-2); } @@ -369,7 +369,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { // Run on opencl BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Scale", "ScaleImage", kernels::BufferType::ARGUMENT); BufferToImage(net, "Offset", "OffsetImage", @@ -387,7 +387,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) { net.RunOp(DeviceType::OPENCL); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.5); } } diff --git a/mace/ops/fused_conv_2d_test.cc b/mace/ops/fused_conv_2d_test.cc index bdc4c3cf5aab19d80aace3963ed79de2f4e52a9c..87d99b9e1dc401b5948a6734fb4925aaa5870d9e 100644 --- a/mace/ops/fused_conv_2d_test.cc +++ b/mace/ops/fused_conv_2d_test.cc @@ -24,7 +24,7 @@ void TestNHWCSimple3x3VALID() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", @@ -44,7 +44,7 @@ void TestNHWCSimple3x3VALID() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("FusedConv2D", "FusedConv2dTest") @@ -81,7 +81,7 @@ void TestNHWCSimple3x3SAME() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", @@ -101,7 +101,7 @@ void TestNHWCSimple3x3SAME() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("FusedConv2D", "FusedConv2dTest") @@ -149,7 +149,7 @@ void TestNHWCSimple3x3WithoutBias() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); @@ -166,7 +166,7 @@ void TestNHWCSimple3x3WithoutBias() { net.RunOp(D); // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("FusedConv2D", "FusedConv2dTest") .Input("Input") @@ -218,7 +218,7 @@ void TestConv1x1() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", @@ -237,7 +237,7 @@ void TestConv1x1() { net.RunOp(D); ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("FusedConv2D", "FusedConv2dTest") .Input("Input") @@ -309,7 +309,7 @@ static void TestComplexConvNxNS12(const std::vector &shape) { // run on gpu BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", @@ -329,7 +329,7 @@ static void TestComplexConvNxNS12(const std::vector &shape) { net.RunOp(D); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.001); }; @@ -395,7 +395,7 @@ static void TestHalfComplexConvNxNS12(const std::vector &shape) { // run on gpu BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", @@ -415,7 +415,7 @@ static void TestHalfComplexConvNxNS12(const std::vector &shape) { net.RunOp(D); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.2); }; @@ -473,7 +473,7 @@ static void TestGeneralConvNxNS12(const std::vector &image_shape, // run on gpu BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", @@ -493,7 +493,7 @@ static void TestGeneralConvNxNS12(const std::vector &image_shape, net.RunOp(D); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.001); }; @@ -550,7 +550,7 @@ static void TestAtrousConvNxN(const std::vector &shape, const int dilat expected.Copy(*net.GetOutput("Output")); // run on gpu - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); @@ -567,7 +567,7 @@ static void TestAtrousConvNxN(const std::vector &shape, const int dilat // Run on device net.RunOp(D); - ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.001); }; @@ -632,7 +632,7 @@ static void TestGeneralHalfAtrousConv(const std::vector &image_shape, expected.Copy(*net.GetOutput("Output")); // run on gpu - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); @@ -649,7 +649,7 @@ static void TestGeneralHalfAtrousConv(const std::vector &image_shape, // Run on device net.RunOp(D); - ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.7); }; diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc new file mode 100644 index 0000000000000000000000000000000000000000..37731fd7c3704b07a9da974f6aa0cd6884c465ee --- /dev/null +++ b/mace/ops/matmul.cc @@ -0,0 +1,29 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/ops/matmul.h" + +namespace mace { + +void Register_MatMul(OperatorRegistry *op_registry) { + REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + MatMulOp); + + REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") + .Device(DeviceType::OPENCL) + .TypeConstraint("T") + .Build(), + MatMulOp); + + REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul") + .Device(DeviceType::OPENCL) + .TypeConstraint("T") + .Build(), + MatMulOp); +} + +} // namespace mace diff --git a/mace/ops/matmul.h b/mace/ops/matmul.h new file mode 100644 index 0000000000000000000000000000000000000000..6cfdfe999b4d563bb2f4f9ff2fb3e7556e3e3e21 --- /dev/null +++ b/mace/ops/matmul.h @@ -0,0 +1,40 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_OPS_MATMUL_H_ +#define MACE_OPS_MATMUL_H_ + +#include "mace/core/operator.h" +#include "mace/kernels/matmul.h" + +namespace mace { + +template +class MatMulOp : public Operator { + public: + MatMulOp(const OperatorDef &operator_def, Workspace *ws) + : Operator(operator_def, ws) {} + + bool Run(StatsFuture *future) override { + const Tensor *A = this->Input(0); + const Tensor *B = this->Input(1); + Tensor *C = this->Output(0); + MACE_CHECK(A->dim_size() == 4 && 4 == B->dim_size()) + << "The dimension of A and B should be 4"; + MACE_CHECK(A->dim(0) == B->dim(0)) << "A and B must have same batch size"; + MACE_CHECK(A->dim(2) == B->dim(1)) + << "the number of A's column " << A->dim(2) + << " must be equal to B's row " << B->dim(1); + + functor_(A, B, C, future); + return true; + } + + private: + kernels::MatMulFunctor functor_; +}; + +} // namespace mace + +#endif // MACE_OPS_MATMUL_H_ diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc new file mode 100644 index 0000000000000000000000000000000000000000..864767999c80e8dfd1ce5d047317f9d85432c1dc --- /dev/null +++ b/mace/ops/matmul_benchmark.cc @@ -0,0 +1,70 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include +#include "mace/core/operator.h" +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +template +static void MatMulBenchmark( + int iters, int batch, int height, int channels, int out_width) { + mace::testing::StopTiming(); + + OpsTestNet net; + + // Add input data + net.AddRandomInput("A", {batch, height, channels, 1}); + net.AddRandomInput("B", {batch, channels, out_width, 1}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "A", "AImage", + kernels::BufferType::IN_OUT_WIDTH); + BufferToImage(net, "B", "BImage", + kernels::BufferType::IN_OUT_HEIGHT); + + OpDefBuilder("MatMul", "MatMulBM") + .Input("AImage") + .Input("BImage") + .Output("Output") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + } else { + OpDefBuilder("MatMul", "MatMulBM") + .Input("A") + .Input("B") + .Output("Output") + .Finalize(net.NewOperatorDef()); + } + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + } + net.Sync(); +} + +#define BM_MATMUL_MACRO(N, H, C, W, TYPE, DEVICE) \ + static void BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE(int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::ItemsProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + MatMulBenchmark(iters, N, H, C, W); \ + } \ + BENCHMARK(BM_MATMUL_##N##_##H##_##C##_##W##_##TYPE##_##DEVICE) + +#define BM_MATMUL(N, H, C, W, TYPE) \ + BM_MATMUL_MACRO(N, H, C, W, TYPE, OPENCL); + +BM_MATMUL(16, 32, 128, 49, half); +BM_MATMUL(16, 32, 128, 961, half); +BM_MATMUL(16, 32, 128, 3969, half); +} // namespace mace diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b6c801dfbba04a84fc50276d5d6e95c5541dbb7e --- /dev/null +++ b/mace/ops/matmul_test.cc @@ -0,0 +1,181 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include +#include "mace/core/operator.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { + +class MatMulOpTest : public OpsTestBase {}; + +template +void Simple(const std::vector &A_shape, + const std::vector &A_value, + const std::vector &B_shape, + const std::vector &B_value, + const std::vector &C_shape, + const std::vector &C_value) { + OpsTestNet net; + + // Add input data + net.AddInputFromArray("A", A_shape, A_value); + net.AddInputFromArray("B", B_shape, B_value); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "A", "AImage", + kernels::BufferType::IN_OUT_WIDTH); + BufferToImage(net, "B", "BImage", + kernels::BufferType::IN_OUT_HEIGHT); + + OpDefBuilder("MatMul", "MatMulTest") + .Input("AImage") + .Input("BImage") + .Output("OutputImage") + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", + kernels::BufferType::IN_OUT_HEIGHT); + } else { + OpDefBuilder("MatMul", "MatMulTest") + .Input("A") + .Input("B") + .Output("Output") + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + } + + // Check + auto expected = + CreateTensor(C_shape, C_value); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} + +TEST_F(MatMulOpTest, SimpleCPU) { + Simple({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, + {1, 3, 2, 1}, {1, 2, 3, 4, 5, 6}, + {1, 2, 2, 1}, {22, 28, 49, 64}); + Simple({1, 5, 5, 1}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, + {1, 5, 5, 1}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, + {1, 5, 5, 1}, + {215, 230, 245, 260, 275, 490, 530, 570, 610, 650, + 765, 830, 895, 960, 1025, 1040, 1130, 1220, 1310, 1400, + 1315, 1430, 1545, 1660, 1775}); +} + + +TEST_F(MatMulOpTest, SimpleCPUWithBatch) { + Simple({2, 2, 3, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, + {2, 3, 2, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, + {2, 2, 2, 1}, {22, 28, 49, 64, 22, 28, 49, 64}); +} + +TEST_F(MatMulOpTest, SimpleOPENCL) { + Simple({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, + {1, 3, 2, 1}, {1, 2, 3, 4, 5, 6}, + {1, 2, 2, 1}, {22, 28, 49, 64}); + Simple({1, 5, 5, 1}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, + {1, 5, 5, 1}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, + {1, 5, 5, 1}, + {215, 230, 245, 260, 275, 490, 530, 570, 610, 650, + 765, 830, 895, 960, 1025, 1040, 1130, 1220, 1310, 1400, + 1315, 1430, 1545, 1660, 1775}); +} + +TEST_F(MatMulOpTest, SimpleGPUWithBatch) { + Simple({2, 2, 3, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, + {2, 3, 2, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}, + {2, 2, 2, 1}, {22, 28, 49, 64, 22, 28, 49, 64}); +} + +template +void Complex(const index_t batch, + const index_t height, + const index_t channels, + const index_t out_width) { + srand(time(NULL)); + + // Construct graph + OpsTestNet net; + OpDefBuilder("MatMul", "MatMulTest") + .Input("A") + .Input("B") + .Output("Output") + .Finalize(net.NewOperatorDef()); + + // Add input data + net.AddRandomInput( + "A", {batch, height, channels, 1}); + net.AddRandomInput( + "B", {batch, channels, out_width, 1}); + + // run cpu + net.RunOp(); + + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // Run on opencl + BufferToImage(net, "A", "AImage", + kernels::BufferType::IN_OUT_WIDTH); + BufferToImage(net, "B", "BImage", + kernels::BufferType::IN_OUT_HEIGHT); + + OpDefBuilder("MatMul", "MatMulTest") + .Input("AImage") + .Input("BImage") + .Output("OutputImage") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Run on opencl + net.RunOp(DeviceType::OPENCL); + + ImageToBuffer(net, "OutputImage", "OPENCLOutput", + kernels::BufferType::IN_OUT_HEIGHT); + if (DataTypeToEnum::value == DataType::DT_HALF) { + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-1); + } else { + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-4); + } +} + +TEST_F(MatMulOpTest, OPENCLAlignedWithoutBatch) { + Complex(1, 64, 128, 32); + Complex(1, 64, 32, 128); +} +TEST_F(MatMulOpTest, OPENCLUnAlignedWithoutBatch) { + Complex(1, 31, 113, 61); + Complex(1, 113, 31, 73); +} +TEST_F(MatMulOpTest, OPENCLUnAlignedWithBatch) { + Complex(2, 3, 3, 3); + Complex(16, 31, 61, 67); + Complex(31, 31, 61, 67); +} +TEST_F(MatMulOpTest, OPENCLHalfAlignedWithoutBatch) { + Complex(1, 64, 128, 32); + Complex(1, 64, 32, 128); +} +TEST_F(MatMulOpTest, OPENCLHalfUnAlignedWithBatch) { + Complex(2, 31, 113, 61); + Complex(16, 32, 64, 64); + Complex(31, 31, 61, 67); +} + +} diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index c802c126667027513ef0337e339c5dc007fd4282..bf4cff8bc121ddcd1dbbb484859eabed7396599b 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -134,7 +134,7 @@ static void SimpleMaxPooling3S2() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pooling", "PoolingTest") .Input("InputImage") .Output("OutputImage") @@ -146,7 +146,7 @@ static void SimpleMaxPooling3S2() { .Finalize(net.NewOperatorDef()); net.RunOp(D); ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { // Run OpDefBuilder("Pooling", "PoolingTest") @@ -198,7 +198,7 @@ static void MaxPooling3S2(const std::vector &input_shape, Tensor expected; expected.Copy(*net.GetOutput("Output")); - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pooling", "PoolingTest") .Input("InputImage") .Output("OutputImage") @@ -211,7 +211,7 @@ static void MaxPooling3S2(const std::vector &input_shape, .Finalize(net.NewOperatorDef()); net.RunOp(D); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.001); } @@ -283,7 +283,7 @@ static void SimpleAvgPoolingTest() { {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pooling", "PoolingTest") .Input("InputImage") .Output("OutputImage") @@ -296,7 +296,7 @@ static void SimpleAvgPoolingTest() { // Run net.RunOp(D); ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); // Check auto expected = CreateTensor({1, 1, 4, 1}, {4.5, 6.5, 8.5, 10.5}); @@ -333,7 +333,7 @@ static void AvgPoolingTest(const std::vector &shape, Tensor expected; expected.Copy(*net.GetOutput("Output")); - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pooling", "PoolingTest") .Input("InputImage") .Output("OutputImage") @@ -346,7 +346,7 @@ static void AvgPoolingTest(const std::vector &shape, .Finalize(net.NewOperatorDef()); net.RunOp(D); ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.01); } diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc index 46b9612394c9269aae98e402a41546d51cdc582e..01ffda0e686527b5a6f30a24b02c853ebe56d5ce 100644 --- a/mace/ops/resize_bilinear_benchmark.cc +++ b/mace/ops/resize_bilinear_benchmark.cc @@ -27,7 +27,7 @@ static void ResizeBilinearBenchmark(int iters, {output_height, output_width}); if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark") .Input("InputImage") .Input("OutSize") diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc index 06b715a0b8ff581a17f664f8e780668b63efdc56..129a627a320ba627ae79a83eac67406eb44bd3f7 100644 --- a/mace/ops/resize_bilinear_test.cc +++ b/mace/ops/resize_bilinear_test.cc @@ -92,7 +92,7 @@ void TestRandomResizeBilinear() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("ResizeBilinear", "ResizeBilinearTest") .Input("InputImage") @@ -104,7 +104,7 @@ void TestRandomResizeBilinear() { net.RunOp(D); ImageToBuffer(net, "OutputImage", "DeviceOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { // TODO support NEON } diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc index 030af807ca9d186551752116763c3ee7598ab9e6..267074a7c1130ce6063d0b2a937395005c082b04 100644 --- a/mace/ops/softmax_benchmark.cc +++ b/mace/ops/softmax_benchmark.cc @@ -20,7 +20,7 @@ static void SoftmaxBenchmark( if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Softmax", "SoftmaxBM") .Input("InputImage") diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc index b4f321a6d86ec0c7b7db7633a7ad6d43ada0d916..af8e3afc4bd4a192c0e8011671ea26c75f2734b8 100644 --- a/mace/ops/softmax_test.cc +++ b/mace/ops/softmax_test.cc @@ -18,7 +18,7 @@ void Simple() { if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Softmax", "SoftmaxTest") .Input("InputImage") @@ -30,7 +30,7 @@ void Simple() { // Transfer output ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Softmax", "SoftmaxTest") .Input("Input") @@ -72,7 +72,7 @@ void Complex(const std::vector &logits_shape) { expected.Copy(*net.GetOutput("Output")); BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Softmax", "SoftmaxTest") .Input("InputImage") @@ -84,7 +84,7 @@ void Complex(const std::vector &logits_shape) { // Transfer output ImageToBuffer(net, "OutputImage", "OPENCLOutput", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1e-5); } diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc index a2fea8dc9fd9c87eee7cabc4b4c332284e85c466..9b3e4d1cb68178406c418c76505e1d2a90a8eb69 100644 --- a/mace/ops/space_to_batch_benchmark.cc +++ b/mace/ops/space_to_batch_benchmark.cc @@ -16,7 +16,7 @@ static void BMSpaceToBatch( net.AddRandomInput("Input", {batch, height, width, channels}); BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") .Input("InputImage") .Output("OutputImage") diff --git a/mace/ops/space_to_batch_test.cc b/mace/ops/space_to_batch_test.cc index bebbafeff042a85c2ae3d1d2581cdd584544df6d..56d3761108405465f94291aaedd21d8843544daf 100644 --- a/mace/ops/space_to_batch_test.cc +++ b/mace/ops/space_to_batch_test.cc @@ -18,7 +18,7 @@ void RunSpaceToBatch(const std::vector &input_shape, net.AddInputFromArray("Input", input_shape, input_data); BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("SpaceToBatchND", "SpaceToBatchNDTest") .Input("InputImage") .Output("OutputImage") @@ -30,7 +30,7 @@ void RunSpaceToBatch(const std::vector &input_shape, net.RunOp(D); ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); // Check ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-8); } @@ -46,7 +46,7 @@ void RunBatchToSpace(const std::vector &input_shape, net.AddInputFromArray("Input", input_shape, input_data); BufferToImage(net, "Input", "InputImage", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("BatchToSpaceND", "BatchToSpaceNDTest") .Input("InputImage") .Output("OutputImage") @@ -58,7 +58,7 @@ void RunBatchToSpace(const std::vector &input_shape, net.RunOp(D); ImageToBuffer(net, "OutputImage", "Output", - kernels::BufferType::IN_OUT); + kernels::BufferType::IN_OUT_CHANNEL); // Check ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-8); } diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..364aec6b482169f5e99574160cffd2833a88bce7 --- /dev/null +++ b/mace/ops/winograd_convolution_test.cc @@ -0,0 +1,143 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include +#include "mace/core/operator.h" +#include "mace/ops/ops_test_util.h" +#include "mace/kernels/conv_pool_2d_util.h" + +namespace mace { + +class WinogradConvlutionTest : public OpsTestBase {}; + +void TransposeFilter(const std::vector &input, + const std::vector &input_shape, + std::vector &output) { + output.resize(input.size()); + + const float *input_ptr = input.data(); + for (index_t h = 0; h < input_shape[0]; ++h) { + for (index_t w = 0; w < input_shape[1]; ++w) { + for (index_t ic = 0; ic < input_shape[2]; ++ic) { + for (index_t oc = 0; oc < input_shape[3]; ++oc) { + int offset = ((oc * input_shape[2] + ic) * input_shape[0] + h) * input_shape[1] + w; + output[offset] = *input_ptr; + ++input_ptr; + } + } + } + } +} + +template +void WinogradConvolution(const index_t batch, + const index_t height, + const index_t width, + const index_t in_channels, + const index_t out_channels, + const Padding padding) { + srand(time(NULL)); + + // Construct graph + OpsTestNet net; + // Add input data + std::vector filter_data; + std::vector filter_shape = {3, 3, in_channels, out_channels}; + GenerateRandomRealTypeData(filter_shape, filter_data); + net.AddRandomInput("Input", {batch, height, width, in_channels}); + net.AddInputFromArray("Filter", filter_shape, filter_data); + net.AddRandomInput("Bias", {out_channels}); + + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT_CHANNEL); + BufferToImage(net, "Filter", "FilterImage", + kernels::BufferType::CONV2D_FILTER); + BufferToImage(net, "Bias", "BiasImage", + kernels::BufferType::ARGUMENT); + OpDefBuilder("Conv2D", "Conv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "ConvOutput", + kernels::BufferType::IN_OUT_CHANNEL); + Tensor expected; + expected.Copy(*net.GetOutput("ConvOutput")); + auto output_shape = expected.shape(); + + // Winograd convolution + // transform filter + std::vector wino_filter_data; + TransposeFilter(filter_data, filter_shape, wino_filter_data); + net.AddInputFromArray("WinoFilterData", {out_channels, in_channels, 3, 3}, wino_filter_data); + BufferToImage(net, "WinoFilterData", "WinoFilter", kernels::BufferType::WINOGRAD_FILTER); + + // transform input + OpDefBuilder("WinogradTransform", "WinogradTransformTest") + .Input("InputImage") + .Output("WinoInput") + .AddIntArg("padding", padding) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Run on opencl + net.RunOp(D); + + // MatMul + OpDefBuilder("MatMul", "MatMulTest") + .Input("WinoFilter") + .Input("WinoInput") + .Output("WinoGemm") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Run on opencl + net.RunOp(D); + + // Inverse transform + OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest") + .Input("WinoGemm") + .Input("BiasImage") + .AddIntArg("batch", batch) + .AddIntArg("height", output_shape[1]) + .AddIntArg("width", output_shape[2]) + .Output("WinoOutputImage") + .Finalize(net.NewOperatorDef()); + + // Run on opencl + net.RunOp(D); + net.Sync(); + + ImageToBuffer(net, "WinoOutputImage", "WinoOutput", + kernels::BufferType::IN_OUT_CHANNEL); + if (DataTypeToEnum::value == DataType::DT_HALF) { + ExpectTensorNear(expected, *net.GetOutput("WinoOutput"), 1e-1); + } else { + ExpectTensorNear(expected, *net.GetOutput("WinoOutput"), 1e-4); + } +} + +TEST_F(WinogradConvlutionTest, AlignedConvolution) { + WinogradConvolution(1, 32, 32, 32, 16, Padding::VALID); + WinogradConvolution(1, 32, 32, 32, 16, Padding::SAME); +} + +TEST_F(WinogradConvlutionTest, UnAlignedConvolution) { + WinogradConvolution(1, 61, 67, 31, 37, Padding::VALID); + WinogradConvolution(1, 61, 67, 37, 31, Padding::SAME); +} + +TEST_F(WinogradConvlutionTest, BatchConvolution) { + WinogradConvolution(3, 64, 64, 32, 32, Padding::VALID); + WinogradConvolution(5, 61, 67, 37, 31, Padding::SAME); +} + +} diff --git a/mace/ops/winograd_inverse_transform.cc b/mace/ops/winograd_inverse_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..4f81a1b05929448a431436f7db841f8b2b77f457 --- /dev/null +++ b/mace/ops/winograd_inverse_transform.cc @@ -0,0 +1,22 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/ops/winograd_inverse_transform.h" + +namespace mace { + +void Register_WinogradInverseTransform(OperatorRegistry *op_registry) { + REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform") + .Device(DeviceType::OPENCL) + .TypeConstraint("T") + .Build(), + WinogradInverseTransformOp); + REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform") + .Device(DeviceType::OPENCL) + .TypeConstraint("T") + .Build(), + WinogradInverseTransformOp); +} + +} // namespace mace diff --git a/mace/ops/winograd_inverse_transform.h b/mace/ops/winograd_inverse_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..4c20769f1fd461f393c1c57e58bc5f089197ed7c --- /dev/null +++ b/mace/ops/winograd_inverse_transform.h @@ -0,0 +1,48 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_OPS_WINOGRAD_INVERSE_TRANSFORM_H_ +#define MACE_OPS_WINOGRAD_INVERSE_TRANSFORM_H_ + +#include + +#include "mace/core/operator.h" +#include "mace/kernels/winograd_transform.h" +#include "mace/kernels/activation.h" + +namespace mace { + +template +class WinogradInverseTransformOp : public Operator { + public: + WinogradInverseTransformOp(const OperatorDef &op_def, Workspace *ws) + : Operator(op_def, ws), + functor_(OperatorBase::GetSingleArgument("batch", 1), + OperatorBase::GetSingleArgument("height", 0), + OperatorBase::GetSingleArgument("width", 0), + kernels::StringToActivationType( + OperatorBase::GetSingleArgument("activation", + "NOOP")), + OperatorBase::GetSingleArgument("max_limit", 0.0f), + OperatorBase::GetSingleArgument("alpha", 0.0f)) {} + + bool Run(StatsFuture *future) override { + const Tensor *input_tensor = this->Input(INPUT); + const Tensor *bias = this->InputSize() == 2 ? this->Input(BIAS) : nullptr; + Tensor *output_tensor = this->Output(OUTPUT); + functor_(input_tensor, bias, output_tensor, future); + return true; + } + + private: + kernels::WinogradInverseTransformFunctor functor_; + + protected: + OP_INPUT_TAGS(INPUT, BIAS); + OP_OUTPUT_TAGS(OUTPUT); +}; + +} // namespace mace + +#endif // MACE_OPS_WINOGRAD_INVERSE_TRANSFORM_H_ diff --git a/mace/ops/winograd_transform.cc b/mace/ops/winograd_transform.cc new file mode 100644 index 0000000000000000000000000000000000000000..369a62188800e8873b9548c0abec69d0f8d8e8f5 --- /dev/null +++ b/mace/ops/winograd_transform.cc @@ -0,0 +1,22 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/ops/winograd_transform.h" + +namespace mace { + +void Register_WinogradTransform(OperatorRegistry *op_registry) { + REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform") + .Device(DeviceType::OPENCL) + .TypeConstraint("T") + .Build(), + WinogradTransformOp); + REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform") + .Device(DeviceType::OPENCL) + .TypeConstraint("T") + .Build(), + WinogradTransformOp); +} + +} // namespace mace diff --git a/mace/ops/winograd_transform.h b/mace/ops/winograd_transform.h new file mode 100644 index 0000000000000000000000000000000000000000..f2cc5f10130da155558f5a67b7b7a813ef3d7f80 --- /dev/null +++ b/mace/ops/winograd_transform.h @@ -0,0 +1,41 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_OPS_WINOGRAD_TRANSFORM_H_ +#define MACE_OPS_WINOGRAD_TRANSFORM_H_ + +#include + +#include "mace/core/operator.h" +#include "mace/kernels/winograd_transform.h" + +namespace mace { + +template +class WinogradTransformOp : public Operator { + public: + WinogradTransformOp(const OperatorDef &op_def, Workspace *ws) + : Operator(op_def, ws), + functor_(static_cast(OperatorBase::GetSingleArgument( + "padding", static_cast(VALID)))) {} + + bool Run(StatsFuture *future) override { + const Tensor *input_tensor = this->Input(INPUT); + Tensor *output_tensor = this->Output(OUTPUT); + + functor_(input_tensor, output_tensor, future); + return true; + } + + private: + kernels::WinogradTransformFunctor functor_; + + protected: + OP_INPUT_TAGS(INPUT); + OP_OUTPUT_TAGS(OUTPUT); +}; + +} // namespace mace + +#endif // MACE_OPS_WINOGRAD_TRANSFORM_H_ diff --git a/mace/ops/winograd_transform_benchmark.cc b/mace/ops/winograd_transform_benchmark.cc new file mode 100644 index 0000000000000000000000000000000000000000..28d73b2a5cc96f3ca41dea871251a3400b48fb5b --- /dev/null +++ b/mace/ops/winograd_transform_benchmark.cc @@ -0,0 +1,110 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/operator.h" +#include "mace/core/testing/test_benchmark.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +template +static void BMWinogradTransform( + int iters, int batch, int height, int width, int channels) { + mace::testing::StopTiming(); + + OpsTestNet net; + net.AddRandomInput("Input", {batch, height, width, channels}); + + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT_CHANNEL); + OpDefBuilder("WinogradTransform", "WinogradTransformTest") + .Input("InputImage") + .Output("OutputImage") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + } + net.Sync(); +} + +#define BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ + static void \ + BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::ItemsProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + BMWinogradTransform(iters, N, H, W, C); \ + } \ + BENCHMARK( \ + BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) + +#define BM_WINOGRAD_TRANSFORM(N, H, W, C, TYPE) \ + BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, OPENCL); + +BM_WINOGRAD_TRANSFORM(1, 16, 16, 128, half); +BM_WINOGRAD_TRANSFORM(1, 64, 64, 128, half); +BM_WINOGRAD_TRANSFORM(1, 128, 128, 128, half); + +template +static void BMWinogradInverseTransform( + int iters, int batch, int height, int width, int channels) { + mace::testing::StopTiming(); + + index_t p = batch * ((height + 1) / 2) * ((width + 1) / 2); + OpsTestNet net; + net.AddRandomInput("Input", {16, channels, p, 1}); + + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT_HEIGHT); + OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest") + .Input("InputImage") + .AddIntArg("batch", batch) + .AddIntArg("height", height) + .AddIntArg("width", width) + .Output("OutputImage") + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Warm-up + for (int i = 0; i < 5; ++i) { + net.RunOp(D); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + net.RunOp(D); + } + net.Sync(); +} + +#define BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE) \ + static void \ + BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * C * H * W; \ + mace::testing::ItemsProcessed(tot); \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + BMWinogradInverseTransform(iters, N, H, W, C); \ + } \ + BENCHMARK( \ + BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) + +#define BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C, TYPE) \ + BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, OPENCL); + +BM_WINOGRAD_INVERSE_TRANSFORM(1, 14, 14, 32, half); +BM_WINOGRAD_INVERSE_TRANSFORM(1, 62, 62, 32, half); +BM_WINOGRAD_INVERSE_TRANSFORM(1, 126, 126, 32, half); + +} // namespace mace \ No newline at end of file diff --git a/mace/proto/BUILD b/mace/proto/BUILD index 5222b06bda6e1681b15ac7f60317376c5d34fa3d..8649197b94508615dd395a991bccfe5205042804 100644 --- a/mace/proto/BUILD +++ b/mace/proto/BUILD @@ -10,15 +10,6 @@ licenses(["notice"]) # Apache 2.0 load("@com_google_protobuf//:protobuf.bzl", "py_proto_library") -py_proto_library( - name = "mace_py", - srcs = ["mace.proto"], - default_runtime = "@com_google_protobuf//:protobuf_python", - protoc = "@com_google_protobuf//:protoc", - srcs_version = "PY2AND3", - deps = ["@com_google_protobuf//:protobuf_python"], -) - py_proto_library( name = "caffe_py", srcs = ["caffe.proto"], diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h index 369152819afb67c554c8c057777fc91d9b3e1349..dd0023e02116248bc7cf1f340cf37d3c1a510182 100644 --- a/mace/utils/tuner.h +++ b/mace/utils/tuner.h @@ -41,7 +41,7 @@ class Tuner { template RetType TuneOrRun( const std::string param_key, - std::vector &default_param, + const std::vector &default_param, const std::function>()> ¶m_generator, const std::function &, Timer *, std::vector *)> &func,