diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index 063e4f4c270c59db13314c12b0919e73bc84c395..05297479074e41bf0b96c1ae369c81a668f3d80c 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -186,7 +186,7 @@ bool Run(MaceEngine *engine, return true; } -DEFINE_string(device, "CPU", "Device [CPU|OPENCL]"); +DEFINE_string(device, "CPU", "Device [CPU|NEON|OPENCL]"); DEFINE_string(input_node, "input_node0,input_node1", "input nodes, separated by comma"); DEFINE_string(output_node, "output_node0,output_node1", @@ -264,6 +264,8 @@ int Main(int argc, char **argv) { DeviceType device_type = CPU; if (FLAGS_device == "OPENCL") { device_type = OPENCL; + } else if (FLAGS_device == "NEON") { + device_type = NEON; } // config runtime @@ -271,7 +273,7 @@ int Main(int argc, char **argv) { mace::ConfigOpenCLRuntime( static_cast(FLAGS_gpu_perf_hint), static_cast(FLAGS_gpu_priority_hint)); - } else if (device_type == CPU) { + } else if (device_type == CPU || device_type == NEON) { mace::ConfigOmpThreadsAndAffinity( FLAGS_omp_num_threads, static_cast(FLAGS_cpu_power_option)); diff --git a/mace/core/buffer.h b/mace/core/buffer.h index e6d433e62dae922d62a4f554fb87b1fcfb2cb08f..b655fdc4b10857a7a446b1b7ffc859bf535427e9 100644 --- a/mace/core/buffer.h +++ b/mace/core/buffer.h @@ -41,6 +41,8 @@ class BufferBase { virtual bool OnHost() const = 0; + virtual void Clear() = 0; + virtual index_t offset() const { return 0; } template @@ -158,6 +160,12 @@ class Buffer : public BufferBase { bool OnHost() const { return allocator_->OnHost(); } + void Clear() { + if (buf_ != nullptr) { + memset(buf_, 0, size_); + } + } + private: Allocator *allocator_; void *buf_; @@ -242,6 +250,10 @@ class Image : public BufferBase { bool OnHost() const { return allocator_->OnHost(); } + void Clear() { + MACE_NOT_IMPLEMENTED; + } + private: Allocator *allocator_; std::vector shape_; @@ -322,6 +334,10 @@ class BufferSlice : public BufferBase { bool OnHost() const { return buffer_->OnHost(); } + void Clear() { + MACE_NOT_IMPLEMENTED; + } + private: BufferBase *buffer_; void *mapped_buf_; diff --git a/mace/core/operator.cc b/mace/core/operator.cc index 60eabfdaa0a8de900690e2ef87e9719ead07d639..029c99f1667d6ddac71b778e76672710674d0787 100644 --- a/mace/core/operator.cc +++ b/mace/core/operator.cc @@ -93,10 +93,9 @@ extern void Register_Slice(OperatorRegistry *op_registry); extern void Register_Softmax(OperatorRegistry *op_registry); extern void Register_SpaceToBatchND(OperatorRegistry *op_registry); extern void Register_SpaceToDepth(OperatorRegistry *op_registry); +extern void Register_Transpose(OperatorRegistry *op_registry); extern void Register_WinogradInverseTransform(OperatorRegistry *op_registry); extern void Register_WinogradTransform(OperatorRegistry *op_registry); - - } // namespace ops OperatorRegistry::OperatorRegistry() { @@ -130,6 +129,7 @@ OperatorRegistry::OperatorRegistry() { ops::Register_Softmax(this); ops::Register_SpaceToBatchND(this); ops::Register_SpaceToDepth(this); + ops::Register_Transpose(this); ops::Register_WinogradInverseTransform(this); ops::Register_WinogradTransform(this); } diff --git a/mace/core/tensor.h b/mace/core/tensor.h index 53ac3c2e34e65f72d75a5aa518a48f0eeab3ed28..29571d96140c436eb71bfe64128842e297899ee8 100644 --- a/mace/core/tensor.h +++ b/mace/core/tensor.h @@ -146,21 +146,26 @@ class Tensor { template inline const T *data() const { - MACE_CHECK(buffer_ != nullptr, "buffer is null"); + MACE_CHECK_NOTNULL(buffer_); return buffer_->data(); } inline void *raw_mutable_data() { - MACE_CHECK(buffer_ != nullptr, "buffer is null"); + MACE_CHECK_NOTNULL(buffer_); return buffer_->raw_mutable_data(); } template inline T *mutable_data() { - MACE_CHECK(buffer_ != nullptr, "buffer is null"); + MACE_CHECK_NOTNULL(buffer_); return static_cast(buffer_->raw_mutable_data()); } + inline void Clear() { + MACE_CHECK_NOTNULL(buffer_); + buffer_->Clear(); + } + inline void Reshape(const std::vector &shape) { shape_ = shape; MACE_CHECK(raw_size() <= buffer_->size()); @@ -258,22 +263,19 @@ class Tensor { inline void DebugPrint() const { using namespace numerical_chars; // NOLINT(build/namespaces) std::stringstream os; + os << "Tensor " << name_ << " size: ["; for (index_t i : shape_) { os << i << ", "; } + os << "], content:\n"; - os.str(""); - os.clear(); - MappingGuard guard(this); for (int i = 0; i < size(); ++i) { - if (i != 0 && i % shape_[3] == 0) { + if (i != 0 && i % shape_.back() == 0) { os << "\n"; } CASES(dtype_, (os << (this->data()[i]) << ", ")); } - LOG(INFO) << "Tensor size: [" << dim(0) << ", " << dim(1) << ", " << dim(2) - << ", " << dim(3) << "], content:\n" - << os.str(); + LOG(INFO) << os.str(); } class MappingGuard { diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index 1aabb5de70177036c054e4a2f675a0c2abfa42cb..227c99737c1fc766c6a8fe0944ce6ea5b84cacc3 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -21,6 +21,7 @@ Tensor *Workspace::CreateTensor(const std::string &name, VLOG(3) << "Creating Tensor " << name; tensor_map_[name] = std::move(std::unique_ptr(new Tensor(alloc, type))); + tensor_map_[name]->SetSourceOpName(name); } return GetTensor(name); } diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD index 8add74d8c979cc5fc01f8401664146c1eb823714..877fa040624b1eae2d48569b8cd23379951495d8 100644 --- a/mace/kernels/BUILD +++ b/mace/kernels/BUILD @@ -11,13 +11,21 @@ load("//mace:mace.bzl", "if_android", "if_neon_enabled", "if_openmp_enabled") cc_library( name = "kernels", - srcs = glob([ - "*.cc", - "opencl/*.cc", - ]), + srcs = glob( + [ + "*.cc", + "opencl/*.cc", + "arm/*.cc", + ], + exclude = [ + "*_test.cc", + "arm/*_test.cc", + ], + ), hdrs = glob([ "*.h", "opencl/*.h", + "arm/*.h", ]), copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled(["-DMACE_ENABLE_NEON"]), linkopts = if_android(["-lm"]), @@ -28,14 +36,20 @@ cc_library( ) cc_test( - name = "kernel_test", + name = "kernels_test", testonly = 1, - srcs = glob(["test/*.cc"]), - linkopts = if_android(["-pie"]), + srcs = glob( + [ + "*_test.cc", + "arm/*_test.cc", + ], + ), + copts = if_openmp_enabled(["-fopenmp"]) + if_neon_enabled(["-DMACE_ENABLE_NEON"]), + linkopts = ["-fopenmp"], linkstatic = 1, deps = [ ":kernels", - "//mace/core", + "@gtest//:gtest", "@gtest//:gtest_main", ], ) diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h index 88840910a586346f4f962e594e02fa6e2e8179d0..5130ccdcb1675e46a653e30634e379f1b0898769 100644 --- a/mace/kernels/activation.h +++ b/mace/kernels/activation.h @@ -134,11 +134,20 @@ class ActivationFunctor { }; template <> -void ActivationFunctor::operator()( - const Tensor *input, - const Tensor *alpha, - Tensor *output, - StatsFuture *future); +class ActivationFunctor { + public: + ActivationFunctor(ActivationType type, float relux_max_limit) + : activation_(type), relux_max_limit_(relux_max_limit) {} + + void operator()(const Tensor *input, + const Tensor *alpha, + Tensor *output, + StatsFuture *future); + + private: + ActivationType activation_; + float relux_max_limit_; +}; template class ActivationFunctor { diff --git a/mace/kernels/arm/activation.cc b/mace/kernels/arm/activation.cc new file mode 100644 index 0000000000000000000000000000000000000000..30f21e03c5333292fd1e44aa71684a2e2cfcbfc7 --- /dev/null +++ b/mace/kernels/arm/activation.cc @@ -0,0 +1,32 @@ +// +// Copyright (c) 2018 XiaoMi All rights reserved. +// + +#include "mace/kernels/activation.h" + +namespace mace { +namespace kernels { + +void ActivationFunctor::operator()( + const Tensor *input, + const Tensor *alpha, + Tensor *output, + StatsFuture *future) { + const float *input_ptr = input->data(); + float *output_ptr = output->mutable_data(); + if (activation_ == PRELU) { + MACE_CHECK_NOTNULL(alpha); + const float *alpha_ptr = alpha->data(); + PReLUActivation(input_ptr, output->size(), input->dim(1), alpha_ptr, + output_ptr); + } else { + DoActivation(input_ptr, output_ptr, output->size(), activation_, + relux_max_limit_); + } +} + +} // namespace kernels +} // namespace mace + + + diff --git a/mace/kernels/arm/batch_norm.cc b/mace/kernels/arm/batch_norm.cc new file mode 100644 index 0000000000000000000000000000000000000000..76aa17d023c0f5cb7adf5c2bef402c097944ef24 --- /dev/null +++ b/mace/kernels/arm/batch_norm.cc @@ -0,0 +1,73 @@ +// +// Copyright (c) 2018 XiaoMi All rights reserved. +// + +#include "mace/kernels/batch_norm.h" + +namespace mace { +namespace kernels { + +void BatchNormFunctor::operator()( + const Tensor *input, + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + const float epsilon, + Tensor *output, + StatsFuture *future) { + // Batch normalization in the paper https://arxiv.org/abs/1502.03167 . + // The calculation formula for inference is + // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X + + // ( \offset - \frac { \scale * mean } { + // \sqrt{var+\variance_epsilon} } + // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} } + // new_offset = \offset - mean * common_val; + // Y = new_scale * X + new_offset; + const index_t batch = input->dim(0); + const index_t channels = input->dim(1); + const index_t height = input->dim(2); + const index_t width = input->dim(3); + + const float *input_ptr = input->data(); + const float *scale_ptr = scale->data(); + const float *offset_ptr = offset->data(); + float *output_ptr = output->mutable_data(); + + std::vector new_scale; + std::vector new_offset; + if (!folded_constant_) { + new_scale.resize(channels); + new_offset.resize(channels); + const float *mean_ptr = mean->data(); + const float *var_ptr = var->data(); +#pragma omp parallel for + for (index_t c = 0; c < channels; ++c) { + new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon); + new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c]; + } + } + + const float *scale_data = folded_constant_ ? scale_ptr : new_scale.data(); + const float *offset_data = folded_constant_ ? offset_ptr : new_offset.data(); + + index_t channel_size = height * width; + index_t batch_size = channels * channel_size; + + // NEON is slower, so stick to the trivial implementaion +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + index_t offset = b * batch_size + c * channel_size; + for (index_t hw = 0; hw < height * width; ++hw) { + output_ptr[offset + hw] = + scale_data[c] * input_ptr[offset + hw] + offset_data[c]; + } + } + } + DoActivation(output_ptr, output_ptr, output->size(), activation_, + relux_max_limit_); +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/arm/conv_2d.cc b/mace/kernels/arm/conv_2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..ccd17e9a01c76982749c16ce458907d7ddce24da --- /dev/null +++ b/mace/kernels/arm/conv_2d.cc @@ -0,0 +1,345 @@ +// +// Copyright (c) 2018 XiaoMi All rights reserved. +// + +#include "mace/kernels/conv_2d.h" +#include "mace/kernels/arm/conv_winograd.h" + +// winograd is always superior to neon impl during benchmark +#define USE_WINOGRAD 1 + +namespace mace { +namespace kernels { + +namespace { + +void Conv2dNCHW(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_channels, + const int filter_height, + const int filter_width, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + float *output) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch; ++b) { + for (index_t m = 0; m < out_channels; ++m) { + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w < out_width; ++w) { + index_t out_offset = + ((b * out_channels + m) * out_height + h) * out_width + w; + for (index_t c = 0; c < in_channels; ++c) { + for (index_t kh = 0; kh < filter_height; ++kh) { + for (index_t kw = 0; kw < filter_width; ++kw) { + index_t ih = h * stride_h + kh * dilation_h; + index_t iw = w * stride_w + kw * dilation_w; + index_t in_offset = + ((b * in_channels + c) * in_height + ih) * in_width + iw; + index_t filter_offset = + (((m * in_channels) + c) * filter_height + kh) * filter_width + + kw; + output[out_offset] += input[in_offset] * filter[filter_offset]; + } + } + } + } + } + } + } +} + +} // namespace + +extern void Conv2dNeonK1x1S1(const float *input, + const float *filter, + const index_t batch, + const index_t height, + const index_t width, + const index_t in_channels, + const index_t out_channels, + float *output); + +extern void Conv2dNeonK3x3S1(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_channels, + float *output); + +extern void Conv2dNeonK3x3S2(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_channels, + float *output); + +void Conv2dFunctor::operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output, + StatsFuture *future) { + MACE_CHECK_NOTNULL(input); + MACE_CHECK_NOTNULL(filter); + MACE_CHECK_NOTNULL(output); + + std::vector output_shape(4); + std::vector paddings(2); + if (paddings_.empty()) { + CalcNCHWPaddingAndOutputSize(input->shape().data(), + filter->shape().data(), + dilations_, + strides_, + padding_type_, + output_shape.data(), + paddings.data()); + } else { + paddings = paddings_; + CalcNCHWOutputSize(input->shape().data(), filter->shape().data(), + paddings_.data(), dilations_, strides_, RoundType::FLOOR, + output_shape.data()); + } + output->Resize(output_shape); + output->Clear(); + + index_t batch = output->dim(0); + index_t channels = output->dim(1); + index_t height = output->dim(2); + index_t width = output->dim(3); + + index_t input_batch = input->dim(0); + index_t input_channels = input->dim(1); + index_t input_height = input->dim(2); + index_t input_width = input->dim(3); + + index_t filter_h = filter->dim(2); + index_t filter_w = filter->dim(3); + MACE_CHECK(filter->dim(0) == channels, filter->dim(0), " != ", channels); + MACE_CHECK(filter->dim(1) == input_channels, filter->dim(1), " != ", + input_channels); + + index_t stride_h = strides_[0]; + index_t stride_w = strides_[1]; + + index_t dilation_h = dilations_[0]; + index_t dilation_w = dilations_[1]; + + MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); + + index_t padded_input_height = input_height + paddings[0]; + index_t padded_input_width = input_width + paddings[1]; + index_t extra_input_height = padded_input_height; + index_t extra_input_width = padded_input_width; + index_t extra_output_height = height; + index_t extra_output_width = width; + + int pad_top = paddings[0] >> 1; + int pad_bottom = paddings[0] - pad_top; + int pad_left = paddings[1] >> 1; + int pad_right = paddings[1] - pad_left; + + std::function conv_func; + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto bias_data = bias == nullptr ? nullptr : bias->data(); + auto output_data = output->mutable_data(); + memset(output_data, 0, sizeof(float) * batch * channels * height * width); + + if (USE_WINOGRAD && filter_h == 3 && filter_w == 3 && stride_h == 1 + && stride_w == 1 + && dilation_h == 1 && dilation_w == 1) { + extra_output_height = RoundUp(height, 2); + extra_input_height = std::max(padded_input_height, extra_output_height + 2); + extra_output_width = RoundUp(width, 2); + extra_input_width = std::max(padded_input_width, extra_output_width + 2); + if (extra_input_height != padded_input_height) { + pad_bottom += (extra_input_height - padded_input_height); + } + if (extra_input_width != padded_input_width) { + pad_right += (extra_input_width - padded_input_width); + } + + index_t tile_height_count = (extra_output_height + 1) / 2; + index_t tile_width_count = (extra_output_width + 1) / 2; + index_t tile_count = tile_height_count * tile_width_count; + transformed_input_.Resize({16, batch, input_channels, tile_count}); + transformed_filter_.Resize({16, channels, input_channels}); + transformed_output_.Resize({16, batch, channels, tile_count}); + + conv_func = [=](const float *pad_input, float *pad_output) { + WinoGradConv3x3s1(pad_input, + filter_data, + batch, + extra_input_height, + extra_input_width, + input_channels, + channels, + transformed_input_.mutable_data(), + transformed_filter_.mutable_data(), + transformed_output_.mutable_data(), + is_filter_transformed_, + pad_output); + is_filter_transformed_ = true; + }; + } else if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1 + && dilation_h == 1 && dilation_w == 1) { + extra_output_height = RoundUp(height, 2); + extra_input_height = std::max(padded_input_height, extra_output_height + 2); + extra_output_width = RoundUp(width, 4); + extra_input_width = std::max(padded_input_width, extra_output_width + 2); + if (extra_input_height != padded_input_height) { + pad_bottom += (extra_input_height - padded_input_height); + } + if (extra_input_width != padded_input_width) { + pad_right += (extra_input_width - padded_input_width); + } + + conv_func = [=](const float *pad_input, float *pad_output) { + Conv2dNeonK3x3S1(pad_input, + filter_data, + batch, + extra_input_height, + extra_input_width, + input_channels, + extra_output_height, + extra_output_width, + channels, + pad_output); + }; + } else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2 + && dilation_h == 1 && dilation_w == 1) { + extra_output_height = height; + extra_input_height = + std::max(padded_input_height, (extra_output_height - 1) * 2 + 3); + extra_output_width = RoundUp(width, 4); + extra_input_width = + std::max(padded_input_width, (extra_output_width - 1) * 2 + 3); + if (extra_input_height != padded_input_height) { + pad_bottom += (extra_input_height - padded_input_height); + } + if (extra_input_width != padded_input_width) { + pad_right += (extra_input_width - padded_input_width); + } + + conv_func = [=](const float *pad_input, float *pad_output) { + Conv2dNeonK3x3S2(pad_input, + filter_data, + batch, + extra_input_height, + extra_input_width, + input_channels, + extra_output_height, + extra_output_width, + channels, + pad_output); + }; + } else if (filter_h == 1 && filter_w == 1 && stride_h == 1 && stride_w == 1 + && dilation_h == 1 && dilation_w == 1) { + conv_func = [=](const float *pad_input, float *pad_output) { + Conv2dNeonK1x1S1(input_data, + filter_data, + batch, + height, + width, + input_channels, + channels, + output_data); + }; + } else { + conv_func = [=](const float *pad_input, float *pad_output) { + Conv2dNCHW(pad_input, + filter_data, + batch, + extra_input_height, + extra_input_width, + input_channels, + extra_output_height, + extra_output_width, + channels, + filter_h, + filter_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + pad_output); + }; + } + + const Tensor *pad_input_ptr = input; + // Keep this alive during kernel execution + if (extra_input_height != input_height || extra_input_width != input_width) { + ConstructNCHWInputWithSpecificPadding(input, + pad_top, + pad_bottom, + pad_left, + pad_right, + &padded_input_); + pad_input_ptr = &padded_input_; + } + const float *pad_input_data = pad_input_ptr->data(); + + Tensor *pad_output_ptr = output; + // Keep this alive during kernel execution + if (extra_output_height != height || extra_output_width != width) { + std::vector extra_output_shape + {batch, channels, extra_output_height, extra_output_width}; + padded_output_.Resize(extra_output_shape); + pad_output_ptr = &padded_output_; + } + float *pad_output_data = pad_output_ptr->mutable_data(); + + conv_func(pad_input_data, pad_output_data); + + // unpack output + if (extra_output_height != height || extra_output_width != width) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + for (index_t h = 0; h < height; ++h) { + memcpy( + output_data + b * channels * height * width + c * height * width + + h * width, + pad_output_data + + b * channels * extra_output_height * extra_output_width + + c * extra_output_height * extra_output_width + + h * extra_output_width, + sizeof(float) * width); + } + } + } + } + + if (bias_data != nullptr) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + for (index_t i = 0; i < height * width; ++i) { + output_data[(b * channels + c) * height * width + i] += bias_data[c]; + } + } + } + } + + DoActivation(output_data, output_data, output->size(), activation_, + relux_max_limit_); +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/arm/conv_2d_neon_1x1.cc b/mace/kernels/arm/conv_2d_neon_1x1.cc new file mode 100644 index 0000000000000000000000000000000000000000..cb08b06189927160981ed21448f7d5672df24bab --- /dev/null +++ b/mace/kernels/arm/conv_2d_neon_1x1.cc @@ -0,0 +1,35 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) +#include +#endif + +#include "mace/core/types.h" +#include "mace/kernels/gemm.h" + +namespace mace { +namespace kernels { + +void Conv2dNeonK1x1S1(const float *input, + const float *filter, + const index_t batch, + const index_t height, + const index_t width, + const index_t in_channels, + const index_t out_channels, + float *output) { + for (index_t b = 0; b < batch; ++b) { + Gemm(filter, + input + b * in_channels * height * width, + 1, + out_channels, + in_channels, + height * width, + output + b * out_channels * height * width); + } +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/arm/conv_2d_neon_3x3.cc b/mace/kernels/arm/conv_2d_neon_3x3.cc new file mode 100644 index 0000000000000000000000000000000000000000..cedf44eca2907efebb137cf47d91cd8cd6689d86 --- /dev/null +++ b/mace/kernels/arm/conv_2d_neon_3x3.cc @@ -0,0 +1,406 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) +#include +#endif + +#include "mace/core/types.h" + +namespace mace { +namespace kernels { + +// Ho = 2, Wo = 4, Co = 2 +void Conv2dNeonK3x3S1(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_channels, + float *output) { + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch; ++b) { + for (index_t m = 0; m < out_channels; m += 2) { + if (m + 1 < out_channels) { + float *out_ptr0_base = output + b * out_batch_size + m * out_image_size; + float *out_ptr1_base = + output + b * out_batch_size + (m + 1) * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + float *out_ptr0 = out_ptr0_base; + float *out_ptr1 = out_ptr1_base; + + const float *in_ptr0 = input + b * in_batch_size + c * in_image_size; + const float *in_ptr1 = + input + b * in_batch_size + c * in_image_size + 1 * in_width; + const float *in_ptr2 = + input + b * in_batch_size + c * in_image_size + 2 * in_width; + const float *in_ptr3 = + input + b * in_batch_size + c * in_image_size + 3 * in_width; + const float *filter_ptr0 = filter + m * in_channels * 9 + c * 9; + const float *filter_ptr1 = filter + (m + 1) * in_channels * 9 + c * 9; + +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) + // load filter (4 outch x 3 height x 3 width): vf_outch_height + float32x4_t vf00, vf01, vf02; + float32x4_t vf10, vf11, vf12; + vf00 = vld1q_f32(filter_ptr0); + vf01 = vld1q_f32(filter_ptr0 + 3); + vf02 = vld1q_f32(filter_ptr0 + 6); + + vf10 = vld1q_f32(filter_ptr1); + vf11 = vld1q_f32(filter_ptr1 + 3); + vf12 = vld1q_f32(filter_ptr1 + 6); + + + for (index_t h = 0; h + 1 < out_height; h += 2) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input (4 height x 3 slide): vi_height_slide + float32x4_t vi00, vi01, vi02; // reg count: 14 + float32x4_t vi10, vi11, vi12; + float32x4_t vi20, vi21, vi22; + float32x4_t vi30, vi31, vi32; + float32x4_t vo20, vo30; // tmp use + + // output (4 outch x 2 height x 4 width): vo_outch_height + float32x4_t vo00, vo01; + float32x4_t vo10, vo11; + + // load input + vi00 = vld1q_f32(in_ptr0); + vo00 = vld1q_f32(in_ptr0 + 4); // reuse vo00: vi0n + vi10 = vld1q_f32(in_ptr1); + vo10 = vld1q_f32(in_ptr1 + 4); + vi20 = vld1q_f32(in_ptr2); + vo20 = vld1q_f32(in_ptr2 + 4); + vi30 = vld1q_f32(in_ptr3); + vo30 = vld1q_f32(in_ptr3 + 4); + + vi01 = vextq_f32(vi00, vo00, 1); + vi02 = vextq_f32(vi00, vo00, 2); + vi11 = vextq_f32(vi10, vo10, 1); + vi12 = vextq_f32(vi10, vo10, 2); + vi21 = vextq_f32(vi20, vo20, 1); + vi22 = vextq_f32(vi20, vo20, 2); + vi31 = vextq_f32(vi30, vo30, 1); + vi32 = vextq_f32(vi30, vo30, 2); + + // load ouptut + vo00 = vld1q_f32(out_ptr0); + vo01 = vld1q_f32(out_ptr0 + out_width); + vo10 = vld1q_f32(out_ptr1); + vo11 = vld1q_f32(out_ptr1 + out_width); + + // outch 0, height 0 + vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0); // reg count: 18 + vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1); + vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2); + vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0); + vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1); + vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2); + vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 0); + vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 1); + vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 2); + + // outch 0, height 1 + vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0); + vo01 = vfmaq_laneq_f32(vo01, vi11, vf00, 1); + vo01 = vfmaq_laneq_f32(vo01, vi12, vf00, 2); + vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0); + vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1); + vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2); + vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 0); + vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 1); + vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 2); + + // outch 1, height 0 + vo10 = vfmaq_laneq_f32(vo10, vi00, vf10, 0); + vo10 = vfmaq_laneq_f32(vo10, vi01, vf10, 1); + vo10 = vfmaq_laneq_f32(vo10, vi02, vf10, 2); + vo10 = vfmaq_laneq_f32(vo10, vi10, vf11, 0); + vo10 = vfmaq_laneq_f32(vo10, vi11, vf11, 1); + vo10 = vfmaq_laneq_f32(vo10, vi12, vf11, 2); + vo10 = vfmaq_laneq_f32(vo10, vi20, vf12, 0); + vo10 = vfmaq_laneq_f32(vo10, vi21, vf12, 1); + vo10 = vfmaq_laneq_f32(vo10, vi22, vf12, 2); + + // outch 1, height 1 + vo11 = vfmaq_laneq_f32(vo11, vi10, vf10, 0); + vo11 = vfmaq_laneq_f32(vo11, vi11, vf10, 1); + vo11 = vfmaq_laneq_f32(vo11, vi12, vf10, 2); + vo11 = vfmaq_laneq_f32(vo11, vi20, vf11, 0); + vo11 = vfmaq_laneq_f32(vo11, vi21, vf11, 1); + vo11 = vfmaq_laneq_f32(vo11, vi22, vf11, 2); + vo11 = vfmaq_laneq_f32(vo11, vi30, vf12, 0); + vo11 = vfmaq_laneq_f32(vo11, vi31, vf12, 1); + vo11 = vfmaq_laneq_f32(vo11, vi32, vf12, 2); + + vst1q_f32(out_ptr0, vo00); + vst1q_f32(out_ptr0 + out_width, vo01); + vst1q_f32(out_ptr1, vo10); + vst1q_f32(out_ptr1 + out_width, vo11); + + in_ptr0 += 4; + in_ptr1 += 4; + in_ptr2 += 4; + in_ptr3 += 4; + + out_ptr0 += 4; + out_ptr1 += 4; + } // w + + in_ptr0 += 2 + in_width; + in_ptr1 += 2 + in_width; + in_ptr2 += 2 + in_width; + in_ptr3 += 2 + in_width; + + out_ptr0 += out_width; + out_ptr1 += out_width; + } // h +#else + for (index_t io = 0; io < 2; ++io) { + for (index_t ih = 0; ih < out_height; ++ih) { + for (index_t iw = 0; iw < out_width; ++iw) { + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + out_ptr0[io * out_image_size + ih * out_width + iw] += + in_ptr0[(ih + i) * in_width + (iw + j)] + * filter_ptr0[io * in_channels * 9 + i * 3 + j]; + } + } + } + } + } // for +#endif + } // c + } else { + for (index_t mm = m; mm < out_channels; ++mm) { + float + *out_ptr0_base = output + b * out_batch_size + mm * out_image_size; + for (index_t c = 0; c < in_channels; ++c) { + float *out_ptr0 = out_ptr0_base; + + const float + *in_ptr0 = input + b * in_batch_size + c * in_image_size; + const float *in_ptr1 = + input + b * in_batch_size + c * in_image_size + 1 * in_width; + const float *in_ptr2 = + input + b * in_batch_size + c * in_image_size + 2 * in_width; + const float *in_ptr3 = + input + b * in_batch_size + c * in_image_size + 3 * in_width; + const float *filter_ptr0 = filter + mm * in_channels * 9 + c * 9; + +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float32x4_t vf00, vf01, vf02; + vf00 = vld1q_f32(filter_ptr0); + vf01 = vld1q_f32(filter_ptr0 + 3); + vf02 = vld1q_f32(filter_ptr0 + 6); + + for (index_t h = 0; h + 1 < out_height; h += 2) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + // input (4 height x 3 slide): vi_height_slide + float32x4_t vi00, vi01, vi02, vi0n; + float32x4_t vi10, vi11, vi12, vi1n; + float32x4_t vi20, vi21, vi22, vi2n; + float32x4_t vi30, vi31, vi32, vi3n; + + // output (1 outch x 2 height x 4 width): vo_outch_height + float32x4_t vo00, vo01; + + // load input + vi00 = vld1q_f32(in_ptr0); + vi0n = vld1q_f32(in_ptr0 + 4); + vi10 = vld1q_f32(in_ptr1); + vi1n = vld1q_f32(in_ptr1 + 4); + vi20 = vld1q_f32(in_ptr2); + vi2n = vld1q_f32(in_ptr2 + 4); + vi30 = vld1q_f32(in_ptr3); + vi3n = vld1q_f32(in_ptr3 + 4); + + vi01 = vextq_f32(vi00, vi0n, 1); + vi02 = vextq_f32(vi00, vi0n, 2); + vi11 = vextq_f32(vi10, vi1n, 1); + vi12 = vextq_f32(vi10, vi1n, 2); + vi21 = vextq_f32(vi20, vi2n, 1); + vi22 = vextq_f32(vi20, vi2n, 2); + vi31 = vextq_f32(vi30, vi3n, 1); + vi32 = vextq_f32(vi30, vi3n, 2); + + // load ouptut + vo00 = vld1q_f32(out_ptr0); + vo01 = vld1q_f32(out_ptr0 + out_width); + + // outch 0, height 0 + vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0); + vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1); + vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2); + vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0); + vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1); + vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2); + vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 0); + vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 1); + vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 2); + + // outch 0, height 1 + vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0); + vo01 = vfmaq_laneq_f32(vo01, vi11, vf00, 1); + vo01 = vfmaq_laneq_f32(vo01, vi12, vf00, 2); + vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0); + vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1); + vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2); + vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 0); + vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 1); + vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 2); + + vst1q_f32(out_ptr0, vo00); + vst1q_f32(out_ptr0 + out_width, vo01); + + + in_ptr0 += 4; + in_ptr1 += 4; + in_ptr2 += 4; + in_ptr3 += 4; + + out_ptr0 += 4; + } // w + + in_ptr0 += 2 + in_width; + in_ptr1 += 2 + in_width; + in_ptr2 += 2 + in_width; + in_ptr3 += 2 + in_width; + + out_ptr0 += out_width; + } // h +#else + for (index_t ih = 0; ih < out_height; ++ih) { + for (index_t iw = 0; iw < out_width; ++iw) { + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + out_ptr0[ih * out_width + iw] += + in_ptr0[(ih + i) * in_width + (iw + j)] + * filter_ptr0[i * 3 + j]; + } + } + } + } +#endif + } // c + } // mm + } // if + } // m + } // b +} + +void Conv2dNeonK3x3S2(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_channels, + float *output) { + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch; ++b) { + for (index_t m = 0; m < out_channels; ++m) { + for (index_t c = 0; c < in_channels; ++c) { + const float *in_base = input + b * in_batch_size + c * in_image_size; + const float + *filter_ptr = filter + m * in_channels * 9 + c * 9; + float *out_base = output + b * out_batch_size + m * out_image_size; + +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float32x4_t vf00, vf01, vf02; + vf00 = vld1q_f32(filter_ptr); + vf01 = vld1q_f32(filter_ptr + 3); + vf02 = vld1q_f32(filter_ptr + 6); + + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w + 3 < out_width; w += 4) { + float32x4x2_t vi0, vi1, vi2; + float32x4_t vi0n, vi1n, vi2n; + + // input (3 height x 3 slide): vi_height_slide + float32x4_t vi00, vi01, vi02; + float32x4_t vi10, vi11, vi12; + float32x4_t vi20, vi21, vi22; + + // output (1 outch x 1 height x 4 width): vo + float32x4_t vo; + + // load input + index_t in_h = h * 2; + index_t in_w = w * 2; + index_t in_offset = in_h * in_width + in_w; + vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7] + vi1 = vld2q_f32(in_base + in_offset + in_width); + vi2 = vld2q_f32(in_base + in_offset + 2 * in_width); + + vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11] + vi1n = vld1q_f32(in_base + in_offset + in_width + 8); + vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8); + + // load ouptut + index_t out_offset = h * out_width + w; + vo = vld1q_f32(out_base + out_offset); + + vi00 = vi0.val[0]; // [0.2.4.6] + vi01 = vi0.val[1]; // [1.3.5.7] + vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8] + vi10 = vi1.val[0]; + vi11 = vi1.val[1]; + vi12 = vextq_f32(vi10, vi1n, 1); + vi20 = vi2.val[0]; + vi21 = vi2.val[1]; + vi22 = vextq_f32(vi20, vi2n, 1); + + // outch 0, height 0 + vo = vfmaq_laneq_f32(vo, vi00, vf00, 0); + vo = vfmaq_laneq_f32(vo, vi01, vf00, 1); + vo = vfmaq_laneq_f32(vo, vi02, vf00, 2); + vo = vfmaq_laneq_f32(vo, vi10, vf01, 0); + vo = vfmaq_laneq_f32(vo, vi11, vf01, 1); + vo = vfmaq_laneq_f32(vo, vi12, vf01, 2); + vo = vfmaq_laneq_f32(vo, vi20, vf02, 0); + vo = vfmaq_laneq_f32(vo, vi21, vf02, 1); + vo = vfmaq_laneq_f32(vo, vi22, vf02, 2); + + vst1q_f32(out_base + out_offset, vo); + } // w + } // h +#else + for (index_t ih = 0; ih < out_height; ++ih) { + for (index_t iw = 0; iw < out_width; ++iw) { + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + out_base[ih * out_width + iw] += + in_base[(ih * 2 + i) * in_width + (iw * 2 + j)] + * filter_ptr[i * 3 + j]; + } + } + } + } +#endif + } // c + } // m + } // b +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/arm/conv_winograd.cc b/mace/kernels/arm/conv_winograd.cc new file mode 100644 index 0000000000000000000000000000000000000000..c0509689f832d702ed23afdbd29c5b53abf42773 --- /dev/null +++ b/mace/kernels/arm/conv_winograd.cc @@ -0,0 +1,404 @@ +// +// Copyright (c) 2018 XiaoMi All rights reserved. +// + +#include +#include + +#include "mace/kernels/arm/conv_winograd.h" +#include "mace/kernels/gemm.h" +#include "mace/utils/utils.h" + +namespace mace { +namespace kernels { + +namespace { +// NCHW => TNCB (T: in tile pixels, B: tile indices) +void TransformInput(const float *input, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t tile_count, + float *output) { + const index_t stride = batch * in_channels * tile_count; + const index_t in_height_width = in_height * in_width; + +#pragma omp parallel for + for (index_t nc = 0; nc < batch * in_channels; ++nc) { + index_t tile_index = nc * tile_count; + for (index_t h = 0; h < in_height - 2; h += 2) { + for (index_t w = 0; w < in_width - 2; w += 2) { + float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, + d15; + float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + s15; + + // load tile data + const index_t tile_offset = nc * in_height_width + h * in_width + w; + d0 = input[tile_offset]; + d1 = input[tile_offset + 1]; + d2 = input[tile_offset + 2]; + d3 = input[tile_offset + 3]; + + d4 = input[tile_offset + in_width]; + d5 = input[tile_offset + in_width + 1]; + d6 = input[tile_offset + in_width + 2]; + d7 = input[tile_offset + in_width + 3]; + + d8 = input[tile_offset + 2 * in_width]; + d9 = input[tile_offset + 2 * in_width + 1]; + d10 = input[tile_offset + 2 * in_width + 2]; + d11 = input[tile_offset + 2 * in_width + 3]; + + d12 = input[tile_offset + 3 * in_width]; + d13 = input[tile_offset + 3 * in_width + 1]; + d14 = input[tile_offset + 3 * in_width + 2]; + d15 = input[tile_offset + 3 * in_width + 3]; + + // s = BT * d * B + s0 = (d0 - d8) - (d2 - d10); + s1 = (d1 - d9) + (d2 - d10); + s2 = (d2 - d10) - (d1 - d9); + s3 = (d1 - d9) - (d3 - d11); + s4 = (d4 + d8) - (d6 + d10); + s5 = (d5 + d9) + (d6 + d10); + s6 = (d6 + d10) - (d5 + d9); + s7 = (d5 + d9) - (d7 + d11); + s8 = (d8 - d4) - (d10 - d6); + s9 = (d9 - d5) + (d10 - d6); + s10 = (d10 - d6) - (d9 - d5); + s11 = (d9 - d5) - (d11 - d7); + s12 = (d4 - d12) - (d6 - d14); + s13 = (d5 - d13) + (d6 - d14); + s14 = (d6 - d14) - (d5 - d13); + s15 = (d5 - d13) - (d7 - d15); + + // store output + output[tile_index + 0 * stride] = s0; + output[tile_index + 1 * stride] = s1; + output[tile_index + 2 * stride] = s2; + output[tile_index + 3 * stride] = s3; + + output[tile_index + 4 * stride] = s4; + output[tile_index + 5 * stride] = s5; + output[tile_index + 6 * stride] = s6; + output[tile_index + 7 * stride] = s7; + + output[tile_index + 8 * stride] = s8; + output[tile_index + 9 * stride] = s9; + output[tile_index + 10 * stride] = s10; + output[tile_index + 11 * stride] = s11; + + output[tile_index + 12 * stride] = s12; + output[tile_index + 13 * stride] = s13; + output[tile_index + 14 * stride] = s14; + output[tile_index + 15 * stride] = s15; + + ++tile_index; + } + } + } +} + +// OCHW => TOC +// no need to optimize, it will exist in converter +void TransformFilter(const float *filter, + const index_t in_channels, + const index_t out_channels, + float *output) { + const index_t stride = out_channels * in_channels; + +#pragma omp parallel for collapse(2) + for (index_t m = 0; m < out_channels; ++m) { + for (index_t c = 0; c < in_channels; ++c) { + float g0, g1, g2, g3, g4, g5, g6, g7, g8; + float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, + s15; + + // load filter + index_t filter_offset = (m * in_channels + c) * 9; + g0 = filter[filter_offset]; + g1 = filter[filter_offset + 1]; + g2 = filter[filter_offset + 2]; + g3 = filter[filter_offset + 3]; + g4 = filter[filter_offset + 4]; + g5 = filter[filter_offset + 5]; + g6 = filter[filter_offset + 6]; + g7 = filter[filter_offset + 7]; + g8 = filter[filter_offset + 8]; + + // s = G * g * GT + s0 = g0; + s1 = (g0 + g2 + g1) * 0.5f; + s2 = (g0 + g2 - g1) * 0.5f; + s3 = g2; + s4 = (g0 + g6 + g3) * 0.5f; + s5 = ((g0 + g6 + g3) + (g2 + g8 + g5) + (g1 + g7 + g4)) * 0.25f; + s6 = ((g0 + g6 + g3) + (g2 + g8 + g5) - (g1 + g7 + g4)) * 0.25f; + s7 = (g2 + g8 + g5) * 0.5f; + s8 = (g0 + g6 - g3) * 0.5f; + s9 = ((g0 + g6 - g3) + (g2 + g8 - g5) + (g1 + g7 - g4)) * 0.25f; + s10 = ((g0 + g6 - g3) + (g2 + g8 - g5) - (g1 + g7 - g4)) * 0.25f; + s11 = (g2 + g8 - g5) * 0.5f; + s12 = g6; + s13 = (g6 + g8 + g7) * 0.5f; + s14 = (g6 + g8 - g7) * 0.5f; + s15 = g8; + + // store output + index_t output_offset = m * in_channels + c; + output[output_offset + 0 * stride] = s0; + output[output_offset + 1 * stride] = s1; + output[output_offset + 2 * stride] = s2; + output[output_offset + 3 * stride] = s3; + + output[output_offset + 4 * stride] = s4; + output[output_offset + 5 * stride] = s5; + output[output_offset + 6 * stride] = s6; + output[output_offset + 7 * stride] = s7; + + output[output_offset + 8 * stride] = s8; + output[output_offset + 9 * stride] = s9; + output[output_offset + 10 * stride] = s10; + output[output_offset + 11 * stride] = s11; + + output[output_offset + 12 * stride] = s12; + output[output_offset + 13 * stride] = s13; + output[output_offset + 14 * stride] = s14; + output[output_offset + 15 * stride] = s15; + } + } +} + +// TOC * TNCB => TNOB +void BatchGemm(const float *input, + const float *filter, + index_t batch, + index_t in_channels, + index_t out_channels, + index_t tile_count, + float *output) { + const index_t in_stride = batch * in_channels * tile_count; + const index_t in_channels_tile_count = in_channels * tile_count; + const index_t filter_stride = out_channels * in_channels; + const index_t out_stride = batch * out_channels * tile_count; + const index_t out_channels_tile_count = out_channels * tile_count; + + if (batch == 1) { + Gemm(filter, input, 16, out_channels, in_channels, tile_count, output); + } else { + for (int i = 0; i < 16; ++i) { + for (int b = 0; b < batch; ++b) { + const float + *in_ptr = input + i * in_stride + b * in_channels_tile_count; + const float *filter_ptr = filter + i * filter_stride; + float *out_ptr = output + i * out_stride + b * out_channels_tile_count; + Gemm(filter_ptr, + in_ptr, + 1, + out_channels, /* rows */ + in_channels, /* K */ + tile_count, /* cols */ + out_ptr); + } + } + } +} + +// TNOB => ToNOB => NOHoWo +void TransformOutput(const float *input, + index_t batch, + index_t out_height, + index_t out_width, + index_t out_channels, + index_t tile_count, + float *output) { + const index_t in_stride = batch * out_channels * tile_count; + +#pragma omp parallel for + for (index_t nm = 0; nm < batch * out_channels; ++nm) { + index_t tile_offset = nm * tile_count; + for (index_t h = 0; h < out_height; h += 2) { + for (index_t w = 0; w < out_width; w += 2) { + float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, + d15; + float s0, s1, s2, s3, s4, s5, s6, s7; + float v0, v1, v2, v3; + + d0 = input[tile_offset + 0 * in_stride]; + d1 = input[tile_offset + 1 * in_stride]; + d2 = input[tile_offset + 2 * in_stride]; + d3 = input[tile_offset + 3 * in_stride]; + + d4 = input[tile_offset + 4 * in_stride]; + d5 = input[tile_offset + 5 * in_stride]; + d6 = input[tile_offset + 6 * in_stride]; + d7 = input[tile_offset + 7 * in_stride]; + + d8 = input[tile_offset + 8 * in_stride]; + d9 = input[tile_offset + 9 * in_stride]; + d10 = input[tile_offset + 10 * in_stride]; + d11 = input[tile_offset + 11 * in_stride]; + + d12 = input[tile_offset + 12 * in_stride]; + d13 = input[tile_offset + 13 * in_stride]; + d14 = input[tile_offset + 14 * in_stride]; + d15 = input[tile_offset + 15 * in_stride]; + + s0 = d0 + d1 + d2; + s1 = d1 - d2 - d3; + s2 = d4 + d5 + d6; + s3 = d5 - d6 - d7; + s4 = d8 + d9 + d10; + s5 = d9 - d10 - d11; + s6 = d12 + d13 + d14; + s7 = d13 - d14 - d15; + + v0 = s0 + s2 + s4; + v1 = s1 + s3 + s5; + v2 = s2 - s4 - s6; + v3 = s3 - s5 - s7; + + index_t out_offset = nm * out_height * out_width + h * out_width + w; + output[out_offset] = v0; + output[out_offset + 1] = v1; + output[out_offset + out_width] = v2; + output[out_offset + out_width + 1] = v3; + + ++tile_offset; + } + } + } +} + +void ConvRef3x3s1(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_channels, + float *output) { + index_t out_height = in_height - 2; + index_t out_width = in_width - 2; + +#pragma omp parallel for collapse(4) + for (index_t b = 0; b < batch; ++b) { + for (index_t m = 0; m < out_channels; ++m) { + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w < out_width; ++w) { + index_t out_offset = + ((b * out_channels + m) * out_height + h) * out_width + w; + output[out_offset] = 0; + for (index_t c = 0; c < in_channels; ++c) { + for (index_t kh = 0; kh < 3; ++kh) { + for (index_t kw = 0; kw < 3; ++kw) { + index_t ih = h + kh; + index_t iw = w + kw; + index_t in_offset = + ((b * in_channels + c) * in_height + ih) * in_width + iw; + index_t + filter_offset = (((m * in_channels) + c) * 3 + kh) * 3 + kw; + output[out_offset] += input[in_offset] * filter[filter_offset]; + } + } + } + } + } + } + } +} +} // namespace + +void WinoGradConv3x3s1(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_channels, + float *transformed_input, + float *transformed_filter, + float *transformed_output, + bool is_filter_transformed, + float *output) { + index_t out_height = in_height - 2; + index_t out_width = in_width - 2; + index_t tile_height_count = (out_height + 1) / 2; + index_t tile_width_count = (out_width + 1) / 2; + index_t tile_count = tile_height_count * tile_width_count; + + TransformInput(input, + batch, + in_height, + in_width, + in_channels, + tile_count, + transformed_input); + + // TODO(liyin): put it in model converter, but do not worry, it is fast and + // will only do once + if (!is_filter_transformed) { + TransformFilter(filter, in_channels, out_channels, transformed_filter); + } + + BatchGemm(transformed_input, + transformed_filter, + batch, + in_channels, + out_channels, + tile_count, + transformed_output); + + TransformOutput(transformed_output, + batch, + out_height, + out_width, + out_channels, + tile_count, + output); +} + +void WinoGradConv3x3s1(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_channels, + float *output) { + index_t out_height = in_height - 2; + index_t out_width = in_width - 2; + index_t tile_height_count = (out_height + 1) / 2; + index_t tile_width_count = (out_width + 1) / 2; + index_t tile_count = tile_height_count * tile_width_count; + + index_t transformed_input_size = 16 * batch * in_channels * tile_count; + index_t transformed_filter_size = 16 * out_channels * in_channels; + index_t transformed_output_size = 16 * batch * out_channels * tile_count; + + float *transformed_input = new float[transformed_input_size]; // TNCB + float *transformed_filter = new float[transformed_filter_size]; // TOC + float *transformed_output = new float[transformed_output_size]; + + WinoGradConv3x3s1(input, + filter, + batch, + in_height, + in_width, + in_channels, + out_channels, + transformed_input, + transformed_filter, + transformed_output, + false, + output); + + delete[]transformed_input; + delete[]transformed_filter; + delete[]transformed_output; +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/arm/conv_winograd.h b/mace/kernels/arm/conv_winograd.h new file mode 100644 index 0000000000000000000000000000000000000000..7611d65ae5e2a57b4542df40bc4d6bef3d04538d --- /dev/null +++ b/mace/kernels/arm/conv_winograd.h @@ -0,0 +1,42 @@ +// +// Copyright (c) 2018 XiaoMi All rights reserved. +// + +#ifndef MACE_KERNELS_ARM_CONV_WINOGRAD_H_ +#define MACE_KERNELS_ARM_CONV_WINOGRAD_H_ + +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) +#include +#endif + +#include "mace/core/types.h" + +namespace mace { +namespace kernels { + +void WinoGradConv3x3s1(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_channels, + float *output); + +void WinoGradConv3x3s1(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_channels, + float *transformed_input, + float *transformed_filter, + float *transformed_output, + bool is_filter_transformed, + float *output); + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_ARM_CONV_WINOGRAD_H_ diff --git a/mace/kernels/arm/conv_winograd_test.cc b/mace/kernels/arm/conv_winograd_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..52be053bd9e5cb2d57a5c2d3c1b1fce322752996 --- /dev/null +++ b/mace/kernels/arm/conv_winograd_test.cc @@ -0,0 +1,75 @@ +// +// Copyright (c) 2018 XiaoMi All rights reserved. +// + +#include +#include +#include + +#include "mace/kernels/arm/conv_winograd.h" +#include "mace/core/types.h" + +namespace mace { +namespace kernels { + +TEST(ConvWinogradTest, winograd) { + index_t batch = 1; + index_t in_height = 32; + index_t in_width = 32; + index_t in_channels = 64; + index_t out_channels = 128; + + index_t out_height = in_height - 2; + index_t out_width = in_width - 2; + index_t input_size = batch * in_channels * in_height * out_height; + index_t filter_size = 3 * 3 * in_channels * out_channels; + index_t output_size = batch * out_channels * out_height * out_width; + + float *input_data = new float[input_size]; + float *filter_data = new float[filter_size]; + float *output_data = new float[output_size]; + float *output_data_ref = new float[output_size]; + + std::random_device rd; + std::mt19937 gen(rd()); + std::normal_distribution nd(0, 1); + std::generate(input_data, input_data + input_size, + [&gen, &nd] { + return std::max(-1.0f, std::min(1.0f, nd(gen))); + }); + std::generate(filter_data, filter_data + filter_size, + [&gen, &nd] { + return std::max(-1.0f, std::min(1.0f, nd(gen))); + }); + + kernels::ConvRef3x3s1(input_data, + filter_data, + batch, + in_height, + in_width, + in_channels, + out_channels, + output_data_ref); + + kernels::WinoGradConv3x3s1(input_data, + filter_data, + batch, + in_height, + in_width, + in_channels, + out_channels, + output_data); + + // test + for (index_t i = 0; i < output_size; ++i) { + EXPECT_NEAR(output_data_ref[i], output_data[i], 0.1); + } + + delete[]input_data; + delete[]filter_data; + delete[]output_data; + delete[]output_data_ref; +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/arm/depthwise_conv2d.cc b/mace/kernels/arm/depthwise_conv2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..3920ed08b3150f07b5189058dfe5ae751cbbd8b5 --- /dev/null +++ b/mace/kernels/arm/depthwise_conv2d.cc @@ -0,0 +1,258 @@ +// +// Copyright (c) 2018 XiaoMi All rights reserved. +// + +#include "mace/kernels/depthwise_conv2d.h" +#include "mace/kernels/activation.h" + +namespace mace { +namespace kernels { + +namespace { + +void DepthwiseConv2dNCHW(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_channels, + const int filter_height, + const int filter_width, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int pad_top, + const int pad_left, + float *output) { + const index_t multiplier = out_channels / in_channels; +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch; ++b) { + for (index_t m = 0; m < out_channels; ++m) { + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w < out_width; ++w) { + index_t out_offset = + ((b * out_channels + m) * out_height + h) * out_width + w; + index_t c = m / multiplier; + index_t o = m % multiplier; + float sum = 0; + for (index_t kh = 0; kh < filter_height; ++kh) { + for (index_t kw = 0; kw < filter_width; ++kw) { + index_t ih = h * stride_h + kh * dilation_h - pad_top; + index_t iw = w * stride_w + kw * dilation_w - pad_left; + if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) { + index_t in_offset = + ((b * in_channels + c) * in_height + ih) * in_width + iw; + index_t filter_offset = + (((o * in_channels) + c) * filter_height + kh) * filter_width + + kw; + + sum += input[in_offset] * filter[filter_offset]; + } + } + } + output[out_offset] = sum; + } + } + } + } +} +} // namespace + +extern void DepthwiseConv2dNeonK3x3S1(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_channels, + const int pad_top, + const int pad_left, + const int valid_h_start, + const int valid_h_stop, + const int valid_w_start, + const int valid_w_stop, + float *output); + +void DepthwiseConv2dNeonK3x3S2(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_channels, + const int pad_top, + const int pad_left, + const int valid_h_start, + const int valid_h_stop, + const int valid_w_start, + const int valid_w_stop, + float *output); + +void DepthwiseConv2dFunctor::operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output, + StatsFuture *future) { + MACE_CHECK_NOTNULL(input); + MACE_CHECK_NOTNULL(filter); + MACE_CHECK_NOTNULL(output); + + std::vector output_shape(4); + std::vector paddings(2); + std::vector filter_shape + {filter->dim(0) * filter->dim(1), filter->dim(1), filter->dim(2), + filter->dim(3)}; + + if (paddings_.empty()) { + CalcNCHWPaddingAndOutputSize(input->shape().data(), + filter_shape.data(), + dilations_, + strides_, + padding_type_, + output_shape.data(), + paddings.data()); + } else { + paddings = paddings_; + CalcNCHWOutputSize(input->shape().data(), filter_shape.data(), + paddings_.data(), dilations_, strides_, RoundType::FLOOR, + output_shape.data()); + } + output->Resize(output_shape); + output->Clear(); + + index_t batch = output->dim(0); + index_t channels = output->dim(1); + index_t height = output->dim(2); + index_t width = output->dim(3); + + index_t input_batch = input->dim(0); + index_t input_channels = input->dim(1); + index_t input_height = input->dim(2); + index_t input_width = input->dim(3); + + index_t filter_h = filter_shape[2]; + index_t filter_w = filter_shape[3]; + MACE_CHECK(filter_shape[0] == channels, filter_shape[0], " != ", channels); + MACE_CHECK(filter_shape[1] == input_channels, filter_shape[1], " != ", + input_channels); + + index_t stride_h = strides_[0]; + index_t stride_w = strides_[1]; + + index_t dilation_h = dilations_[0]; + index_t dilation_w = dilations_[1]; + + MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); + + int pad_top = paddings[0] >> 1; + int pad_bottom = paddings[0] - pad_top; + int pad_left = paddings[1] >> 1; + int pad_right = paddings[1] - pad_left; + + int valid_h_start = pad_top == 0 ? 0 : (pad_top - 1) / stride_h + 1; + int valid_h_stop = pad_bottom == 0 + ? height + : height - ((pad_bottom - 1) / stride_h + 1); + int valid_w_start = pad_left == 0 ? 0 : (pad_left - 1) / stride_w + 1; + int valid_w_stop = pad_right == 0 + ? width + : width - ((pad_right - 1) / stride_w + 1); + + std::function conv_func; + + auto input_data = input->data(); + auto filter_data = filter->data(); + auto bias_data = bias == nullptr ? nullptr : bias->data(); + auto output_data = output->mutable_data(); + + if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1 + && dilation_h == 1 && dilation_w == 1) { + conv_func = [=](const float *input, float *output) { + DepthwiseConv2dNeonK3x3S1(input, + filter_data, + batch, + input_height, + input_width, + input_channels, + height, + width, + channels, + pad_top, + pad_left, + valid_h_start, + valid_h_stop, + valid_w_start, + valid_w_stop, + output); + }; + } else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2 + && dilation_h == 1 && dilation_w == 1) { + conv_func = [=](const float *input, float *output) { + DepthwiseConv2dNeonK3x3S2(input, + filter_data, + batch, + input_height, + input_width, + input_channels, + height, + width, + channels, + pad_top, + pad_left, + valid_h_start, + valid_h_stop, + valid_w_start, + valid_w_stop, + output); + }; + } else { + conv_func = [=](const float *input, float *output) { + DepthwiseConv2dNCHW(input, + filter_data, + batch, + input_height, + input_width, + input_channels, + height, + width, + channels, + filter_h, + filter_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + pad_top, + pad_left, + output); + }; + } + + conv_func(input_data, output_data); + + if (bias_data != nullptr) { +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + for (index_t i = 0; i < height * width; ++i) { + output_data[(b * channels + c) * height * width + i] += bias_data[c]; + } + } + } + } + + DoActivation(output_data, output_data, output->size(), activation_, + relux_max_limit_); +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc new file mode 100644 index 0000000000000000000000000000000000000000..9ce4c68c66c42aba7d7e4044eff0ef266345d24c --- /dev/null +++ b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc @@ -0,0 +1,442 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) +#include +#endif + +#include "mace/core/types.h" + +namespace mace { +namespace kernels { + +namespace { +void DepthwiseConv2dPixel(const float *in_base, + const float *filter, + const index_t out_h, + const index_t out_w, + const index_t in_h_start, + const index_t in_w_start, + const index_t out_width, + const index_t in_height, + const index_t in_width, + int filter_height, + int filter_width, + float *out_base) { + float sum = 0; + for (int i = 0; i < filter_height; ++i) { + for (int j = 0; j < filter_width; ++j) { + index_t in_h = in_h_start + i; + index_t in_w = in_w_start + j; + if (in_h >= 0 && in_h < in_height && in_w >= 0 && in_w < in_width) { + sum += in_base[in_h * in_width + in_w] * filter[i * filter_width + j]; + } + } + } + out_base[out_h * out_width + out_w] = sum; +} +} // namespace + +// Ho = 2, Wo = 4, Co = 1 +void DepthwiseConv2dNeonK3x3S1(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_channels, + const int pad_top, + const int pad_left, + const int valid_h_start, + const int valid_h_stop, + const int valid_w_start, + const int valid_w_stop, + float *output) { + const index_t multiplier = out_channels / in_channels; + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch; ++b) { + for (index_t m = 0; m < out_channels; ++m) { + index_t c = m / multiplier; + index_t multi_index = m % multiplier; + const float *in_base = input + b * in_batch_size + c * in_image_size; + const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9; + float *out_base = output + b * out_batch_size + m * out_image_size; + index_t h, w; + + // top + for (h = 0; h < valid_h_start; ++h) { + for (w = 0; w < out_width; ++w) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + h, + w, + h - pad_top, + w - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } + } + +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float32x4_t vf00, vf01, vf02; + vf00 = vld1q_f32(filter_ptr); + vf01 = vld1q_f32(filter_ptr + 3); + vf02 = vld1q_f32(filter_ptr + 6); + + for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) { + // left + for (w = 0; w < valid_w_start; ++w) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + h, + w, + h - pad_top, + w - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + DepthwiseConv2dPixel(in_base, + filter_ptr, + h + 1, + w, + h + 1 - pad_top, + w - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } + + for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) { + // input (4 height x 3 slide): vi_height_slide + float32x4_t vi00, vi01, vi02, vi0n; + float32x4_t vi10, vi11, vi12, vi1n; + float32x4_t vi20, vi21, vi22, vi2n; + float32x4_t vi30, vi31, vi32, vi3n; + + // output (1 outch x 2 height x 4 width): vo_outch_height + float32x4_t vo00, vo01; + + // load input + index_t in_h = h - pad_top; + index_t in_w = w - pad_left; + index_t in_offset = in_h * in_width + in_w; + vi00 = vld1q_f32(in_base + in_offset); + vi0n = vld1q_f32(in_base + in_offset + 4); + vi10 = vld1q_f32(in_base + in_offset + in_width); + vi1n = vld1q_f32(in_base + in_offset + in_width + 4); + vi20 = vld1q_f32(in_base + in_offset + 2 * in_width); + vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 4); + vi30 = vld1q_f32(in_base + in_offset + 3 * in_width); + vi3n = vld1q_f32(in_base + in_offset + 3 * in_width + 4); + + vi01 = vextq_f32(vi00, vi0n, 1); + vi02 = vextq_f32(vi00, vi0n, 2); + vi11 = vextq_f32(vi10, vi1n, 1); + vi12 = vextq_f32(vi10, vi1n, 2); + vi21 = vextq_f32(vi20, vi2n, 1); + vi22 = vextq_f32(vi20, vi2n, 2); + vi31 = vextq_f32(vi30, vi3n, 1); + vi32 = vextq_f32(vi30, vi3n, 2); + + // load ouptut + index_t out_offset = h * out_width + w; + vo00 = vld1q_f32(out_base + out_offset); + vo01 = vld1q_f32(out_base + out_offset + out_width); + + // outch 0, height 0 + vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0); + vo00 = vfmaq_laneq_f32(vo00, vi01, vf00, 1); + vo00 = vfmaq_laneq_f32(vo00, vi02, vf00, 2); + vo00 = vfmaq_laneq_f32(vo00, vi10, vf01, 0); + vo00 = vfmaq_laneq_f32(vo00, vi11, vf01, 1); + vo00 = vfmaq_laneq_f32(vo00, vi12, vf01, 2); + vo00 = vfmaq_laneq_f32(vo00, vi20, vf02, 0); + vo00 = vfmaq_laneq_f32(vo00, vi21, vf02, 1); + vo00 = vfmaq_laneq_f32(vo00, vi22, vf02, 2); + + // outch 0, height 1 + vo01 = vfmaq_laneq_f32(vo01, vi10, vf00, 0); + vo01 = vfmaq_laneq_f32(vo01, vi11, vf00, 1); + vo01 = vfmaq_laneq_f32(vo01, vi12, vf00, 2); + vo01 = vfmaq_laneq_f32(vo01, vi20, vf01, 0); + vo01 = vfmaq_laneq_f32(vo01, vi21, vf01, 1); + vo01 = vfmaq_laneq_f32(vo01, vi22, vf01, 2); + vo01 = vfmaq_laneq_f32(vo01, vi30, vf02, 0); + vo01 = vfmaq_laneq_f32(vo01, vi31, vf02, 1); + vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 2); + + vst1q_f32(out_base + out_offset, vo00); + vst1q_f32(out_base + out_offset + out_width, vo01); + } // w + + // right + for (; w < out_width; ++w) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + h, + w, + h - pad_top, + w - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + DepthwiseConv2dPixel(in_base, + filter_ptr, + h + 1, + w, + h + 1 - pad_top, + w - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } + } // h +#else + for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) { + for (index_t iw = 0; iw < out_width; ++iw) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + ih, + iw, + ih - pad_top, + iw - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } + } +#endif + + // bottom + for (; h < out_height; ++h) { + for (w = 0; w < out_width; ++w) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + h, + w, + h - pad_top, + w - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } + } + } // m + } // b +} + +void DepthwiseConv2dNeonK3x3S2(const float *input, + const float *filter, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t in_channels, + const index_t out_height, + const index_t out_width, + const index_t out_channels, + const int pad_top, + const int pad_left, + const int valid_h_start, + const int valid_h_stop, + const int valid_w_start, + const int valid_w_stop, + float *output) { + const index_t multiplier = out_channels / in_channels; + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = in_channels * in_image_size; + const index_t out_batch_size = out_channels * out_image_size; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch; ++b) { + for (index_t m = 0; m < out_channels; ++m) { + index_t c = m / multiplier; + index_t multi_index = m % multiplier; + const float *in_base = input + b * in_batch_size + c * in_image_size; + const float *filter_ptr = filter + multi_index * in_channels * 9 + c * 9; + float *out_base = output + b * out_batch_size + m * out_image_size; + index_t h, w; + + // top + for (h = 0; h < valid_h_start; ++h) { + for (w = 0; w < out_width; ++w) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + h, + w, + h * 2 - pad_top, + w * 2 - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } + } + +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) + // load filter (1 outch x 3 height x 3 width): vf_outch_height + float32x4_t vf00, vf01, vf02; + vf00 = vld1q_f32(filter_ptr); + vf01 = vld1q_f32(filter_ptr + 3); + vf02 = vld1q_f32(filter_ptr + 6); + + for (h = valid_h_start; h < valid_h_stop; ++h) { + // left + for (w = 0; w < valid_w_start; ++w) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + h, + w, + h * 2 - pad_top, + w * 2 - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } + + for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) { + float32x4x2_t vi0, vi1, vi2; + float32x4_t vi0n, vi1n, vi2n; + + // input (3 height x 3 slide): vi_height_slide + float32x4_t vi00, vi01, vi02; + float32x4_t vi10, vi11, vi12; + float32x4_t vi20, vi21, vi22; + + // output (1 outch x 1 height x 4 width): vo + float32x4_t vo; + + // load input + index_t in_h = h * 2 - pad_top; + index_t in_w = w * 2 - pad_left; + index_t in_offset = in_h * in_width + in_w; + vi0 = vld2q_f32(in_base + in_offset); // [0.2.4.6, 1.3.5.7] + vi1 = vld2q_f32(in_base + in_offset + in_width); + vi2 = vld2q_f32(in_base + in_offset + 2 * in_width); + + vi0n = vld1q_f32(in_base + in_offset + 8); // [8.9.10.11] + vi1n = vld1q_f32(in_base + in_offset + in_width + 8); + vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8); + + // load ouptut + index_t out_offset = h * out_width + w; + vo = vld1q_f32(out_base + out_offset); + + vi00 = vi0.val[0]; // [0.2.4.6] + vi01 = vi0.val[1]; // [1.3.5.7] + vi02 = vextq_f32(vi00, vi0n, 1); // [2.4.6.8] + vi10 = vi1.val[0]; + vi11 = vi1.val[1]; + vi12 = vextq_f32(vi10, vi1n, 1); + vi20 = vi2.val[0]; + vi21 = vi2.val[1]; + vi22 = vextq_f32(vi20, vi2n, 1); + + // outch 0, height 0 + vo = vfmaq_laneq_f32(vo, vi00, vf00, 0); + vo = vfmaq_laneq_f32(vo, vi01, vf00, 1); + vo = vfmaq_laneq_f32(vo, vi02, vf00, 2); + vo = vfmaq_laneq_f32(vo, vi10, vf01, 0); + vo = vfmaq_laneq_f32(vo, vi11, vf01, 1); + vo = vfmaq_laneq_f32(vo, vi12, vf01, 2); + vo = vfmaq_laneq_f32(vo, vi20, vf02, 0); + vo = vfmaq_laneq_f32(vo, vi21, vf02, 1); + vo = vfmaq_laneq_f32(vo, vi22, vf02, 2); + + vst1q_f32(out_base + out_offset, vo); + } // w + + // right + for (; w < out_width; ++w) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + h, + w, + h * 2 - pad_top, + w * 2 - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } + } // h +#else + for (index_t ih = valid_h_start; ih < valid_h_stop; ++ih) { + for (index_t iw = 0; iw < out_width; ++iw) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + ih, + iw, + ih * 2 - pad_top, + iw * 2 - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } + } +#endif + + // bottom + for (; h < out_height; ++h) { + for (w = 0; w < out_width; ++w) { + DepthwiseConv2dPixel(in_base, + filter_ptr, + h, + w, + h * 2 - pad_top, + w * 2 - pad_left, + out_width, + in_height, + in_width, + 3, + 3, + out_base); + } + } + } // m + } // b +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/arm/pooling.cc b/mace/kernels/arm/pooling.cc new file mode 100644 index 0000000000000000000000000000000000000000..9250134315c3004c5581ec39cf6b8e611a451468 --- /dev/null +++ b/mace/kernels/arm/pooling.cc @@ -0,0 +1,196 @@ +// +// Copyright (c) 2018 XiaoMi All rights reserved. +// + +#include "mace/kernels/pooling.h" + +namespace mace { +namespace kernels { + +namespace { + +void MaxPooling(const float *input, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t channels, + const index_t out_height, + const index_t out_width, + const int filter_height, + const int filter_width, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int pad_top, + const int pad_left, + float *output) { + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = channels * in_image_size; + const index_t out_batch_size = channels * out_image_size; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + const index_t out_base = b * out_batch_size + c * out_image_size; + const index_t in_base = b * in_batch_size + c * in_image_size; + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w < out_width; ++w) { + const index_t out_offset = out_base + h * out_width + w; + float res = std::numeric_limits::lowest(); + for (int fh = 0; fh < filter_height; ++fh) { + for (int fw = 0; fw < filter_width; ++fw) { + int inh = h * stride_h + dilation_h * fh - pad_top; + int inw = w * stride_w + dilation_w * fw - pad_left; + if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { + index_t input_offset = in_base + inh * in_width + inw; + res = std::max(res, input[input_offset]); + } + } + } + output[out_offset] = res; + } + } + } + } +} + +void AvgPooling(const float *input, + const index_t batch, + const index_t in_height, + const index_t in_width, + const index_t channels, + const index_t out_height, + const index_t out_width, + const int filter_height, + const int filter_width, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int pad_top, + const int pad_left, + float *output) { + const index_t in_image_size = in_height * in_width; + const index_t out_image_size = out_height * out_width; + const index_t in_batch_size = channels * in_image_size; + const index_t out_batch_size = channels * out_image_size; + +#pragma omp parallel for collapse(2) + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + const index_t out_base = b * out_batch_size + c * out_image_size; + const index_t in_base = b * in_batch_size + c * in_image_size; + for (index_t h = 0; h < out_height; ++h) { + for (index_t w = 0; w < out_width; ++w) { + const index_t out_offset = out_base + h * out_width + w; + float res = 0; + int block_size = 0; + for (int fh = 0; fh < filter_height; ++fh) { + for (int fw = 0; fw < filter_width; ++fw) { + int inh = h * stride_h + dilation_h * fh - pad_top; + int inw = w * stride_w + dilation_w * fw - pad_left; + if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { + index_t input_offset = in_base + inh * in_width + inw; + res += input[input_offset]; + ++block_size; + } + } + } + output[out_offset] = res / block_size; + } + } + } + } +} +} // namespace + +void PoolingFunctor::operator()(const Tensor *input_tensor, + Tensor *output_tensor, + StatsFuture *future) { + std::vector output_shape(4); + std::vector filter_shape = { + input_tensor->dim(1), input_tensor->dim(1), kernels_[0], kernels_[1]}; + + std::vector paddings(2); + if (paddings_.empty()) { + kernels::CalcNCHWPaddingAndOutputSize( + input_tensor->shape().data(), filter_shape.data(), dilations_, + strides_, padding_type_, output_shape.data(), paddings.data()); + } else { + paddings = paddings_; + CalcNCHWOutputSize(input_tensor->shape().data(), filter_shape.data(), + paddings_.data(), dilations_, strides_, RoundType::CEIL, + output_shape.data()); + } + output_tensor->Resize(output_shape); + + const float *input = input_tensor->data(); + float *output = output_tensor->mutable_data(); + const index_t *input_shape = input_tensor->shape().data(); + index_t batch = output_shape[0]; + index_t channels = output_shape[1]; + index_t height = output_shape[2]; + index_t width = output_shape[3]; + + index_t input_channels = input_shape[1]; + index_t input_height = input_shape[2]; + index_t input_width = input_shape[3]; + + index_t in_image_size = input_height * input_width; + + int filter_h = kernels_[0]; + int filter_w = kernels_[1]; + + int stride_h = strides_[0]; + int stride_w = strides_[1]; + + int dilation_h = dilations_[0]; + int dilation_w = dilations_[1]; + + int pad_top = paddings[0] / 2; + int pad_left = paddings[1] / 2; + + if (pooling_type_ == PoolingType::MAX) { + MaxPooling(input, + batch, + input_height, + input_width, + channels, + height, + width, + filter_h, + filter_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + pad_top, + pad_left, + output); + } else if (pooling_type_ == PoolingType::AVG) { + AvgPooling(input, + batch, + input_height, + input_width, + channels, + height, + width, + filter_h, + filter_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + pad_top, + pad_left, + output); + } else { + MACE_NOT_IMPLEMENTED; + } +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/arm/softmax.cc b/mace/kernels/arm/softmax.cc new file mode 100644 index 0000000000000000000000000000000000000000..d51ee6e8e2c4b25e89a47ff0d3f8af9c7a6e10c3 --- /dev/null +++ b/mace/kernels/arm/softmax.cc @@ -0,0 +1,62 @@ +// +// Copyright (c) 2018 XiaoMi All rights reserved. +// + +#include "mace/kernels/softmax.h" + +namespace mace { +namespace kernels { + +void SoftmaxFunctor::operator()(const Tensor *input, + Tensor *output, + StatsFuture *future) { + const index_t batch = input->dim(0); + const index_t class_count = input->dim(1); + const index_t class_size = input->dim(2) * input->dim(3); + + const float *input_data = input->data(); + float *output_data = output->mutable_data(); + + for (index_t b = 0; b < batch; ++b) { + std::vector + max_val(class_size, std::numeric_limits::lowest()); + std::vector sum_val(class_size, 0.f); + + // calculate max for each class + for (index_t c = 0; c < class_count; ++c) { + const float *input_ptr = input_data + (b * class_count + c) * class_size; + for (index_t k = 0; k < class_size; ++k) { + max_val[k] = std::max(max_val[k], input_ptr[k]); + } + } + + // calculate data - max for each class +#pragma omp parallel for + for (index_t c = 0; c < class_count; ++c) { + const float *input_ptr = input_data + (b * class_count + c) * class_size; + float *output_ptr = output_data + (b * class_count + c) * class_size; + for (index_t k = 0; k < class_size; ++k) { + output_ptr[k] = ::exp(input_ptr[k] - max_val[k]); + } + } + + // calculate sum for each class + for (index_t c = 0; c < class_count; ++c) { + float *output_ptr = output_data + (b * class_count + c) * class_size; + for (index_t k = 0; k < class_size; ++k) { + sum_val[k] += output_ptr[k]; + } + } + + // calculate (data - max) / sum for each class + for (index_t c = 0; c < class_count; ++c) { + float *output_ptr = output_data + (b * class_count + c) * class_size; + for (index_t k = 0; k < class_size; ++k) { + output_ptr[k] /= sum_val[k]; + } + } + } +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index 5e8ae34f9a9aaef596090cc7113c440b425021cf..0b8ae31735e25dbd6b0014878dd5d84f5db638ed 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -133,14 +133,21 @@ struct BatchNormFunctor : BatchNormFunctorBase { }; template <> -void BatchNormFunctor::operator()(const Tensor *input, - const Tensor *scale, - const Tensor *offset, - const Tensor *mean, - const Tensor *var, - const float epsilon, - Tensor *output, - StatsFuture *future); +struct BatchNormFunctor : BatchNormFunctorBase { + BatchNormFunctor(const bool folded_constant, + const ActivationType activation, + const float relux_max_limit) + : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {} + void operator()(const Tensor *input, + const Tensor *scale, + const Tensor *offset, + const Tensor *mean, + const Tensor *var, + const float epsilon, + Tensor *output, + StatsFuture *future); +}; + template struct BatchNormFunctor : BatchNormFunctorBase { diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index 4cd05a65d527a45caf8c34486be2696511406589..6ef35d29f7a7146e8548d31ecabf00f9ce77b4ae 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -212,6 +212,12 @@ switch (w_count) { \ case 2: \ MACE_DO_CONV2D(CC, CH, 2); \ break; \ + case 3: \ + MACE_DO_CONV2D(CC, CH, 3); \ + break; \ + case 4: \ + MACE_DO_CONV2D(CC, CH, 4); \ + break; \ default: \ LOG(FATAL) << "Unsupported w tile: " << w_count; \ } @@ -242,6 +248,42 @@ switch (c_count) { \ case 4: \ MACE_CASE_H_CONV2D(4); \ break; \ + case 5: \ + MACE_CASE_H_CONV2D(5); \ + break; \ + case 6: \ + MACE_CASE_H_CONV2D(6); \ + break; \ + case 7: \ + MACE_CASE_H_CONV2D(7); \ + break; \ + case 8: \ + MACE_CASE_H_CONV2D(8); \ + break; \ + case 9: \ + MACE_CASE_H_CONV2D(9); \ + break; \ + case 10: \ + MACE_CASE_H_CONV2D(10); \ + break; \ + case 11: \ + MACE_CASE_H_CONV2D(11); \ + break; \ + case 12: \ + MACE_CASE_H_CONV2D(12); \ + break; \ + case 13: \ + MACE_CASE_H_CONV2D(13); \ + break; \ + case 14: \ + MACE_CASE_H_CONV2D(14); \ + break; \ + case 15: \ + MACE_CASE_H_CONV2D(15); \ + break; \ + case 16: \ + MACE_CASE_H_CONV2D(16); \ + break; \ default: \ LOG(FATAL) << "Unsupported c tile: " << c_count; \ } @@ -373,11 +415,35 @@ struct Conv2dFunctor : Conv2dFunctorBase { }; template <> -void Conv2dFunctor::operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future); +struct Conv2dFunctor : Conv2dFunctorBase { + Conv2dFunctor(const int *strides, + const Padding &padding_type, + const std::vector &paddings, + const int *dilations, + const ActivationType activation, + const float relux_max_limit) + : Conv2dFunctorBase(strides, + padding_type, + paddings, + dilations, + activation, + relux_max_limit), + is_filter_transformed_(false) {} + + void operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output, + StatsFuture *future); + + // TODO(liyin): share tmp buffers among ops + Tensor padded_input_; + Tensor padded_output_; + Tensor transformed_input_; + Tensor transformed_filter_; + Tensor transformed_output_; + bool is_filter_transformed_; +}; template struct Conv2dFunctor : Conv2dFunctorBase { diff --git a/mace/kernels/conv_pool_2d_util.cc b/mace/kernels/conv_pool_2d_util.cc index 9bbbdcf1d96852744de1e073e67e9b4a15dc2c1f..7615cfc6676254966cdb46090c37aabdaebc3bb4 100644 --- a/mace/kernels/conv_pool_2d_util.cc +++ b/mace/kernels/conv_pool_2d_util.cc @@ -9,7 +9,7 @@ namespace mace { namespace kernels { -void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW +void CalcNCHWPaddingAndOutputSize(const index_t *input_shape, // NCHW const index_t *filter_shape, // OIHW const int *dilations, const int *strides, @@ -186,6 +186,55 @@ void CalcOutputSize(const index_t *input_shape, // NHWC output_shape[3] = filter_shape[2]; } +void CalcNCHWOutputSize(const index_t *input_shape, // NCHW + const index_t *filter_shape, // OIHW + const int *padding_size, + const int *dilations, + const int *strides, + const RoundType round_type, + index_t *output_shape) { + MACE_CHECK(dilations[0] > 0 && dilations[1] > 0, + "Invalid dilations, must >= 1"); + MACE_CHECK((dilations[0] == 1 || strides[0] == 1) && + (dilations[1] == 1 || strides[1] == 1), + "If dilations > 1, strides should be 1"); + MACE_CHECK_NOTNULL(output_shape); + MACE_CHECK_NOTNULL(padding_size); + /* + * Convolution arithmetic: + * o = floor((i + 2 * p - k - (k - 1) * (d - 1)) / s) + 1 + * Pooling arithmetic: + * o = ceil((i + 2 * p - k - (k - 1) * (d - 1)) / s) + 1 + * For details, see https://arxiv.org/pdf/1603.07285.pdf or + * http://deeplearning.net/software/theano/tutorial/conv_arithmetic.html + */ + output_shape[0] = input_shape[0]; + if (round_type == FLOOR) { + output_shape[2] = static_cast( + std::floor(1.0 * (input_shape[2] + padding_size[0] - filter_shape[2] - + (filter_shape[2] - 1) * (dilations[0] - 1)) / + strides[0]) + + 1); + output_shape[3] = static_cast( + std::floor(1.0 * (input_shape[3] + padding_size[1] - filter_shape[3] - + (filter_shape[3] - 1) * (dilations[1] - 1)) / + strides[1]) + + 1); + } else { + output_shape[2] = static_cast( + std::ceil(1.0 * (input_shape[2] + padding_size[0] - filter_shape[2] - + (filter_shape[2] - 1) * (dilations[0] - 1)) / + strides[0]) + + 1); + output_shape[3] = static_cast( + std::ceil(1.0 * (input_shape[3] + padding_size[1] - filter_shape[3] - + (filter_shape[3] - 1) * (dilations[1] - 1)) / + strides[1]) + + 1); + } + output_shape[1] = filter_shape[0]; +} + void CalPaddingSize(const index_t *input_shape, // NCHW const index_t *filter_shape, // OIHW const int *dilations, @@ -230,10 +279,11 @@ void CalPaddingSize(const index_t *input_shape, // NCHW 0, (output_width - 1) * strides[1] + k_extent_width - input_shape[3]); } -void ConstructInputWithPadding(const Tensor *input_tensor, - const int *paddings, - Tensor *output_tensor, - bool padding_same_value) { + +void ConstructNCHWInputWithPadding(const Tensor *input_tensor, + const int *paddings, + Tensor *output_tensor, + bool padding_same_value) { Tensor::MappingGuard input_mapper(input_tensor); const float *input = input_tensor->data(); const index_t *input_shape = input_tensor->shape().data(); @@ -244,7 +294,7 @@ void ConstructInputWithPadding(const Tensor *input_tensor, index_t width = input_shape[3]; std::vector output_shape( - {batch, channels, paddings[0] + height, paddings[1] + width}); + {batch, channels, paddings[0] + height, paddings[1] + width}); const index_t output_width = output_shape[3]; const int padded_top = paddings[0] / 2; @@ -268,6 +318,7 @@ void ConstructInputWithPadding(const Tensor *input_tensor, const int padded_bottom = paddings[0] - padded_top; const int padded_right = paddings[1] - padded_left; + for (int i = 0; i < batch; ++i) { for (int j = 0; j < channels; ++j) { for (int k = 0; k < padded_top; ++k) { @@ -301,6 +352,51 @@ void ConstructInputWithPadding(const Tensor *input_tensor, } } +void ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor, + const int pad_top, + const int pad_bottom, + const int pad_left, + const int pad_right, + Tensor *output_tensor) { + Tensor::MappingGuard input_mapper(input_tensor); + const float *input = input_tensor->data(); + const index_t *input_shape = input_tensor->shape().data(); + + index_t batch = input_shape[0]; + index_t channels = input_shape[1]; + index_t height = input_shape[2]; + index_t width = input_shape[3]; + + const int pad_height = pad_top + pad_bottom; + const int pad_width = pad_left + pad_right; + std::vector output_shape( + {batch, channels, height + pad_height, width + pad_width}); + output_tensor->Resize(output_shape); + Tensor::MappingGuard padded_output_mapper(output_tensor); + float *output_data = output_tensor->mutable_data(); + + const index_t output_height = output_shape[2]; + const index_t output_width = output_shape[3]; + const index_t in_image_size = height * width; + const index_t out_image_size = output_height * output_width; + const index_t in_batch_size = channels * in_image_size; + const index_t out_batch_size = channels * out_image_size; + +#pragma omp parallel for collapse(2) + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + for (int k = 0; k < height; ++k) { + memcpy(output_data + i * out_batch_size + j * out_image_size + + (pad_top + k) * output_width + pad_left, + input + i * in_batch_size + j * in_image_size + k * width, + width * sizeof(float)); + } + // Skip the padded bottom in this channel and top in the next channel + } + } +} + + void ConstructNHWCInputWithPadding(const Tensor *input_tensor, const int *paddings, Tensor *output_tensor, diff --git a/mace/kernels/conv_pool_2d_util.h b/mace/kernels/conv_pool_2d_util.h index 45b1d8a4fa074d2a5256609691bbe2e2e0e9cc00..32dee0cc34d526e3408cce339cc8e4c37eba6dad 100644 --- a/mace/kernels/conv_pool_2d_util.h +++ b/mace/kernels/conv_pool_2d_util.h @@ -22,16 +22,16 @@ enum RoundType { namespace kernels { -void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW - const index_t *filter_shape, // OIHW +void CalcNCHWPaddingAndOutputSize(const index_t *input_shape, + const index_t *filter_shape, const int *dilations, const int *strides, Padding padding, index_t *output_shape, int *padding_size); -void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, // NCHW - const index_t *filter_shape, // OIHW +void CalcNHWCPaddingAndOutputSize(const index_t *input_shape, + const index_t *filter_shape, const int *dilations, const int *strides, Padding padding, @@ -46,6 +46,14 @@ void CalcOutputSize(const index_t *input_shape, // NHWC const RoundType round_type, index_t *output_shape); +void CalcNCHWOutputSize(const index_t *input_shape, + const index_t *filter_shape, + const int *padding_size, + const int *dilations, + const int *strides, + const RoundType round_type, + index_t *output_shape); + void CalPaddingSize(const index_t *input_shape, // NCHW const index_t *filter_shape, // OIHW const int *dilations, @@ -53,10 +61,15 @@ void CalPaddingSize(const index_t *input_shape, // NCHW Padding padding, int *padding_size); -void ConstructInputWithPadding(const Tensor *input, - const int *paddings, - Tensor *output_tensor, - bool padding_same_value = false); +void ConstructNCHWInputWithSpecificPadding(const Tensor *input, + const int pad_top, const int pad_bottom, + const int pad_left, const int pad_right, + Tensor *output_tensor); + +void ConstructNCHWInputWithPadding(const Tensor *input, + const int *paddings, + Tensor *output_tensor, + bool padding_same_value = false); void ConstructNHWCInputWithPadding(const Tensor *input, const int *paddings, diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index 90c17b19e71553424c5f68eee1cc3bc9ffa2b279..7f4f2021f134dbcb2ffa9880a7c3e4c5a2ad7ddb 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -14,6 +14,7 @@ #include "mace/core/future.h" #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/kernels/conv_pool_2d_util.h" +#include "mace/kernels/activation.h" #include "mace/public/mace.h" namespace mace { @@ -407,12 +408,27 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { }; template <> -void DepthwiseConv2dFunctor::operator()( - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future); +struct DepthwiseConv2dFunctor + : DepthwiseConv2dFunctorBase { + DepthwiseConv2dFunctor(const int *strides, + const Padding padding_type, + const std::vector &paddings, + const int *dilations, + const ActivationType activation, + const float relux_max_limit) + : DepthwiseConv2dFunctorBase(strides, + padding_type, + paddings, + dilations, + activation, + relux_max_limit) {} + + void operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output, + StatsFuture *future); +}; template struct DepthwiseConv2dFunctor diff --git a/mace/kernels/gemm.cc b/mace/kernels/gemm.cc new file mode 100644 index 0000000000000000000000000000000000000000..00be4829802ede5fadbc0244917f56fcf0dd6025 --- /dev/null +++ b/mace/kernels/gemm.cc @@ -0,0 +1,247 @@ +// +// Copyright (c) 2018 XiaoMi All rights reserved. +// + +#include +#include + +#include "mace/kernels/gemm.h" +#include "mace/utils/utils.h" +#include "mace/utils/logging.h" + +namespace mace { +namespace kernels { + +namespace { +void GemmRef(const float *A, + const float *B, + const index_t height, + const index_t K, + const index_t width, + float *C) { + memset(C, 0, sizeof(float) * height * width); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + for (int k = 0; k < K; ++k) { + C[i * width + j] += A[i * K + k] * B[k * width + j]; + } + } + } +} + +inline void GemmBlock(const float *A, + const float *B, + const index_t height, + const index_t K, + const index_t width, + const index_t stride_k, + const index_t stride_w, + float *C) { + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + for (int k = 0; k < K; ++k) { + C[i * stride_w + j] += A[i * stride_k + k] * B[k * stride_w + j]; + } + } + } +} + +// TODO(liyin): may need implement 883 since RGB +inline void Gemm884(const float *a_ptr, + const float *b_ptr, + index_t stride_w, + index_t stride_k, + float *c_ptr) { +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) + float32x4_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, + a15; + float32x4_t b0, b1, b2, b3, b4, b5, b6, b7; + float32x4_t c0, c1, c2, c3, c4, c5, c6, c7; + + a0 = vld1q_f32(a_ptr); + a1 = vld1q_f32(a_ptr + 4); + a2 = vld1q_f32(a_ptr + 1 * stride_k); + a3 = vld1q_f32(a_ptr + 1 * stride_k + 4); + a4 = vld1q_f32(a_ptr + 2 * stride_k); + a5 = vld1q_f32(a_ptr + 2 * stride_k + 4); + a6 = vld1q_f32(a_ptr + 3 * stride_k); + a7 = vld1q_f32(a_ptr + 3 * stride_k + 4); + a8 = vld1q_f32(a_ptr + 4 * stride_k); + a9 = vld1q_f32(a_ptr + 4 * stride_k + 4); + a10 = vld1q_f32(a_ptr + 5 * stride_k); + a11 = vld1q_f32(a_ptr + 5 * stride_k + 4); + a12 = vld1q_f32(a_ptr + 6 * stride_k); + a13 = vld1q_f32(a_ptr + 6 * stride_k + 4); + a14 = vld1q_f32(a_ptr + 7 * stride_k); + a15 = vld1q_f32(a_ptr + 7 * stride_k + 4); + + b0 = vld1q_f32(b_ptr); + b1 = vld1q_f32(b_ptr + 1 * stride_w); + b2 = vld1q_f32(b_ptr + 2 * stride_w); + b3 = vld1q_f32(b_ptr + 3 * stride_w); + b4 = vld1q_f32(b_ptr + 4 * stride_w); + b5 = vld1q_f32(b_ptr + 5 * stride_w); + b6 = vld1q_f32(b_ptr + 6 * stride_w); + b7 = vld1q_f32(b_ptr + 7 * stride_w); + + c0 = vld1q_f32(c_ptr); + c1 = vld1q_f32(c_ptr + 1 * stride_w); + c2 = vld1q_f32(c_ptr + 2 * stride_w); + c3 = vld1q_f32(c_ptr + 3 * stride_w); + c4 = vld1q_f32(c_ptr + 4 * stride_w); + c5 = vld1q_f32(c_ptr + 5 * stride_w); + c6 = vld1q_f32(c_ptr + 6 * stride_w); + c7 = vld1q_f32(c_ptr + 7 * stride_w); + +#define MACE_CONV_1x1_REG_CAL(RC, RA, RAN) \ + c##RC = vfmaq_laneq_f32(c##RC, b0, a##RA, 0); \ + c##RC = vfmaq_laneq_f32(c##RC, b1, a##RA, 1); \ + c##RC = vfmaq_laneq_f32(c##RC, b2, a##RA, 2); \ + c##RC = vfmaq_laneq_f32(c##RC, b3, a##RA, 3); \ + c##RC = vfmaq_laneq_f32(c##RC, b4, a##RAN, 0); \ + c##RC = vfmaq_laneq_f32(c##RC, b5, a##RAN, 1); \ + c##RC = vfmaq_laneq_f32(c##RC, b6, a##RAN, 2); \ + c##RC = vfmaq_laneq_f32(c##RC, b7, a##RAN, 3); + + MACE_CONV_1x1_REG_CAL(0, 0, 1); + MACE_CONV_1x1_REG_CAL(1, 2, 3); + MACE_CONV_1x1_REG_CAL(2, 4, 5); + MACE_CONV_1x1_REG_CAL(3, 6, 7); + MACE_CONV_1x1_REG_CAL(4, 8, 9); + MACE_CONV_1x1_REG_CAL(5, 10, 11); + MACE_CONV_1x1_REG_CAL(6, 12, 13); + MACE_CONV_1x1_REG_CAL(7, 14, 15); + + vst1q_f32(c_ptr, c0); + vst1q_f32(c_ptr + 1 * stride_w, c1); + vst1q_f32(c_ptr + 2 * stride_w, c2); + vst1q_f32(c_ptr + 3 * stride_w, c3); + vst1q_f32(c_ptr + 4 * stride_w, c4); + vst1q_f32(c_ptr + 5 * stride_w, c5); + vst1q_f32(c_ptr + 6 * stride_w, c6); + vst1q_f32(c_ptr + 7 * stride_w, c7); + +#else + GemmBlock(a_ptr, b_ptr, 8, 8, 4, stride_k, stride_w, c_ptr); +#endif +} + +inline void GemmTile(const float *A, + const float *B, + const index_t height, + const index_t K, + const index_t width, + const index_t stride_k, + const index_t stride_w, + float *C) { + index_t h, w, k; + for (h = 0; h + 7 < height; h += 8) { + for (w = 0; w + 3 < width; w += 4) { + for (k = 0; k + 7 < K; k += 8) { + const float *a_ptr = A + (h * stride_k + k); + const float *b_ptr = B + (k * stride_w + w); + float *c_ptr = C + (h * stride_w + w); + Gemm884(a_ptr, b_ptr, stride_w, stride_k, c_ptr); + } + if (k < K) { + const float *a_ptr = A + (h * stride_k + k); + const float *b_ptr = B + (k * stride_w + w); + float *c_ptr = C + (h * stride_w + w); + GemmBlock(a_ptr, b_ptr, 8, K - k, 4, stride_k, stride_w, c_ptr); + } + } + if (w < width) { + const float *a_ptr = A + h * stride_k; + const float *b_ptr = B + w; + float *c_ptr = C + (h * stride_w + w); + GemmBlock(a_ptr, + b_ptr, + 8, + K, + width - w, + stride_k, + stride_w, + c_ptr); + } + } + if (h < height) { + // TODO(liyin): may use Gemm444 + const float *a_ptr = A + (h * stride_k); + const float *b_ptr = B; + float *c_ptr = C + h * stride_w; + GemmBlock(a_ptr, + b_ptr, + height - h, + K, + width, + stride_k, + stride_w, + c_ptr); + } +} +} // namespace + +void Gemm(const float *A, + const float *B, + const index_t batch, + const index_t height, + const index_t K, + const index_t width, + float *C) { + memset(C, 0, sizeof(float) * batch * height * width); + + + // It is better to use large block size if it fits for fast cache. + // Assume l1 cache size is 32k, we load three blocks at a time (A, B, C), + // the block size should be sqrt(32k / sizeof(T) / 3). + const index_t block_size = 48; + const index_t block_tile_height = RoundUpDiv(height, block_size); + const index_t block_tile_width = RoundUpDiv(width, block_size); + const index_t block_tile_k = RoundUpDiv(K, block_size); + const index_t remain_height = height % block_size; + const index_t remain_width = width % block_size; + const index_t remain_k = K % block_size; + +#pragma omp parallel for collapse(3) + for (index_t n = 0; n < batch; ++n) { + for (index_t bh = 0; bh < block_tile_height; ++bh) { + for (index_t bw = 0; bw < block_tile_width; ++bw) { + const float *a_base = A + n * height * K; + const float *b_base = B + n * K * width; + float *c_base = C + n * height * width; + + const index_t ih_begin = bh * block_size; + const index_t ih_end = + bh * block_size + (bh == block_tile_height - 1 && remain_height > 0 + ? remain_height : block_size); + const index_t iw_begin = bw * block_size; + const index_t iw_end = + bw * block_size + + (bw == block_tile_width - 1 && remain_width > 0 ? remain_width + : block_size); + + for (index_t bk = 0; bk < block_tile_k; ++bk) { + const index_t ik_begin = bk * block_size; + const index_t ik_end = + bk * block_size + + (bk == block_tile_k - 1 && remain_k > 0 ? remain_k + : block_size); + + // inside block: + // calculate C[bh, bw] += A[bh, bk] * B[bk, bw] for one k + GemmTile(a_base + (ih_begin * K + ik_begin), + b_base + (ik_begin * width + iw_begin), + ih_end - ih_begin, + ik_end - ik_begin, + iw_end - iw_begin, + K, + width, + c_base + (ih_begin * width + iw_begin)); + } // bk + } // bw + } // bh + } // n +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/gemm.h b/mace/kernels/gemm.h new file mode 100644 index 0000000000000000000000000000000000000000..d17eab83e12d5531eed8bdaddd6af352b179b327 --- /dev/null +++ b/mace/kernels/gemm.h @@ -0,0 +1,28 @@ +// +// Copyright (c) 2018 XiaoMi All rights reserved. +// + +#ifndef MACE_KERNELS_GEMM_H_ +#define MACE_KERNELS_GEMM_H_ + +#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) +#include +#endif + +#include "mace/core/types.h" + +namespace mace { +namespace kernels { + +void Gemm(const float *A, + const float *B, + const index_t batch, + const index_t height, + const index_t K, + const index_t width, + float *C); + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_GEMM_H_ diff --git a/mace/kernels/gemm_test.cc b/mace/kernels/gemm_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9e4b964daf5922a6fd0912ca65d9b01225aaf2fb --- /dev/null +++ b/mace/kernels/gemm_test.cc @@ -0,0 +1,46 @@ +// +// Copyright (c) 2018 XiaoMi All rights reserved. +// + +#include +#include + +#include "mace/kernels/gemm.h" +#include "mace/core/types.h" + +namespace mace { + +TEST(GEMMTest, gemm) { + index_t N = 17; + index_t M = 33; + index_t K = 64; + float *A = new float[N * K]; + float *B = new float[K * M]; + float *C = new float[N * M]; + float *C_ref = new float[N * M]; + + std::random_device rd; + std::mt19937 gen(rd()); + std::normal_distribution nd(0, 1); + + std::generate(A, A + N * K, + [&gen, &nd] { + return nd(gen); + }); + std::generate(B, B + K * M, + [&gen, &nd] { + return nd(gen); + }); + kernels::Gemm(A, B, N, K, M, C); + kernels::GemmRef(A, B, N, K, M, C_ref); + + for (int i = 0; i < N * M; ++i) { + EXPECT_NEAR(C_ref[i], C[i], 0.1); + } + + delete[]A; + delete[]B; + delete[]C; +} + +} // namespace mace diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h index b025cbfebe29efa20d65838328458eb73befb823..28db80c0cf8896a6fd48143a8aa4167b0a04e1cd 100644 --- a/mace/kernels/matmul.h +++ b/mace/kernels/matmul.h @@ -16,142 +16,12 @@ #include "mace/core/future.h" #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/tensor.h" +#include "mace/kernels/gemm.h" #include "mace/utils/utils.h" namespace mace { namespace kernels { -template -inline void MatMulKernelFunc(const T *A, - const T *B, - T *C, - index_t offset_h, - index_t offset_w, - index_t offset_k, - index_t stride_h, - index_t stride_w, - index_t stride_k) { - T a_tmp[register_tile_size][register_tile_size] = {0}; - T b_tmp[register_tile_size][register_tile_size] = {0}; - T c_tmp[register_tile_size][register_tile_size] = {0}; - - for (int h = 0; h < h_count; ++h) { - for (int k = 0; k < k_count; ++k) { - a_tmp[h][k] = A[(offset_h + h) * stride_k + (offset_k + k)]; - } - } - for (int k = 0; k < k_count; ++k) { - for (int w = 0; w < w_count; ++w) { - b_tmp[k][w] = B[(offset_k + k) * stride_w + (offset_w + w)]; - } - } - -#if defined(MACE_ENABLE_NEON) && defined(__aarch64__) - static_assert(register_tile_size == 4, "register tile size must be 4"); - float32x4_t a_dup; - float32x4_t b_vec[4] = - {vld1q_f32(b_tmp[0]), vld1q_f32(b_tmp[1]), vld1q_f32(b_tmp[2]), - vld1q_f32(b_tmp[3])}; - float32x4_t - c_vec[4] = {vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0)}; - - for (int h = 0; h < register_tile_size; ++h) { - for (int k = 0; k < register_tile_size; ++k) { - a_dup = vdupq_n_f32(a_tmp[h][k]); - c_vec[h] = vfmaq_f32(c_vec[h], a_dup, b_vec[k]); - } - } - - for (int h = 0; h < register_tile_size; ++h) { - vst1q_f32(c_tmp[h], c_vec[h]); - } - -#else - for (int h = 0; h < register_tile_size; ++h) { - for (int w = 0; w < register_tile_size; ++w) { - for (int k = 0; k < register_tile_size; ++k) { - c_tmp[h][w] += a_tmp[h][k] * b_tmp[k][w]; - } - } - } -#endif - - for (int h = 0; h < h_count; ++h) { - for (int w = 0; w < w_count; ++w) { - C[(offset_h + h) * stride_w + (offset_w + w)] += c_tmp[h][w]; - } - } -} - -#define MACE_DO_MATMUL(HC, WC, KC) \ -MatMulKernelFunc(a_ptr_batch_base, \ - b_ptr_batch_base, \ - c_ptr_batch_base, \ - ih, \ - iw, \ - ik, \ - height, \ - width, \ - K); - -#define MACE_CASE_K_MATMUL(HC, WC) \ -switch (k_count) { \ - case 1: \ - MACE_DO_MATMUL(HC, WC, 1); \ - break; \ - case 2: \ - MACE_DO_MATMUL(HC, WC, 2); \ - break; \ - case 3: \ - MACE_DO_MATMUL(HC, WC, 3); \ - break; \ - case 4: \ - MACE_DO_MATMUL(HC, WC, 4); \ - break; \ - default: \ - LOG(FATAL) << "Unsupported k tile: " << k_count; \ -} - -#define MACE_CASE_W_MATMUL(HC) \ -switch (w_count) { \ - case 1: \ - MACE_CASE_K_MATMUL(HC, 1); \ - break; \ - case 2: \ - MACE_CASE_K_MATMUL(HC, 2); \ - break; \ - case 3: \ - MACE_CASE_K_MATMUL(HC, 3); \ - break; \ - case 4: \ - MACE_CASE_K_MATMUL(HC, 4); \ - break; \ - default: \ - LOG(FATAL) << "Unsupported w tile: " << w_count; \ -} - -#define MACE_CASE_H_MATMUL \ -switch (h_count) { \ - case 1: \ - MACE_CASE_W_MATMUL(1); \ - break; \ - case 2: \ - MACE_CASE_W_MATMUL(2); \ - break; \ - case 3: \ - MACE_CASE_W_MATMUL(3); \ - break; \ - case 4: \ - MACE_CASE_W_MATMUL(4); \ - break; \ - default: \ - LOG(FATAL) << "Unsupported h tile: " << h_count; \ -} - template struct MatMulFunctor { void operator()(const Tensor *A, @@ -185,51 +55,7 @@ struct MatMulFunctor { constexpr index_t register_tile_size = 4; memset(c_ptr_base, 0, batch * height * width * sizeof(T)); -#pragma omp parallel for collapse(3) - for (index_t n = 0; n < batch; ++n) { - // handle block - for (index_t bh = 0; bh < block_tile_height; ++bh) { - for (index_t bw = 0; bw < block_tile_width; ++bw) { - const T *a_ptr_batch_base = a_ptr_base + n * height * K; - const T *b_ptr_batch_base = b_ptr_base + n * K * width; - T *c_ptr_batch_base = c_ptr_base + n * height * width; - const index_t ih_begin = bh * block_size; - const index_t ih_end = - bh * block_size + (bh == block_tile_height - 1 && remain_height > 0 - ? remain_height : block_size); - const index_t iw_begin = bw * block_size; - const index_t iw_end = - bw * block_size - + (bw == block_tile_width - 1 && remain_width > 0 ? remain_width - : block_size); - - for (index_t bk = 0; bk < block_tile_k; ++bk) { - const index_t ik_begin = bk * block_size; - const index_t ik_end = - bk * block_size - + (bk == block_tile_k - 1 && remain_k > 0 ? remain_k - : block_size); - - // inside block: - // calculate C[bh, bw] += A[bh, bk] * B[bk, bw] for one k - for (index_t ih = ih_begin; ih < ih_end; - ih += register_tile_size) { - for (index_t iw = iw_begin; iw < iw_end; - iw += register_tile_size) { - for (index_t ik = ik_begin; ik < ik_end; - ik += register_tile_size) { - const int h_count = std::min(register_tile_size, ih_end - ih); - const int w_count = std::min(register_tile_size, iw_end - iw); - const int k_count = std::min(register_tile_size, ik_end - ik); - - MACE_CASE_H_MATMUL; - } // ik - } // iw - } // ih - } // bk - } // bw - } // bh - } // n + Gemm(a_ptr_base, b_ptr_base, batch, height, K, width, c_ptr_base); } }; diff --git a/mace/kernels/neon/avg_pooling_neon_2x2.cc b/mace/kernels/neon/avg_pooling_neon_2x2.cc deleted file mode 100644 index a1079100d909aa4a6a05b464051d577c807e421f..0000000000000000000000000000000000000000 --- a/mace/kernels/neon/avg_pooling_neon_2x2.cc +++ /dev/null @@ -1,175 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include -#include -#include - -namespace mace { -namespace kernels { - -void PoolingAvgNeonK2x2S2x2(const float *input, - const index_t *in_shape, - float *output, - const index_t *out_shape, - const int *paddings) { - index_t batch = in_shape[0]; - index_t channels = in_shape[1]; - index_t in_height = in_shape[2]; - index_t in_width = in_shape[3]; - - index_t out_height = out_shape[2]; - index_t out_width = out_shape[3]; - - int padding_top = paddings[0] / 2; - int padding_bottom = paddings[0] - padding_top; - int padding_left = paddings[1] / 2; - int padding_right = paddings[1] - padding_left; - - int in_image_size = in_height * in_width; - int out_image_size = out_height * out_width; - index_t input_offset = 0; - index_t output_offset = 0; - float avg_factors[4] = {0.25, 0.25, 0.25, 0.25}; - -#pragma omp parallel for collapse(2) - for (int b = 0; b < batch; ++b) { - for (int c = 0; c < channels; ++c) { - float *outptr = output + output_offset; - const float *r0, *r1; - - for (int h = 0; h < out_height; ++h) { - int w = 0; - int num_vectors = 0; - if (!((h == 0 && padding_top > 0) || - (h == out_height - 1 && padding_bottom > 0))) { - r0 = input + input_offset + (h * 2 - padding_top) * in_width; - r1 = r0 + in_width; - if (padding_left > 0) { - *outptr = (r0[0] + r1[0]) * 0.25; - ++r0; - ++r1; - ++outptr; - ++w; - } - if (padding_right > 0) { - num_vectors = (out_width - w - 1) >> 2; - } else { - num_vectors = (out_width - w) >> 2; - } - } - - w += num_vectors << 2; - float32x4_t factors = vld1q_f32(avg_factors); - for (; num_vectors > 0; --num_vectors) { - float32x4_t r00 = vld1q_f32(r0); - float32x4_t r10 = vld1q_f32(r1); - float32x4_t r01 = vld1q_f32(r0 + 4); - float32x4_t r11 = vld1q_f32(r1 + 4); - - float32x4_t sum0 = vaddq_f32(r00, r10); - float32x4_t sum1 = vaddq_f32(r01, r11); - - float32x4_t sum_result = vpaddq_f32(sum0, sum1); - float32x4_t avg_result = vmulq_f32(sum_result, factors); - - vst1q_f32(outptr, avg_result); - - r0 += 8; - r1 += 8; - outptr += 4; - } - - for (; w < out_width; ++w) { - float sum = 0.0; - for (int kh = 0; kh < 2; ++kh) { - for (int kw = 0; kw < 2; ++kw) { - int inh = h * 2 - padding_top + kh; - int inw = w * 2 - padding_left + kw; - if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) { - sum += input[input_offset + inh * in_width + inw]; - } - } - } - - *outptr = sum * 0.25; - ++outptr; - } - } - input_offset += in_image_size; - output_offset += out_image_size; - } - } -} - -// assume the input has already been padded -void PoolingAvgNeonK2x2S2x2Padded(const float *input, - const index_t *in_shape, - float *output, - const index_t *out_shape) { - index_t batch = in_shape[0]; - index_t channels = in_shape[1]; - index_t in_height = in_shape[2]; - index_t in_width = in_shape[3]; - - index_t out_height = out_shape[2]; - index_t out_width = out_shape[3]; - - int in_image_size = in_height * in_width; - int out_image_size = out_height * out_width; - index_t input_offset = 0; - index_t output_offset = 0; - float avg_factors[4] = {0.25, 0.25, 0.25, 0.25}; - -#pragma omp parallel for collapse(2) - for (int b = 0; b < batch; ++b) { - for (int c = 0; c < channels; ++c) { - const float *img0 = input + input_offset; - float *outptr = output + output_offset; - - const float *r0 = img0; - const float *r1 = img0 + in_width; - - for (int h = 0; h < out_height; ++h) { - int num_vectors = out_width >> 2; - int remain = out_width - (num_vectors << 2); - - float32x4_t factors = vld1q_f32(avg_factors); - for (; num_vectors > 0; --num_vectors) { - float32x4_t r00 = vld1q_f32(r0); - float32x4_t r10 = vld1q_f32(r1); - float32x4_t r01 = vld1q_f32(r0 + 4); - float32x4_t r11 = vld1q_f32(r1 + 4); - - float32x4_t sum0 = vaddq_f32(r00, r10); - float32x4_t sum1 = vaddq_f32(r01, r11); - - float32x4_t sum_result = vpaddq_f32(sum0, sum1); - float32x4_t avg_result = vmulq_f32(sum_result, factors); - - vst1q_f32(outptr, avg_result); - - r0 += 8; - r1 += 8; - outptr += 4; - } - - for (; remain > 0; --remain) { - *outptr = (r0[0] + r0[1] + r1[0] + r1[1]) * 0.25; - - r0 += 2; - r1 += 2; - outptr++; - } - r0 += in_width; - r1 += in_width; - } - input_offset += in_image_size; - output_offset += out_image_size; - } - } -} - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/neon/batch_norm_neon.cc b/mace/kernels/neon/batch_norm_neon.cc deleted file mode 100644 index 930a0c5d5692a120c71bfd962c569443aa90d805..0000000000000000000000000000000000000000 --- a/mace/kernels/neon/batch_norm_neon.cc +++ /dev/null @@ -1,84 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include "mace/kernels/batch_norm.h" -#include - -namespace mace { -namespace kernels { - -template <> -void BatchNormFunctor::operator()( - const Tensor *input, - const Tensor *scale, - const Tensor *offset, - const Tensor *mean, - const Tensor *var, - const float epsilon, - Tensor *output, - StatsFuture *future) { - // Batch normalization in the paper https://arxiv.org/abs/1502.03167 . - // The calculation formula for inference is - // Y = \frac{ \scale } { \sqrt{var+\epsilon} } * X + - // ( \offset - \frac { \scale * mean } { \sqrt{var+\epsilon} - // } - // new_scale = \frac{ \scale } { \sqrt{var+\epsilon} } - // new_offset = \offset - mean * common_val; - // Y = new_scale * X + new_offset; - const index_t n = input->dim(0); - const index_t sample_size = input->dim(1) * input->dim(2); - const index_t channel = input->dim(3); - - const float *input_ptr = input->data(); - const float *scale_ptr = scale->data(); - const float *offset_ptr = offset->data(); - const float *mean_ptr = mean->data(); - const float *var_ptr = var->data(); - float *output_ptr = output->mutable_data(); - - const index_t ch_blks = channel >> 2; - const index_t remain_chs = channel - (ch_blks << 2); - - std::vector new_scale(channel); - std::vector new_offset(channel); - -#pragma omp parallel for - for (index_t c = 0; c < channel; ++c) { - new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon); - new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c]; - } - -#pragma omp parallel for collapse(2) - for (index_t i = 0; i < n; ++i) { - for (index_t j = 0; j < sample_size; ++j) { - const float *input_sample_ptr = - input_ptr + (i * sample_size + j) * channel; - float *output_sample_ptr = output_ptr + (i * sample_size + j) * channel; - const float *new_scale_ptr = new_scale.data(); - const float *new_offset_ptr = new_offset.data(); - for (index_t cb = 0; cb < ch_blks; ++cb) { - float32x4_t new_scale_f = vld1q_f32(new_scale_ptr); - float32x4_t new_offset_f = vld1q_f32(new_offset_ptr); - float32x4_t input_f = vld1q_f32(input_sample_ptr); - float32x4_t output_f = vfmaq_f32(new_offset_f, input_f, new_scale_f); - vst1q_f32(output_sample_ptr, output_f); - - input_sample_ptr += 4; - output_sample_ptr += 4; - new_scale_ptr += 4; - new_offset_ptr += 4; - } - for (index_t c = (ch_blks << 2); c < channel; ++c) { - *output_sample_ptr = new_scale[c] * *input_sample_ptr + new_offset[c]; - ++output_sample_ptr; - ++input_sample_ptr; - ++new_scale_ptr; - ++new_offset_ptr; - } - } - } -} - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/neon/conv_2d_neon.cc b/mace/kernels/neon/conv_2d_neon.cc deleted file mode 100644 index 8b937ddf539c04c717868c1b28992908821df4e4..0000000000000000000000000000000000000000 --- a/mace/kernels/neon/conv_2d_neon.cc +++ /dev/null @@ -1,109 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include "mace/kernels/conv_2d.h" -#include "mace/kernels/conv_pool_2d_util.h" - -namespace mace { -namespace kernels { - -extern void Conv2dNeonK1x1S1(const float *input, - const index_t *input_shape, - const float *filter, - const index_t *filter_shape, - const float *bias, - float *output, - const index_t *output_shape); - -extern void Conv2dNeonK3x3S1(const float *input, - const index_t *input_shape, - const float *filter, - const index_t *filter_shape, - const float *bias, - float *output, - const index_t *output_shape); - -extern void Conv2dNeonK3x3S2(const float *input, - const index_t *input_shape, - const float *filter, - const index_t *filter_shape, - const float *bias, - float *output, - const index_t *output_shape); - -extern void Conv2dNeonK5x5S1(const float *input, - const index_t *input_shape, - const float *filter, - const index_t *filter_shape, - const float *bias, - float *output, - const index_t *output_shape); - -template <> -void Conv2dFunctor::operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future) { - MACE_CHECK_NOTNULL(input); - MACE_CHECK_NOTNULL(filter); - MACE_CHECK_NOTNULL(output); - - std::vector output_shape_vec(4); - std::vector paddings(2); - kernels::CalcPaddingAndOutputSize( - input->shape().data(), filter->shape().data(), dilations_, strides_, - paddings_, output_shape_vec.data(), paddings.data()); - output->Resize(output_shape_vec); - - typedef void (*Conv2dNeonFunction)( - const float *input, const index_t *input_shape, const float *filter, - const index_t *filter_shape, const float *bias, float *output, - const index_t *output_shape); - // Selection matrix: kernel_size x stride_size - static const Conv2dNeonFunction selector[5][2] = { - {Conv2dNeonK1x1S1, nullptr}, - {nullptr, nullptr}, - {Conv2dNeonK3x3S1, Conv2dNeonK3x3S2}, - {nullptr, nullptr}, - {Conv2dNeonK5x5S1, nullptr}}; - // not implement yet - index_t kernel_h = filter->dim(2); - index_t kernel_w = filter->dim(3); - if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] || - strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 || - selector[kernel_h - 1][strides_[0] - 1] == nullptr) { - LOG(WARNING) << "NEON conv2d kernel with " - << "filter" << kernel_h << "x" << kernel_w << "," - << " stride " << strides_[0] << "x" << strides_[1] - << " is not implemented yet, using slow version"; - Conv2dFunctor(strides_, paddings_, dilations_)( - input, filter, bias, output, future); - return; - } - - Tensor padded_input; - // Keep this alive during kernel execution - if (paddings[0] > 0 || paddings[1] > 0) { - ConstructInputWithPadding(input, paddings.data(), &padded_input); - input = &padded_input; - } - Tensor::MappingGuard input_mapper(input); - Tensor::MappingGuard filter_mapper(filter); - Tensor::MappingGuard bias_mapper(bias); - Tensor::MappingGuard output_mapper(output); - auto input_data = input->data(); - auto input_shape = input->shape().data(); - auto filter_data = filter->data(); - auto bias_data = bias == nullptr ? nullptr : bias->data(); - auto output_data = output->mutable_data(); - auto output_shape = output->shape().data(); - - auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1]; - conv2d_neon_func(input_data, input_shape, filter_data, nullptr, bias_data, - output_data, output_shape); -} - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/neon/conv_2d_neon_1x1.cc b/mace/kernels/neon/conv_2d_neon_1x1.cc deleted file mode 100644 index 14c20cc387c03b41f2b190e038a693f9236514ab..0000000000000000000000000000000000000000 --- a/mace/kernels/neon/conv_2d_neon_1x1.cc +++ /dev/null @@ -1,475 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include -#include "mace/utils/utils.h" - -namespace mace { -namespace kernels { -static constexpr index_t kInputChannelBlockSize = 2; -static constexpr index_t kOutputChannelBlockSize = 4; -static __attribute__((__aligned__(64))) -int32_t mask_array[8] = {0, 0, 0, 0, -1, -1, -1, -1}; - -static inline void NeonConv2x4Kernel(index_t input_channels, - index_t pixel_size, - const float *input, - const float *filter, - float *output) { - const float *input0 = input; - const float *input1 = input + pixel_size; - - const float32x2_t vfilter0x = vld1_f32(filter); - filter += input_channels; - const float32x2_t vfilter1x = vld1_f32(filter); - filter += input_channels; - const float32x2_t vfilter2x = vld1_f32(filter); - filter += input_channels; - const float32x2_t vfilter3x = vld1_f32(filter); - - float *output0 = output; - float *output1 = output0 + pixel_size; - float *output2 = output1 + pixel_size; - float *output3 = output2 + pixel_size; - while (pixel_size >= 4) { - float32x4_t voutput0 = vld1q_f32(output0); - float32x4_t voutput1 = vld1q_f32(output1); - float32x4_t voutput2 = vld1q_f32(output2); - float32x4_t voutput3 = vld1q_f32(output3); - - const float32x4_t vinput0 = vld1q_f32(input0); - input0 += 4; - voutput0 = vfmaq_lane_f32(voutput0, vinput0, vfilter0x, 0); - voutput1 = vfmaq_lane_f32(voutput1, vinput0, vfilter1x, 0); - voutput2 = vfmaq_lane_f32(voutput2, vinput0, vfilter2x, 0); - voutput3 = vfmaq_lane_f32(voutput3, vinput0, vfilter3x, 0); - - const float32x4_t vinput1 = vld1q_f32(input1); - input1 += 4; - voutput0 = vfmaq_lane_f32(voutput0, vinput1, vfilter0x, 1); - voutput1 = vfmaq_lane_f32(voutput1, vinput1, vfilter1x, 1); - voutput2 = vfmaq_lane_f32(voutput2, vinput1, vfilter2x, 1); - voutput3 = vfmaq_lane_f32(voutput3, vinput1, vfilter3x, 1); - - vst1q_f32(output0, voutput0); - output0 += 4; - vst1q_f32(output1, voutput1); - output1 += 4; - vst1q_f32(output2, voutput2); - output2 += 4; - vst1q_f32(output3, voutput3); - output3 += 4; - - pixel_size -= 4; - } - if (pixel_size != 0) { - const int32x4_t vmask = vld1q_s32(&mask_array[pixel_size]); - - output0 = output0 + pixel_size - 4; - float32x4_t voutput0 = vld1q_f32(output0); - output1 = output1 + pixel_size - 4; - float32x4_t voutput1 = vld1q_f32(output1); - output2 = output2 + pixel_size - 4; - float32x4_t voutput2 = vld1q_f32(output2); - output3 = output3 + pixel_size - 4; - float32x4_t voutput3 = vld1q_f32(output3); - - const float32x4_t vinput0 = vreinterpretq_f32_s32(vandq_s32( - vmask, vreinterpretq_s32_f32(vld1q_f32(&input0[pixel_size - 4])))); - voutput0 = vfmaq_lane_f32(voutput0, vinput0, vfilter0x, 0); - voutput1 = vfmaq_lane_f32(voutput1, vinput0, vfilter1x, 0); - voutput2 = vfmaq_lane_f32(voutput2, vinput0, vfilter2x, 0); - voutput3 = vfmaq_lane_f32(voutput3, vinput0, vfilter3x, 0); - - const float32x4_t vinput1 = vreinterpretq_f32_s32(vandq_s32( - vmask, vreinterpretq_s32_f32(vld1q_f32(&input1[pixel_size - 4])))); - voutput0 = vfmaq_lane_f32(voutput0, vinput1, vfilter0x, 1); - voutput1 = vfmaq_lane_f32(voutput1, vinput1, vfilter1x, 1); - voutput2 = vfmaq_lane_f32(voutput2, vinput1, vfilter2x, 1); - voutput3 = vfmaq_lane_f32(voutput3, vinput1, vfilter3x, 1); - - vst1q_f32(output0, voutput0); - vst1q_f32(output1, voutput1); - vst1q_f32(output2, voutput2); - vst1q_f32(output3, voutput3); - } -} - -static inline void NeonConv2x4SubBlockKernel( - index_t input_channels_subblock_size, - index_t output_channels_subblock_size, - index_t input_channels, - index_t pixel_size, - const float *input, - const float *filter, - float *output) { - const float *input0 = input; - const float *input1 = input + pixel_size; - - float32x2_t vfilter0x, vfilter1x, vfilter2x, vfilter3x; - vfilter0x = vld1_dup_f32(&filter[0]); - if (input_channels_subblock_size > 1) { - vfilter0x = vld1_lane_f32(&filter[1], vfilter0x, 1); - } - if (output_channels_subblock_size > 1) { - filter += input_channels; - vfilter1x = vld1_dup_f32(&filter[0]); - if (input_channels_subblock_size > 1) { - vfilter1x = vld1_lane_f32(&filter[1], vfilter1x, 1); - } - if (output_channels_subblock_size > 2) { - filter += input_channels; - vfilter2x = vld1_dup_f32(&filter[0]); - if (input_channels_subblock_size > 1) { - vfilter2x = vld1_lane_f32(&filter[1], vfilter2x, 1); - } - if (output_channels_subblock_size > 3) { - filter += input_channels; - vfilter3x = vld1_dup_f32(&filter[0]); - if (input_channels_subblock_size > 1) { - vfilter3x = vld1_lane_f32(&filter[1], vfilter3x, 1); - } - } - } - } - - float *output0 = output; - float *output1 = output0 + pixel_size; - float *output2 = output1 + pixel_size; - float *output3 = output2 + pixel_size; - while (pixel_size >= 4) { - float32x4_t voutput0, voutput1, voutput2, voutput3; - voutput0 = vld1q_f32(output0); - if (output_channels_subblock_size > 1) { - voutput1 = vld1q_f32(output1); - if (output_channels_subblock_size > 2) { - voutput2 = vld1q_f32(output2); - if (output_channels_subblock_size > 3) { - voutput3 = vld1q_f32(output3); - } - } - } - - const float32x4_t vinput0 = vld1q_f32(input0); - input0 += 4; - voutput0 = vfmaq_lane_f32(voutput0, vinput0, vfilter0x, 0); - voutput1 = vfmaq_lane_f32(voutput1, vinput0, vfilter1x, 0); - voutput2 = vfmaq_lane_f32(voutput2, vinput0, vfilter2x, 0); - voutput3 = vfmaq_lane_f32(voutput3, vinput0, vfilter3x, 0); - - if (input_channels_subblock_size > 1) { - const float32x4_t vinput1 = vld1q_f32(input1); - input1 += 4; - voutput0 = vfmaq_lane_f32(voutput0, vinput1, vfilter0x, 1); - voutput1 = vfmaq_lane_f32(voutput1, vinput1, vfilter1x, 1); - voutput2 = vfmaq_lane_f32(voutput2, vinput1, vfilter2x, 1); - voutput3 = vfmaq_lane_f32(voutput3, vinput1, vfilter3x, 1); - } - - vst1q_f32(output0, voutput0); - output0 += 4; - if (output_channels_subblock_size > 1) { - vst1q_f32(output1, voutput1); - output1 += 4; - if (output_channels_subblock_size > 2) { - vst1q_f32(output2, voutput2); - output2 += 4; - if (output_channels_subblock_size > 3) { - vst1q_f32(output3, voutput3); - output3 += 4; - } - } - } - - pixel_size -= 4; - } - if (pixel_size != 0) { - const int32x4_t vmask = vld1q_s32(&mask_array[pixel_size]); - - float32x4_t voutput0, voutput1, voutput2, voutput3; - output0 += pixel_size - 4; - voutput0 = vld1q_f32(output0); - if (output_channels_subblock_size > 1) { - output1 += pixel_size - 4; - voutput1 = vld1q_f32(output1); - if (output_channels_subblock_size > 2) { - output2 += pixel_size - 4; - voutput2 = vld1q_f32(output2); - if (output_channels_subblock_size > 3) { - output3 += pixel_size - 4; - voutput3 = vld1q_f32(output3); - } - } - } - - const float32x4_t vinput0 = vreinterpretq_f32_s32(vandq_s32( - vmask, vreinterpretq_s32_f32(vld1q_f32(&input0[pixel_size - 4])))); - voutput0 = vfmaq_lane_f32(voutput0, vinput0, vfilter0x, 0); - voutput1 = vfmaq_lane_f32(voutput1, vinput0, vfilter1x, 0); - voutput2 = vfmaq_lane_f32(voutput2, vinput0, vfilter2x, 0); - voutput3 = vfmaq_lane_f32(voutput3, vinput0, vfilter3x, 0); - - if (input_channels_subblock_size > 1) { - const float32x4_t vinput1 = vreinterpretq_f32_s32(vandq_s32( - vmask, vreinterpretq_s32_f32(vld1q_f32(&input1[pixel_size - 4])))); - voutput0 = vfmaq_lane_f32(voutput0, vinput1, vfilter0x, 1); - voutput1 = vfmaq_lane_f32(voutput1, vinput1, vfilter1x, 1); - voutput2 = vfmaq_lane_f32(voutput2, vinput1, vfilter2x, 1); - voutput3 = vfmaq_lane_f32(voutput3, vinput1, vfilter3x, 1); - } - - vst1q_f32(output0, voutput0); - if (output_channels_subblock_size > 1) { - vst1q_f32(output1, voutput1); - if (output_channels_subblock_size > 2) { - vst1q_f32(output2, voutput2); - if (output_channels_subblock_size > 3) { - vst1q_f32(output3, voutput3); - } - } - } - } -} - -void Conv2dNeonK1x1S1(const float *input, // NCHW - const index_t *input_shape, - const float *filter, // c_out, c_in, filter_h, filter_w - const index_t *filter_shape, - const float *bias, // c_out - float *output, // NCHW - const index_t *output_shape) { - const index_t batch = output_shape[0]; - const index_t channels = output_shape[1]; - const index_t height = output_shape[2]; - const index_t width = output_shape[3]; - - const index_t input_batch = input_shape[0]; - const index_t input_channels = input_shape[1]; - const index_t input_height = input_shape[2]; - const index_t input_width = input_shape[3]; - - MACE_CHECK(input_batch == batch && input_height == height && - input_width == width); - - const index_t total_pixels = height * width; - const index_t round_up_channels = RoundUp(channels, kOutputChannelBlockSize); - -#pragma omp parallel for collapse(2) - for (index_t n = 0; n < batch; ++n) { - for (int i = 0; i < channels; ++i) { - float *output_ptr_base = - output + n * channels * total_pixels + i * total_pixels; - std::fill(output_ptr_base, output_ptr_base + total_pixels, - bias ? bias[i] : 0); - } - } -#pragma omp parallel for collapse(2) - for (index_t n = 0; n < batch; ++n) { - for (index_t c = 0; c < round_up_channels; c += kOutputChannelBlockSize) { - const float *input_ptr = input + n * input_channels * total_pixels; - const float *filter_ptr = filter + c * input_channels; - float *output_ptr = - output + n * channels * total_pixels + c * total_pixels; - const index_t output_channel_block_size = - std::min(channels - c, kOutputChannelBlockSize); - index_t remain_input_channels = input_channels; - if (c + kOutputChannelBlockSize <= channels) { - while (remain_input_channels >= kInputChannelBlockSize) { - NeonConv2x4Kernel(input_channels, total_pixels, input_ptr, filter_ptr, - output_ptr); - - input_ptr += kInputChannelBlockSize * total_pixels; - filter_ptr += kInputChannelBlockSize; - remain_input_channels -= kInputChannelBlockSize; - } - } - while (remain_input_channels != 0) { - const index_t input_channel_block_size = - std::min(remain_input_channels, kInputChannelBlockSize); - NeonConv2x4SubBlockKernel( - input_channel_block_size, output_channel_block_size, input_channels, - total_pixels, input_ptr, filter_ptr, output_ptr); - input_ptr += kInputChannelBlockSize * total_pixels; - filter_ptr += kInputChannelBlockSize; - remain_input_channels -= input_channel_block_size; - } - } - } -} - -void Conv2dNeonPixelK1x1S1( - const float *input, // NCHW - const index_t *input_shape, - const float *filter, // c_out, c_in, kernel_h, kernel_w - const index_t *filter_shape, - const float *bias, // c_out - float *output, // NCHW - const index_t *output_shape) { - const index_t batch = output_shape[0]; - const index_t channels = output_shape[1]; - const index_t height = output_shape[2]; - const index_t width = output_shape[3]; - - const index_t input_batch = input_shape[0]; - const index_t input_channels = input_shape[1]; - const index_t input_height = input_shape[2]; - const index_t input_width = input_shape[3]; - - MACE_CHECK(input_batch == batch && input_height == height && - input_width == width); - - const index_t total_pixels = height * width; - // Process 4 * 2 = 8 pixels for each innermost loop - // TODO(heliangliang): Does 64 bit v.s. 32 bit index matters? need benchmark - const index_t total_loops = total_pixels >> 3; - const index_t loop_remaining = total_pixels & 7; - -#pragma omp parallel for collapse(2) - for (index_t n = 0; n < batch; ++n) { - for (index_t c = 0; c < channels; ++c) { - const float *filter_ptr = filter + c * input_channels; - // TODO(heliangliang): Will GCC opt these out? - float *channel_output_start = - output + n * channels * height * width + c * height * width; - const float *input_ptr = - input + n * input_channels * input_height * input_width; - - // Fill with bias - float *output_ptr = channel_output_start; - std::fill(output_ptr, output_ptr + total_pixels, bias ? bias[c] : 0); - - index_t inc = 0; - // Process 4 input channels in batch - for (; inc + 3 < input_channels; inc += 4) { - float *output_ptr = channel_output_start; - // The begining of each input feature map channel - MACE_ASSERT(input_ptr == - input + n * input_channels * input_height * input_width + - inc * input_height * input_width); - - const float *input_ptr1 = input_ptr + total_pixels; - const float *input_ptr2 = input_ptr1 + total_pixels; - const float *input_ptr3 = input_ptr2 + total_pixels; - - // filter is in c_out, c_in, 1, 1 order - MACE_ASSERT(filter_ptr == filter + c * input_channels + inc); - const float k0 = filter_ptr[0]; - const float k1 = filter_ptr[1]; - const float k2 = filter_ptr[2]; - const float k3 = filter_ptr[3]; - filter_ptr += 4; - - const float32x4_t vk0 = vdupq_n_f32(k0); - const float32x4_t vk1 = vdupq_n_f32(k1); - const float32x4_t vk2 = vdupq_n_f32(k2); - const float32x4_t vk3 = vdupq_n_f32(k3); - - index_t loop_itr = total_loops; - for (; loop_itr > 0; --loop_itr) { - // Process 2 group of 4 floats - float32x4_t out0 = vld1q_f32(output_ptr); - float32x4_t out4 = vld1q_f32(output_ptr + 4); - - const float32x4_t in00 = vld1q_f32(input_ptr); - const float32x4_t in04 = vld1q_f32(input_ptr + 4); - - out0 = vfmaq_f32(out0, in00, vk0); - out4 = vfmaq_f32(out4, in04, vk0); - - const float32x4_t in10 = vld1q_f32(input_ptr1); - const float32x4_t in14 = vld1q_f32(input_ptr1 + 4); - - out0 = vfmaq_f32(out0, in10, vk1); - out4 = vfmaq_f32(out4, in14, vk1); - - const float32x4_t in20 = vld1q_f32(input_ptr2); - const float32x4_t in24 = vld1q_f32(input_ptr2 + 4); - - out0 = vfmaq_f32(out0, in20, vk2); - out4 = vfmaq_f32(out4, in24, vk2); - - const float32x4_t in30 = vld1q_f32(input_ptr3); - const float32x4_t in34 = vld1q_f32(input_ptr3 + 4); - - out0 = vfmaq_f32(out0, in30, vk3); - out4 = vfmaq_f32(out4, in34, vk3); - - float prev_output = output_ptr[0]; - // Save output - vst1q_f32(output_ptr, out0); - vst1q_f32(output_ptr + 4, out4); - - output_ptr += 8; - input_ptr += 8; - input_ptr1 += 8; - input_ptr2 += 8; - input_ptr3 += 8; - } - // Process the remaining pixels - index_t remaining_pixels = loop_remaining; - for (; remaining_pixels > 0; --remaining_pixels) { - const float mul = *input_ptr * k0; - const float mul1 = *input_ptr1 * k1; - const float mul2 = *input_ptr2 * k2; - const float mul3 = *input_ptr3 * k3; - - float prev_output = output_ptr[0]; - *output_ptr += mul + mul1 + mul2 + mul3; - - ++output_ptr; - ++input_ptr; - ++input_ptr1; - ++input_ptr2; - ++input_ptr3; - } - // Skip these 4 feature maps - input_ptr += 3 * total_pixels; - } - // Process the remaining channels - for (; inc < input_channels; ++inc) { - float *output_ptr = channel_output_start; - MACE_ASSERT(input_ptr == - input + n * input_channels * input_height * input_width + - inc * input_height * input_width); - MACE_ASSERT(filter_ptr == filter + c * input_channels + inc); - - const float k0 = filter_ptr[0]; - ++filter_ptr; - const float32x4_t vk0 = vdupq_n_f32(k0); - - index_t loop_itr = total_loops; - for (; loop_itr > 0; --loop_itr) { - float32x4_t out0 = vld1q_f32(output_ptr); - float32x4_t out4 = vld1q_f32(output_ptr + 4); - - const float32x4_t in0 = vld1q_f32(input_ptr); - const float32x4_t in4 = vld1q_f32(input_ptr + 4); - - out0 = vfmaq_f32(out0, in0, vk0); - out4 = vfmaq_f32(out4, in4, vk0); - - // Save output - vst1q_f32(output_ptr, out0); - vst1q_f32(output_ptr + 4, out4); - - output_ptr += 8; - input_ptr += 8; - } - // Process the remaining pixels - index_t remaining_pixels = loop_remaining; - for (; remaining_pixels > 0; --remaining_pixels) { - const float mul = *input_ptr * k0; - - *output_ptr += mul; - - ++output_ptr; - ++input_ptr; - } - } - } - } -} - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/neon/conv_2d_neon_3x3.cc b/mace/kernels/neon/conv_2d_neon_3x3.cc deleted file mode 100644 index af1e83cba7e73dd0e15bb2e52b79ccd645c0ec24..0000000000000000000000000000000000000000 --- a/mace/kernels/neon/conv_2d_neon_3x3.cc +++ /dev/null @@ -1,331 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include - -namespace mace { -namespace kernels { - -static const int kRegisterSize = 4; -static const int kFilterSize = 9; - -void Conv2dNeonK3x3S1(const float *input, // NCHW - const index_t *input_shape, - const float *filter, // c_out, c_in, kernel_h, kernel_w - const index_t *filter_shape, - const float *bias, // c_out - float *output, // NCHW - const index_t *output_shape) { - int height_count = (output_shape[2] >> 1) << 1; - - int output_batch = output_shape[0]; - int output_channels = output_shape[1]; - int output_height = output_shape[2]; - int output_width = output_shape[3]; - int input_batch = input_shape[0]; - int input_channels = input_shape[1]; - int input_height = input_shape[2]; - int input_width = input_shape[3]; - int multiplier = filter_shape == nullptr ? 0 : filter_shape[0]; - int filter_in_channels = filter_shape == nullptr ? input_channels : 1; -#pragma omp parallel for collapse(2) - for (int b = 0; b < output_batch; ++b) { - for (int oc = 0; oc < output_channels; ++oc) { - float *output_ptr_base = - output + b * output_channels * output_height * output_width; - const float *filter_ptr = filter + oc * filter_in_channels * kFilterSize; - const float *input_ptr = - input + b * input_channels * input_height * input_width; - if (filter_shape != nullptr) { - input_ptr += (oc / multiplier) * input_height * input_width; - } - float *output_ptr = output_ptr_base + oc * output_height * output_width; - std::fill(output_ptr, output_ptr + output_height * output_width, - bias ? bias[oc] : 0); - for (int ic = 0; ic < filter_in_channels; ++ic) { - float32x4_t n_filter_v[3] = {vld1q_f32(filter_ptr), - vld1q_f32(filter_ptr + 3), - vld1q_f32(filter_ptr + 6)}; - - const float *row_ptr_v[kRegisterSize] = { - input_ptr, input_ptr + input_width, input_ptr + 2 * input_width, - input_ptr + 3 * input_width}; - - float *output_ptr_v[] = {output_ptr, output_ptr + output_width}; - - for (int h = 0; h < height_count; h += 2) { - int count = output_width >> 2; - int remain_count = output_width & 3; - - for (; count > 0; --count) { - float32x4_t n_sum0 = vdupq_n_f32(.0f); - - float32x4_t n_row_former = vld1q_f32(row_ptr_v[0]); - float32x4_t n_row_latter = vld1q_f32(row_ptr_v[0] + kRegisterSize); - float32x4_t n_row_ext0 = vextq_f32(n_row_former, n_row_latter, 1); - float32x4_t n_row_ext1 = vextq_f32(n_row_former, n_row_latter, 2); - n_sum0 = vfmaq_laneq_f32(n_sum0, n_row_former, n_filter_v[0], 0); - n_sum0 = vfmaq_laneq_f32(n_sum0, n_row_ext0, n_filter_v[0], 1); - n_sum0 = vfmaq_laneq_f32(n_sum0, n_row_ext1, n_filter_v[0], 2); - - float32x4_t n_row1_former = vld1q_f32(row_ptr_v[1]); - float32x4_t n_row1_latter = vld1q_f32(row_ptr_v[1] + kRegisterSize); - float32x4_t n_row1_ext0 = - vextq_f32(n_row1_former, n_row1_latter, 1); - float32x4_t n_row1_ext1 = - vextq_f32(n_row1_former, n_row1_latter, 2); - n_sum0 = vfmaq_laneq_f32(n_sum0, n_row1_former, n_filter_v[1], 0); - n_sum0 = vfmaq_laneq_f32(n_sum0, n_row1_ext0, n_filter_v[1], 1); - n_sum0 = vfmaq_laneq_f32(n_sum0, n_row1_ext1, n_filter_v[1], 2); - - n_row_former = vld1q_f32(row_ptr_v[2]); - n_row_latter = vld1q_f32(row_ptr_v[2] + kRegisterSize); - n_row_ext0 = vextq_f32(n_row_former, n_row_latter, 1); - n_row_ext1 = vextq_f32(n_row_former, n_row_latter, 2); - n_sum0 = vfmaq_laneq_f32(n_sum0, n_row_former, n_filter_v[2], 0); - n_sum0 = vfmaq_laneq_f32(n_sum0, n_row_ext0, n_filter_v[2], 1); - n_sum0 = vfmaq_laneq_f32(n_sum0, n_row_ext1, n_filter_v[2], 2); - - // second row - float32x4_t n_sum1 = vdupq_n_f32(.0f); - - n_sum1 = vfmaq_laneq_f32(n_sum1, n_row1_former, n_filter_v[0], 0); - n_sum1 = vfmaq_laneq_f32(n_sum1, n_row1_ext0, n_filter_v[0], 1); - n_sum1 = vfmaq_laneq_f32(n_sum1, n_row1_ext1, n_filter_v[0], 2); - - n_sum1 = vfmaq_laneq_f32(n_sum1, n_row_former, n_filter_v[1], 0); - n_sum1 = vfmaq_laneq_f32(n_sum1, n_row_ext0, n_filter_v[1], 1); - n_sum1 = vfmaq_laneq_f32(n_sum1, n_row_ext1, n_filter_v[1], 2); - - n_row1_former = vld1q_f32(row_ptr_v[3]); - n_row1_latter = vld1q_f32(row_ptr_v[3] + kRegisterSize); - n_row1_ext0 = vextq_f32(n_row1_former, n_row1_latter, 1); - n_row1_ext1 = vextq_f32(n_row1_former, n_row1_latter, 2); - n_sum1 = vfmaq_laneq_f32(n_sum1, n_row1_former, n_filter_v[2], 0); - n_sum1 = vfmaq_laneq_f32(n_sum1, n_row1_ext0, n_filter_v[2], 1); - n_sum1 = vfmaq_laneq_f32(n_sum1, n_row1_ext1, n_filter_v[2], 2); - - float32x4_t n_output_row = vld1q_f32(output_ptr_v[0]); - float32x4_t n_output_row1 = vld1q_f32(output_ptr_v[1]); - n_output_row = vaddq_f32(n_output_row, n_sum0); - n_output_row1 = vaddq_f32(n_output_row1, n_sum1); - vst1q_f32(output_ptr_v[0], n_output_row); - vst1q_f32(output_ptr_v[1], n_output_row1); - output_ptr_v[0] += kRegisterSize; - output_ptr_v[1] += kRegisterSize; - for (int i = 0; i < kRegisterSize; ++i) { - row_ptr_v[i] += kRegisterSize; - } - } - for (; remain_count > 0; --remain_count) { - float32x4_t n_row_v[] = {vld1q_f32(row_ptr_v[0]), - vld1q_f32(row_ptr_v[1]), - vld1q_f32(row_ptr_v[2])}; - float32x4_t n_sum0 = vmulq_f32(n_row_v[0], n_filter_v[0]); - n_sum0 = vmlaq_f32(n_sum0, n_row_v[1], n_filter_v[1]); - n_sum0 = vmlaq_f32(n_sum0, n_row_v[2], n_filter_v[2]); - n_sum0 = vsetq_lane_f32(*output_ptr_v[0], n_sum0, 3); - *output_ptr_v[0] = vaddvq_f32(n_sum0); - - float32x4_t n_row3 = vld1q_f32(row_ptr_v[3]); - float32x4_t n_sum1 = vmulq_f32(n_row_v[1], n_filter_v[0]); - n_sum1 = vmlaq_f32(n_sum1, n_row_v[2], n_filter_v[1]); - n_sum1 = vmlaq_f32(n_sum1, n_row3, n_filter_v[2]); - n_sum1 = vsetq_lane_f32(*output_ptr_v[1], n_sum1, 3); - *output_ptr_v[1] = vaddvq_f32(n_sum1); - - ++output_ptr_v[0]; - ++output_ptr_v[1]; - for (int i = 0; i < kRegisterSize; ++i) { - row_ptr_v[i] += 1; - } - } - output_ptr_v[0] += output_width; - output_ptr_v[1] += output_width; - for (int i = 0; i < kRegisterSize; ++i) { - row_ptr_v[i] += 2 + input_width; - } - } - - if (output_height != height_count) { - int count = output_width >> 2; - int remain_count = output_width & 3; - for (; count > 0; --count) { - float32x4_t n_sum = vdupq_n_f32(.0f); - float32x4_t n_row_former = vld1q_f32(row_ptr_v[0]); - float32x4_t n_row_latter = vld1q_f32(row_ptr_v[0] + kRegisterSize); - float32x4_t n_row_ext1 = vextq_f32(n_row_former, n_row_latter, 1); - float32x4_t n_row_ext2 = vextq_f32(n_row_former, n_row_latter, 2); - n_sum = vfmaq_laneq_f32(n_sum, n_row_former, n_filter_v[0], 0); - n_sum = vfmaq_laneq_f32(n_sum, n_row_ext1, n_filter_v[0], 1); - n_sum = vfmaq_laneq_f32(n_sum, n_row_ext2, n_filter_v[0], 2); - - n_row_former = vld1q_f32(row_ptr_v[1]); - n_row_latter = vld1q_f32(row_ptr_v[1] + kRegisterSize); - n_row_ext1 = vextq_f32(n_row_former, n_row_latter, 1); - n_row_ext2 = vextq_f32(n_row_former, n_row_latter, 2); - n_sum = vfmaq_laneq_f32(n_sum, n_row_former, n_filter_v[1], 0); - n_sum = vfmaq_laneq_f32(n_sum, n_row_ext1, n_filter_v[1], 1); - n_sum = vfmaq_laneq_f32(n_sum, n_row_ext2, n_filter_v[1], 2); - - n_row_former = vld1q_f32(row_ptr_v[2]); - n_row_latter = vld1q_f32(row_ptr_v[2] + kRegisterSize); - n_row_ext1 = vextq_f32(n_row_former, n_row_latter, 1); - n_row_ext2 = vextq_f32(n_row_former, n_row_latter, 2); - n_sum = vfmaq_laneq_f32(n_sum, n_row_former, n_filter_v[2], 0); - n_sum = vfmaq_laneq_f32(n_sum, n_row_ext1, n_filter_v[2], 1); - n_sum = vfmaq_laneq_f32(n_sum, n_row_ext2, n_filter_v[2], 2); - - float32x4_t n_output_row = vld1q_f32(output_ptr_v[0]); - n_output_row = vaddq_f32(n_output_row, n_sum); - vst1q_f32(output_ptr_v[0], n_output_row); - output_ptr_v[0] += kRegisterSize; - for (int i = 0; i < 3; ++i) { - row_ptr_v[i] += kRegisterSize; - } - } - for (; remain_count > 0; --remain_count) { - float32x4_t n_row_v[] = { - vld1q_f32(row_ptr_v[0]), vld1q_f32(row_ptr_v[1]), - vld1q_f32(row_ptr_v[2]), - }; - - float32x4_t n_sum = vmulq_f32(n_row_v[0], n_filter_v[0]); - n_sum = vmlaq_f32(n_sum, n_row_v[1], n_filter_v[1]); - n_sum = vmlaq_f32(n_sum, n_row_v[2], n_filter_v[2]); - n_sum = vsetq_lane_f32(*output_ptr_v[0], n_sum, 3); - *output_ptr_v[0] = vaddvq_f32(n_sum); - - ++output_ptr_v[0]; - for (int i = 0; i < 3; ++i) { - row_ptr_v[i] += 1; - } - } - } - - filter_ptr += kFilterSize; - input_ptr += input_height * input_width; - } - } - } -} - -void Conv2dNeonK3x3S2(const float *input, // NCHW - const index_t *input_shape, - const float *filter, // c_out, c_in, kernel_h, kernel_w - const index_t *filter_shape, - const float *bias, // c_out - float *output, // NCHW - const index_t *output_shape) { - int tail_step = 2 * (input_shape[3] - output_shape[3]); - - int output_batch = output_shape[0]; - int output_channels = output_shape[1]; - int output_height = output_shape[2]; - int output_width = output_shape[3]; - int input_batch = input_shape[0]; - int input_channels = input_shape[1]; - int input_height = input_shape[2]; - int input_width = input_shape[3]; - int multiplier = filter_shape == nullptr ? 0 : filter_shape[0]; - int filter_in_channels = filter_shape == nullptr ? input_channels : 1; - -#pragma omp parallel for collapse(2) - for (int b = 0; b < output_batch; ++b) { - for (int oc = 0; oc < output_channels; ++oc) { - float *output_ptr_base = - output + b * output_channels * output_height * output_width; - const float *filter_ptr = filter + oc * filter_in_channels * kFilterSize; - const float *input_ptr = - input + b * input_channels * input_height * input_width; - if (filter_shape != nullptr) { - input_ptr += (oc / multiplier) * input_height * input_width; - } - float *output_ptr = output_ptr_base + oc * output_height * output_width; - std::fill(output_ptr, output_ptr + output_height * output_width, - bias ? bias[oc] : 0); - for (int ic = 0; ic < filter_in_channels; ++ic) { - float32x4_t n_filter_v[3] = {vld1q_f32(filter_ptr), - vld1q_f32(filter_ptr + 3), - vld1q_f32(filter_ptr + 6)}; - - const float *row_ptr_v[3] = {input_ptr, input_ptr + input_width, - input_ptr + 2 * input_width}; - - float *output_ptr_inner = output_ptr; - - for (int h = 0; h < output_height; ++h) { - int count = output_width >> 2; - int remain_count = output_width & 3; - - for (; count > 0; --count) { - float32x4_t n_sum = vdupq_n_f32(.0f); - - float32x4x2_t n_row_former = vld2q_f32(row_ptr_v[0]); - float32x4_t n_row_latter = vld1q_f32(row_ptr_v[0] + 8); - float32x4_t n_row_ext = - vextq_f32(n_row_former.val[0], n_row_latter, 1); - - n_sum = - vfmaq_laneq_f32(n_sum, n_row_former.val[0], n_filter_v[0], 0); - n_sum = - vfmaq_laneq_f32(n_sum, n_row_former.val[1], n_filter_v[0], 1); - n_sum = vfmaq_laneq_f32(n_sum, n_row_ext, n_filter_v[0], 2); - - float32x4x2_t n_row1_former = vld2q_f32(row_ptr_v[1]); - float32x4_t n_row1_latter = vld1q_f32(row_ptr_v[1] + 8); - float32x4_t n_row1_ext = - vextq_f32(n_row1_former.val[0], n_row1_latter, 1); - n_sum = - vfmaq_laneq_f32(n_sum, n_row1_former.val[0], n_filter_v[1], 0); - n_sum = - vfmaq_laneq_f32(n_sum, n_row1_former.val[1], n_filter_v[1], 1); - n_sum = vfmaq_laneq_f32(n_sum, n_row1_ext, n_filter_v[1], 2); - - float32x4x2_t n_row2_former = vld2q_f32(row_ptr_v[2]); - float32x4_t n_row2_latter = vld1q_f32(row_ptr_v[2] + 8); - float32x4_t n_row2_ext = - vextq_f32(n_row2_former.val[0], n_row2_latter, 1); - n_sum = - vfmaq_laneq_f32(n_sum, n_row2_former.val[0], n_filter_v[2], 0); - n_sum = - vfmaq_laneq_f32(n_sum, n_row2_former.val[1], n_filter_v[2], 1); - n_sum = vfmaq_laneq_f32(n_sum, n_row2_ext, n_filter_v[2], 2); - - float32x4_t n_output_row = vld1q_f32(output_ptr_inner); - n_output_row = vaddq_f32(n_output_row, n_sum); - vst1q_f32(output_ptr_inner, n_output_row); - output_ptr_inner += kRegisterSize; - for (int i = 0; i < 3; ++i) { - row_ptr_v[i] += 2 * kRegisterSize; - } - } - for (; remain_count > 0; --remain_count) { - float32x4_t n_row_v[] = {vld1q_f32(row_ptr_v[0]), - vld1q_f32(row_ptr_v[1]), - vld1q_f32(row_ptr_v[2])}; - float32x4_t n_sum = vmulq_f32(n_row_v[0], n_filter_v[0]); - n_sum = vmlaq_f32(n_sum, n_row_v[1], n_filter_v[1]); - n_sum = vmlaq_f32(n_sum, n_row_v[2], n_filter_v[2]); - n_sum = vsetq_lane_f32(*output_ptr_inner, n_sum, 3); - *output_ptr_inner = vaddvq_f32(n_sum); - - ++output_ptr_inner; - for (int i = 0; i < 3; ++i) { - row_ptr_v[i] += 2; - } - } - for (int i = 0; i < 3; ++i) { - row_ptr_v[i] += tail_step; - } - } - - filter_ptr += kFilterSize; - input_ptr += input_height * input_width; - } - } - } -} -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/neon/conv_2d_neon_5x5.cc b/mace/kernels/neon/conv_2d_neon_5x5.cc deleted file mode 100644 index 709610a3ddc57b65eb83d85c81e9bb65cd732f19..0000000000000000000000000000000000000000 --- a/mace/kernels/neon/conv_2d_neon_5x5.cc +++ /dev/null @@ -1,419 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// -#ifndef MACE_KERNELS_NEON_CONV_2D_NEON_5X5_H_ -#define MACE_KERNELS_NEON_CONV_2D_NEON_5X5_H_ - -#include - -namespace mace { -namespace kernels { - -void Conv2dNeonK5x5S1(const float *input, // NCHW - const index_t *input_shape, - const float *filter, // c_out, c_in, kernel_h, kernel_w - const index_t *filter_shape, - const float *bias, // c_out - float *output, // NCHW - const index_t *output_shape) { - const index_t batch = output_shape[0]; - const index_t channels = output_shape[1]; - const index_t height = output_shape[2]; - const index_t width = output_shape[3]; - - const index_t input_batch = input_shape[0]; - const index_t input_channels = input_shape[1]; - const index_t input_height = input_shape[2]; - const index_t input_width = input_shape[3]; - - MACE_ASSERT(input_batch == batch); - - const index_t input_total_pixels_per_channel = input_height * input_width; - const index_t output_total_pixels_per_channel = height * width; - const index_t input_total_pixels_per_batch = - input_total_pixels_per_channel * input_channels; - const index_t output_total_pixels_per_batch = - output_total_pixels_per_channel * channels; - const index_t patch_size = input_channels * 25; - -#pragma omp parallel for collapse(2) - for (index_t n = 0; n < batch; ++n) { - for (index_t c = 0; c < channels; ++c) { - float *output_ptr = output + n * output_total_pixels_per_batch + - c * output_total_pixels_per_channel; - const float *input_ptr = input + n * input_total_pixels_per_batch; - - // Fill with bias - std::fill(output_ptr, output_ptr + output_total_pixels_per_channel, - bias ? bias[c] : 0); - - for (index_t inc = 0; inc < input_channels; ++inc) { - float *outptr = output_ptr; - float *outptr2 = outptr + width; - - const float *inptr = input_ptr + inc * input_total_pixels_per_channel; - const float *filter_ptr = filter + c * patch_size + inc * 25; - - const float *r0 = inptr; - const float *r1 = inptr + input_width; - const float *r2 = inptr + input_width * 2; - const float *r3 = inptr + input_width * 3; - const float *r4 = inptr + input_width * 4; - const float *r5 = inptr + input_width * 5; - - const float *k0 = filter_ptr; - const float *k1 = filter_ptr + 5; - const float *k2 = filter_ptr + 10; - const float *k3 = filter_ptr + 15; - const float *k4 = filter_ptr + 20; - - float32x4_t _k0123 = vld1q_f32(filter_ptr); - float32x4_t _k4567 = vld1q_f32(filter_ptr + 4); - float32x4_t _k891011 = vld1q_f32(filter_ptr + 8); - float32x4_t _k12131415 = vld1q_f32(filter_ptr + 12); - float32x4_t _k16171819 = vld1q_f32(filter_ptr + 16); - float32x4_t _k20212223 = vld1q_f32(filter_ptr + 20); - float32x4_t _k24242424 = vdupq_n_f32(filter_ptr[24]); - - // height_block_size = 2, width_block_size = 4 - int h = 0; - for (; h + 1 < height; h += 2) { - int width_blocks = width >> 2; - int remain = width - (width_blocks << 2); - - for (; width_blocks > 0; --width_blocks) { - float32x4_t _sum = vld1q_f32(outptr); - float32x4_t _sum2 = vld1q_f32(outptr2); - - float32x4_t _r00 = vld1q_f32(r0); - float32x4_t _r04 = vld1q_f32(r0 + 4); - float32x4_t _r01 = vextq_f32(_r00, _r04, 1); - float32x4_t _r02 = vextq_f32(_r00, _r04, 2); - float32x4_t _r03 = vextq_f32(_r00, _r04, 3); - - float32x4_t _r10 = vld1q_f32(r1); - float32x4_t _r14 = vld1q_f32(r1 + 4); - float32x4_t _r11 = vextq_f32(_r10, _r14, 1); - float32x4_t _r12 = vextq_f32(_r10, _r14, 2); - float32x4_t _r13 = vextq_f32(_r10, _r14, 3); - - float32x4_t _r20 = vld1q_f32(r2); - float32x4_t _r24 = vld1q_f32(r2 + 4); - float32x4_t _r21 = vextq_f32(_r20, _r24, 1); - float32x4_t _r22 = vextq_f32(_r20, _r24, 2); - float32x4_t _r23 = vextq_f32(_r20, _r24, 3); - - float32x4_t _r30 = vld1q_f32(r3); - float32x4_t _r34 = vld1q_f32(r3 + 4); - float32x4_t _r31 = vextq_f32(_r30, _r34, 1); - float32x4_t _r32 = vextq_f32(_r30, _r34, 2); - float32x4_t _r33 = vextq_f32(_r30, _r34, 3); - - float32x4_t _r40 = vld1q_f32(r4); - float32x4_t _r44 = vld1q_f32(r4 + 4); - float32x4_t _r41 = vextq_f32(_r40, _r44, 1); - float32x4_t _r42 = vextq_f32(_r40, _r44, 2); - float32x4_t _r43 = vextq_f32(_r40, _r44, 3); - - float32x4_t _r50 = vld1q_f32(r5); - float32x4_t _r54 = vld1q_f32(r5 + 4); - float32x4_t _r51 = vextq_f32(_r50, _r54, 1); - float32x4_t _r52 = vextq_f32(_r50, _r54, 2); - float32x4_t _r53 = vextq_f32(_r50, _r54, 3); - - _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0); - _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1); - _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2); - _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3); - _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0); - - _sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1); - _sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2); - _sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3); - _sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0); - _sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1); - - _sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2); - _sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3); - _sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0); - _sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1); - _sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2); - - _sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3); - _sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0); - _sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1); - _sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2); - _sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3); - - _sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0); - _sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1); - _sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2); - _sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3); - _sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0); - - _sum2 = vfmaq_laneq_f32(_sum2, _r10, _k0123, 0); - _sum2 = vfmaq_laneq_f32(_sum2, _r11, _k0123, 1); - _sum2 = vfmaq_laneq_f32(_sum2, _r12, _k0123, 2); - _sum2 = vfmaq_laneq_f32(_sum2, _r13, _k0123, 3); - _sum2 = vfmaq_laneq_f32(_sum2, _r14, _k4567, 0); - - _sum2 = vfmaq_laneq_f32(_sum2, _r20, _k4567, 1); - _sum2 = vfmaq_laneq_f32(_sum2, _r21, _k4567, 2); - _sum2 = vfmaq_laneq_f32(_sum2, _r22, _k4567, 3); - _sum2 = vfmaq_laneq_f32(_sum2, _r23, _k891011, 0); - _sum2 = vfmaq_laneq_f32(_sum2, _r24, _k891011, 1); - - _sum2 = vfmaq_laneq_f32(_sum2, _r30, _k891011, 2); - _sum2 = vfmaq_laneq_f32(_sum2, _r31, _k891011, 3); - _sum2 = vfmaq_laneq_f32(_sum2, _r32, _k12131415, 0); - _sum2 = vfmaq_laneq_f32(_sum2, _r33, _k12131415, 1); - _sum2 = vfmaq_laneq_f32(_sum2, _r34, _k12131415, 2); - - _sum2 = vfmaq_laneq_f32(_sum2, _r40, _k12131415, 3); - _sum2 = vfmaq_laneq_f32(_sum2, _r41, _k16171819, 0); - _sum2 = vfmaq_laneq_f32(_sum2, _r42, _k16171819, 1); - _sum2 = vfmaq_laneq_f32(_sum2, _r43, _k16171819, 2); - _sum2 = vfmaq_laneq_f32(_sum2, _r44, _k16171819, 3); - - _sum2 = vfmaq_laneq_f32(_sum2, _r50, _k20212223, 0); - _sum2 = vfmaq_laneq_f32(_sum2, _r51, _k20212223, 1); - _sum2 = vfmaq_laneq_f32(_sum2, _r52, _k20212223, 2); - _sum2 = vfmaq_laneq_f32(_sum2, _r53, _k20212223, 3); - _sum2 = vfmaq_laneq_f32(_sum2, _r54, _k24242424, 0); - - vst1q_f32(outptr, _sum); - vst1q_f32(outptr2, _sum2); - - r0 += 4; - r1 += 4; - r2 += 4; - r3 += 4; - r4 += 4; - r5 += 4; - outptr += 4; - outptr2 += 4; - } - - for (; remain > 0; --remain) { - float sum = 0; - float sum2 = 0; - - float32x4_t _r1 = vld1q_f32(r1); - float32x4_t _k1 = vld1q_f32(k1); - float32x4_t _sum = vmulq_f32(_r1, _k1); - float32x4_t _sum2 = vmulq_f32(_r1, _k0123); - - float32x4_t _r2 = vld1q_f32(r2); - float32x4_t _k2 = vld1q_f32(k2); - _sum = vmlaq_f32(_sum, _r2, _k2); - _sum2 = vmlaq_f32(_sum2, _r2, _k1); - - float32x4_t _r3 = vld1q_f32(r3); - float32x4_t _k3 = vld1q_f32(k3); - _sum = vmlaq_f32(_sum, _r3, _k3); - _sum2 = vmlaq_f32(_sum2, _r3, _k2); - - float32x4_t _r4 = vld1q_f32(r4); - _sum = vmlaq_f32(_sum, _r4, _k20212223); - _sum2 = vmlaq_f32(_sum2, _r4, _k3); - - float32x4_t _r0 = vld1q_f32(r0); - _sum = vmlaq_f32(_sum, _r0, _k0123); - float32x4_t _r5 = vld1q_f32(r5); - _sum2 = vmlaq_f32(_sum2, _r5, _k20212223); - - float32x4_t _k_t4; - _k_t4 = vsetq_lane_f32(k0[4], _k_t4, 0); - _k_t4 = vsetq_lane_f32(k1[4], _k_t4, 1); - _k_t4 = vsetq_lane_f32(k2[4], _k_t4, 2); - _k_t4 = vsetq_lane_f32(k3[4], _k_t4, 3); - - float32x4_t _r_t4; - - _r_t4 = vsetq_lane_f32(r0[4], _r_t4, 0); - _r_t4 = vsetq_lane_f32(r1[4], _r_t4, 1); - _r_t4 = vsetq_lane_f32(r2[4], _r_t4, 2); - _r_t4 = vsetq_lane_f32(r3[4], _r_t4, 3); - _sum = vmlaq_f32(_sum, _r_t4, _k_t4); - - sum = r4[4] * k4[4]; - - _r_t4 = vextq_f32(_r_t4, _r_t4, 1); - _r_t4 = vsetq_lane_f32(r4[4], _r_t4, 3); - _sum2 = vmlaq_f32(_sum2, _r_t4, _k_t4); - - sum2 = r5[4] * k4[4]; - - float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum)); - float32x2_t _ss2 = - vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2)); - float32x2_t _ss_ss2 = vpadd_f32(_ss, _ss2); - - sum += vget_lane_f32(_ss_ss2, 0); - sum2 += vget_lane_f32(_ss_ss2, 1); - - *outptr += sum; - *outptr2 += sum2; - - ++r0; - ++r1; - ++r2; - ++r3; - ++r4; - ++r5; - ++outptr; - ++outptr2; - } - - r0 += 4 + input_width; // 4 = 5 - 1 - r1 += 4 + input_width; - r2 += 4 + input_width; - r3 += 4 + input_width; - r4 += 4 + input_width; - r5 += 4 + input_width; - outptr += width; - outptr2 += width; - } - - for (; h < height; ++h) { - // may left one row if odd rows - int width_blocks = width >> 2; - int remain = width - (width_blocks << 2); - for (; width_blocks > 0; --width_blocks) { - float32x4_t _sum = vld1q_f32(outptr); - - float32x4_t _r00 = vld1q_f32(r0); - float32x4_t _r04 = vld1q_f32(r0 + 4); - float32x4_t _r01 = vextq_f32(_r00, _r04, 1); - float32x4_t _r02 = vextq_f32(_r00, _r04, 2); - float32x4_t _r03 = vextq_f32(_r00, _r04, 3); - - float32x4_t _r10 = vld1q_f32(r1); - float32x4_t _r14 = vld1q_f32(r1 + 4); - float32x4_t _r11 = vextq_f32(_r10, _r14, 1); - float32x4_t _r12 = vextq_f32(_r10, _r14, 2); - float32x4_t _r13 = vextq_f32(_r10, _r14, 3); - - float32x4_t _r20 = vld1q_f32(r2); - float32x4_t _r24 = vld1q_f32(r2 + 4); - float32x4_t _r21 = vextq_f32(_r20, _r24, 1); - float32x4_t _r22 = vextq_f32(_r20, _r24, 2); - float32x4_t _r23 = vextq_f32(_r20, _r24, 3); - - float32x4_t _r30 = vld1q_f32(r3); - float32x4_t _r34 = vld1q_f32(r3 + 4); - float32x4_t _r31 = vextq_f32(_r30, _r34, 1); - float32x4_t _r32 = vextq_f32(_r30, _r34, 2); - float32x4_t _r33 = vextq_f32(_r30, _r34, 3); - - float32x4_t _r40 = vld1q_f32(r4); - float32x4_t _r44 = vld1q_f32(r4 + 4); - float32x4_t _r41 = vextq_f32(_r40, _r44, 1); - float32x4_t _r42 = vextq_f32(_r40, _r44, 2); - float32x4_t _r43 = vextq_f32(_r40, _r44, 3); - - _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0); - _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1); - _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2); - _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3); - _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0); - - _sum = vfmaq_laneq_f32(_sum, _r10, _k4567, 1); - _sum = vfmaq_laneq_f32(_sum, _r11, _k4567, 2); - _sum = vfmaq_laneq_f32(_sum, _r12, _k4567, 3); - _sum = vfmaq_laneq_f32(_sum, _r13, _k891011, 0); - _sum = vfmaq_laneq_f32(_sum, _r14, _k891011, 1); - - _sum = vfmaq_laneq_f32(_sum, _r20, _k891011, 2); - _sum = vfmaq_laneq_f32(_sum, _r21, _k891011, 3); - _sum = vfmaq_laneq_f32(_sum, _r22, _k12131415, 0); - _sum = vfmaq_laneq_f32(_sum, _r23, _k12131415, 1); - _sum = vfmaq_laneq_f32(_sum, _r24, _k12131415, 2); - - _sum = vfmaq_laneq_f32(_sum, _r30, _k12131415, 3); - _sum = vfmaq_laneq_f32(_sum, _r31, _k16171819, 0); - _sum = vfmaq_laneq_f32(_sum, _r32, _k16171819, 1); - _sum = vfmaq_laneq_f32(_sum, _r33, _k16171819, 2); - _sum = vfmaq_laneq_f32(_sum, _r34, _k16171819, 3); - - _sum = vfmaq_laneq_f32(_sum, _r40, _k20212223, 0); - _sum = vfmaq_laneq_f32(_sum, _r41, _k20212223, 1); - _sum = vfmaq_laneq_f32(_sum, _r42, _k20212223, 2); - _sum = vfmaq_laneq_f32(_sum, _r43, _k20212223, 3); - _sum = vfmaq_laneq_f32(_sum, _r44, _k24242424, 0); - - vst1q_f32(outptr, _sum); - - r0 += 4; - r1 += 4; - r2 += 4; - r3 += 4; - r4 += 4; - r5 += 4; - outptr += 4; - } - - for (; remain > 0; --remain) { - float sum = 0; - float32x4_t _r0 = vld1q_f32(r0); - float32x4_t _sum = vmulq_f32(_r0, _k0123); - - float debug[4]; - vst1q_f32(debug, _sum); - - float32x4_t _r1 = vld1q_f32(r1); - _sum = vmlaq_f32(_sum, _r1, vld1q_f32(k1)); - - float32x4_t _r2 = vld1q_f32(r2); - _sum = vmlaq_f32(_sum, _r2, vld1q_f32(k2)); - - float32x4_t _r3 = vld1q_f32(r3); - _sum = vmlaq_f32(_sum, _r3, vld1q_f32(k3)); - - float32x4_t _r4 = vld1q_f32(r4); - _sum = vmlaq_f32(_sum, _r4, _k20212223); - - float32x4_t _k_t4; - _k_t4 = vsetq_lane_f32(k0[4], _k_t4, 0); - _k_t4 = vsetq_lane_f32(k1[4], _k_t4, 1); - _k_t4 = vsetq_lane_f32(k2[4], _k_t4, 2); - _k_t4 = vsetq_lane_f32(k3[4], _k_t4, 3); - - float32x4_t _r_t4; - - _r_t4 = vsetq_lane_f32(r0[4], _r_t4, 0); - _r_t4 = vsetq_lane_f32(r1[4], _r_t4, 1); - _r_t4 = vsetq_lane_f32(r2[4], _r_t4, 2); - _r_t4 = vsetq_lane_f32(r3[4], _r_t4, 3); - _sum = vmlaq_f32(_sum, _r_t4, _k_t4); - - sum = r4[4] * k4[4]; - - float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum)); - _ss = vpadd_f32(_ss, _ss); - - sum += vget_lane_f32(_ss, 0); - *outptr += sum; - - ++r0; - ++r1; - ++r2; - ++r3; - ++r4; - ++outptr; - } - r0 += 4; - r1 += 4; - r2 += 4; - r3 += 4; - r4 += 4; - } - } - } - } -} - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_NEON_CONV_2D_NEON_5X5_H_ diff --git a/mace/kernels/neon/depthwise_conv_neon.cc b/mace/kernels/neon/depthwise_conv_neon.cc deleted file mode 100644 index fc8f457aabfd6acfa821ba8675417011f6766a34..0000000000000000000000000000000000000000 --- a/mace/kernels/neon/depthwise_conv_neon.cc +++ /dev/null @@ -1,80 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include "mace/kernels/conv_2d.h" -#include "mace/kernels/depthwise_conv2d.h" - -namespace mace { -namespace kernels { - -extern void Conv2dNeonK3x3S1(const float *input, - const index_t *input_shape, - const float *filter, - const index_t *filter_shape, - const float *bias, - float *output, - const index_t *output_shape); - -extern void Conv2dNeonK3x3S2(const float *input, - const index_t *input_shape, - const float *filter, - const index_t *filter_shape, - const float *bias, - float *output, - const index_t *output_shape); - -template <> -void DepthwiseConv2dFunctor::operator()( - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future) { - typedef void (*Conv2dNeonFunction)( - const float *input, const index_t *input_shape, const float *filter, - const index_t *filter_shape, const float *bias, float *output, - const index_t *output_shape); - // Selection matrix: kernel_size x stride_size - static const Conv2dNeonFunction selector[5][2] = { - {nullptr, nullptr}, - {nullptr, nullptr}, - {Conv2dNeonK3x3S1, Conv2dNeonK3x3S2}, - {nullptr, nullptr}, - {nullptr, nullptr}}; - // not implement yet - index_t kernel_h = filter->dim(2); - index_t kernel_w = filter->dim(3); - if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] || - strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 || - selector[kernel_h - 1][strides_[0] - 1] == nullptr) { - LOG(WARNING) << "Depthwise-Conv2d NEON kernel with " - << "filter" << kernel_h << "x" << kernel_w << "," - << " stride " << strides_[0] << "x" << strides_[1] - << " is not implemented yet, using slow version"; - DepthwiseConv2dFunctor( - strides_, paddings_, dilations_)(input, filter, bias, output, future); - return; - } - - const float *input_ptr = input->data(); - const index_t *input_shape = input->shape().data(); - const float *filter_ptr = filter->data(); - const index_t *filter_shape = filter->shape().data(); - const float *bias_ptr = bias->data(); - float *output_ptr = output->mutable_data(); - const index_t *output_shape = output->shape().data(); - // Keep this alive during kernel execution - Tensor padded_input; - if (paddings_[0] > 0 || paddings_[1] > 0) { - ConstructInputWithPadding(input, paddings_.data(), &padded_input); - input_ptr = padded_input.data(); - input_shape = padded_input.shape().data(); - } - auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1]; - conv2d_neon_func(input_ptr, input_shape, filter_ptr, filter_shape, bias_ptr, - output_ptr, output_shape); -} - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index ac5c7987fbeda1e76c7e5a13f4a5166c3a2f6c07..b5a5001271e78ff54c2fbf4efe3b541a08467390 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -166,8 +166,20 @@ struct PoolingFunctor : PoolingFunctorBase { }; template <> -void PoolingFunctor::operator()( - const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future); +struct PoolingFunctor : PoolingFunctorBase { + PoolingFunctor(const PoolingType pooling_type, + const int *kernels, + const int *strides, + const Padding padding_type, + const std::vector &paddings, + const int *dilations) + : PoolingFunctorBase( + pooling_type, kernels, strides, padding_type, paddings, dilations) { + } + void operator()(const Tensor *input_tensor, + Tensor *output_tensor, + StatsFuture *future); +}; template struct PoolingFunctor : PoolingFunctorBase { diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h index e2b8efc8d8673f26b01a2124dcc3fb64730ad0d5..62cbcbd532938a3ae8854fec7a2d55123bc34e7e 100644 --- a/mace/kernels/softmax.h +++ b/mace/kernels/softmax.h @@ -56,6 +56,11 @@ struct SoftmaxFunctor { } }; +template <> +struct SoftmaxFunctor { + void operator()(const Tensor *logits, Tensor *output, StatsFuture *future); +}; + template struct SoftmaxFunctor { void operator()(const Tensor *logits, Tensor *output, StatsFuture *future); diff --git a/mace/kernels/transpose.h b/mace/kernels/transpose.h new file mode 100644 index 0000000000000000000000000000000000000000..ad04d7284d9b2e9b6ecc47dc2aa6cb14d4ea0dca --- /dev/null +++ b/mace/kernels/transpose.h @@ -0,0 +1,64 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_KERNELS_TRANSPOSE_H_ +#define MACE_KERNELS_TRANSPOSE_H_ + +#include + +#include "mace/core/future.h" +#include "mace/core/tensor.h" +#include "mace/public/mace.h" +#include "mace/utils/utils.h" + +namespace mace { +namespace kernels { + +template +struct TransposeFunctor { + explicit TransposeFunctor(const std::vector &dims) : dims_(dims) {} + + void operator()(const Tensor *input, Tensor *output, StatsFuture *future) { + Tensor::MappingGuard input_guard(input); + Tensor::MappingGuard output_guard(output); + const std::vector &input_shape = input->shape(); + const std::vector &output_shape = output->shape(); + const T *input_data = input->data(); + T *output_data = output->mutable_data(); + + std::vector + in_stride{input_shape[1] * input_shape[2] * input_shape[3], + input_shape[2] * input_shape[3], input_shape[3], 1}; + std::vector + out_stride{output_shape[1] * output_shape[2] * output_shape[3], + output_shape[2] * output_shape[3], output_shape[3], 1}; + + std::vector idim(4, 0); + std::vector odim(4, 0); + for (odim[0] = 0; odim[0] < output_shape[0]; ++odim[0]) { + for (odim[1] = 0; odim[1] < output_shape[1]; ++odim[1]) { + for (odim[2] = 0; odim[2] < output_shape[2]; ++odim[2]) { + for (odim[3] = 0; odim[3] < output_shape[3]; ++odim[3]) { + idim[dims_[0]] = odim[0]; + idim[dims_[1]] = odim[1]; + idim[dims_[2]] = odim[2]; + idim[dims_[3]] = odim[3]; + + output_data[odim[0] * out_stride[0] + odim[1] * out_stride[1] + + odim[2] * out_stride[2] + odim[3]] = + input_data[idim[0] * in_stride[0] + idim[1] * in_stride[1] + + idim[2] * in_stride[2] + idim[3]]; + } + } + } + } + } + + std::vector dims_; +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_TRANSPOSE_H_ diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc index d7a000807654b576199482d63b7249d0049dc1f1..4e49eb244cd4a63c8f9c1d0d83bf36c5b5de01fa 100644 --- a/mace/ops/activation.cc +++ b/mace/ops/activation.cc @@ -25,6 +25,11 @@ void Register_Activation(OperatorRegistry *op_registry) { .TypeConstraint("T") .Build(), ActivationOp); + REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation") + .Device(DeviceType::NEON) + .TypeConstraint("T") + .Build(), + ActivationOp); } } // namespace ops diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc index 6e804cd1ae3535acfe6f402683d8ac9550f702dd..8b42007641f01cf5b81a1047d6304593f09aa603 100644 --- a/mace/ops/batch_norm.cc +++ b/mace/ops/batch_norm.cc @@ -25,6 +25,11 @@ void Register_BatchNorm(OperatorRegistry *op_registry) { .TypeConstraint("T") .Build(), BatchNormOp); + REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm") + .Device(DeviceType::NEON) + .TypeConstraint("T") + .Build(), + BatchNormOp); } } // namespace ops diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc index eac95fa51f08d39a699ae382d64ed6bd3abd31f4..af3f51c569d998e58a428e31489bdbf94c5988a6 100644 --- a/mace/ops/batch_norm_test.cc +++ b/mace/ops/batch_norm_test.cc @@ -11,7 +11,7 @@ namespace test { class BatchNormOpTest : public OpsTestBase {}; -template +template void Simple() { OpsTestNet net; @@ -36,14 +36,14 @@ void Simple() { kernels::BufferType::ARGUMENT); OpDefBuilder("BatchNorm", "BatchNormTest") - .Input("InputImage") - .Input("ScaleImage") - .Input("OffsetImage") - .Input("MeanImage") - .Input("VarImage") - .AddFloatArg("epsilon", 1e-3) - .Output("OutputImage") - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("ScaleImage") + .Input("OffsetImage") + .Input("MeanImage") + .Input("VarImage") + .AddFloatArg("epsilon", 1e-3) + .Output("OutputImage") + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -52,22 +52,22 @@ void Simple() { kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("BatchNorm", "BatchNormTest") - .Input("Input") - .Input("Scale") - .Input("Offset") - .Input("Mean") - .Input("Var") - .AddFloatArg("epsilon", 1e-3) - .Output("Output") - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") + .AddFloatArg("epsilon", 1e-3) + .Output("Output") + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); } // Check auto expected = - CreateTensor({1, 6, 2, 1}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83, - 3.17, 3.17, 5.51, 5.51, 7.86, 7.86}); + CreateTensor({1, 6, 2, 1}, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83, + 3.17, 3.17, 5.51, 5.51, 7.86, 7.86}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-2); } @@ -87,18 +87,18 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { // Construct graph OpsTestNet net; OpDefBuilder("BatchNorm", "BatchNormTest") - .Input("Input") - .Input("Scale") - .Input("Offset") - .Input("Mean") - .Input("Var") - .AddFloatArg("epsilon", 1e-3) - .Output("Output") - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") + .AddFloatArg("epsilon", 1e-3) + .Output("Output") + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput( - "Input", {batch, height, width, channels}); + "Input", {batch, height, width, channels}); net.AddRandomInput("Scale", {channels}); net.AddRandomInput("Offset", {channels}); net.AddRandomInput("Mean", {channels}); @@ -124,14 +124,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { kernels::BufferType::ARGUMENT); OpDefBuilder("BatchNorm", "BatchNormTest") - .Input("InputImage") - .Input("ScaleImage") - .Input("OffsetImage") - .Input("MeanImage") - .Input("VarImage") - .AddFloatArg("epsilon", 1e-3) - .Output("OutputImage") - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("ScaleImage") + .Input("OffsetImage") + .Input("MeanImage") + .Input("VarImage") + .AddFloatArg("epsilon", 1e-3) + .Output("OutputImage") + .Finalize(net.NewOperatorDef()); // Tuning setenv("MACE_TUNING", "1", 1); @@ -158,18 +158,18 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { // Construct graph OpsTestNet net; OpDefBuilder("BatchNorm", "BatchNormTest") - .Input("Input") - .Input("Scale") - .Input("Offset") - .Input("Mean") - .Input("Var") - .AddFloatArg("epsilon", 1e-3) - .Output("Output") - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") + .AddFloatArg("epsilon", 1e-3) + .Output("Output") + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput( - "Input", {batch, height, width, channels}); + "Input", {batch, height, width, channels}); net.AddRandomInput("Scale", {channels}); net.AddRandomInput("Offset", {channels}); net.AddRandomInput("Mean", {channels}); @@ -195,15 +195,15 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) { kernels::BufferType::ARGUMENT); OpDefBuilder("BatchNorm", "BatchNormTest") - .Input("InputImage") - .Input("ScaleImage") - .Input("OffsetImage") - .Input("MeanImage") - .Input("VarImage") - .AddFloatArg("epsilon", 1e-3) - .Output("OutputImage") - .AddIntArg("T", static_cast(DataType::DT_HALF)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("ScaleImage") + .Input("OffsetImage") + .Input("MeanImage") + .Input("VarImage") + .AddFloatArg("epsilon", 1e-3) + .Output("OutputImage") + .AddIntArg("T", static_cast(DataType::DT_HALF)) + .Finalize(net.NewOperatorDef()); // Tuning setenv("MACE_TUNING", "1", 1); @@ -230,18 +230,18 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { // Construct graph OpsTestNet net; OpDefBuilder("BatchNorm", "BatchNormTest") - .Input("Input") - .Input("Scale") - .Input("Offset") - .Input("Mean") - .Input("Var") - .AddFloatArg("epsilon", 1e-3) - .Output("Output") - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") + .AddFloatArg("epsilon", 1e-3) + .Output("Output") + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput( - "Input", {batch, height, width, channels}); + "Input", {batch, height, width, channels}); net.AddRandomInput("Scale", {channels}); net.AddRandomInput("Offset", {channels}); net.AddRandomInput("Mean", {channels}); @@ -267,14 +267,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { kernels::BufferType::ARGUMENT); OpDefBuilder("BatchNorm", "BatchNormTest") - .Input("InputImage") - .Input("ScaleImage") - .Input("OffsetImage") - .Input("MeanImage") - .Input("VarImage") - .AddFloatArg("epsilon", 1e-3) - .Output("OutputImage") - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("ScaleImage") + .Input("OffsetImage") + .Input("MeanImage") + .Input("VarImage") + .AddFloatArg("epsilon", 1e-3) + .Output("OutputImage") + .Finalize(net.NewOperatorDef()); // tuning setenv("MACE_TUNING", "1", 1); @@ -301,18 +301,18 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { // Construct graph OpsTestNet net; OpDefBuilder("BatchNorm", "BatchNormTest") - .Input("Input") - .Input("Scale") - .Input("Offset") - .Input("Mean") - .Input("Var") - .AddFloatArg("epsilon", 1e-3) - .Output("Output") - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") + .AddFloatArg("epsilon", 1e-3) + .Output("Output") + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput( - "Input", {batch, height, width, channels}); + "Input", {batch, height, width, channels}); net.AddRandomInput("Scale", {channels}); net.AddRandomInput("Offset", {channels}); net.AddRandomInput("Mean", {channels}); @@ -338,15 +338,15 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { kernels::BufferType::ARGUMENT); OpDefBuilder("BatchNorm", "BatchNormTest") - .Input("InputImage") - .Input("ScaleImage") - .Input("OffsetImage") - .Input("MeanImage") - .Input("VarImage") - .AddFloatArg("epsilon", 1e-3) - .Output("OutputImage") - .AddIntArg("T", static_cast(DataType::DT_HALF)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("ScaleImage") + .Input("OffsetImage") + .Input("MeanImage") + .Input("VarImage") + .AddFloatArg("epsilon", 1e-3) + .Output("OutputImage") + .AddIntArg("T", static_cast(DataType::DT_HALF)) + .Finalize(net.NewOperatorDef()); // tuning setenv("MACE_TUNING", "1", 1); @@ -362,6 +362,63 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) { ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.5); } +TEST_F(BatchNormOpTest, NEONTest) { + srand(time(NULL)); + unsigned int seed; + + // generate random input + index_t batch = 1 + rand_r(&seed) % 10; + index_t channels = 3 + rand_r(&seed) % 50; + index_t height = 64; + index_t width = 64; + + // Construct graph + OpsTestNet net; + OpDefBuilder("BatchNorm", "BatchNormTest") + .Input("Input") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") + .AddFloatArg("epsilon", 1e-3) + .Output("Output") + .Finalize(net.NewOperatorDef()); + + // Add input data + net.AddRandomInput( + "Input", {batch, height, width, channels}); + net.AddRandomInput("Scale", {channels}); + net.AddRandomInput("Offset", {channels}); + net.AddRandomInput("Mean", {channels}); + net.AddRandomInput("Var", {channels}, true); + + // run cpu + net.RunOp(); + + OpDefBuilder("BatchNorm", "BatchNormTest") + .Input("InputNeon") + .Input("Scale") + .Input("Offset") + .Input("Mean") + .Input("Var") + .AddFloatArg("epsilon", 1e-3) + .Output("OutputNeon") + .Finalize(net.NewOperatorDef()); + + net.FillNHWCInputToNCHWInput("InputNeon", "Input"); + + // Run on neon + net.RunOp(DeviceType::NEON); + net.Sync(); + + net.FillNHWCInputToNCHWInput("OutputExptected", + "Output"); + + ExpectTensorNear(*net.GetOutput("OutputExptected"), + *net.GetOutput("OutputNeon"), + 0.001); +} + } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index cf45d9df7acf38b4da0c1afa6f3f6c501b5a9a8b..1c612a426b5dccbfad830fa5387010b6f4ef8d23 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -25,6 +25,12 @@ void Register_Conv2D(OperatorRegistry *op_registry) { .TypeConstraint("T") .Build(), Conv2dOp); + + REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D") + .Device(DeviceType::NEON) + .TypeConstraint("T") + .Build(), + Conv2dOp); } } // namespace ops diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index f06a7e127359e391a54b28bb4d35891416f32cbb..4197f65c8e5a35bc060825eec071d5f92e95be87 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -30,10 +30,19 @@ static void Conv2d(int iters, OpsTestNet net; // Add input data - net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Filter", - {kernel_h, kernel_w, output_channels, channels}); - net.AddRandomInput("Bias", {output_channels}); + if (D == DeviceType::NEON) { + net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddRandomInput("Filter", + {output_channels, channels, kernel_h, + kernel_w}); + net.AddRandomInput("Bias", {output_channels}); + } else { + net.AddRandomInput("Input", {batch, height, width, channels}); + net.AddRandomInput("Filter", + {kernel_h, kernel_w, output_channels, + channels}); + net.AddRandomInput("Bias", {output_channels}); + } if (D == DeviceType::OPENCL) { BufferToImage(&net, "Input", "InputImage", @@ -65,15 +74,17 @@ static void Conv2d(int iters, .Finalize(net.NewOperatorDef()); } + net.Setup(D); + // Warm-up for (int i = 0; i < 2; ++i) { - net.RunOp(D); + net.Run(); net.Sync(); } mace::testing::StartTiming(); while (iters--) { - net.RunOp(D); + net.Run(); net.Sync(); } } @@ -112,7 +123,8 @@ static void Conv2d(int iters, #define BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC) \ BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU); \ BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, OPENCL); \ - BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, OPENCL); + BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, OPENCL); \ + BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, NEON); BM_CONV_2D(1, 256, 64, 64, 3, 3, 1, 1, VALID, 256); @@ -133,6 +145,8 @@ BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, 1, VALID, 128); BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, 1, VALID, 128); // Test bad alignments BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, 1, SAME, 128); BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, 1, SAME, 128); +BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, SAME, 32); +BM_CONV_2D(1, 3, 224, 224, 3, 3, 2, 1, VALID, 32); BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128); BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128); diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index eef5a560d6a190932681173a0038fa4edd738703..6bb541a9c35373ff92e8f0dc323555c584391f01 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -14,17 +14,17 @@ namespace test { class Conv2dOpTest : public OpsTestBase {}; -template +template void TestNHWCSimple3x3VALID() { OpsTestNet net; // Add input data net.AddInputFromArray( - "Input", {1, 3, 3, 2}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + "Input", {1, 3, 3, 2}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); net.AddInputFromArray( - "Filter", {3, 3, 1, 2}, - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + "Filter", {3, 3, 1, 2}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); net.AddInputFromArray("Bias", {1}, {0.1f}); if (D == DeviceType::OPENCL) { @@ -35,15 +35,15 @@ void TestNHWCSimple3x3VALID() { BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); net.RunOp(D); @@ -53,15 +53,15 @@ void TestNHWCSimple3x3VALID() { } else { OpDefBuilder("Conv2D", "Conv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); } @@ -70,18 +70,18 @@ void TestNHWCSimple3x3VALID() { ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); } -template +template void TestNHWCSimple3x3SAME() { OpsTestNet net; // Add input data net.AddInputFromArray( - "Input", {1, 3, 3, 2}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + "Input", {1, 3, 3, 2}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); net.AddInputFromArray( - "Filter", {3, 3, 1, 2}, - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + "Filter", {3, 3, 1, 2}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); net.AddInputFromArray("Bias", {1}, {0.1f}); if (D == DeviceType::OPENCL) { @@ -92,15 +92,15 @@ void TestNHWCSimple3x3SAME() { BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::SAME) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -110,22 +110,22 @@ void TestNHWCSimple3x3SAME() { } else { OpDefBuilder("Conv2D", "Conv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::SAME) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); } auto expected = CreateTensor( - {1, 3, 3, 1}, - {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f}); + {1, 3, 3, 1}, + {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); } @@ -140,18 +140,18 @@ TEST_F(Conv2dOpTest, OPENCLSimple) { TestNHWCSimple3x3SAME(); } -template +template void TestNHWCSimple3x3WithoutBias() { OpsTestNet net; // Add input data net.AddInputFromArray( - "Input", {1, 3, 3, 2}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + "Input", {1, 3, 3, 2}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); net.AddInputFromArray( - "Filter", {3, 3, 1, 2}, - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + "Filter", {3, 3, 1, 2}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); if (D == DeviceType::OPENCL) { BufferToImage(&net, "Input", "InputImage", @@ -160,14 +160,14 @@ void TestNHWCSimple3x3WithoutBias() { kernels::BufferType::CONV2D_FILTER); OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Output("OutputImage") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); // Transfer output @@ -175,14 +175,14 @@ void TestNHWCSimple3x3WithoutBias() { kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Conv2D", "Conv2dTest") - .Input("Input") - .Input("Filter") - .Output("Output") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -202,21 +202,21 @@ TEST_F(Conv2dOpTest, OPENCLWithoutBias) { TestNHWCSimple3x3WithoutBias(); } -template +template static void TestNHWCCombined3x3() { // Construct graph OpsTestNet net; // Add input data net.AddInputFromArray( - "Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + "Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); net.AddInputFromArray( - "Filter", {3, 3, 2, 2}, - {1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, - 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, - 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f}); + "Filter", {3, 3, 2, 2}, + {1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, + 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, + 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f}); net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); if (D == DeviceType::OPENCL) { @@ -228,15 +228,15 @@ static void TestNHWCCombined3x3() { kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2DTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::SAME) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -244,23 +244,23 @@ static void TestNHWCCombined3x3() { kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Conv2D", "Conv2DTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::SAME) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); } // Check auto expected = CreateTensor( - {1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f, - 9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f}); + {1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f, + 9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); } @@ -272,24 +272,24 @@ TEST_F(Conv2dOpTest, OPENCLStride2) { TestNHWCCombined3x3(); } -template +template void TestConv1x1() { // Construct graph OpsTestNet net; // Add input data net.AddInputFromArray( - "Input", {1, 3, 10, 5}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + "Input", {1, 3, 10, 5}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); net.AddInputFromArray( - "Filter", {1, 1, 2, 5}, - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f}); + "Filter", {1, 1, 2, 5}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f}); net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); if (D == DeviceType::OPENCL) { @@ -301,14 +301,14 @@ void TestConv1x1() { kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2DTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -316,27 +316,27 @@ void TestConv1x1() { kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Conv2D", "Conv2DTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); } // Check auto expected = CreateTensor( - {1, 3, 10, 2}, - {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, - 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, - 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, - 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, - 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, - 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f}); + {1, 3, 10, 2}, + {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -345,7 +345,7 @@ TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1(); } TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1(); } -template +template static void TestComplexConvNxNS12(const std::vector &shape, const int stride) { testing::internal::LogToStderr(); @@ -361,20 +361,20 @@ static void TestComplexConvNxNS12(const std::vector &shape, // Construct graph OpsTestNet net; OpDefBuilder("Conv2D", "Conv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}); + "Filter", {kernel_h, kernel_w, output_channels, input_channels}); net.AddRandomInput("Bias", {output_channels}); // run on cpu @@ -392,15 +392,15 @@ static void TestComplexConvNxNS12(const std::vector &shape, kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run on device net.RunOp(D); @@ -430,7 +430,7 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) { TestComplexConvNxNS12({32, 32, 13, 17}, 4); } -template +template static void TestHalfComplexConvNxNS12(const std::vector &input_shape, const std::vector &filter_shape, const std::vector &dilations) { @@ -449,30 +449,30 @@ static void TestHalfComplexConvNxNS12(const std::vector &input_shape, // Construct graph OpsTestNet net; OpDefBuilder("Conv2D", "Conv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", padding) - .AddIntsArg("dilations", {dilations[0], dilations[1]}) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {dilations[0], dilations[1]}) + .Finalize(net.NewOperatorDef()); std::vector float_input_data; GenerateRandomRealTypeData({batch, height, width, input_channels}, &float_input_data); std::vector float_filter_data; GenerateRandomRealTypeData( - {kernel_h, kernel_w, output_channels, input_channels}, - &float_filter_data); + {kernel_h, kernel_w, output_channels, input_channels}, + &float_filter_data); std::vector float_bias_data; GenerateRandomRealTypeData({output_channels}, &float_bias_data); // Add input data net.AddInputFromArray( - "Input", {batch, height, width, input_channels}, float_input_data); + "Input", {batch, height, width, input_channels}, float_input_data); net.AddInputFromArray( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}, - float_filter_data); + "Filter", {kernel_h, kernel_w, output_channels, input_channels}, + float_filter_data); net.AddInputFromArray("Bias", {output_channels}, float_bias_data); // run on cpu @@ -490,15 +490,15 @@ static void TestHalfComplexConvNxNS12(const std::vector &input_shape, kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", padding) - .AddIntsArg("dilations", {dilations[0], dilations[1]}) - .AddIntArg("T", static_cast(DataType::DT_HALF)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {dilations[0], dilations[1]}) + .AddIntArg("T", static_cast(DataType::DT_HALF)) + .Finalize(net.NewOperatorDef()); // Run on device net.RunOp(D); @@ -566,7 +566,7 @@ TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) { {4, 4}); } -template +template static void TestDilationConvNxN(const std::vector &shape, const int dilation_rate) { testing::internal::LogToStderr(); @@ -583,20 +583,20 @@ static void TestDilationConvNxN(const std::vector &shape, // Construct graph OpsTestNet net; OpDefBuilder("Conv2D", "Conv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {dilation_rate, dilation_rate}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {dilation_rate, dilation_rate}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}); + "Filter", {kernel_h, kernel_w, output_channels, input_channels}); net.AddRandomInput("Bias", {output_channels}); // run on cpu @@ -614,15 +614,15 @@ static void TestDilationConvNxN(const std::vector &shape, kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {dilation_rate, dilation_rate}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {dilation_rate, dilation_rate}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run on device net.RunOp(D); @@ -651,7 +651,7 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) { TestDilationConvNxN({107, 113, 5, 7}, 4); } -template +template static void TestArbitraryPadConvNxN(const std::vector &shape, const std::vector &paddings) { testing::internal::LogToStderr(); @@ -667,19 +667,19 @@ static void TestArbitraryPadConvNxN(const std::vector &shape, // Construct graph OpsTestNet net; OpDefBuilder("Conv2D", "Conv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntsArg("padding_values", paddings) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntsArg("padding_values", paddings) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}); + "Filter", {kernel_h, kernel_w, output_channels, input_channels}); net.AddRandomInput("Bias", {output_channels}); // run on cpu @@ -697,14 +697,14 @@ static void TestArbitraryPadConvNxN(const std::vector &shape, kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntsArg("padding_values", paddings) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntsArg("padding_values", paddings) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run on device net.RunOp(D); @@ -733,6 +733,83 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) { TestArbitraryPadConvNxN({107, 113, 5, 7}, {4, 4}); } +static void TestNeonArbitraryPadConvNxN(const std::vector &shape, + const std::vector &paddings) { + testing::internal::LogToStderr(); + auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w) { + srand(time(NULL)); + + // generate random input + index_t batch = 1; + index_t height = shape[0]; + index_t width = shape[1]; + index_t input_channels = shape[2]; + index_t output_channels = shape[3]; + // Construct graph + OpsTestNet net; + OpDefBuilder("Conv2D", "Conv2dTestCPU") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntsArg("padding_values", paddings) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Add input data + net.AddRandomInput("Input", + {batch, height, width, + input_channels}); + net.AddRandomInput( + "Filter", {kernel_h, kernel_w, output_channels, input_channels}); + net.AddRandomInput("Bias", {output_channels}); + + // run cpu + net.RunOp(); + + // run neon + OpDefBuilder("Conv2D", "Conv2dTestNEON") + .Input("InputNeon") + .Input("FilterNeon") + .Input("Bias") + .Output("OutputNeon") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntsArg("padding_values", paddings) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + net.FillNHWCInputToNCHWInput("InputNeon", "Input"); + net.FillHWOIInputToOIHWInput("FilterNeon", + "Filter"); + + // Run on device + net.RunOp(DeviceType::NEON); + + net.FillNHWCInputToNCHWInput("OutputExptected", + "Output"); + + ExpectTensorNear(*net.GetOutput("OutputExptected"), + *net.GetOutput("OutputNeon"), + 0.001); + }; + + for (int kernel_size : {1, 3, 5}) { + for (int stride : {1, 2}) { + if (stride < kernel_size) { + func(kernel_size, kernel_size, stride, stride); + } + } + } +} + +TEST_F(Conv2dOpTest, NEONTest) { + TestNeonArbitraryPadConvNxN({32, 34, 32, 64}, {0, 0}); + TestNeonArbitraryPadConvNxN({32, 32, 32, 64}, {1, 1}); + TestNeonArbitraryPadConvNxN({128, 128, 16, 16}, {2, 2}); + TestNeonArbitraryPadConvNxN({107, 113, 5, 7}, {4, 4}); +} + } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index 112cb03163f384e6b5ea1361b53e512f57f3c999..fe5a6b2aa80f35d5882e90418e5f460c1a8476fe 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -25,6 +25,12 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) { .TypeConstraint("T") .Build(), DepthwiseConv2dOp); + + REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") + .Device(DeviceType::NEON) + .TypeConstraint("T") + .Build(), + DepthwiseConv2dOp); } } // namespace ops diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc index d97df4585d1925ca2a2c3f07e7a4ae29992636bb..5ce7ae3d14624fa44df5d2d1dd157b9dfc211659 100644 --- a/mace/ops/depthwise_conv2d_benchmark.cc +++ b/mace/ops/depthwise_conv2d_benchmark.cc @@ -29,10 +29,19 @@ static void DepthwiseConv2d(int iters, OpsTestNet net; // Add input data - net.AddRandomInput("Input", {batch, height, width, input_channels}); - net.AddRandomInput( + if (D == DeviceType::NEON) { + net.AddRandomInput("Input", + {batch, input_channels, height, width}); + net.AddRandomInput( + "Filter", {multiplier, input_channels, kernel_h, kernel_w}); + net.AddRandomInput("Bias", {input_channels * multiplier}); + } else { + net.AddRandomInput("Input", + {batch, height, width, input_channels}); + net.AddRandomInput( "Filter", {kernel_h, kernel_w, input_channels, multiplier}); - net.AddRandomInput("Bias", {input_channels * multiplier}); + net.AddRandomInput("Bias", {input_channels * multiplier}); + } if (D == DeviceType::OPENCL) { BufferToImage(&net, "Input", "InputImage", @@ -64,15 +73,17 @@ static void DepthwiseConv2d(int iters, .Finalize(net.NewOperatorDef()); } + net.Setup(D); + // Warm-up for (int i = 0; i < 2; ++i) { - net.RunOp(D); + net.Run(); net.Sync(); } mace::testing::StartTiming(); while (iters--) { - net.RunOp(D); + net.Run(); net.Sync(); } } @@ -108,10 +119,16 @@ static void DepthwiseConv2d(int iters, #define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M) \ BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU); \ BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, OPENCL); \ - BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, OPENCL); + BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, OPENCL); \ + BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, NEON); BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1); -BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, SAME, 1); +BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1); +BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 2, VALID, 1); +BM_DEPTHWISE_CONV_2D(1, 32, 224, 224, 3, 3, 2, VALID, 1); +BM_DEPTHWISE_CONV_2D(1, 64, 56, 56, 3, 3, 2, VALID, 1); +BM_DEPTHWISE_CONV_2D(1, 64, 112, 112, 3, 3, 2, VALID, 1); +BM_DEPTHWISE_CONV_2D(1, 64, 224, 224, 3, 3, 2, VALID, 1); BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1); BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1); BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1); @@ -124,6 +141,10 @@ BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1); BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1); BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1); BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1); +BM_DEPTHWISE_CONV_2D(1, 3, 112, 112, 3, 3, 2, VALID, 1); +BM_DEPTHWISE_CONV_2D(1, 3, 224, 224, 3, 3, 2, SAME, 1); +BM_DEPTHWISE_CONV_2D(1, 8, 224, 224, 3, 3, 2, SAME, 1); + } // namespace test } // namespace ops diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index 59073899e24351dc798fd7f5da787a1eb6b25474..d401da97f587fb9ea15fdc4bcac424f6a2830ae4 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -11,7 +11,7 @@ namespace test { class DepthwiseConv2dOpTest : public OpsTestBase {}; -template +template void SimpleValidTest() { testing::internal::LogToStderr(); // Construct graph @@ -19,10 +19,10 @@ void SimpleValidTest() { // Add input data net.AddInputFromArray( - "Input", {1, 3, 3, 2}, - {1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18}); + "Input", {1, 3, 3, 2}, + {1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18}); net.AddInputFromArray( - "Filter", {2, 2, 2, 1}, {1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f}); + "Filter", {2, 2, 2, 1}, {1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f}); net.AddInputFromArray("Bias", {2}, {.1f, .2f}); if (D == DeviceType::OPENCL) { BufferToImage(&net, "Input", "InputImage", @@ -32,15 +32,15 @@ void SimpleValidTest() { BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); net.RunOp(D); @@ -50,23 +50,23 @@ void SimpleValidTest() { } else { OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); } // Check auto expected = CreateTensor( - {1, 2, 2, 2}, VectorStaticCast({37.1f, 148.2f, 47.1f, 188.2f, 67.1f, - 268.2f, 77.1f, 308.2f})); + {1, 2, 2, 2}, VectorStaticCast({37.1f, 148.2f, 47.1f, 188.2f, 67.1f, + 268.2f, 77.1f, 308.2f})); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); } @@ -83,7 +83,7 @@ TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) { SimpleValidTest(); } -template +template void ComplexValidTest() { testing::internal::LogToStderr(); // Construct graph @@ -91,41 +91,41 @@ void ComplexValidTest() { // Add input data net.AddInputFromArray( - "Input", {1, 10, 10, 3}, - {0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, - 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, - 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, - 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, - 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, - 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, - 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, - 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, - 0.96, 0.97, 0.98, 0.99, 1.0, 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, - 1.08, 1.09, 1.1, 1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19, - 1.2, 1.21, 1.22, 1.23, 1.24, 1.25, 1.26, 1.27, 1.28, 1.29, 1.3, 1.31, - 1.32, 1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4, 1.41, 1.42, 1.43, - 1.44, 1.45, 1.46, 1.47, 1.48, 1.49, 1.5, 1.51, 1.52, 1.53, 1.54, 1.55, - 1.56, 1.57, 1.58, 1.59, 1.6, 1.61, 1.62, 1.63, 1.64, 1.65, 1.66, 1.67, - 1.68, 1.69, 1.7, 1.71, 1.72, 1.73, 1.74, 1.75, 1.76, 1.77, 1.78, 1.79, - 1.8, 1.81, 1.82, 1.83, 1.84, 1.85, 1.86, 1.87, 1.88, 1.89, 1.9, 1.91, - 1.92, 1.93, 1.94, 1.95, 1.96, 1.97, 1.98, 1.99, 2.0, 2.01, 2.02, 2.03, - 2.04, 2.05, 2.06, 2.07, 2.08, 2.09, 2.1, 2.11, 2.12, 2.13, 2.14, 2.15, - 2.16, 2.17, 2.18, 2.19, 2.2, 2.21, 2.22, 2.23, 2.24, 2.25, 2.26, 2.27, - 2.28, 2.29, 2.3, 2.31, 2.32, 2.33, 2.34, 2.35, 2.36, 2.37, 2.38, 2.39, - 2.4, 2.41, 2.42, 2.43, 2.44, 2.45, 2.46, 2.47, 2.48, 2.49, 2.5, 2.51, - 2.52, 2.53, 2.54, 2.55, 2.56, 2.57, 2.58, 2.59, 2.6, 2.61, 2.62, 2.63, - 2.64, 2.65, 2.66, 2.67, 2.68, 2.69, 2.7, 2.71, 2.72, 2.73, 2.74, 2.75, - 2.76, 2.77, 2.78, 2.79, 2.8, 2.81, 2.82, 2.83, 2.84, 2.85, 2.86, 2.87, - 2.88, 2.89, 2.9, 2.91, 2.92, 2.93, 2.94, 2.95, 2.96, 2.97, 2.98, 2.99}); + "Input", {1, 10, 10, 3}, + {0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, + 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, + 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, + 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, + 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, + 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, + 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, + 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, + 0.96, 0.97, 0.98, 0.99, 1.0, 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, + 1.08, 1.09, 1.1, 1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19, + 1.2, 1.21, 1.22, 1.23, 1.24, 1.25, 1.26, 1.27, 1.28, 1.29, 1.3, 1.31, + 1.32, 1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4, 1.41, 1.42, 1.43, + 1.44, 1.45, 1.46, 1.47, 1.48, 1.49, 1.5, 1.51, 1.52, 1.53, 1.54, 1.55, + 1.56, 1.57, 1.58, 1.59, 1.6, 1.61, 1.62, 1.63, 1.64, 1.65, 1.66, 1.67, + 1.68, 1.69, 1.7, 1.71, 1.72, 1.73, 1.74, 1.75, 1.76, 1.77, 1.78, 1.79, + 1.8, 1.81, 1.82, 1.83, 1.84, 1.85, 1.86, 1.87, 1.88, 1.89, 1.9, 1.91, + 1.92, 1.93, 1.94, 1.95, 1.96, 1.97, 1.98, 1.99, 2.0, 2.01, 2.02, 2.03, + 2.04, 2.05, 2.06, 2.07, 2.08, 2.09, 2.1, 2.11, 2.12, 2.13, 2.14, 2.15, + 2.16, 2.17, 2.18, 2.19, 2.2, 2.21, 2.22, 2.23, 2.24, 2.25, 2.26, 2.27, + 2.28, 2.29, 2.3, 2.31, 2.32, 2.33, 2.34, 2.35, 2.36, 2.37, 2.38, 2.39, + 2.4, 2.41, 2.42, 2.43, 2.44, 2.45, 2.46, 2.47, 2.48, 2.49, 2.5, 2.51, + 2.52, 2.53, 2.54, 2.55, 2.56, 2.57, 2.58, 2.59, 2.6, 2.61, 2.62, 2.63, + 2.64, 2.65, 2.66, 2.67, 2.68, 2.69, 2.7, 2.71, 2.72, 2.73, 2.74, 2.75, + 2.76, 2.77, 2.78, 2.79, 2.8, 2.81, 2.82, 2.83, 2.84, 2.85, 2.86, 2.87, + 2.88, 2.89, 2.9, 2.91, 2.92, 2.93, 2.94, 2.95, 2.96, 2.97, 2.98, 2.99}); net.AddInputFromArray( - "Filter", {5, 5, 3, 1}, - {0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, - 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, - 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, - 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, - 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, - 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, - 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74}); + "Filter", {5, 5, 3, 1}, + {0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, + 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, + 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, + 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, + 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, + 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, + 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74}); net.AddInputFromArray("Bias", {6}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}); if (D == DeviceType::OPENCL) { @@ -136,15 +136,15 @@ void ComplexValidTest() { BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::SAME) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); net.RunOp(D); @@ -154,38 +154,38 @@ void ComplexValidTest() { } else { OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::SAME) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); } // Check auto expected = CreateTensor( - {1, 5, 5, 3}, - VectorStaticCast( - {4.48200035, 4.63479996, 4.79079962, 5.85899973, 6.05599976, - 6.25699997, 6.38100004, 6.59000015, 6.80300045, 6.90299988, - 7.1239996, 7.34899998, 4.03559971, 4.16820002, 4.30319977, - 8.90999985, 9.1760006, 9.44599915, 11.20499992, 11.54500103, - 11.89000034, 11.74499989, 12.09999943, 12.46000004, 12.28499985, - 12.65500069, 13.03000069, 7.00200033, 7.22399998, 7.44900036, - 13.4100008, 13.79599953, 14.18599987, 16.60500145, 17.09499741, - 17.59000015, 17.14500046, 17.65000153, 18.15999794, 17.68499947, - 18.20499992, 18.72999954, 9.97200012, 10.28399944, 10.59899998, - 17.90999985, 18.41600037, 18.92599869, 22.00500107, 22.64500046, - 23.28999901, 22.54500008, 23.19999886, 23.8599987, 23.0850029, - 23.75500107, 24.43000031, 12.94200039, 13.34400082, 13.7489996, - 6.97500038, 7.29659986, 7.62060022, 8.32049942, 8.72700024, - 9.13650036, 8.5095005, 8.92500019, 9.34349918, 8.69849968, - 9.12300014, 9.55049992, 4.55220032, 4.80690002, 5.06340027})); + {1, 5, 5, 3}, + VectorStaticCast( + {4.48200035, 4.63479996, 4.79079962, 5.85899973, 6.05599976, + 6.25699997, 6.38100004, 6.59000015, 6.80300045, 6.90299988, + 7.1239996, 7.34899998, 4.03559971, 4.16820002, 4.30319977, + 8.90999985, 9.1760006, 9.44599915, 11.20499992, 11.54500103, + 11.89000034, 11.74499989, 12.09999943, 12.46000004, 12.28499985, + 12.65500069, 13.03000069, 7.00200033, 7.22399998, 7.44900036, + 13.4100008, 13.79599953, 14.18599987, 16.60500145, 17.09499741, + 17.59000015, 17.14500046, 17.65000153, 18.15999794, 17.68499947, + 18.20499992, 18.72999954, 9.97200012, 10.28399944, 10.59899998, + 17.90999985, 18.41600037, 18.92599869, 22.00500107, 22.64500046, + 23.28999901, 22.54500008, 23.19999886, 23.8599987, 23.0850029, + 23.75500107, 24.43000031, 12.94200039, 13.34400082, 13.7489996, + 6.97500038, 7.29659986, 7.62060022, 8.32049942, 8.72700024, + 9.13650036, 8.5095005, 8.92500019, 9.34349918, 8.69849968, + 9.12300014, 9.55049992, 4.55220032, 4.80690002, 5.06340027})); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.2); } @@ -202,7 +202,7 @@ TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) { ComplexValidTest(); } -template +template void TestNxNS12(const index_t height, const index_t width) { testing::internal::LogToStderr(); auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, @@ -219,18 +219,18 @@ void TestNxNS12(const index_t height, const index_t width) { net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {kernel_h, kernel_w, input_channels, multiplier}); + "Filter", {kernel_h, kernel_w, input_channels, multiplier}); net.AddRandomInput("Bias", {multiplier * input_channels}); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run on cpu net.RunOp(); @@ -246,15 +246,15 @@ void TestNxNS12(const index_t height, const index_t width) { BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); net.RunOp(D); @@ -263,15 +263,15 @@ void TestNxNS12(const index_t height, const index_t width) { kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("DeviceOutput") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("DeviceOutput") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -315,6 +315,86 @@ TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12Half) { TestNxNS12(107, 113); } +void TestNEONNxNS12(const index_t height, + const index_t width, + const index_t input_channels, + const index_t multiplier) { + testing::internal::LogToStderr(); + auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, + Padding type) { + // generate random input + index_t batch = 1; + // Construct graph + OpsTestNet net; + + // Add input data + net.AddRandomInput("Input", + {batch, height, width, input_channels}); + net.AddRandomInput( + "Filter", {kernel_h, kernel_w, input_channels, multiplier}); + net.AddRandomInput("Bias", {multiplier * input_channels}); + OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Run on cpu + net.RunOp(); + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") + .Input("InputNeon") + .Input("FilterNeon") + .Input("Bias") + .Output("OutputNeon") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + net.FillNHWCInputToNCHWInput("InputNeon", "Input"); + net.FillHWIOInputToOIHWInput("FilterNeon", + "Filter"); + + // Run + net.RunOp(NEON); + + net.FillNHWCInputToNCHWInput("OutputExptected", + "Output"); + + // Check + ExpectTensorNear(*net.GetOutput("OutputExptected"), + *net.GetOutput("OutputNeon"), + 0.001); + }; + + for (int kernel_size : {1, 3, 5}) { + for (int stride : {1, 2}) { + if (kernel_size > stride) { + func(kernel_size, kernel_size, stride, stride, VALID); + func(kernel_size, kernel_size, stride, stride, SAME); + } + } + } +} + +TEST_F(DepthwiseConv2dOpTest, NEONTest) { + TestNEONNxNS12(4, 4, 32, 1); + TestNEONNxNS12(64, 64, 32, 1); + TestNEONNxNS12(112, 112, 32, 1); + TestNEONNxNS12(128, 128, 15, 1); + TestNEONNxNS12(107, 113, 15, 1); +} + } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/folded_batch_norm.cc b/mace/ops/folded_batch_norm.cc index 10cc39272c814f09e37cee22450ed51259aac710..9bc436524f08db5b0d7e906a491120d64560652d 100644 --- a/mace/ops/folded_batch_norm.cc +++ b/mace/ops/folded_batch_norm.cc @@ -25,6 +25,11 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) { .TypeConstraint("T") .Build(), FoldedBatchNormOp); + REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm") + .Device(DeviceType::NEON) + .TypeConstraint("T") + .Build(), + FoldedBatchNormOp); } } // namespace ops diff --git a/mace/ops/fused_conv_2d.cc b/mace/ops/fused_conv_2d.cc index d4b5de4f809c7398adc6622ae9a11ee7aa911524..ebc360e255864c0ecb5f44a3794f067bd54861c2 100644 --- a/mace/ops/fused_conv_2d.cc +++ b/mace/ops/fused_conv_2d.cc @@ -25,6 +25,11 @@ void Register_FusedConv2D(OperatorRegistry *op_registry) { .TypeConstraint("T") .Build(), FusedConv2dOp); + REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D") + .Device(DeviceType::NEON) + .TypeConstraint("T") + .Build(), + FusedConv2dOp); } } // namespace ops diff --git a/mace/ops/fused_conv_2d_test.cc b/mace/ops/fused_conv_2d_test.cc index 554f2cba53025e8ca038fd8c420d8099184e35ba..04e00ae05a8911cb3b4998e351e760ee1d1f37ba 100644 --- a/mace/ops/fused_conv_2d_test.cc +++ b/mace/ops/fused_conv_2d_test.cc @@ -13,17 +13,17 @@ namespace test { class FusedConv2dOpTest : public OpsTestBase {}; -template +template void TestNHWCSimple3x3VALID() { OpsTestNet net; // Add input data net.AddInputFromArray( - "Input", {1, 3, 3, 2}, - {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); + "Input", {1, 3, 3, 2}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); net.AddInputFromArray( - "Filter", {3, 3, 2, 1}, - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + "Filter", {3, 3, 2, 1}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); net.AddInputFromArray("Bias", {1}, {-0.1f}); if (D == DeviceType::OPENCL) { @@ -34,15 +34,15 @@ void TestNHWCSimple3x3VALID() { BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); net.RunOp(D); @@ -52,15 +52,15 @@ void TestNHWCSimple3x3VALID() { } else { OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); } @@ -69,18 +69,18 @@ void TestNHWCSimple3x3VALID() { ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); } -template +template void TestNHWCSimple3x3SAME() { OpsTestNet net; // Add input data net.AddInputFromArray( - "Input", {1, 3, 3, 2}, - {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); + "Input", {1, 3, 3, 2}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); net.AddInputFromArray( - "Filter", {3, 3, 2, 1}, - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + "Filter", {3, 3, 2, 1}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); net.AddInputFromArray("Bias", {1}, {-0.1f}); if (D == DeviceType::OPENCL) { @@ -91,15 +91,15 @@ void TestNHWCSimple3x3SAME() { BufferToImage(&net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::SAME) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -109,21 +109,21 @@ void TestNHWCSimple3x3SAME() { } else { OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::SAME) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); } auto expected = CreateTensor( - {1, 3, 3, 1}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}); + {1, 3, 3, 1}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); } @@ -138,18 +138,18 @@ TEST_F(FusedConv2dOpTest, OPENCLSimple) { TestNHWCSimple3x3SAME(); } -template +template void TestNHWCSimple3x3WithoutBias() { OpsTestNet net; // Add input data net.AddInputFromArray( - "Input", {1, 3, 3, 2}, - {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); + "Input", {1, 3, 3, 2}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); net.AddInputFromArray( - "Filter", {3, 3, 2, 1}, - {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, - 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + "Filter", {3, 3, 2, 1}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); if (D == DeviceType::OPENCL) { BufferToImage(&net, "Input", "InputImage", @@ -158,14 +158,14 @@ void TestNHWCSimple3x3WithoutBias() { kernels::BufferType::CONV2D_FILTER); OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Output("OutputImage") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); // Transfer output @@ -173,14 +173,14 @@ void TestNHWCSimple3x3WithoutBias() { kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("Input") - .Input("Filter") - .Output("Output") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -200,24 +200,24 @@ TEST_F(FusedConv2dOpTest, OPENCLWithoutBias) { TestNHWCSimple3x3WithoutBias(); } -template +template void TestConv1x1() { // Construct graph OpsTestNet net; // Add input data net.AddInputFromArray( - "Input", {1, 3, 10, 5}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + "Input", {1, 3, 10, 5}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); net.AddInputFromArray( - "Filter", {1, 1, 5, 2}, - {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}); + "Filter", {1, 1, 5, 2}, + {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}); net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); if (D == DeviceType::OPENCL) { @@ -229,14 +229,14 @@ void TestConv1x1() { kernels::BufferType::ARGUMENT); OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -244,27 +244,27 @@ void TestConv1x1() { kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); } // Check auto expected = CreateTensor( - {1, 3, 10, 2}, - {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, - 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, - 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, - 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, - 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, - 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f}); + {1, 3, 10, 2}, + {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -273,7 +273,7 @@ TEST_F(FusedConv2dOpTest, CPUConv1x1) { TestConv1x1(); } TEST_F(FusedConv2dOpTest, OPENCLConv1x1) { TestConv1x1(); } -template +template static void TestComplexConvNxNS12(const std::vector &shape) { testing::internal::LogToStderr(); auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, @@ -288,20 +288,20 @@ static void TestComplexConvNxNS12(const std::vector &shape) { // Construct graph OpsTestNet net; OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}); + "Filter", {kernel_h, kernel_w, output_channels, input_channels}); net.AddRandomInput("Bias", {output_channels}); // run on cpu @@ -319,15 +319,15 @@ static void TestComplexConvNxNS12(const std::vector &shape) { kernels::BufferType::ARGUMENT); OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run on device net.RunOp(D); @@ -348,7 +348,7 @@ TEST_F(FusedConv2dOpTest, OPENCLUnalignedConvNxNS12) { TestComplexConvNxNS12({107, 113, 5, 7}); } -template +template static void TestHalfComplexConvNxNS12(const std::vector &shape) { testing::internal::LogToStderr(); auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, @@ -363,30 +363,30 @@ static void TestHalfComplexConvNxNS12(const std::vector &shape) { // Construct graph OpsTestNet net; OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); std::vector float_input_data; GenerateRandomRealTypeData({batch, height, width, input_channels}, &float_input_data); std::vector float_filter_data; GenerateRandomRealTypeData( - {kernel_h, kernel_w, output_channels, input_channels}, - &float_filter_data); + {kernel_h, kernel_w, output_channels, input_channels}, + &float_filter_data); std::vector float_bias_data; GenerateRandomRealTypeData({output_channels}, &float_bias_data); // Add input data net.AddInputFromArray( - "Input", {batch, height, width, input_channels}, float_input_data); + "Input", {batch, height, width, input_channels}, float_input_data); net.AddInputFromArray( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}, - float_filter_data); + "Filter", {kernel_h, kernel_w, output_channels, input_channels}, + float_filter_data); net.AddInputFromArray("Bias", {output_channels}, float_bias_data); // run on cpu @@ -404,15 +404,15 @@ static void TestHalfComplexConvNxNS12(const std::vector &shape) { kernels::BufferType::ARGUMENT); OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataType::DT_HALF)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataType::DT_HALF)) + .Finalize(net.NewOperatorDef()); // Run on device net.RunOp(D); @@ -433,7 +433,7 @@ TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConvNxNS12) { TestHalfComplexConvNxNS12({32, 32, 32, 64}); } -template +template static void TestGeneralConvNxNS12(const std::vector &image_shape, const std::vector &filter_shape) { testing::internal::LogToStderr(); @@ -451,20 +451,20 @@ static void TestGeneralConvNxNS12(const std::vector &image_shape, // Construct graph OpsTestNet net; OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}); + "Filter", {kernel_h, kernel_w, output_channels, input_channels}); net.AddRandomInput("Bias", {output_channels}); // run on cpu @@ -482,15 +482,15 @@ static void TestGeneralConvNxNS12(const std::vector &image_shape, kernels::BufferType::ARGUMENT); OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run on device net.RunOp(D); @@ -513,7 +513,7 @@ TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) { TestGeneralConvNxNS12({40, 40}, {15, 1, 32, 64}); } -template +template static void TestAtrousConvNxN(const std::vector &shape, const int dilation) { testing::internal::LogToStderr(); @@ -530,20 +530,20 @@ static void TestAtrousConvNxN(const std::vector &shape, // Construct graph OpsTestNet net; OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {dilation, dilation}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {dilation, dilation}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}); + "Filter", {kernel_h, kernel_w, output_channels, input_channels}); net.AddRandomInput("Bias", {output_channels}); // run on cpu @@ -561,15 +561,15 @@ static void TestAtrousConvNxN(const std::vector &shape, kernels::BufferType::ARGUMENT); OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {dilation, dilation}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {dilation, dilation}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run on device net.RunOp(D); @@ -598,7 +598,7 @@ TEST_F(FusedConv2dOpTest, OPENCLUnalignedAtrousConvNxN) { TestAtrousConvNxN({107, 113, 5, 7}, 2); } -template +template static void TestGeneralHalfAtrousConv(const std::vector &image_shape, const std::vector &filter_shape, const std::vector &dilations) { @@ -617,20 +617,20 @@ static void TestGeneralHalfAtrousConv(const std::vector &image_shape, // Construct graph OpsTestNet net; OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", {batch, height, width, input_channels}); net.AddRandomInput( - "Filter", {kernel_h, kernel_w, output_channels, input_channels}); + "Filter", {kernel_h, kernel_w, output_channels, input_channels}); net.AddRandomInput("Bias", {output_channels}); // run on cpu @@ -648,15 +648,15 @@ static void TestGeneralHalfAtrousConv(const std::vector &image_shape, kernels::BufferType::ARGUMENT); OpDefBuilder("FusedConv2D", "FusedConv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {stride_h, stride_w}) - .AddIntArg("padding", type) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Run on device net.RunOp(D); @@ -679,6 +679,79 @@ TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) { {2, 2}); } +static void TestNEONGeneralConvNxNS12( + const std::vector &image_shape, + const std::vector &filter_shape) { + testing::internal::LogToStderr(); + auto func = [&](int stride_h, int stride_w, Padding type) { + srand(time(NULL)); + + // generate random input + index_t batch = 1; + index_t height = image_shape[0]; + index_t width = image_shape[1]; + index_t input_channels = filter_shape[2]; + index_t output_channels = filter_shape[3]; + index_t kernel_h = filter_shape[0]; + index_t kernel_w = filter_shape[1]; + // Construct graph + OpsTestNet net; + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Add input data + net.AddRandomInput("Input", + {batch, height, width, input_channels}); + net.AddRandomInput( + "Filter", {kernel_h, kernel_w, output_channels, input_channels}); + net.AddRandomInput("Bias", {output_channels}); + + // run on cpu + net.RunOp(); + + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("InputNeon") + .Input("FilterNeon") + .Input("Bias") + .Output("OutputNeon") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + net.FillNHWCInputToNCHWInput("InputNeon", "Input"); + net.FillHWOIInputToOIHWInput("FilterNeon", + "Filter"); + + // Run on device + net.RunOp(DeviceType::NEON); + + net.FillNHWCInputToNCHWInput("OutputExptected", + "Output"); + + ExpectTensorNear(*net.GetOutput("OutputExptected"), + *net.GetOutput("OutputNeon"), + 0.001); + }; + + for (int stride : {1, 2}) { + func(stride, stride, VALID); + func(stride, stride, SAME); + } +} + +TEST_F(FusedConv2dOpTest, NEONTest) { + TestNEONGeneralConvNxNS12({32, 32}, {7, 7, 3, 64}); +} } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index ece4885717ad9336f753be57499f4c2eec87e548..595f8000348ad4a8087fc01787d0d2e5932a3b80 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -105,12 +105,12 @@ class OpsTestNet { public: OpsTestNet() : op_registry_(new OperatorRegistry()) {} - template + template void AddInputFromArray(const std::string &name, const std::vector &shape, const std::vector &data) { Tensor *input = - ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum::v()); + ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum::v()); input->Resize(shape); Tensor::MappingGuard input_mapper(input); T *input_data = input->mutable_data(); @@ -118,24 +118,24 @@ class OpsTestNet { memcpy(input_data, data.data(), data.size() * sizeof(T)); } - template + template void AddRepeatedInput(const std::string &name, const std::vector &shape, const T data) { Tensor *input = - ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum::v()); + ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum::v()); input->Resize(shape); Tensor::MappingGuard input_mapper(input); T *input_data = input->mutable_data(); std::fill(input_data, input_data + input->size(), data); } - template + template void AddRandomInput(const std::string &name, const std::vector &shape, bool positive = false) { Tensor *input = - ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum::v()); + ws_.CreateTensor(name, GetDeviceAllocator(D), DataTypeToEnum::v()); input->Resize(shape); Tensor::MappingGuard input_mapper(input); T *input_data = input->mutable_data(); @@ -145,10 +145,10 @@ class OpsTestNet { std::normal_distribution nd(0, 1); if (DataTypeToEnum::value == DT_HALF) { std::generate( - input_data, input_data + input->size(), [&gen, &nd, positive] { - return half_float::half_cast(positive ? std::abs(nd(gen)) - : nd(gen)); - }); + input_data, input_data + input->size(), [&gen, &nd, positive] { + return half_float::half_cast(positive ? std::abs(nd(gen)) + : nd(gen)); + }); } else { std::generate(input_data, input_data + input->size(), [&gen, &nd, positive] { @@ -157,6 +157,84 @@ class OpsTestNet { } } + template + void FillNHWCInputToNCHWInput(const std::string &name_nchw, + const std::string &name_nhwc) { + Tensor *input = ws_.GetTensor(name_nhwc); + Tensor *output = ws_.CreateTensor(name_nchw, + GetDeviceAllocator(D), + DataTypeToEnum::v()); + const std::vector input_shape = input->shape(); + index_t batch = input_shape[0]; + index_t height = input_shape[1]; + index_t width = input_shape[2]; + index_t channels = input_shape[3]; + output->Resize({batch, channels, height, width}); + const T *input_data = input->data(); + T *output_data = output->mutable_data(); + for (index_t b = 0; b < batch; ++b) { + for (index_t c = 0; c < channels; ++c) { + for (index_t h = 0; h < height; ++h) { + for (index_t w = 0; w < width; ++w) { + output_data[((b * channels + c) * height + h) * width + w] = + input_data[((b * height + h) * width + w) * channels + c]; + } + } + } + } + } + + template + void FillHWOIInputToOIHWInput(const std::string &name_oihw, + const std::string &name_hwoi) { + Tensor *input = ws_.GetTensor(name_hwoi); + Tensor *output = ws_.CreateTensor(name_oihw, + GetDeviceAllocator(D), + DataTypeToEnum::v()); + const std::vector input_shape = input->shape(); + index_t height = input_shape[0]; + index_t width = input_shape[1]; + index_t out_channels = input_shape[2]; + index_t in_channels = input_shape[3]; + index_t hw = height * width; + index_t oi = out_channels * in_channels; + output->Resize({out_channels, in_channels, height, width}); + const T *input_data = input->data(); + T *output_data = output->mutable_data(); + for (index_t i = 0; i < oi; ++i) { + for (index_t j = 0; j < hw; ++j) { + output_data[i * height * width + j] = + input_data[j * out_channels * in_channels + i]; + } + } + } + + template + void FillHWIOInputToOIHWInput(const std::string &name_oihw, + const std::string &name_hwio) { + Tensor *input = ws_.GetTensor(name_hwio); + Tensor *output = ws_.CreateTensor(name_oihw, + GetDeviceAllocator(D), + DataTypeToEnum::v()); + const std::vector input_shape = input->shape(); + index_t height = input_shape[0]; + index_t width = input_shape[1]; + index_t in_channels = input_shape[2]; + index_t out_channels = input_shape[3]; + index_t hw = height * width; + output->Resize({out_channels, in_channels, height, width}); + const T *input_data = input->data(); + T *output_data = output->mutable_data(); + for (index_t m = 0; m < out_channels; ++m) { + for (index_t c = 0; c < in_channels; ++c) { + for (index_t k = 0; k < hw; ++k) { + output_data[((m * in_channels) + c) * height * width + k] = + input_data[k * out_channels * in_channels + c * out_channels + m]; + } + } + } + } + OperatorDef *NewOperatorDef() { op_defs_.clear(); op_defs_.emplace_back(OperatorDef()); @@ -165,17 +243,35 @@ class OpsTestNet { Workspace *ws() { return &ws_; } - bool RunOp(DeviceType device) { + bool Setup(DeviceType device) { NetDef net_def; for (auto &op_def_ : op_defs_) { net_def.add_op()->CopyFrom(op_def_); } net_ = CreateNet(op_registry_, net_def, &ws_, device); device_ = device; + return net_ != nullptr; + } + + bool Run() { + MACE_CHECK_NOTNULL(net_); return net_->Run(); } - bool RunOp() { return RunOp(DeviceType::CPU); } + // DEPRECATED(liyin): + // Test and benchmark should setup model once and run multiple times. + // Setup time should not be counted during benchmark. + bool RunOp(DeviceType device) { + Setup(device); + return Run(); + } + + // DEPRECATED(liyin): + // Test and benchmark should setup model once and run multiple times. + // Setup time should not be counted during benchmark. + bool RunOp() { + return RunOp(DeviceType::CPU); + } Tensor *GetOutput(const char *output_name) { return ws_.GetTensor(output_name); @@ -210,7 +306,7 @@ class OpsTestBase : public ::testing::Test { } }; -template +template void GenerateRandomRealTypeData(const std::vector &shape, std::vector *res) { MACE_CHECK_NOTNULL(res); @@ -231,7 +327,7 @@ void GenerateRandomRealTypeData(const std::vector &shape, } } -template +template void GenerateRandomIntTypeData(const std::vector &shape, std::vector *res, const T a = 0, @@ -249,7 +345,7 @@ void GenerateRandomIntTypeData(const std::vector &shape, std::generate(res->begin(), res->end(), [&gen, &nd] { return nd(gen); }); } -template +template std::vector VectorStaticCast(const std::vector &&src) { std::vector dest; dest.reserve(src.size()); @@ -259,11 +355,11 @@ std::vector VectorStaticCast(const std::vector &&src) { return std::move(dest); } -template +template std::unique_ptr CreateTensor(const std::vector &shape, const std::vector &data) { std::unique_ptr res( - new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum::v())); + new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum::v())); res->Resize(shape); T *input_data = res->mutable_data(); memcpy(input_data, data.data(), data.size() * sizeof(T)); @@ -293,24 +389,24 @@ inline std::string ShapeToString(const Tensor &x) { return std::string(stream.str()); } -template +template struct is_floating_point_type { static const bool value = std::is_same::value || - std::is_same::value || - std::is_same::value; + std::is_same::value || + std::is_same::value; }; -template +template inline void ExpectEqual(const T &a, const T &b) { EXPECT_EQ(a, b); } -template <> +template<> inline void ExpectEqual(const float &a, const float &b) { EXPECT_FLOAT_EQ(a, b); } -template <> +template<> inline void ExpectEqual(const double &a, const double &b) { EXPECT_DOUBLE_EQ(a, b); } @@ -320,13 +416,13 @@ inline void AssertSameDims(const Tensor &x, const Tensor &y) { << "y.shape [ " << ShapeToString(y) << "]"; } -template ::value> +template::value> struct Expector; // Partial specialization for float and double. -template +template struct Expector { static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); } @@ -373,22 +469,22 @@ struct Expector { } }; -template +template void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) { static_assert(is_floating_point_type::value, "T is not a floating point type"); Expector::Near(x, y, abs_err); } -template +template void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) { static_assert(is_floating_point_type::value && - is_floating_point_type::value, + is_floating_point_type::value, "T is not a floating point type"); Expector::Near(x, y, abs_err); } -template +template void BufferToImage(OpsTestNet *net, const std::string &input_name, const std::string &output_name, @@ -396,11 +492,11 @@ void BufferToImage(OpsTestNet *net, MACE_CHECK_NOTNULL(net); OpDefBuilder("BufferToImage", "BufferToImageTest") - .Input(input_name) - .Output(output_name) - .AddIntArg("buffer_type", type) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net->NewOperatorDef()); + .Input(input_name) + .Output(output_name) + .AddIntArg("buffer_type", type) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net->NewOperatorDef()); // Run net->RunOp(D); @@ -408,7 +504,7 @@ void BufferToImage(OpsTestNet *net, net->Sync(); } -template +template void ImageToBuffer(OpsTestNet *net, const std::string &input_name, const std::string &output_name, @@ -416,11 +512,11 @@ void ImageToBuffer(OpsTestNet *net, MACE_CHECK_NOTNULL(net); OpDefBuilder("ImageToBuffer", "ImageToBufferTest") - .Input(input_name) - .Output(output_name) - .AddIntArg("buffer_type", type) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net->NewOperatorDef()); + .Input(input_name) + .Output(output_name) + .AddIntArg("buffer_type", type) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net->NewOperatorDef()); // Run net->RunOp(D); diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index 166a71ade097b34249b865b9d8ded158f415a22a..e74dd8442c9c55837770dc0ce1a39dd860ef5a01 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -29,6 +29,11 @@ void Register_Pooling(OperatorRegistry *op_registry) { .TypeConstraint("T") .Build(), PoolingOp); + REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling") + .Device(DeviceType::NEON) + .TypeConstraint("T") + .Build(), + PoolingOp); } } // namespace ops diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index dd81a4bb3cbd0b4fb0b7e5f5ee105810ce19df09..d74b6fdf8eed926455a551f1b6254fc8e19c6d43 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -19,27 +19,27 @@ TEST_F(PoolingOpTest, MAX_VALID) { // Construct graph OpsTestNet net; OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") - .AddIntsArg("kernels", {2, 2}) - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("pooling_type", PoolingType::MAX) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Output("Output") + .AddIntsArg("kernels", {2, 2}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("pooling_type", PoolingType::MAX) + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( - "Input", {1, 4, 4, 2}, - {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, - 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); + "Input", {1, 4, 4, 2}, + {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); // Run net.RunOp(); // Check auto expected = - CreateTensor({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31}); + CreateTensor({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } @@ -48,14 +48,14 @@ TEST_F(PoolingOpTest, MAX_SAME) { // Construct graph OpsTestNet net; OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") - .AddIntsArg("kernels", {2, 2}) - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::SAME) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("pooling_type", PoolingType::MAX) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Output("Output") + .AddIntsArg("kernels", {2, 2}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("pooling_type", PoolingType::MAX) + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray("Input", {1, 3, 3, 1}, @@ -74,19 +74,19 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) { // Construct graph OpsTestNet net; OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") - .AddIntsArg("kernels", {2, 2}) - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {2, 2}) - .AddIntArg("pooling_type", PoolingType::MAX) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Output("Output") + .AddIntsArg("kernels", {2, 2}) + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {2, 2}) + .AddIntArg("pooling_type", PoolingType::MAX) + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( - "Input", {1, 4, 4, 1}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + "Input", {1, 4, 4, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); // Run net.RunOp(); @@ -101,19 +101,19 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { // Construct graph OpsTestNet net; OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") - .AddIntArg("pooling_type", PoolingType::MAX) - .AddIntsArg("kernels", {2, 2}) - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::SAME) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Output("Output") + .AddIntArg("pooling_type", PoolingType::MAX) + .AddIntsArg("kernels", {2, 2}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( - "Input", {1, 2, 9, 1}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}); + "Input", {1, 2, 9, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}); // Run net.RunOp(); @@ -123,43 +123,43 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } -template +template static void SimpleMaxPooling3S2() { // Construct graph OpsTestNet net; // Add input data net.AddInputFromArray( - "Input", {1, 3, 9, 1}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, - 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}); + "Input", {1, 3, 9, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}); if (D == DeviceType::OPENCL) { BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pooling", "PoolingTest") - .Input("InputImage") - .Output("OutputImage") - .AddIntArg("pooling_type", PoolingType::MAX) - .AddIntsArg("kernels", {3, 3}) - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Output("OutputImage") + .AddIntArg("pooling_type", PoolingType::MAX) + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); net.RunOp(D); ImageToBuffer(&net, "OutputImage", "Output", kernels::BufferType::IN_OUT_CHANNEL); } else { // Run OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") - .AddIntArg("pooling_type", PoolingType::MAX) - .AddIntsArg("kernels", {3, 3}) - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Output("Output") + .AddIntArg("pooling_type", PoolingType::MAX) + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); net.RunOp(D); } @@ -175,22 +175,22 @@ TEST_F(PoolingOpTest, OPENCLSimpleMaxPooling3S2) { SimpleMaxPooling3S2(); } -template +template static void MaxPooling3S2(const std::vector &input_shape, const std::vector strides, Padding padding) { // Construct graph OpsTestNet net; OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") - .AddIntArg("pooling_type", PoolingType::MAX) - .AddIntsArg("kernels", {3, 3}) - .AddIntsArg("strides", strides) - .AddIntArg("padding", padding) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Output("Output") + .AddIntArg("pooling_type", PoolingType::MAX) + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", input_shape); @@ -203,15 +203,15 @@ static void MaxPooling3S2(const std::vector &input_shape, BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pooling", "PoolingTest") - .Input("InputImage") - .Output("OutputImage") - .AddIntArg("pooling_type", PoolingType::MAX) - .AddIntsArg("kernels", {3, 3}) - .AddIntsArg("strides", strides) - .AddIntArg("padding", padding) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Output("OutputImage") + .AddIntArg("pooling_type", PoolingType::MAX) + .AddIntsArg("kernels", {3, 3}) + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); net.RunOp(D); ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); @@ -250,52 +250,52 @@ TEST_F(PoolingOpTest, AVG_VALID) { // Construct graph OpsTestNet net; OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") - .AddIntsArg("kernels", {2, 2}) - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("pooling_type", PoolingType::AVG) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Output("Output") + .AddIntsArg("kernels", {2, 2}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("pooling_type", PoolingType::AVG) + .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( - "Input", {1, 4, 4, 2}, - {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, - 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); + "Input", {1, 4, 4, 2}, + {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23, + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); // Run net.RunOp(); // Check auto expected = CreateTensor( - {1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5}); + {1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } -template +template static void SimpleAvgPoolingTest() { // Construct graph OpsTestNet net; // Add input data net.AddInputFromArray( - "Input", {1, 2, 8, 1}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + "Input", {1, 2, 8, 1}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pooling", "PoolingTest") - .Input("InputImage") - .Output("OutputImage") - .AddIntArg("pooling_type", PoolingType::AVG) - .AddIntsArg("kernels", {2, 2}) - .AddIntsArg("strides", {2, 2}) - .AddIntArg("padding", Padding::SAME) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Output("OutputImage") + .AddIntArg("pooling_type", PoolingType::AVG) + .AddIntsArg("kernels", {2, 2}) + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); ImageToBuffer(&net, "OutputImage", "Output", @@ -311,7 +311,7 @@ TEST_F(PoolingOpTest, OPENCLSimpleAvgPooling) { SimpleAvgPoolingTest(); } -template +template static void AvgPoolingTest(const std::vector &shape, const std::vector &kernels, const std::vector &strides, @@ -319,14 +319,14 @@ static void AvgPoolingTest(const std::vector &shape, // Construct graph OpsTestNet net; OpDefBuilder("Pooling", "PoolingTest") - .Input("Input") - .Output("Output") - .AddIntArg("pooling_type", PoolingType::AVG) - .AddIntsArg("kernels", kernels) - .AddIntsArg("strides", strides) - .AddIntArg("padding", padding) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Output("Output") + .AddIntArg("pooling_type", PoolingType::AVG) + .AddIntsArg("kernels", kernels) + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); // Add input data net.AddRandomInput("Input", shape); @@ -339,15 +339,15 @@ static void AvgPoolingTest(const std::vector &shape, BufferToImage(&net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Pooling", "PoolingTest") - .Input("InputImage") - .Output("OutputImage") - .AddIntArg("pooling_type", PoolingType::AVG) - .AddIntsArg("kernels", kernels) - .AddIntsArg("strides", strides) - .AddIntArg("padding", padding) - .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Output("OutputImage") + .AddIntArg("pooling_type", PoolingType::AVG) + .AddIntsArg("kernels", kernels) + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); net.RunOp(D); ImageToBuffer(&net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL); @@ -396,6 +396,62 @@ TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) { Padding::SAME); } +static void AvgPoolingNEONTest(const std::vector &shape, + const std::vector &kernels, + const std::vector &strides, + Padding padding, + PoolingType pooling_type) { + // Construct graph + OpsTestNet net; + OpDefBuilder("Pooling", "PoolingTest") + .Input("Input") + .Output("Output") + .AddIntArg("pooling_type", pooling_type) + .AddIntsArg("kernels", kernels) + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + + // Add input data + net.AddRandomInput("Input", shape); + + // run on cpu + net.RunOp(); + + OpDefBuilder("Pooling", "PoolingTest") + .Input("InputNeon") + .Output("OutputNeon") + .AddIntArg("pooling_type", pooling_type) + .AddIntsArg("kernels", kernels) + .AddIntsArg("strides", strides) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + + net.FillNHWCInputToNCHWInput("InputNeon", "Input"); + + // run on neon + net.RunOp(DeviceType::NEON); + + net.FillNHWCInputToNCHWInput("OutputExptected", + "Output"); + + ExpectTensorNear(*net.GetOutput("OutputExptected"), + *net.GetOutput("OutputNeon"), + 0.01); +} + +TEST_F(PoolingOpTest, NEONTest) { + AvgPoolingNEONTest({3, 31, 37, 128}, {8, 8}, {8, 8}, + Padding::VALID, PoolingType::MAX); + AvgPoolingNEONTest({3, 31, 37, 128}, {8, 8}, {8, 8}, + Padding::SAME, PoolingType::MAX); + AvgPoolingNEONTest({3, 31, 37, 128}, {8, 8}, {8, 8}, + Padding::VALID, PoolingType::AVG); + AvgPoolingNEONTest({3, 31, 37, 128}, {8, 8}, {8, 8}, + Padding::SAME, PoolingType::AVG); +} } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index 6b8ead8164736f59a59b6079992c7bbd28a8b1b8..d8c9cd49dcbf8867f3eeabbcfff00dc271930f2c 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -25,6 +25,11 @@ void Register_Softmax(OperatorRegistry *op_registry) { .TypeConstraint("T") .Build(), SoftmaxOp); + REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax") + .Device(DeviceType::NEON) + .TypeConstraint("T") + .Build(), + SoftmaxOp); } } // namespace ops diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc index b4ad23097db6474df978f1537572aecac11b6dfa..97afa33069ff4ac083b5ea8af1d2d8ac9de3333b 100644 --- a/mace/ops/softmax_test.cc +++ b/mace/ops/softmax_test.cc @@ -11,7 +11,7 @@ namespace test { class SoftmaxOpTest : public OpsTestBase {}; -template +template void Simple() { // Construct graph OpsTestNet net; @@ -24,9 +24,9 @@ void Simple() { kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Softmax", "SoftmaxTest") - .Input("InputImage") - .Output("OutputImage") - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Output("OutputImage") + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -36,17 +36,17 @@ void Simple() { kernels::BufferType::IN_OUT_CHANNEL); } else { OpDefBuilder("Softmax", "SoftmaxTest") - .Input("Input") - .Output("Output") - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Output("Output") + .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); } auto expected = CreateTensor( - {1, 1, 2, 4}, - {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426}); + {1, 1, 2, 4}, + {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-7); } @@ -54,7 +54,7 @@ void Simple() { TEST_F(SoftmaxOpTest, CPUSimple) { Simple(); } TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple(); } -template +template void Complex(const std::vector &logits_shape) { // Construct graph OpsTestNet net; @@ -62,9 +62,9 @@ void Complex(const std::vector &logits_shape) { net.AddRandomInput("Input", logits_shape); OpDefBuilder("Softmax", "SoftmaxTest") - .Input("Input") - .Output("Output") - .Finalize(net.NewOperatorDef()); + .Input("Input") + .Output("Output") + .Finalize(net.NewOperatorDef()); // Run on cpu net.RunOp(); @@ -75,9 +75,9 @@ void Complex(const std::vector &logits_shape) { kernels::BufferType::IN_OUT_CHANNEL); OpDefBuilder("Softmax", "SoftmaxTest") - .Input("InputImage") - .Output("OutputImage") - .Finalize(net.NewOperatorDef()); + .Input("InputImage") + .Output("OutputImage") + .Finalize(net.NewOperatorDef()); // Run on gpu net.RunOp(D); @@ -104,6 +104,45 @@ TEST_F(SoftmaxOpTest, OPENCLUnAligned) { Complex({5, 211, 107, 1}); } +void SoftMaxNEONTest(const std::vector &logits_shape) { + // Construct graph + OpsTestNet net; + // Add input data + net.AddRandomInput("Input", logits_shape); + + OpDefBuilder("Softmax", "SoftmaxTest") + .Input("Input") + .Output("Output") + .Finalize(net.NewOperatorDef()); + + // Run on cpu + net.RunOp(); + + OpDefBuilder("Softmax", "SoftmaxTest") + .Input("InputNeon") + .Output("OutputNeon") + .Finalize(net.NewOperatorDef()); + + net.FillNHWCInputToNCHWInput("InputNeon", "Input"); + + // run on neon + net.RunOp(DeviceType::NEON); + + net.FillNHWCInputToNCHWInput("OutputExptected", + "Output"); + + ExpectTensorNear(*net.GetOutput("OutputExptected"), + *net.GetOutput("OutputNeon"), + 0.01); +} + +TEST_F(SoftmaxOpTest, NEONTest) { + SoftMaxNEONTest({5, 64, 64, 3}); + SoftMaxNEONTest({8, 128, 128, 8}); + SoftMaxNEONTest({1, 113, 107, 13}); + SoftMaxNEONTest({5, 211, 107, 1}); +} + } // namespace test } // namespace ops } // namespace mace diff --git a/mace/ops/transpose.cc b/mace/ops/transpose.cc new file mode 100644 index 0000000000000000000000000000000000000000..7634a62b46838185ec2bbad3d6edc0a5b4063d60 --- /dev/null +++ b/mace/ops/transpose.cc @@ -0,0 +1,25 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/ops/transpose.h" + +namespace mace { +namespace ops { + +void Register_Transpose(OperatorRegistry *op_registry) { + REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose") + .Device(DeviceType::CPU) + .TypeConstraint("T") + .Build(), + TransposeOp); + + REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose") + .Device(DeviceType::NEON) + .TypeConstraint("T") + .Build(), + TransposeOp); +} + +} // namespace ops +} // namespace mace diff --git a/mace/ops/transpose.h b/mace/ops/transpose.h new file mode 100644 index 0000000000000000000000000000000000000000..8d04bae5c75fab9072eab600ccb465317df1bf3b --- /dev/null +++ b/mace/ops/transpose.h @@ -0,0 +1,50 @@ +// +// Copyright (c) 2018 XiaoMi All rights reserved. +// + +#ifndef MACE_OPS_TRANSPOSE_H_ +#define MACE_OPS_TRANSPOSE_H_ + +#include + +#include "mace/core/operator.h" +#include "mace/kernels/transpose.h" +#include "mace/kernels/softmax.h" + +namespace mace { + +template +class TransposeOp : public Operator { + public: + TransposeOp(const OperatorDef &operator_def, Workspace *ws) + : Operator(operator_def, ws), + dims_(OperatorBase::GetRepeatedArgument( + "dims")), + functor_(dims_) {} + + bool Run(StatsFuture *future) override { + const Tensor *input = this->Input(INPUT); + Tensor *output = this->Output(OUTPUT); + const std::vector &input_shape = input->shape(); + MACE_CHECK(input_shape.size() == 4 && dims_.size() == 4, + "rank should be 4"); + std::vector output_shape; + for (int i = 0; i < dims_.size(); ++i) { + output_shape.push_back(input_shape[dims_[i]]); + } + output->Resize(output_shape); + functor_(input, output, future); + return true; + } + + protected: + std::vector dims_; + kernels::TransposeFunctor functor_; + + OP_INPUT_TAGS(INPUT); + OP_OUTPUT_TAGS(OUTPUT); +}; + +} // namespace mace + +#endif // MACE_OPS_TRANSPOSE_H_ diff --git a/mace/ops/transpose_test.cc b/mace/ops/transpose_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..ecc8d08d916b7df82da767d01ac5148bb56f1445 --- /dev/null +++ b/mace/ops/transpose_test.cc @@ -0,0 +1,43 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/operator.h" +#include "mace/ops/ops_test_util.h" + +namespace mace { +namespace ops { +namespace test { + +class TransposeOpTest : public OpsTestBase {}; + +void TransposeNCHWTest(const std::vector &input_shape) { + // Construct graph + OpsTestNet net; + // Add input data + net.AddRandomInput("Input", input_shape); + + OpDefBuilder("Transpose", "TransposeNCHWTest") + .Input("Input") + .Output("Output") + .AddIntsArg("dims", {0, 3, 1, 2}) + .Finalize(net.NewOperatorDef()); + + // Run on cpu + net.RunOp(); + + net.FillNHWCInputToNCHWInput("InputNCHW", "Input"); + + ExpectTensorNear(*net.GetOutput("InputNCHW"), + *net.GetOutput("Output"), + 0.01); +} + +TEST_F(TransposeOpTest, NCHW) { + TransposeNCHWTest({3, 64, 64, 128}); + TransposeNCHWTest({1, 64, 48, 128}); +} + +} // namespace test +} // namespace ops +} // namespace mace diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py index 93fbea92a8f283a3646e3746ad496bf35a050f15..61f3e92621a9f7e50ac4e556d8c10fce47132b69 100644 --- a/mace/python/tools/tf_converter_lib.py +++ b/mace/python/tools/tf_converter_lib.py @@ -8,7 +8,6 @@ from mace.python.tools import memory_optimizer from tensorflow.core.framework import graph_pb2 from tensorflow.core.framework import tensor_shape_pb2 -# TODO: support NCHW formt, now only support NHWC. padding_mode = { 'VALID': 0, 'SAME': 1, @@ -133,7 +132,7 @@ class TFConverter(object): arg.i = self.dt return output_name - def add_input_transform(self, names): + def add_gpu_input_transform(self, names): for name in names: new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" op_def = self.net_def.op.add() @@ -150,7 +149,24 @@ class TFConverter(object): arg.name = 'T' arg.i = self.dt - def add_output_transform(self, names): + def add_neon_input_transform(self, names): + for name in names: + new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = name + op_def.type = 'Transpose' + op_def.input.extend([new_input_name]) + op_def.output.extend([name+':0']) + + dims_arg = op_def.arg.add() + dims_arg.name = 'dims' + dims_arg.ints.extend([0, 3, 1, 2]) + + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + + def add_gpu_output_transform(self, names): for name in names: output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" op_def = self.net_def.op.add() @@ -163,6 +179,19 @@ class TFConverter(object): epsilon_arg.name = 'buffer_type' epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL'] + def add_neon_output_transform(self, names): + for name in names: + output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0" + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'Transpose' + op_def.input.extend([name+':0']) + op_def.output.extend([output_name]) + + dims_arg = op_def.arg.add() + dims_arg.name = 'dims' + dims_arg.ints.extend([0, 2, 3, 1]) + @staticmethod def add_output_shape(outputs, op): output_shapes = [] @@ -335,9 +364,14 @@ class TFConverter(object): op_def.name = op.name if op.type == 'DepthwiseConv2dNative': op_def.type = 'DepthwiseConv2d' + if self.device == 'neon': + self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (3, 2, 0, 1) else: op_def.type = op.type - self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (0, 1, 3, 2) + if self.device == 'neon': + self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (3, 2, 0, 1) + else: + self.transpose_filter_tensor[get_input_tensor(op, 1).name] = (0, 1, 3, 2) if self.device == 'gpu': op_def.input.extend([op.inputs[0].name]) buffer_type = "DW_CONV2D_FILTER" if op_def.type == 'DepthwiseConv2d' else "CONV2D_FILTER" @@ -354,7 +388,10 @@ class TFConverter(object): strides_arg.ints.extend(op.get_attr('strides')[1:3]) data_format_arg = op_def.arg.add() data_format_arg.name = 'data_format' - data_format_arg.s = 'NHWC' + if self.device == 'neon': + data_format_arg.s = 'NCHW' + else: + data_format_arg.s = 'NHWC' final_op = op self.resolved_ops[op.name] = 1 @@ -394,7 +431,10 @@ class TFConverter(object): arg.i = self.dt data_format_arg = op_def.arg.add() data_format_arg.name = 'data_format' - data_format_arg.s = 'NHWC' + if self.device == 'neon': + data_format_arg.s = 'NCHW' + else: + data_format_arg.s = 'NHWC' op_def.name = op.name op_def.type = 'FoldedBatchNorm' @@ -497,7 +537,10 @@ class TFConverter(object): epsilon_arg.f = get_input_tensor(op, 1).eval().astype(np.float) data_format_arg = op_def.arg.add() data_format_arg.name = 'data_format' - data_format_arg.s = 'NHWC' + if self.device == 'neon': + data_format_arg.s = 'NCHW' + else: + data_format_arg.s = 'NHWC' self.unused_tensor.add(get_input_tensor(op, 1).name) self.net_def.op.extend([op_def]) @@ -528,7 +571,10 @@ class TFConverter(object): kernels_arg.ints.extend(op.get_attr('ksize')[1:3]) data_format_arg = op_def.arg.add() data_format_arg.name = 'data_format' - data_format_arg.s = 'NHWC' + if self.device == 'neon': + data_format_arg.s = 'NCHW' + else: + data_format_arg.s = 'NHWC' self.resolved_ops[op.name] = 1 def convert_global_avg_pooling(self, op): @@ -555,7 +601,10 @@ class TFConverter(object): kernels_arg.ints.extend(op.inputs[0].shape.as_list()[1:3]) data_format_arg = op_def.arg.add() data_format_arg.name = 'data_format' - data_format_arg.s = 'NHWC' + if self.device == 'neon': + data_format_arg.s = 'NCHW' + else: + data_format_arg.s = 'NHWC' self.resolved_ops[op.name] = 1 def convert_activation(self, op): @@ -771,7 +820,10 @@ class TFConverter(object): strides_arg.ints.extend([1, 1]) data_format_arg = op_def.arg.add() data_format_arg.name = 'data_format' - data_format_arg.s = 'NHWC' + if self.device == 'neon': + data_format_arg.s = 'NCHW' + else: + data_format_arg.s = 'NHWC' final_op = conv_op self.resolved_ops[op.name] = 1 self.resolved_ops[conv_op.name] = 1 @@ -879,7 +931,9 @@ class TFConverter(object): def convert(self, input_nodes, output_nodes): if self.device == 'gpu': - self.add_input_transform(input_nodes) + self.add_gpu_input_transform(input_nodes) + if self.device == 'neon': + self.add_neon_input_transform(input_nodes) for op in self.tf_ops: if self.resolved_ops[op.name] == 1: @@ -957,7 +1011,10 @@ class TFConverter(object): raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type)) if self.device == 'gpu': - self.add_output_transform(output_nodes) + self.add_gpu_output_transform(output_nodes) + + if self.device == 'neon': + self.add_neon_output_transform(output_nodes) if self.device == 'cpu': self.replace_in_out_name(input_nodes, output_nodes) @@ -1007,12 +1064,20 @@ class Optimizer: scale_tensor = self.tensor_map[scale_buffer_name] weight_shape = weight_tensor.dims idx = 0 - for i in range(weight_shape[0]): - for j in range(weight_shape[1]): - for ic in range(weight_shape[2]): - for oc in range(weight_shape[3]): - weight_tensor.float_data[idx] *= scale_tensor.float_data[ic * weight_shape[3] + oc] - idx += 1 + if self.device == 'neon': # OIHW + for oc in range(weight_shape[0]): + for ic in range(weight_shape[1]): + for i in range(weight_shape[2]): + for j in range(weight_shape[3]): + weight_tensor.float_data[idx] *= scale_tensor.float_data[ic * weight_shape[0] + oc] + idx += 1 + else: # HWIO + for i in range(weight_shape[0]): + for j in range(weight_shape[1]): + for ic in range(weight_shape[2]): + for oc in range(weight_shape[3]): + weight_tensor.float_data[idx] *= scale_tensor.float_data[ic * weight_shape[3] + oc] + idx += 1 new_tensors.append(weight_tensor) unused_tensors.add(weight_tensor.name) diff --git a/tools/benchmark.sh b/tools/benchmark.sh index 409bfeb4a2b368de06af136f9846e09f13301fc4..a546ca0e1281d301f0900f396377dee722bf100b 100644 --- a/tools/benchmark.sh +++ b/tools/benchmark.sh @@ -1,5 +1,6 @@ #!/bin/bash +set -x Usage() { echo "Usage: bash tools/benchmark.sh target_soc model_output_dir option_args" } @@ -70,6 +71,7 @@ else --copt="-DMACE_OBFUSCATE_LITERALS" \ --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \ --define openmp=true \ + --define neon=true \ --copt="-O3" \ --define production=true || exit 1 @@ -85,7 +87,7 @@ else adb -s $DEVICE_ID push ${MODEL_OUTPUT_DIR}/benchmark_model \ ${PHONE_DATA_DIR} > /dev/null || exit 1 if [ "$EMBED_MODEL_DATA" = 0 ]; then - adb -s $DEVICE_ID push ${MODEL_OUTPUT_DIR}/${MODEL_TAG}.data + adb -s $DEVICE_ID push ${MODEL_OUTPUT_DIR}/${MODEL_TAG}.data \ ${PHONE_DATA_DIR} > /dev/null || exit 1 fi diff --git a/tools/build_mace_run.sh b/tools/build_mace_run.sh index 4606fde6ca4a2299200266873b831a7113134a27..658ae26745f330cd0dded36da0f24d85d1bb9361 100644 --- a/tools/build_mace_run.sh +++ b/tools/build_mace_run.sh @@ -57,6 +57,7 @@ else --copt="-DMACE_OBFUSCATE_LITERALS" \ --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \ --define openmp=true \ + --define neon=true \ --copt="-O3" \ $NEON_ENABLE_FLAG \ $PRODUCTION_MODE_BUILD_FLAGS \ diff --git a/tools/env.sh b/tools/env.sh index e61180e1dd32a0fca24c886e9aab2cf2a5542c53..5ef841a1ed14eb75f8a75dc3f7b87d0ec7b55db6 100644 --- a/tools/env.sh +++ b/tools/env.sh @@ -23,6 +23,9 @@ elif [ x"$RUNTIME" = x"gpu" ]; then elif [ x"$RUNTIME" = x"cpu" ]; then DATA_TYPE="DT_FLOAT" DEVICE_TYPE="CPU" +elif [ x"$RUNTIME" = x"neon" ]; then + DATA_TYPE="DT_FLOAT" + DEVICE_TYPE="NEON" fi GENERATED_MODEL_LIB_NAME="libgenerated_models.a" diff --git a/tools/mace_tools.py b/tools/mace_tools.py index c952bafa5604a79c38443e976ce2c297cce0d1e6..f6892fa1e0727a9aa16a84865f77d35f1772b116 100644 --- a/tools/mace_tools.py +++ b/tools/mace_tools.py @@ -50,6 +50,8 @@ def get_global_runtime(configs): global_runtime = "gpu" elif "cpu" in runtime_list: global_runtime = "cpu" + elif "neon" in runtime_list: + global_runtime = "neon" else: raise Exception("Not found available RUNTIME in config files!") @@ -379,3 +381,4 @@ def main(unused_args): if __name__ == "__main__": FLAGS, unparsed = parse_args() main(unused_args=[sys.argv[0]] + unparsed) +