From 327b02b35b6fd06098c5bd57d5fdf871ea46c18a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= Date: Tue, 16 Oct 2018 15:45:43 +0800 Subject: [PATCH] Improve performance of activation --- mace/kernels/activation.h | 33 ++++++++++++++ mace/kernels/arm/activation_neon.cc | 71 +++++++++++++++++++++++++++++ mace/kernels/arm/activation_neon.h | 31 +++++++++++++ mace/kernels/conv_2d.h | 46 ++++++++++++------- 4 files changed, 165 insertions(+), 16 deletions(-) create mode 100644 mace/kernels/arm/activation_neon.cc create mode 100644 mace/kernels/arm/activation_neon.h diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h index 59f7edd8..66ec407f 100644 --- a/mace/kernels/activation.h +++ b/mace/kernels/activation.h @@ -25,6 +25,7 @@ #include "mace/core/tensor.h" #include "mace/core/types.h" #include "mace/kernels/kernel.h" +#include "mace/kernels/arm/activation_neon.h" namespace mace { namespace kernels { @@ -98,6 +99,38 @@ void DoActivation(const T *input_ptr, } } +template<> +inline void DoActivation(const float *input_ptr, + float *output_ptr, + const index_t size, + const ActivationType type, + const float relux_max_limit) { + switch (type) { + case NOOP: + break; + case RELU: + ReluNeon(input_ptr, size, output_ptr); + break; + case RELUX: + ReluxNeon(input_ptr, relux_max_limit, size, output_ptr); + break; + case TANH: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output_ptr[i] = std::tanh(input_ptr[i]); + } + break; + case SIGMOID: +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i])); + } + break; + default: + LOG(FATAL) << "Unknown activation type: " << type; + } +} + template void PReLUActivation(const T *input_ptr, const index_t outer_size, diff --git a/mace/kernels/arm/activation_neon.cc b/mace/kernels/arm/activation_neon.cc new file mode 100644 index 00000000..6067077c --- /dev/null +++ b/mace/kernels/arm/activation_neon.cc @@ -0,0 +1,71 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(MACE_ENABLE_NEON) +#include +#endif + +#include +#include "mace/kernels/arm/activation_neon.h" + +namespace mace { +namespace kernels { + +void ReluNeon(const float *input, const index_t size, float *output) { +#if defined(MACE_ENABLE_NEON) + float32x4_t vzero = vdupq_n_f32(0.f); +#pragma omp parallel for + for (index_t i = 0; i <= size - 4; i += 4) { + float32x4_t v = vld1q_f32(input + i); + v = vmaxq_f32(v, vzero); + vst1q_f32(output + i, v); + } + // remain + for (index_t i = (size >> 2) << 2; i < size; ++i) { + output[i] = std::max(input[i], 0.f); + } +#else +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::max(input[i], 0.f); + } +#endif +} + +void ReluxNeon(const float *input, const float limit, + const index_t size, float *output) { +#if defined(MACE_ENABLE_NEON) + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t vlimit = vdupq_n_f32(limit); +#pragma omp parallel for + for (index_t i = 0; i <= size - 4; i += 4) { + float32x4_t v = vld1q_f32(input + i); + v = vmaxq_f32(v, vzero); + v = vminq_f32(v, vlimit); + vst1q_f32(output + i, v); + } + // remain + for (index_t i = (size >> 2) << 2; i < size; ++i) { + output[i] = std::min(std::max(input[i], 0.f), limit); + } +#else +#pragma omp parallel for + for (index_t i = 0; i < size; ++i) { + output[i] = std::min(std::max(input[i], 0.f), limit); + } +#endif +} + +} // namespace kernels +} // namespace mace diff --git a/mace/kernels/arm/activation_neon.h b/mace/kernels/arm/activation_neon.h new file mode 100644 index 00000000..886c95fe --- /dev/null +++ b/mace/kernels/arm/activation_neon.h @@ -0,0 +1,31 @@ +// Copyright 2018 Xiaomi, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MACE_KERNELS_ARM_ACTIVATION_NEON_H_ +#define MACE_KERNELS_ARM_ACTIVATION_NEON_H_ + +#include "mace/core/types.h" + +namespace mace { +namespace kernels { + +void ReluNeon(const float *input, const index_t size, float *output); + +void ReluxNeon(const float *input, const float limit, + const index_t size, float *output); + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_ARM_ACTIVATION_NEON_H_ diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index c96b70ef..ebd23576 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -544,6 +544,19 @@ struct Conv2dFunctor : Conv2dFunctorBase { &sgemm_, scratch); }; + } else if (use_neon_1x1_s1) { + conv_func = [=](const float *pad_input, float *pad_output) { + Conv2dNeonK1x1S1(pad_input, + filter_data, + batch, + extra_input_height, + extra_input_width, + input_channels, + channels, + pad_output, + &sgemm_, + scratch); + }; } else if (use_neon_3x3_s1) { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK3x3S1(pad_input, @@ -560,19 +573,6 @@ struct Conv2dFunctor : Conv2dFunctorBase { extra_output_shape, pad_output); }; - } else if (use_neon_1x1_s1) { - conv_func = [=](const float *pad_input, float *pad_output) { - Conv2dNeonK1x1S1(pad_input, - filter_data, - batch, - extra_input_height, - extra_input_width, - input_channels, - channels, - pad_output, - &sgemm_, - scratch); - }; } else if (use_neon_5x5_s1) { conv_func = [=](const float *pad_input, float *pad_output) { Conv2dNeonK5x5S1(pad_input, @@ -699,13 +699,27 @@ struct Conv2dFunctor : Conv2dFunctorBase { } if (bias_data != nullptr) { + const index_t image_size = height * width; #pragma omp parallel for collapse(2) for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { - for (index_t i = 0; i < height * width; ++i) { - output_data[(b * channels + c) * height * width + i] += - bias_data[c]; + float *output_ptr = output_data + (b * channels + c) * image_size; + const float bias = bias_data[c]; +#if defined(MACE_ENABLE_NEON) + float32x4_t vbias = vdupq_n_f32(bias); + for (index_t i = 0; i <= image_size - 4; i += 4) { + float32x4_t v = vld1q_f32(output_ptr + i); + v = vaddq_f32(v, vbias); + vst1q_f32(output_ptr + i, v); } + for (index_t i = (image_size >> 2) << 2; i < image_size; ++i) { + output_ptr[i] += bias; + } +#else + for (index_t i = 0; i < image_size; ++i) { + output_ptr[i] += bias; + } +#endif } } } -- GitLab