提交 a372fa5e 编写于 作者: 李滨

Merge branch 'quantize' into 'master'

Improve performance of activation

See merge request !833
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/kernels/kernel.h" #include "mace/kernels/kernel.h"
#include "mace/kernels/arm/activation_neon.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -98,6 +99,38 @@ void DoActivation(const T *input_ptr, ...@@ -98,6 +99,38 @@ void DoActivation(const T *input_ptr,
} }
} }
template<>
inline void DoActivation(const float *input_ptr,
float *output_ptr,
const index_t size,
const ActivationType type,
const float relux_max_limit) {
switch (type) {
case NOOP:
break;
case RELU:
ReluNeon(input_ptr, size, output_ptr);
break;
case RELUX:
ReluxNeon(input_ptr, relux_max_limit, size, output_ptr);
break;
case TANH:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::tanh(input_ptr[i]);
}
break;
case SIGMOID:
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i]));
}
break;
default:
LOG(FATAL) << "Unknown activation type: " << type;
}
}
template <typename T> template <typename T>
void PReLUActivation(const T *input_ptr, void PReLUActivation(const T *input_ptr,
const index_t outer_size, const index_t outer_size,
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include <algorithm>
#include "mace/kernels/arm/activation_neon.h"
namespace mace {
namespace kernels {
void ReluNeon(const float *input, const index_t size, float *output) {
#if defined(MACE_ENABLE_NEON)
float32x4_t vzero = vdupq_n_f32(0.f);
#pragma omp parallel for
for (index_t i = 0; i <= size - 4; i += 4) {
float32x4_t v = vld1q_f32(input + i);
v = vmaxq_f32(v, vzero);
vst1q_f32(output + i, v);
}
// remain
for (index_t i = (size >> 2) << 2; i < size; ++i) {
output[i] = std::max(input[i], 0.f);
}
#else
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::max(input[i], 0.f);
}
#endif
}
void ReluxNeon(const float *input, const float limit,
const index_t size, float *output) {
#if defined(MACE_ENABLE_NEON)
float32x4_t vzero = vdupq_n_f32(0.f);
float32x4_t vlimit = vdupq_n_f32(limit);
#pragma omp parallel for
for (index_t i = 0; i <= size - 4; i += 4) {
float32x4_t v = vld1q_f32(input + i);
v = vmaxq_f32(v, vzero);
v = vminq_f32(v, vlimit);
vst1q_f32(output + i, v);
}
// remain
for (index_t i = (size >> 2) << 2; i < size; ++i) {
output[i] = std::min(std::max(input[i], 0.f), limit);
}
#else
#pragma omp parallel for
for (index_t i = 0; i < size; ++i) {
output[i] = std::min(std::max(input[i], 0.f), limit);
}
#endif
}
} // namespace kernels
} // namespace mace
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_ARM_ACTIVATION_NEON_H_
#define MACE_KERNELS_ARM_ACTIVATION_NEON_H_
#include "mace/core/types.h"
namespace mace {
namespace kernels {
void ReluNeon(const float *input, const index_t size, float *output);
void ReluxNeon(const float *input, const float limit,
const index_t size, float *output);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_ARM_ACTIVATION_NEON_H_
...@@ -544,6 +544,19 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -544,6 +544,19 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
&sgemm_, &sgemm_,
scratch); scratch);
}; };
} else if (use_neon_1x1_s1) {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK1x1S1(pad_input,
filter_data,
batch,
extra_input_height,
extra_input_width,
input_channels,
channels,
pad_output,
&sgemm_,
scratch);
};
} else if (use_neon_3x3_s1) { } else if (use_neon_3x3_s1) {
conv_func = [=](const float *pad_input, float *pad_output) { conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK3x3S1(pad_input, Conv2dNeonK3x3S1(pad_input,
...@@ -560,19 +573,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -560,19 +573,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
extra_output_shape, extra_output_shape,
pad_output); pad_output);
}; };
} else if (use_neon_1x1_s1) {
conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK1x1S1(pad_input,
filter_data,
batch,
extra_input_height,
extra_input_width,
input_channels,
channels,
pad_output,
&sgemm_,
scratch);
};
} else if (use_neon_5x5_s1) { } else if (use_neon_5x5_s1) {
conv_func = [=](const float *pad_input, float *pad_output) { conv_func = [=](const float *pad_input, float *pad_output) {
Conv2dNeonK5x5S1(pad_input, Conv2dNeonK5x5S1(pad_input,
...@@ -699,13 +699,27 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase { ...@@ -699,13 +699,27 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
} }
if (bias_data != nullptr) { if (bias_data != nullptr) {
const index_t image_size = height * width;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
for (index_t i = 0; i < height * width; ++i) { float *output_ptr = output_data + (b * channels + c) * image_size;
output_data[(b * channels + c) * height * width + i] += const float bias = bias_data[c];
bias_data[c]; #if defined(MACE_ENABLE_NEON)
float32x4_t vbias = vdupq_n_f32(bias);
for (index_t i = 0; i <= image_size - 4; i += 4) {
float32x4_t v = vld1q_f32(output_ptr + i);
v = vaddq_f32(v, vbias);
vst1q_f32(output_ptr + i, v);
} }
for (index_t i = (image_size >> 2) << 2; i < image_size; ++i) {
output_ptr[i] += bias;
}
#else
for (index_t i = 0; i < image_size; ++i) {
output_ptr[i] += bias;
}
#endif
} }
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册