From 0c54e0207d5fd8b25071eebbc806fd14562ca0e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= Date: Mon, 18 Sep 2017 13:17:38 +0800 Subject: [PATCH] Implement ReluN --- mace/kernels/neon/relu_neon.cc | 63 ++++++++++++++++++++++++---------- mace/kernels/relu.h | 12 +++++-- mace/ops/pooling_test.cc | 8 ++--- mace/ops/relu.h | 5 ++- mace/ops/relu_test.cc | 27 ++++++++++++++- 5 files changed, 89 insertions(+), 26 deletions(-) diff --git a/mace/kernels/neon/relu_neon.cc b/mace/kernels/neon/relu_neon.cc index b03b8960..426d8c22 100644 --- a/mace/kernels/neon/relu_neon.cc +++ b/mace/kernels/neon/relu_neon.cc @@ -12,26 +12,53 @@ template <> void ReluFunctor::operator()(const float *input, float *output, index_t size) { -#pragma omp parallel for num_threads(1) // no significant performance improve - for (int64_t i = 0; i < size; i += kCostPerGroup) { - int64_t count = std::min(static_cast(kCostPerGroup), size - i); - int nn = count >> 2; - int remain = count - (nn << 2); - const float *inptr = input + i; - float *outptr = output + i; - float32x4_t _zero = vdupq_n_f32(0.f); - for (; nn > 0; --nn) { - float32x4_t _inptr = vld1q_f32(inptr); - float32x4_t _outptr = vmaxq_f32(_inptr, _zero); - vst1q_f32(outptr, _outptr); + if (max_limit_ < 0) { +#pragma omp parallel for num_threads(1) // no significant perf improve + for (int64_t i = 0; i < size; i += kCostPerGroup) { + int64_t count = std::min(static_cast(kCostPerGroup), size - i); + int block = count >> 2; + int remain = count - (block << 2); + const float *inptr = input + i; + float *outptr = output + i; + float32x4_t zero = vdupq_n_f32(0.f); + for (; block > 0; --block) { + float32x4_t in = vld1q_f32(inptr); + float32x4_t out = vmaxq_f32(in, zero); + vst1q_f32(outptr, out); - inptr += 4; - outptr += 4; + inptr += 4; + outptr += 4; + } + for (; remain > 0; --remain) { + *outptr = std::max(*inptr, 0.f); + ++inptr; + ++outptr; + } } - for (; remain > 0; --remain) { - *outptr = std::max(*inptr, 0.f); - ++inptr; - ++outptr; + } else { +#pragma omp parallel for num_threads(1) // no significant perf improve + for (int64_t i = 0; i < size; i += kCostPerGroup) { + int64_t count = std::min(static_cast(kCostPerGroup), size - i); + int block = count >> 2; + int remain = count - (block << 2); + const float *inptr = input + i; + float *outptr = output + i; + float32x4_t zero = vdupq_n_f32(0.f); + float32x4_t vmax = vdupq_n_f32(max_limit_); + for (; block > 0; --block) { + float32x4_t in = vld1q_f32(inptr); + float32x4_t out = vmaxq_f32(in, zero); + out = vminq_f32(out, vmax); + vst1q_f32(outptr, out); + + inptr += 4; + outptr += 4; + } + for (; remain > 0; --remain) { + *outptr = std::min(std::max(*inptr, 0.f), max_limit_); + ++inptr; + ++outptr; + } } } }; diff --git a/mace/kernels/relu.h b/mace/kernels/relu.h index 79788f03..71cd07ab 100644 --- a/mace/kernels/relu.h +++ b/mace/kernels/relu.h @@ -12,9 +12,17 @@ namespace kernels { template struct ReluFunctor { + T max_limit_; + void operator()(const T *input, T *output, index_t size) { - for (index_t i = 0; i < size; ++i) { - output[i] = std::max(input[i], static_cast(0)); + if (max_limit_ < 0) { + for (index_t i = 0; i < size; ++i) { + output[i] = std::max(input[i], static_cast(0)); + } + } else { + for (index_t i = 0; i < size; ++i) { + output[i] = std::min(std::max(input[i], static_cast(0)), max_limit_); + } } } }; diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc index 7ca43f19..3972743c 100644 --- a/mace/ops/pooling_test.cc +++ b/mace/ops/pooling_test.cc @@ -155,9 +155,9 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { net.RunOp(DeviceType::NEON); // Check - Tensor expected = CreateTensor({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19}); + auto expected = CreateTensor({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19}); - ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } TEST_F(PoolingOpTest, MAX_k3x3s2x2) { @@ -183,7 +183,7 @@ TEST_F(PoolingOpTest, MAX_k3x3s2x2) { net.RunOp(DeviceType::NEON); // Check - Tensor expected = CreateTensor({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19}); + auto expected = CreateTensor({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19}); - ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } diff --git a/mace/ops/relu.h b/mace/ops/relu.h index c195c78f..5f68cca9 100644 --- a/mace/ops/relu.h +++ b/mace/ops/relu.h @@ -14,7 +14,10 @@ template class ReluOp : public Operator { public: ReluOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws) {} + : Operator(operator_def, ws) { + functor_.max_limit_ = + OperatorBase::GetSingleArgument("max_limit", static_cast(-1)); + } bool Run() override { const Tensor* input_tensor = this->inputs_[0]; Tensor* output_tensor = this->outputs_[0]; diff --git a/mace/ops/relu_test.cc b/mace/ops/relu_test.cc index 1277722c..bf4c8100 100644 --- a/mace/ops/relu_test.cc +++ b/mace/ops/relu_test.cc @@ -18,7 +18,7 @@ TEST_F(ReluOpTest, ReluOp) { .Finalize(net.operator_def()); // Add input data - net.AddRandomInput("Input", {1, 2, 3, 4}); + net.AddRandomInput("Input", {1, 2, 3, 5}); // Run net.RunOp(); @@ -32,4 +32,29 @@ TEST_F(ReluOpTest, ReluOp) { ExpectTensorNear(expected, *net.GetOutput("Output"), 0.01); } +TEST_F(ReluOpTest, ReluOpWithMax) { + // Construct graph + auto& net = test_net(); + OpDefBuilder("Relu", "ReluTestWithMax") + .Input("Input") + .Output("Output") + .Finalize(net.operator_def()); + + // Add input data + net.AddRandomInput("Input", {1, 2, 3, 5}); + net.AddFloatArg("max_limit", 0.5); + + // Run + net.RunOp(); + + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // Check + net.RunOp(DeviceType::NEON); + + ExpectTensorNear(expected, *net.GetOutput("Output"), 0.01); +} + + } // namespace mace -- GitLab