From 0c54e0207d5fd8b25071eebbc806fd14562ca0e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= <liyin@xiaomi.com>
Date: Mon, 18 Sep 2017 13:17:38 +0800
Subject: [PATCH] Implement ReluN

---
 mace/kernels/neon/relu_neon.cc | 63 ++++++++++++++++++++++++----------
 mace/kernels/relu.h            | 12 +++++--
 mace/ops/pooling_test.cc       |  8 ++---
 mace/ops/relu.h                |  5 ++-
 mace/ops/relu_test.cc          | 27 ++++++++++++++-
 5 files changed, 89 insertions(+), 26 deletions(-)
diff --git a/mace/kernels/neon/relu_neon.cc b/mace/kernels/neon/relu_neon.cc
index b03b8960..426d8c22 100644
--- a/mace/kernels/neon/relu_neon.cc
+++ b/mace/kernels/neon/relu_neon.cc
@@ -12,26 +12,53 @@ template <>
 void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input,
                                                       float *output,
                                                       index_t size) {
-#pragma omp parallel for num_threads(1)  // no significant performance improve
-  for (int64_t i = 0; i < size; i += kCostPerGroup) {
-    int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
-    int nn = count >> 2;
-    int remain = count - (nn << 2);
-    const float *inptr = input + i;
-    float *outptr = output + i;
-    float32x4_t _zero = vdupq_n_f32(0.f);
-    for (; nn > 0; --nn) {
-      float32x4_t _inptr = vld1q_f32(inptr);
-      float32x4_t _outptr = vmaxq_f32(_inptr, _zero);
-      vst1q_f32(outptr, _outptr);
+  if (max_limit_ < 0) {
+#pragma omp parallel for num_threads(1)  // no significant perf improve
+    for (int64_t i = 0; i < size; i += kCostPerGroup) {
+      int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
+      int block = count >> 2;
+      int remain = count - (block << 2);
+      const float *inptr = input + i;
+      float *outptr = output + i;
+      float32x4_t zero = vdupq_n_f32(0.f);
+      for (; block > 0; --block) {
+        float32x4_t in = vld1q_f32(inptr);
+        float32x4_t out = vmaxq_f32(in, zero);
+        vst1q_f32(outptr, out);
 
-      inptr += 4;
-      outptr += 4;
+        inptr += 4;
+        outptr += 4;
+      }
+      for (; remain > 0; --remain) {
+        *outptr = std::max(*inptr, 0.f);
+        ++inptr;
+        ++outptr;
+      }
     }
-    for (; remain > 0; --remain) {
-      *outptr = std::max(*inptr, 0.f);
-      ++inptr;
-      ++outptr;
+  } else {
+#pragma omp parallel for num_threads(1)  // no significant perf improve
+    for (int64_t i = 0; i < size; i += kCostPerGroup) {
+      int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
+      int block = count >> 2;
+      int remain = count - (block << 2);
+      const float *inptr = input + i;
+      float *outptr = output + i;
+      float32x4_t zero = vdupq_n_f32(0.f);
+      float32x4_t vmax = vdupq_n_f32(max_limit_);
+      for (; block > 0; --block) {
+        float32x4_t in = vld1q_f32(inptr);
+        float32x4_t out = vmaxq_f32(in, zero);
+        out = vminq_f32(out, vmax);
+        vst1q_f32(outptr, out);
+
+        inptr += 4;
+        outptr += 4;
+      }
+      for (; remain > 0; --remain) {
+        *outptr = std::min(std::max(*inptr, 0.f), max_limit_);
+        ++inptr;
+        ++outptr;
+      }
     }
   }
 };
diff --git a/mace/kernels/relu.h b/mace/kernels/relu.h
index 79788f03..71cd07ab 100644
--- a/mace/kernels/relu.h
+++ b/mace/kernels/relu.h
@@ -12,9 +12,17 @@ namespace kernels {
 
 template <DeviceType D, typename T>
 struct ReluFunctor {
+  T max_limit_;
+
   void operator()(const T *input, T *output, index_t size) {
-    for (index_t i = 0; i < size; ++i) {
-      output[i] = std::max(input[i], static_cast<T>(0));
+    if (max_limit_ < 0) {
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::max(input[i], static_cast<T>(0));
+      }
+    } else {
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::min(std::max(input[i], static_cast<T>(0)), max_limit_);
+      }
     }
   }
 };
diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc
index 7ca43f19..3972743c 100644
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -155,9 +155,9 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
   net.RunOp(DeviceType::NEON);
 
   // Check
-  Tensor expected = CreateTensor<float>({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19});
+  auto expected = CreateTensor<float>({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19});
 
-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
 
 TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
@@ -183,7 +183,7 @@ TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
   net.RunOp(DeviceType::NEON);
 
   // Check
-  Tensor expected = CreateTensor<float>({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19});
+  auto expected = CreateTensor<float>({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19});
 
-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
diff --git a/mace/ops/relu.h b/mace/ops/relu.h
index c195c78f..5f68cca9 100644
--- a/mace/ops/relu.h
+++ b/mace/ops/relu.h
@@ -14,7 +14,10 @@ template <DeviceType D, class T>
 class ReluOp : public Operator<D, T> {
  public:
   ReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<D, T>(operator_def, ws) {}
+      : Operator<D, T>(operator_def, ws) {
+      functor_.max_limit_ =
+          OperatorBase::GetSingleArgument<T>("max_limit", static_cast<T>(-1));
+  }
   bool Run() override {
     const Tensor* input_tensor = this->inputs_[0];
     Tensor* output_tensor = this->outputs_[0];
diff --git a/mace/ops/relu_test.cc b/mace/ops/relu_test.cc
index 1277722c..bf4c8100 100644
--- a/mace/ops/relu_test.cc
+++ b/mace/ops/relu_test.cc
@@ -18,7 +18,7 @@ TEST_F(ReluOpTest, ReluOp) {
       .Finalize(net.operator_def());
 
   // Add input data
-  net.AddRandomInput<float>("Input", {1, 2, 3, 4});
+  net.AddRandomInput<float>("Input", {1, 2, 3, 5});
 
   // Run
   net.RunOp();
@@ -32,4 +32,29 @@ TEST_F(ReluOpTest, ReluOp) {
   ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
 }
 
+TEST_F(ReluOpTest, ReluOpWithMax) {
+  // Construct graph
+  auto& net = test_net();
+  OpDefBuilder("Relu", "ReluTestWithMax")
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.operator_def());
+
+  // Add input data
+  net.AddRandomInput<float>("Input", {1, 2, 3, 5});
+  net.AddFloatArg("max_limit", 0.5);
+
+  // Run
+  net.RunOp();
+
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+
+  // Check
+  net.RunOp(DeviceType::NEON);
+
+  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
+}
+
+
 }  // namespace mace
-- 
GitLab