Implement ReluN

0c54e020 · 李寅 · 291a5ee6 · 0c54e020 · 0c54e020 · 0c54e020
5 changed file
--- a/mace/kernels/neon/relu_neon.cc
+++ b/mace/kernels/neon/relu_neon.cc
@@ -12,18 +12,19 @@ template <>
 void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input,
                                                      float *output,
                                                      index_t size) {
-#pragma omp parallel for num_threads(1)  // no significant performance improve
+  if (max_limit_ < 0) {
+#pragma omp parallel for num_threads(1)  // no significant perf improve
    for (int64_t i = 0; i < size; i += kCostPerGroup) {
      int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
-    int nn = count >> 2;
+      int block = count >> 2;
-    int remain = count - (nn << 2);
+      int remain = count - (block << 2);
      const float *inptr = input + i;
      float *outptr = output + i;
-    float32x4_t _zero = vdupq_n_f32(0.f);
+      float32x4_t zero = vdupq_n_f32(0.f);
-    for (; nn > 0; --nn) {
+      for (; block > 0; --block) {
-      float32x4_t _inptr = vld1q_f32(inptr);
+        float32x4_t in = vld1q_f32(inptr);
-      float32x4_t _outptr = vmaxq_f32(_inptr, _zero);
+        float32x4_t out = vmaxq_f32(in, zero);
-      vst1q_f32(outptr, _outptr);
+        vst1q_f32(outptr, out);
        inptr += 4;
        outptr += 4;
@@ -34,6 +35,32 @@ void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input,
        ++outptr;
      }
    }
+  } else {
+#pragma omp parallel for num_threads(1)  // no significant perf improve
+    for (int64_t i = 0; i < size; i += kCostPerGroup) {
+      int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
+      int block = count >> 2;
+      int remain = count - (block << 2);
+      const float *inptr = input + i;
+      float *outptr = output + i;
+      float32x4_t zero = vdupq_n_f32(0.f);
+      float32x4_t vmax = vdupq_n_f32(max_limit_);
+      for (; block > 0; --block) {
+        float32x4_t in = vld1q_f32(inptr);
+        float32x4_t out = vmaxq_f32(in, zero);
+        out = vminq_f32(out, vmax);
+        vst1q_f32(outptr, out);
+        inptr += 4;
+        outptr += 4;
+      }
+      for (; remain > 0; --remain) {
+        *outptr = std::min(std::max(*inptr, 0.f), max_limit_);
+        ++inptr;
+        ++outptr;
+      }
+    }
+  }
 };
 }  // namespace kernels

--- a/mace/kernels/relu.h
+++ b/mace/kernels/relu.h
@@ -12,10 +12,18 @@ namespace kernels {
 template <DeviceType D, typename T>
 struct ReluFunctor {
+  T max_limit_;
  void operator()(const T *input, T *output, index_t size) {
+    if (max_limit_ < 0) {
      for (index_t i = 0; i < size; ++i) {
        output[i] = std::max(input[i], static_cast<T>(0));
      }
+    } else {
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::min(std::max(input[i], static_cast<T>(0)), max_limit_);
+      }
+    }
  }
 };

--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -155,9 +155,9 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
  net.RunOp(DeviceType::NEON);
  // Check
-  Tensor expected = CreateTensor<float>({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19});
+  auto expected = CreateTensor<float>({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19});
-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
 TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
@@ -183,7 +183,7 @@ TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
  net.RunOp(DeviceType::NEON);
  // Check
-  Tensor expected = CreateTensor<float>({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19});
+  auto expected = CreateTensor<float>({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19});
-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
--- a/mace/ops/relu.h
+++ b/mace/ops/relu.h
@@ -14,7 +14,10 @@ template <DeviceType D, class T>
 class ReluOp : public Operator<D, T> {
 public:
  ReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<D, T>(operator_def, ws) {}
+      : Operator<D, T>(operator_def, ws) {
+      functor_.max_limit_ =
+          OperatorBase::GetSingleArgument<T>("max_limit", static_cast<T>(-1));
+  }
  bool Run() override {
    const Tensor* input_tensor = this->inputs_[0];
    Tensor* output_tensor = this->outputs_[0];

--- a/mace/ops/relu_test.cc
+++ b/mace/ops/relu_test.cc
@@ -18,7 +18,7 @@ TEST_F(ReluOpTest, ReluOp) {
      .Finalize(net.operator_def());
  // Add input data
-  net.AddRandomInput<float>("Input", {1, 2, 3, 4});
+  net.AddRandomInput<float>("Input", {1, 2, 3, 5});
  // Run
  net.RunOp();
@@ -32,4 +32,29 @@ TEST_F(ReluOpTest, ReluOp) {
  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
 }
+TEST_F(ReluOpTest, ReluOpWithMax) {
+  // Construct graph
+  auto& net = test_net();
+  OpDefBuilder("Relu", "ReluTestWithMax")
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.operator_def());
+  // Add input data
+  net.AddRandomInput<float>("Input", {1, 2, 3, 5});
+  net.AddFloatArg("max_limit", 0.5);
+  // Run
+  net.RunOp();
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+  // Check
+  net.RunOp(DeviceType::NEON);
+  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
+}
 }  // namespace mace