Implement ReluN

0c54e020 · 李寅 · 291a5ee6 · 0c54e020 · 0c54e020 · 0c54e020
5 changed file
--- a/mace/kernels/neon/relu_neon.cc
+++ b/mace/kernels/neon/relu_neon.cc
@@ -12,26 +12,53 @@ template <>
 void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input,
                                                      float *output,
                                                      index_t size) {
-#pragma omp parallel for num_threads(1)  // no significant performance improve
-  for (int64_t i = 0; i < size; i += kCostPerGroup) {
-    int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
-    int nn = count >> 2;
-    int remain = count - (nn << 2);
-    const float *inptr = input + i;
-    float *outptr = output + i;
-    float32x4_t _zero = vdupq_n_f32(0.f);
-    for (; nn > 0; --nn) {
-      float32x4_t _inptr = vld1q_f32(inptr);
-      float32x4_t _outptr = vmaxq_f32(_inptr, _zero);
-      vst1q_f32(outptr, _outptr);
+  if (max_limit_ < 0) {
+#pragma omp parallel for num_threads(1)  // no significant perf improve
+    for (int64_t i = 0; i < size; i += kCostPerGroup) {
+      int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
+      int block = count >> 2;
+      int remain = count - (block << 2);
+      const float *inptr = input + i;
+      float *outptr = output + i;
+      float32x4_t zero = vdupq_n_f32(0.f);
+      for (; block > 0; --block) {
+        float32x4_t in = vld1q_f32(inptr);
+        float32x4_t out = vmaxq_f32(in, zero);
+        vst1q_f32(outptr, out);

-      inptr += 4;
-      outptr += 4;
+        inptr += 4;
+        outptr += 4;
+      }
+      for (; remain > 0; --remain) {
+        *outptr = std::max(*inptr, 0.f);
+        ++inptr;
+        ++outptr;
+      }
    }
-    for (; remain > 0; --remain) {
-      *outptr = std::max(*inptr, 0.f);
-      ++inptr;
-      ++outptr;
+  } else {
+#pragma omp parallel for num_threads(1)  // no significant perf improve
+    for (int64_t i = 0; i < size; i += kCostPerGroup) {
+      int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
+      int block = count >> 2;
+      int remain = count - (block << 2);
+      const float *inptr = input + i;
+      float *outptr = output + i;
+      float32x4_t zero = vdupq_n_f32(0.f);
+      float32x4_t vmax = vdupq_n_f32(max_limit_);
+      for (; block > 0; --block) {
+        float32x4_t in = vld1q_f32(inptr);
+        float32x4_t out = vmaxq_f32(in, zero);
+        out = vminq_f32(out, vmax);
+        vst1q_f32(outptr, out);
+
+        inptr += 4;
+        outptr += 4;
+      }
+      for (; remain > 0; --remain) {
+        *outptr = std::min(std::max(*inptr, 0.f), max_limit_);
+        ++inptr;
+        ++outptr;
+      }
    }
  }
 };

--- a/mace/kernels/relu.h
+++ b/mace/kernels/relu.h
@@ -12,9 +12,17 @@ namespace kernels {

 template <DeviceType D, typename T>
 struct ReluFunctor {
+  T max_limit_;
+
  void operator()(const T *input, T *output, index_t size) {
-    for (index_t i = 0; i < size; ++i) {
-      output[i] = std::max(input[i], static_cast<T>(0));
+    if (max_limit_ < 0) {
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::max(input[i], static_cast<T>(0));
+      }
+    } else {
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::min(std::max(input[i], static_cast<T>(0)), max_limit_);
+      }
    }
  }
 };

--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -155,9 +155,9 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
  net.RunOp(DeviceType::NEON);

  // Check
-  Tensor expected = CreateTensor<float>({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19});
+  auto expected = CreateTensor<float>({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19});

-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }

 TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
@@ -183,7 +183,7 @@ TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
  net.RunOp(DeviceType::NEON);

  // Check
-  Tensor expected = CreateTensor<float>({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19});
+  auto expected = CreateTensor<float>({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19});

-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
--- a/mace/ops/relu.h
+++ b/mace/ops/relu.h
@@ -14,7 +14,10 @@ template <DeviceType D, class T>
 class ReluOp : public Operator<D, T> {
 public:
  ReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<D, T>(operator_def, ws) {}
+      : Operator<D, T>(operator_def, ws) {
+      functor_.max_limit_ =
+          OperatorBase::GetSingleArgument<T>("max_limit", static_cast<T>(-1));
+  }
  bool Run() override {
    const Tensor* input_tensor = this->inputs_[0];
    Tensor* output_tensor = this->outputs_[0];

--- a/mace/ops/relu_test.cc
+++ b/mace/ops/relu_test.cc
@@ -18,7 +18,7 @@ TEST_F(ReluOpTest, ReluOp) {
      .Finalize(net.operator_def());

  // Add input data
-  net.AddRandomInput<float>("Input", {1, 2, 3, 4});
+  net.AddRandomInput<float>("Input", {1, 2, 3, 5});

  // Run
  net.RunOp();
@@ -32,4 +32,29 @@ TEST_F(ReluOpTest, ReluOp) {
  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
 }

+TEST_F(ReluOpTest, ReluOpWithMax) {
+  // Construct graph
+  auto& net = test_net();
+  OpDefBuilder("Relu", "ReluTestWithMax")
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.operator_def());
+
+  // Add input data
+  net.AddRandomInput<float>("Input", {1, 2, 3, 5});
+  net.AddFloatArg("max_limit", 0.5);
+
+  // Run
+  net.RunOp();
+
+  Tensor expected;
+  expected.Copy(*net.GetOutput("Output"));
+
+  // Check
+  net.RunOp(DeviceType::NEON);
+
+  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
+}
+
+
 }  // namespace mace