From 327b02b35b6fd06098c5bd57d5fdf871ea46c18a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= <liyin@xiaomi.com>
Date: Tue, 16 Oct 2018 15:45:43 +0800
Subject: [PATCH] Improve performance of activation

---
 mace/kernels/activation.h           | 33 ++++++++++++++
 mace/kernels/arm/activation_neon.cc | 71 +++++++++++++++++++++++++++++
 mace/kernels/arm/activation_neon.h  | 31 +++++++++++++
 mace/kernels/conv_2d.h              | 46 ++++++++++++-------
 4 files changed, 165 insertions(+), 16 deletions(-)
 create mode 100644 mace/kernels/arm/activation_neon.cc
 create mode 100644 mace/kernels/arm/activation_neon.h
diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h
index 59f7edd8..66ec407f 100644
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -25,6 +25,7 @@
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
 #include "mace/kernels/kernel.h"
+#include "mace/kernels/arm/activation_neon.h"
 
 namespace mace {
 namespace kernels {
@@ -98,6 +99,38 @@ void DoActivation(const T *input_ptr,
   }
 }
 
+template<>
+inline void DoActivation(const float *input_ptr,
+                         float *output_ptr,
+                         const index_t size,
+                         const ActivationType type,
+                         const float relux_max_limit) {
+  switch (type) {
+    case NOOP:
+      break;
+    case RELU:
+      ReluNeon(input_ptr, size, output_ptr);
+      break;
+    case RELUX:
+      ReluxNeon(input_ptr, relux_max_limit, size, output_ptr);
+      break;
+    case TANH:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output_ptr[i] = std::tanh(input_ptr[i]);
+      }
+      break;
+    case SIGMOID:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i]));
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unknown activation type: " << type;
+  }
+}
+
 template <typename T>
 void PReLUActivation(const T *input_ptr,
                      const index_t outer_size,
diff --git a/mace/kernels/arm/activation_neon.cc b/mace/kernels/arm/activation_neon.cc
new file mode 100644
index 00000000..6067077c
--- /dev/null
+++ b/mace/kernels/arm/activation_neon.cc
@@ -0,0 +1,71 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif
+
+#include <algorithm>
+#include "mace/kernels/arm/activation_neon.h"
+
+namespace mace {
+namespace kernels {
+
+void ReluNeon(const float *input, const index_t size, float *output) {
+#if defined(MACE_ENABLE_NEON)
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#pragma omp parallel for
+  for (index_t i = 0; i <= size - 4; i += 4) {
+    float32x4_t v = vld1q_f32(input + i);
+    v = vmaxq_f32(v, vzero);
+    vst1q_f32(output + i, v);
+  }
+  // remain
+  for (index_t i = (size >> 2) << 2; i < size; ++i) {
+    output[i] = std::max(input[i], 0.f);
+  }
+#else
+#pragma omp parallel for
+  for (index_t i = 0; i < size; ++i) {
+    output[i] = std::max(input[i], 0.f);
+  }
+#endif
+}
+
+void ReluxNeon(const float *input, const float limit,
+               const index_t size, float *output) {
+#if defined(MACE_ENABLE_NEON)
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  float32x4_t vlimit = vdupq_n_f32(limit);
+#pragma omp parallel for
+  for (index_t i = 0; i <= size - 4; i += 4) {
+    float32x4_t v = vld1q_f32(input + i);
+    v = vmaxq_f32(v, vzero);
+    v = vminq_f32(v, vlimit);
+    vst1q_f32(output + i, v);
+  }
+  // remain
+  for (index_t i = (size >> 2) << 2; i < size; ++i) {
+    output[i] = std::min(std::max(input[i], 0.f), limit);
+  }
+#else
+#pragma omp parallel for
+  for (index_t i = 0; i < size; ++i) {
+    output[i] = std::min(std::max(input[i], 0.f), limit);
+  }
+#endif
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/arm/activation_neon.h b/mace/kernels/arm/activation_neon.h
new file mode 100644
index 00000000..886c95fe
--- /dev/null
+++ b/mace/kernels/arm/activation_neon.h
@@ -0,0 +1,31 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_ARM_ACTIVATION_NEON_H_
+#define MACE_KERNELS_ARM_ACTIVATION_NEON_H_
+
+#include "mace/core/types.h"
+
+namespace mace {
+namespace kernels {
+
+void ReluNeon(const float *input, const index_t size, float *output);
+
+void ReluxNeon(const float *input, const float limit,
+               const index_t size, float *output);
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_ARM_ACTIVATION_NEON_H_
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index c96b70ef..ebd23576 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -544,6 +544,19 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                           &sgemm_,
                           scratch);
       };
+    } else if (use_neon_1x1_s1) {
+      conv_func = [=](const float *pad_input, float *pad_output) {
+        Conv2dNeonK1x1S1(pad_input,
+                         filter_data,
+                         batch,
+                         extra_input_height,
+                         extra_input_width,
+                         input_channels,
+                         channels,
+                         pad_output,
+                         &sgemm_,
+                         scratch);
+      };
     } else if (use_neon_3x3_s1) {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dNeonK3x3S1(pad_input,
@@ -560,19 +573,6 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                          extra_output_shape,
                          pad_output);
       };
-    } else if (use_neon_1x1_s1) {
-      conv_func = [=](const float *pad_input, float *pad_output) {
-        Conv2dNeonK1x1S1(pad_input,
-                         filter_data,
-                         batch,
-                         extra_input_height,
-                         extra_input_width,
-                         input_channels,
-                         channels,
-                         pad_output,
-                         &sgemm_,
-                         scratch);
-      };
     } else if (use_neon_5x5_s1) {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dNeonK5x5S1(pad_input,
@@ -699,13 +699,27 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
     }
 
     if (bias_data != nullptr) {
+      const index_t image_size = height * width;
 #pragma omp parallel for collapse(2)
       for (index_t b = 0; b < batch; ++b) {
         for (index_t c = 0; c < channels; ++c) {
-          for (index_t i = 0; i < height * width; ++i) {
-            output_data[(b * channels + c) * height * width + i] +=
-              bias_data[c];
+          float *output_ptr = output_data + (b * channels + c) * image_size;
+          const float bias = bias_data[c];
+#if defined(MACE_ENABLE_NEON)
+          float32x4_t vbias = vdupq_n_f32(bias);
+          for (index_t i = 0; i <= image_size - 4; i += 4) {
+            float32x4_t v = vld1q_f32(output_ptr + i);
+            v = vaddq_f32(v, vbias);
+            vst1q_f32(output_ptr + i, v);
           }
+          for (index_t i = (image_size >> 2) << 2; i < image_size; ++i) {
+            output_ptr[i] += bias;
+          }
+#else
+          for (index_t i = 0; i < image_size; ++i) {
+            output_ptr[i] += bias;
+          }
+#endif
         }
       }
     }
-- 
GitLab