From 0c4fedc90d73bb942e4d30bf124b19678b324836 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= <liyin@xiaomi.com>
Date: Thu, 28 Sep 2017 12:53:25 +0800
Subject: [PATCH] Improve relu & addn performance using multithread

---
 mace/core/common.h             | 3 +--
 mace/core/workspace.cc         | 2 ++
 mace/kernels/neon/addn_neon.cc | 2 +-
 mace/kernels/neon/relu_neon.cc | 4 ++--
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/mace/core/common.h b/mace/core/common.h
index b52526f7..75060255 100644
--- a/mace/core/common.h
+++ b/mace/core/common.h
@@ -32,7 +32,6 @@ typedef int64_t index_t;
 
 #define MACE_NOT_IMPLEMENTED MACE_CHECK(false, "not implemented")
 
-// TODO: need to fine tune this
-#define kCostPerGroup 1024000000
+#define kCostPerGroup 10240
 
 #endif  // MACE_CORE_COMMON_H_
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index ecd5af3e..a421770b 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -41,6 +41,8 @@ bool Workspace::RemoveTensor(const string& name) {
 const Tensor* Workspace::GetTensor(const string& name) const {
   if (tensor_map_.count(name)) {
     return tensor_map_.at(name).get();
+  } else {
+    LOG(WARNING) << "Tensor " << name << " does not exist.";
   }
   return nullptr;
 }
diff --git a/mace/kernels/neon/addn_neon.cc b/mace/kernels/neon/addn_neon.cc
index 19f621d4..fed0c4e1 100644
--- a/mace/kernels/neon/addn_neon.cc
+++ b/mace/kernels/neon/addn_neon.cc
@@ -21,7 +21,7 @@ void AddNFunctor<DeviceType::NEON, float>::operator()(
   }
   int64_t element_per_group = size / groups;
 
-#pragma omp parallel for num_threads(1)  // no significant performance improve
+#pragma omp parallel for
   for (int64_t i = 0; i < size; i += element_per_group) {
     int64_t count = std::min(element_per_group, size - i);
     int nn = count >> 2;
diff --git a/mace/kernels/neon/relu_neon.cc b/mace/kernels/neon/relu_neon.cc
index 426d8c22..7898e9b0 100644
--- a/mace/kernels/neon/relu_neon.cc
+++ b/mace/kernels/neon/relu_neon.cc
@@ -13,7 +13,7 @@ void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input,
                                                       float *output,
                                                       index_t size) {
   if (max_limit_ < 0) {
-#pragma omp parallel for num_threads(1)  // no significant perf improve
+#pragma omp parallel for
     for (int64_t i = 0; i < size; i += kCostPerGroup) {
       int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
       int block = count >> 2;
@@ -36,7 +36,7 @@ void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input,
       }
     }
   } else {
-#pragma omp parallel for num_threads(1)  // no significant perf improve
+#pragma omp parallel for
     for (int64_t i = 0; i < size; i += kCostPerGroup) {
       int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
       int block = count >> 2;
-- 
GitLab