From 0c4fedc90d73bb942e4d30bf124b19678b324836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= Date: Thu, 28 Sep 2017 12:53:25 +0800 Subject: [PATCH] Improve relu & addn performance using multithread --- mace/core/common.h | 3 +-- mace/core/workspace.cc | 2 ++ mace/kernels/neon/addn_neon.cc | 2 +- mace/kernels/neon/relu_neon.cc | 4 ++-- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/mace/core/common.h b/mace/core/common.h index b52526f7..75060255 100644 --- a/mace/core/common.h +++ b/mace/core/common.h @@ -32,7 +32,6 @@ typedef int64_t index_t; #define MACE_NOT_IMPLEMENTED MACE_CHECK(false, "not implemented") -// TODO: need to fine tune this -#define kCostPerGroup 1024000000 +#define kCostPerGroup 10240 #endif // MACE_CORE_COMMON_H_ diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index ecd5af3e..a421770b 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -41,6 +41,8 @@ bool Workspace::RemoveTensor(const string& name) { const Tensor* Workspace::GetTensor(const string& name) const { if (tensor_map_.count(name)) { return tensor_map_.at(name).get(); + } else { + LOG(WARNING) << "Tensor " << name << " does not exist."; } return nullptr; } diff --git a/mace/kernels/neon/addn_neon.cc b/mace/kernels/neon/addn_neon.cc index 19f621d4..fed0c4e1 100644 --- a/mace/kernels/neon/addn_neon.cc +++ b/mace/kernels/neon/addn_neon.cc @@ -21,7 +21,7 @@ void AddNFunctor::operator()( } int64_t element_per_group = size / groups; -#pragma omp parallel for num_threads(1) // no significant performance improve +#pragma omp parallel for for (int64_t i = 0; i < size; i += element_per_group) { int64_t count = std::min(element_per_group, size - i); int nn = count >> 2; diff --git a/mace/kernels/neon/relu_neon.cc b/mace/kernels/neon/relu_neon.cc index 426d8c22..7898e9b0 100644 --- a/mace/kernels/neon/relu_neon.cc +++ b/mace/kernels/neon/relu_neon.cc @@ -13,7 +13,7 @@ void ReluFunctor::operator()(const float *input, float *output, index_t size) { if (max_limit_ < 0) { -#pragma omp parallel for num_threads(1) // no significant perf improve +#pragma omp parallel for for (int64_t i = 0; i < size; i += kCostPerGroup) { int64_t count = std::min(static_cast(kCostPerGroup), size - i); int block = count >> 2; @@ -36,7 +36,7 @@ void ReluFunctor::operator()(const float *input, } } } else { -#pragma omp parallel for num_threads(1) // no significant perf improve +#pragma omp parallel for for (int64_t i = 0; i < size; i += kCostPerGroup) { int64_t count = std::min(static_cast(kCostPerGroup), size - i); int block = count >> 2; -- GitLab