diff --git a/mace/core/common.h b/mace/core/common.h index b52526f7d3f4c5a6227fd6681f20feee9c510576..75060255f45c7241b68f3a33fcea46cd14a8535f 100644 --- a/mace/core/common.h +++ b/mace/core/common.h @@ -32,7 +32,6 @@ typedef int64_t index_t; #define MACE_NOT_IMPLEMENTED MACE_CHECK(false, "not implemented") -// TODO: need to fine tune this -#define kCostPerGroup 1024000000 +#define kCostPerGroup 10240 #endif // MACE_CORE_COMMON_H_ diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc index ecd5af3eb4f714021ddf2fc2f3abc64af583bc4a..a421770bd35dd7ded1b02ada17e0264f583cee0d 100644 --- a/mace/core/workspace.cc +++ b/mace/core/workspace.cc @@ -41,6 +41,8 @@ bool Workspace::RemoveTensor(const string& name) { const Tensor* Workspace::GetTensor(const string& name) const { if (tensor_map_.count(name)) { return tensor_map_.at(name).get(); + } else { + LOG(WARNING) << "Tensor " << name << " does not exist."; } return nullptr; } diff --git a/mace/kernels/neon/addn_neon.cc b/mace/kernels/neon/addn_neon.cc index 19f621d4f34ae7d64eddb188254f65e3738a8d2d..fed0c4e1e97726aaed46115de236515250b3c8ea 100644 --- a/mace/kernels/neon/addn_neon.cc +++ b/mace/kernels/neon/addn_neon.cc @@ -21,7 +21,7 @@ void AddNFunctor::operator()( } int64_t element_per_group = size / groups; -#pragma omp parallel for num_threads(1) // no significant performance improve +#pragma omp parallel for for (int64_t i = 0; i < size; i += element_per_group) { int64_t count = std::min(element_per_group, size - i); int nn = count >> 2; diff --git a/mace/kernels/neon/relu_neon.cc b/mace/kernels/neon/relu_neon.cc index 426d8c222da5c27fc79a158ba8dd4594c8ca63cb..7898e9b0c444a52b311cf9b9cbd4a95e4427352e 100644 --- a/mace/kernels/neon/relu_neon.cc +++ b/mace/kernels/neon/relu_neon.cc @@ -13,7 +13,7 @@ void ReluFunctor::operator()(const float *input, float *output, index_t size) { if (max_limit_ < 0) { -#pragma omp parallel for num_threads(1) // no significant perf improve +#pragma omp parallel for for (int64_t i = 0; i < size; i += kCostPerGroup) { int64_t count = std::min(static_cast(kCostPerGroup), size - i); int block = count >> 2; @@ -36,7 +36,7 @@ void ReluFunctor::operator()(const float *input, } } } else { -#pragma omp parallel for num_threads(1) // no significant perf improve +#pragma omp parallel for for (int64_t i = 0; i < size; i += kCostPerGroup) { int64_t count = std::min(static_cast(kCostPerGroup), size - i); int block = count >> 2;