提交 8c00b57e 编写于 作者: L Liangliang He

Merge branch 'master' into 'master'

Improve relu & addn performance using multithread

See merge request !66
......@@ -32,7 +32,6 @@ typedef int64_t index_t;
#define MACE_NOT_IMPLEMENTED MACE_CHECK(false, "not implemented")
// TODO: need to fine tune this
#define kCostPerGroup 1024000000
#define kCostPerGroup 10240
#endif // MACE_CORE_COMMON_H_
......@@ -41,6 +41,8 @@ bool Workspace::RemoveTensor(const string& name) {
const Tensor* Workspace::GetTensor(const string& name) const {
if (tensor_map_.count(name)) {
return tensor_map_.at(name).get();
} else {
LOG(WARNING) << "Tensor " << name << " does not exist.";
}
return nullptr;
}
......
......@@ -21,7 +21,7 @@ void AddNFunctor<DeviceType::NEON, float>::operator()(
}
int64_t element_per_group = size / groups;
#pragma omp parallel for num_threads(1) // no significant performance improve
#pragma omp parallel for
for (int64_t i = 0; i < size; i += element_per_group) {
int64_t count = std::min(element_per_group, size - i);
int nn = count >> 2;
......
......@@ -13,7 +13,7 @@ void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input,
float *output,
index_t size) {
if (max_limit_ < 0) {
#pragma omp parallel for num_threads(1) // no significant perf improve
#pragma omp parallel for
for (int64_t i = 0; i < size; i += kCostPerGroup) {
int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
int block = count >> 2;
......@@ -36,7 +36,7 @@ void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input,
}
}
} else {
#pragma omp parallel for num_threads(1) // no significant perf improve
#pragma omp parallel for
for (int64_t i = 0; i < size; i += kCostPerGroup) {
int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
int block = count >> 2;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册