提交 cd0a494c 编写于 作者: B Bin Li 提交者: 赵奇可

Refactor gemmlowp context and optimize op quantize

上级 98756dbc
...@@ -72,7 +72,7 @@ extra_tests: ...@@ -72,7 +72,7 @@ extra_tests:
platform_compatible_tests: platform_compatible_tests:
stage: platform_compatible_tests stage: platform_compatible_tests
script: script:
- bazel build mace/core:core - bazel build mace/core:core --define openmp=true
build_libraries: build_libraries:
stage: build_libraries stage: build_libraries
......
...@@ -260,7 +260,8 @@ int Main(int argc, char **argv) { ...@@ -260,7 +260,8 @@ int Main(int argc, char **argv) {
// config runtime // config runtime
MaceStatus ret = mace::SetOpenMPThreadPolicy( MaceStatus ret = mace::SetOpenMPThreadPolicy(
FLAGS_omp_num_threads, FLAGS_omp_num_threads,
static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy)); static_cast<CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
true);
if (ret != MACE_SUCCESS) { if (ret != MACE_SUCCESS) {
LOG(WARNING) << "Set openmp or cpu affinity failed."; LOG(WARNING) << "Set openmp or cpu affinity failed.";
} }
......
...@@ -64,6 +64,7 @@ cc_library( ...@@ -64,6 +64,7 @@ cc_library(
"//mace/codegen:generated_version", "//mace/codegen:generated_version",
"//mace/proto:mace_cc", "//mace/proto:mace_cc",
"//mace/utils", "//mace/utils",
"@gemmlowp",
] + if_opencl_enabled([ ] + if_opencl_enabled([
":opencl_headers", ":opencl_headers",
"//mace/codegen:generated_opencl", "//mace/codegen:generated_opencl",
...@@ -84,7 +85,7 @@ cc_library( ...@@ -84,7 +85,7 @@ cc_library(
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
], ],
deps = [ deps = [
"@opencl_clhpp//:opencl_clhpp", "@opencl_clhpp",
"@opencl_headers//:opencl20_headers", "@opencl_headers//:opencl20_headers",
], ],
) )
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "public/gemmlowp.h"
#include "mace/core/macros.h" #include "mace/core/macros.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/public/mace_runtime.h" #include "mace/public/mace_runtime.h"
...@@ -57,8 +58,8 @@ int GetCPUCount() { ...@@ -57,8 +58,8 @@ int GetCPUCount() {
int GetCPUMaxFreq(int cpu_id) { int GetCPUMaxFreq(int cpu_id) {
char path[64]; char path[64];
snprintf(path, sizeof(path), snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
cpu_id); cpu_id);
FILE *fp = fopen(path, "rb"); FILE *fp = fopen(path, "rb");
if (!fp) { if (!fp) {
...@@ -92,6 +93,11 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) { ...@@ -92,6 +93,11 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) {
} // namespace } // namespace
gemmlowp::GemmContext& GetGemmlowpContext() {
static auto *gemm_context = new gemmlowp::GemmContext;
return *gemm_context;
}
MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids, MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
std::vector<int> *little_core_ids) { std::vector<int> *little_core_ids) {
MACE_CHECK_NOTNULL(big_core_ids); MACE_CHECK_NOTNULL(big_core_ids);
...@@ -166,8 +172,13 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, ...@@ -166,8 +172,13 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
} }
MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint, MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
CPUAffinityPolicy policy) { CPUAffinityPolicy policy,
bool use_gemmlowp) {
if (policy == CPUAffinityPolicy::AFFINITY_NONE) { if (policy == CPUAffinityPolicy::AFFINITY_NONE) {
if (use_gemmlowp) {
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
gemm_context.set_max_num_threads(std::max(0, omp_num_threads_hint));
}
#ifdef MACE_ENABLE_OPENMP #ifdef MACE_ENABLE_OPENMP
if (omp_num_threads_hint > 0) { if (omp_num_threads_hint > 0) {
omp_set_num_threads(std::min(omp_num_threads_hint, omp_get_num_procs())); omp_set_num_threads(std::min(omp_num_threads_hint, omp_get_num_procs()));
...@@ -197,6 +208,11 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint, ...@@ -197,6 +208,11 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
omp_num_threads_hint = use_cpu_ids.size(); omp_num_threads_hint = use_cpu_ids.size();
} }
if (use_gemmlowp) {
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
gemm_context.set_max_num_threads(omp_num_threads_hint);
}
return SetOpenMPThreadsAndAffinityCPUs(omp_num_threads_hint, use_cpu_ids); return SetOpenMPThreadsAndAffinityCPUs(omp_num_threads_hint, use_cpu_ids);
} }
......
...@@ -29,7 +29,8 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, ...@@ -29,7 +29,8 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
const std::vector<int> &cpu_ids); const std::vector<int> &cpu_ids);
MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint, MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
CPUAffinityPolicy policy); CPUAffinityPolicy policy,
bool use_gemmlowp = false);
} // namespace mace } // namespace mace
......
...@@ -42,12 +42,7 @@ int main(int argc, char **argv) { ...@@ -42,12 +42,7 @@ int main(int argc, char **argv) {
if (status != mace::MACE_SUCCESS) { if (status != mace::MACE_SUCCESS) {
LOG(WARNING) << "Set openmp or cpu affinity failed."; LOG(WARNING) << "Set openmp or cpu affinity failed.";
} }
status = SetGemmlowpThreadPolicy(
FLAGS_omp_num_threads,
static_cast<mace::CPUAffinityPolicy>(FLAGS_cpu_affinity_policy));
if (status != mace::MACE_SUCCESS) {
LOG(WARNING) << "Set gemmlowp threads or cpu affinity failed.";
}
mace::OpenCLRuntime::Configure( mace::OpenCLRuntime::Configure(
static_cast<mace::GPUPerfHint>(FLAGS_gpu_perf_hint), static_cast<mace::GPUPerfHint>(FLAGS_gpu_perf_hint),
static_cast<mace::GPUPriorityHint>(FLAGS_gpu_priority_hint)); static_cast<mace::GPUPriorityHint>(FLAGS_gpu_priority_hint));
......
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/gemmlowp_util.h"
#include <algorithm>
#include <vector>
#include "mace/core/runtime/cpu/cpu_runtime.h"
namespace mace {
gemmlowp::GemmContext& GetGemmlowpContext() {
static auto *gemm_context = new gemmlowp::GemmContext;
return *gemm_context;
}
MaceStatus SetGemmlowpThreadPolicy(int num_threads_hint,
CPUAffinityPolicy policy) {
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
if (policy != AFFINITY_NONE) {
std::vector<int> big_core_ids;
std::vector<int> little_core_ids;
MaceStatus res = GetCPUBigLittleCoreIDs(&big_core_ids, &little_core_ids);
if (res != MACE_SUCCESS) {
return res;
}
int use_cpu_size;
if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) {
use_cpu_size = static_cast<int>(big_core_ids.size());
} else {
use_cpu_size = static_cast<int>(little_core_ids.size());
}
if (num_threads_hint <= 0 || num_threads_hint > use_cpu_size) {
num_threads_hint = use_cpu_size;
}
}
gemm_context.set_max_num_threads(std::max(0, num_threads_hint));
return MACE_SUCCESS;
}
} // namespace mace
...@@ -104,6 +104,7 @@ inline void QuantizeWithScaleAndZeropoint(const float *input, ...@@ -104,6 +104,7 @@ inline void QuantizeWithScaleAndZeropoint(const float *input,
int32_t zero_point, int32_t zero_point,
T *output) { T *output) {
float recip_scale = 1 / scale; float recip_scale = 1 / scale;
#pragma omp parallel for
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
output[i] = Saturate<T>(roundf(zero_point + recip_scale * input[i])); output[i] = Saturate<T>(roundf(zero_point + recip_scale * input[i]));
} }
...@@ -132,6 +133,7 @@ inline void Dequantize(const T *input, ...@@ -132,6 +133,7 @@ inline void Dequantize(const T *input,
const float scale, const float scale,
const int32_t zero_point, const int32_t zero_point,
float *output) { float *output) {
#pragma omp parallel for
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
output[i] = scale * (input[i] - zero_point); output[i] = scale * (input[i] - zero_point);
} }
......
...@@ -90,10 +90,13 @@ void SetGPUHints(GPUPerfHint gpu_perf_hint, GPUPriorityHint gpu_priority_hint) { ...@@ -90,10 +90,13 @@ void SetGPUHints(GPUPerfHint gpu_perf_hint, GPUPriorityHint gpu_priority_hint) {
} }
MaceStatus SetOpenMPThreadPolicy(int num_threads_hint, MaceStatus SetOpenMPThreadPolicy(int num_threads_hint,
CPUAffinityPolicy policy) { CPUAffinityPolicy policy,
bool use_gemmlowp) {
VLOG(1) << "Set OpenMP threads number hint: " << num_threads_hint VLOG(1) << "Set OpenMP threads number hint: " << num_threads_hint
<< ", affinity policy: " << policy; << ", affinity policy: " << policy;
return SetOpenMPThreadsAndAffinityPolicy(num_threads_hint, policy); return SetOpenMPThreadsAndAffinityPolicy(num_threads_hint,
policy,
use_gemmlowp);
} }
MaceStatus SetOpenMPThreadAffinity(int num_threads, MaceStatus SetOpenMPThreadAffinity(int num_threads,
......
...@@ -137,15 +137,19 @@ void SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint); ...@@ -137,15 +137,19 @@ void SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint);
/// is larger than it. /// is larger than it.
/// The OpenMP threads will be bind to (via sched_setaffinity) big cores /// The OpenMP threads will be bind to (via sched_setaffinity) big cores
/// (AFFINITY_BIG_ONLY) or little cores (AFFINITY_LITTLE_ONLY). /// (AFFINITY_BIG_ONLY) or little cores (AFFINITY_LITTLE_ONLY).
/// If use_gemmlowp is set to be true, then gemmlowp threads would be set for
/// quantized inference.
/// ///
/// \param num_threads_hint it is only a hint. /// \param num_threads_hint it is only a hint.
/// \param policy one of CPUAffinityPolicy /// \param policy one of CPUAffinityPolicy
/// \param use_gemmlowp use gemmlowp for quantized inference
/// \return MACE_SUCCESS for success, or it can't reliably detect big-LITTLE /// \return MACE_SUCCESS for success, or it can't reliably detect big-LITTLE
/// cores (see GetBigLittleCoreIDs). In such cases, it's suggested to use /// cores (see GetBigLittleCoreIDs). In such cases, it's suggested to use
/// AFFINITY_NONE to use all cores. /// AFFINITY_NONE to use all cores.
__attribute__((visibility("default"))) __attribute__((visibility("default")))
MaceStatus SetOpenMPThreadPolicy(int num_threads_hint, MaceStatus SetOpenMPThreadPolicy(int num_threads_hint,
CPUAffinityPolicy policy); CPUAffinityPolicy policy,
bool use_gemmlowp = false);
/// \brief Set OpenMP threads number and processor affinity. /// \brief Set OpenMP threads number and processor affinity.
/// ///
...@@ -177,29 +181,6 @@ MaceStatus SetOpenMPThreadAffinity(int num_threads, ...@@ -177,29 +181,6 @@ MaceStatus SetOpenMPThreadAffinity(int num_threads,
__attribute__((visibility("default"))) __attribute__((visibility("default")))
MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids, MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
std::vector<int> *little_core_ids); std::vector<int> *little_core_ids);
/// \brief Set gemmlowp threads number and affinity policy for quantization.
///
/// Caution: this function may hurt performance if improper parameters provided.
/// gemmlowp shares threads with OpenMP, which are set by SetOpenMPThreadPolicy,
/// so affinity policy set by these two functions should be the same.
/// When num_threads_hint is zero or negative,
/// the function will set the threads number equaling to the number of
/// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
/// (AFFINITY_NONE) cores according to the policy. The threads number will
/// also be truncated to the corresponding cores number when num_threads_hint
/// is larger than it.
/// The gemmlowp threads will be bind to (via sched_setaffinity) big cores
/// (AFFINITY_BIG_ONLY) or little cores (AFFINITY_LITTLE_ONLY).
///
/// \param num_threads_hint it is only a hint.
/// \param policy one of CPUAffinityPolicy
/// \return MACE_SUCCESS for success, or it can't reliably detect big-LITTLE
/// cores (see GetBigLittleCoreIDs). In such cases, it's suggested to use
/// AFFINITY_NONE to use all cores.
__attribute__((visibility("default")))
MaceStatus SetGemmlowpThreadPolicy(int num_threads_hint,
CPUAffinityPolicy policy);
} // namespace mace } // namespace mace
#endif // MACE_PUBLIC_MACE_RUNTIME_H_ #endif // MACE_PUBLIC_MACE_RUNTIME_H_
...@@ -205,7 +205,8 @@ bool RunModel(const std::string &model_name, ...@@ -205,7 +205,8 @@ bool RunModel(const std::string &model_name,
// config runtime // config runtime
MaceStatus status = mace::SetOpenMPThreadPolicy( MaceStatus status = mace::SetOpenMPThreadPolicy(
FLAGS_omp_num_threads, FLAGS_omp_num_threads,
static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy)); static_cast<CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
true);
if (status != MACE_SUCCESS) { if (status != MACE_SUCCESS) {
LOG(WARNING) << "Set openmp or cpu affinity failed."; LOG(WARNING) << "Set openmp or cpu affinity failed.";
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册