From cd0a494c75c757d93623a834c16660d4d05a106d Mon Sep 17 00:00:00 2001 From: Bin Li Date: Tue, 4 Sep 2018 16:39:18 +0800 Subject: [PATCH] Refactor gemmlowp context and optimize op quantize --- .gitlab-ci.yml | 2 +- mace/benchmark/benchmark_model.cc | 3 +- mace/core/BUILD | 3 +- mace/core/runtime/cpu/cpu_runtime.cc | 22 +++++++-- mace/core/runtime/cpu/cpu_runtime.h | 3 +- mace/core/testing/test_benchmark_main.cc | 7 +-- mace/kernels/gemmlowp_util.cc | 58 ------------------------ mace/kernels/quantize.h | 2 + mace/libmace/mace_runtime.cc | 7 ++- mace/public/mace_runtime.h | 29 ++---------- mace/tools/validation/mace_run.cc | 3 +- 11 files changed, 41 insertions(+), 98 deletions(-) delete mode 100644 mace/kernels/gemmlowp_util.cc diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 339a71e1..11979074 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -72,7 +72,7 @@ extra_tests: platform_compatible_tests: stage: platform_compatible_tests script: - - bazel build mace/core:core + - bazel build mace/core:core --define openmp=true build_libraries: stage: build_libraries diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index b6e2a9c2..9a689f45 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -260,7 +260,8 @@ int Main(int argc, char **argv) { // config runtime MaceStatus ret = mace::SetOpenMPThreadPolicy( FLAGS_omp_num_threads, - static_cast(FLAGS_cpu_affinity_policy)); + static_cast(FLAGS_cpu_affinity_policy), + true); if (ret != MACE_SUCCESS) { LOG(WARNING) << "Set openmp or cpu affinity failed."; } diff --git a/mace/core/BUILD b/mace/core/BUILD index fd9ec5a3..bacde19a 100644 --- a/mace/core/BUILD +++ b/mace/core/BUILD @@ -64,6 +64,7 @@ cc_library( "//mace/codegen:generated_version", "//mace/proto:mace_cc", "//mace/utils", + "@gemmlowp", ] + if_opencl_enabled([ ":opencl_headers", "//mace/codegen:generated_opencl", @@ -84,7 +85,7 @@ cc_library( "-Wno-missing-field-initializers", ], deps = [ - "@opencl_clhpp//:opencl_clhpp", + "@opencl_clhpp", "@opencl_headers//:opencl20_headers", ], ) diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc index 5bef3805..10bfbee8 100644 --- a/mace/core/runtime/cpu/cpu_runtime.cc +++ b/mace/core/runtime/cpu/cpu_runtime.cc @@ -27,6 +27,7 @@ #include #include +#include "public/gemmlowp.h" #include "mace/core/macros.h" #include "mace/public/mace.h" #include "mace/public/mace_runtime.h" @@ -57,8 +58,8 @@ int GetCPUCount() { int GetCPUMaxFreq(int cpu_id) { char path[64]; snprintf(path, sizeof(path), - "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", - cpu_id); + "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", + cpu_id); FILE *fp = fopen(path, "rb"); if (!fp) { @@ -92,6 +93,11 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) { } // namespace +gemmlowp::GemmContext& GetGemmlowpContext() { + static auto *gemm_context = new gemmlowp::GemmContext; + return *gemm_context; +} + MaceStatus GetCPUBigLittleCoreIDs(std::vector *big_core_ids, std::vector *little_core_ids) { MACE_CHECK_NOTNULL(big_core_ids); @@ -166,8 +172,13 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, } MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint, - CPUAffinityPolicy policy) { + CPUAffinityPolicy policy, + bool use_gemmlowp) { if (policy == CPUAffinityPolicy::AFFINITY_NONE) { + if (use_gemmlowp) { + gemmlowp::GemmContext& gemm_context = GetGemmlowpContext(); + gemm_context.set_max_num_threads(std::max(0, omp_num_threads_hint)); + } #ifdef MACE_ENABLE_OPENMP if (omp_num_threads_hint > 0) { omp_set_num_threads(std::min(omp_num_threads_hint, omp_get_num_procs())); @@ -197,6 +208,11 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint, omp_num_threads_hint = use_cpu_ids.size(); } + if (use_gemmlowp) { + gemmlowp::GemmContext& gemm_context = GetGemmlowpContext(); + gemm_context.set_max_num_threads(omp_num_threads_hint); + } + return SetOpenMPThreadsAndAffinityCPUs(omp_num_threads_hint, use_cpu_ids); } diff --git a/mace/core/runtime/cpu/cpu_runtime.h b/mace/core/runtime/cpu/cpu_runtime.h index 333729e1..1fb463f5 100644 --- a/mace/core/runtime/cpu/cpu_runtime.h +++ b/mace/core/runtime/cpu/cpu_runtime.h @@ -29,7 +29,8 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, const std::vector &cpu_ids); MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint, - CPUAffinityPolicy policy); + CPUAffinityPolicy policy, + bool use_gemmlowp = false); } // namespace mace diff --git a/mace/core/testing/test_benchmark_main.cc b/mace/core/testing/test_benchmark_main.cc index e730c10e..48a6928d 100644 --- a/mace/core/testing/test_benchmark_main.cc +++ b/mace/core/testing/test_benchmark_main.cc @@ -42,12 +42,7 @@ int main(int argc, char **argv) { if (status != mace::MACE_SUCCESS) { LOG(WARNING) << "Set openmp or cpu affinity failed."; } - status = SetGemmlowpThreadPolicy( - FLAGS_omp_num_threads, - static_cast(FLAGS_cpu_affinity_policy)); - if (status != mace::MACE_SUCCESS) { - LOG(WARNING) << "Set gemmlowp threads or cpu affinity failed."; - } + mace::OpenCLRuntime::Configure( static_cast(FLAGS_gpu_perf_hint), static_cast(FLAGS_gpu_priority_hint)); diff --git a/mace/kernels/gemmlowp_util.cc b/mace/kernels/gemmlowp_util.cc deleted file mode 100644 index 50716d52..00000000 --- a/mace/kernels/gemmlowp_util.cc +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2018 Xiaomi, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "mace/kernels/gemmlowp_util.h" - -#include -#include - -#include "mace/core/runtime/cpu/cpu_runtime.h" - -namespace mace { - -gemmlowp::GemmContext& GetGemmlowpContext() { - static auto *gemm_context = new gemmlowp::GemmContext; - return *gemm_context; -} - -MaceStatus SetGemmlowpThreadPolicy(int num_threads_hint, - CPUAffinityPolicy policy) { - gemmlowp::GemmContext& gemm_context = GetGemmlowpContext(); - - if (policy != AFFINITY_NONE) { - std::vector big_core_ids; - std::vector little_core_ids; - MaceStatus res = GetCPUBigLittleCoreIDs(&big_core_ids, &little_core_ids); - if (res != MACE_SUCCESS) { - return res; - } - - int use_cpu_size; - if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) { - use_cpu_size = static_cast(big_core_ids.size()); - } else { - use_cpu_size = static_cast(little_core_ids.size()); - } - - if (num_threads_hint <= 0 || num_threads_hint > use_cpu_size) { - num_threads_hint = use_cpu_size; - } - } - - gemm_context.set_max_num_threads(std::max(0, num_threads_hint)); - - return MACE_SUCCESS; -} - -} // namespace mace diff --git a/mace/kernels/quantize.h b/mace/kernels/quantize.h index 7030d79f..1f1cb8d1 100644 --- a/mace/kernels/quantize.h +++ b/mace/kernels/quantize.h @@ -104,6 +104,7 @@ inline void QuantizeWithScaleAndZeropoint(const float *input, int32_t zero_point, T *output) { float recip_scale = 1 / scale; +#pragma omp parallel for for (int i = 0; i < size; ++i) { output[i] = Saturate(roundf(zero_point + recip_scale * input[i])); } @@ -132,6 +133,7 @@ inline void Dequantize(const T *input, const float scale, const int32_t zero_point, float *output) { +#pragma omp parallel for for (int i = 0; i < size; ++i) { output[i] = scale * (input[i] - zero_point); } diff --git a/mace/libmace/mace_runtime.cc b/mace/libmace/mace_runtime.cc index 45ae962c..24b2cd8f 100644 --- a/mace/libmace/mace_runtime.cc +++ b/mace/libmace/mace_runtime.cc @@ -90,10 +90,13 @@ void SetGPUHints(GPUPerfHint gpu_perf_hint, GPUPriorityHint gpu_priority_hint) { } MaceStatus SetOpenMPThreadPolicy(int num_threads_hint, - CPUAffinityPolicy policy) { + CPUAffinityPolicy policy, + bool use_gemmlowp) { VLOG(1) << "Set OpenMP threads number hint: " << num_threads_hint << ", affinity policy: " << policy; - return SetOpenMPThreadsAndAffinityPolicy(num_threads_hint, policy); + return SetOpenMPThreadsAndAffinityPolicy(num_threads_hint, + policy, + use_gemmlowp); } MaceStatus SetOpenMPThreadAffinity(int num_threads, diff --git a/mace/public/mace_runtime.h b/mace/public/mace_runtime.h index f97a2cf7..4cd60d2b 100644 --- a/mace/public/mace_runtime.h +++ b/mace/public/mace_runtime.h @@ -137,15 +137,19 @@ void SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint); /// is larger than it. /// The OpenMP threads will be bind to (via sched_setaffinity) big cores /// (AFFINITY_BIG_ONLY) or little cores (AFFINITY_LITTLE_ONLY). +/// If use_gemmlowp is set to be true, then gemmlowp threads would be set for +/// quantized inference. /// /// \param num_threads_hint it is only a hint. /// \param policy one of CPUAffinityPolicy +/// \param use_gemmlowp use gemmlowp for quantized inference /// \return MACE_SUCCESS for success, or it can't reliably detect big-LITTLE /// cores (see GetBigLittleCoreIDs). In such cases, it's suggested to use /// AFFINITY_NONE to use all cores. __attribute__((visibility("default"))) MaceStatus SetOpenMPThreadPolicy(int num_threads_hint, - CPUAffinityPolicy policy); + CPUAffinityPolicy policy, + bool use_gemmlowp = false); /// \brief Set OpenMP threads number and processor affinity. /// @@ -177,29 +181,6 @@ MaceStatus SetOpenMPThreadAffinity(int num_threads, __attribute__((visibility("default"))) MaceStatus GetBigLittleCoreIDs(std::vector *big_core_ids, std::vector *little_core_ids); - -/// \brief Set gemmlowp threads number and affinity policy for quantization. -/// -/// Caution: this function may hurt performance if improper parameters provided. -/// gemmlowp shares threads with OpenMP, which are set by SetOpenMPThreadPolicy, -/// so affinity policy set by these two functions should be the same. -/// When num_threads_hint is zero or negative, -/// the function will set the threads number equaling to the number of -/// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all -/// (AFFINITY_NONE) cores according to the policy. The threads number will -/// also be truncated to the corresponding cores number when num_threads_hint -/// is larger than it. -/// The gemmlowp threads will be bind to (via sched_setaffinity) big cores -/// (AFFINITY_BIG_ONLY) or little cores (AFFINITY_LITTLE_ONLY). -/// -/// \param num_threads_hint it is only a hint. -/// \param policy one of CPUAffinityPolicy -/// \return MACE_SUCCESS for success, or it can't reliably detect big-LITTLE -/// cores (see GetBigLittleCoreIDs). In such cases, it's suggested to use -/// AFFINITY_NONE to use all cores. -__attribute__((visibility("default"))) -MaceStatus SetGemmlowpThreadPolicy(int num_threads_hint, - CPUAffinityPolicy policy); } // namespace mace #endif // MACE_PUBLIC_MACE_RUNTIME_H_ diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc index 54532d32..0aeefb78 100644 --- a/mace/tools/validation/mace_run.cc +++ b/mace/tools/validation/mace_run.cc @@ -205,7 +205,8 @@ bool RunModel(const std::string &model_name, // config runtime MaceStatus status = mace::SetOpenMPThreadPolicy( FLAGS_omp_num_threads, - static_cast(FLAGS_cpu_affinity_policy)); + static_cast(FLAGS_cpu_affinity_policy), + true); if (status != MACE_SUCCESS) { LOG(WARNING) << "Set openmp or cpu affinity failed."; } -- GitLab