From cd0a494c75c757d93623a834c16660d4d05a106d Mon Sep 17 00:00:00 2001
From: Bin Li <libin11@xiaomi.com>
Date: Tue, 4 Sep 2018 16:39:18 +0800
Subject: [PATCH] Refactor gemmlowp context and optimize op quantize

---
 .gitlab-ci.yml                           |  2 +-
 mace/benchmark/benchmark_model.cc        |  3 +-
 mace/core/BUILD                          |  3 +-
 mace/core/runtime/cpu/cpu_runtime.cc     | 22 +++++++--
 mace/core/runtime/cpu/cpu_runtime.h      |  3 +-
 mace/core/testing/test_benchmark_main.cc |  7 +--
 mace/kernels/gemmlowp_util.cc            | 58 ------------------------
 mace/kernels/quantize.h                  |  2 +
 mace/libmace/mace_runtime.cc             |  7 ++-
 mace/public/mace_runtime.h               | 29 ++----------
 mace/tools/validation/mace_run.cc        |  3 +-
 11 files changed, 41 insertions(+), 98 deletions(-)
 delete mode 100644 mace/kernels/gemmlowp_util.cc
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 339a71e1..11979074 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -72,7 +72,7 @@ extra_tests:
 platform_compatible_tests:
   stage: platform_compatible_tests
   script:
-    - bazel build mace/core:core
+    - bazel build mace/core:core --define openmp=true
 
 build_libraries:
   stage: build_libraries
diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc
index b6e2a9c2..9a689f45 100644
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -260,7 +260,8 @@ int Main(int argc, char **argv) {
   // config runtime
   MaceStatus ret = mace::SetOpenMPThreadPolicy(
       FLAGS_omp_num_threads,
-      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
+      static_cast<CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
+      true);
   if (ret != MACE_SUCCESS) {
     LOG(WARNING) << "Set openmp or cpu affinity failed.";
   }
diff --git a/mace/core/BUILD b/mace/core/BUILD
index fd9ec5a3..bacde19a 100644
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -64,6 +64,7 @@ cc_library(
         "//mace/codegen:generated_version",
         "//mace/proto:mace_cc",
         "//mace/utils",
+        "@gemmlowp",
     ] + if_opencl_enabled([
         ":opencl_headers",
         "//mace/codegen:generated_opencl",
@@ -84,7 +85,7 @@ cc_library(
         "-Wno-missing-field-initializers",
     ],
     deps = [
-        "@opencl_clhpp//:opencl_clhpp",
+        "@opencl_clhpp",
         "@opencl_headers//:opencl20_headers",
     ],
 )
diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc
index 5bef3805..10bfbee8 100644
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -27,6 +27,7 @@
 #include <utility>
 #include <vector>
 
+#include "public/gemmlowp.h"
 #include "mace/core/macros.h"
 #include "mace/public/mace.h"
 #include "mace/public/mace_runtime.h"
@@ -57,8 +58,8 @@ int GetCPUCount() {
 int GetCPUMaxFreq(int cpu_id) {
   char path[64];
   snprintf(path, sizeof(path),
-          "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
-          cpu_id);
+           "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
+           cpu_id);
 
   FILE *fp = fopen(path, "rb");
   if (!fp) {
@@ -92,6 +93,11 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) {
 
 }  // namespace
 
+gemmlowp::GemmContext& GetGemmlowpContext() {
+  static auto *gemm_context = new gemmlowp::GemmContext;
+  return *gemm_context;
+}
+
 MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
                                   std::vector<int> *little_core_ids) {
   MACE_CHECK_NOTNULL(big_core_ids);
@@ -166,8 +172,13 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
 }
 
 MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
-                                             CPUAffinityPolicy policy) {
+                                             CPUAffinityPolicy policy,
+                                             bool use_gemmlowp) {
   if (policy == CPUAffinityPolicy::AFFINITY_NONE) {
+    if (use_gemmlowp) {
+      gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
+      gemm_context.set_max_num_threads(std::max(0, omp_num_threads_hint));
+    }
 #ifdef MACE_ENABLE_OPENMP
     if (omp_num_threads_hint > 0) {
       omp_set_num_threads(std::min(omp_num_threads_hint, omp_get_num_procs()));
@@ -197,6 +208,11 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
     omp_num_threads_hint = use_cpu_ids.size();
   }
 
+  if (use_gemmlowp) {
+    gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
+    gemm_context.set_max_num_threads(omp_num_threads_hint);
+  }
+
   return SetOpenMPThreadsAndAffinityCPUs(omp_num_threads_hint, use_cpu_ids);
 }
 
diff --git a/mace/core/runtime/cpu/cpu_runtime.h b/mace/core/runtime/cpu/cpu_runtime.h
index 333729e1..1fb463f5 100644
--- a/mace/core/runtime/cpu/cpu_runtime.h
+++ b/mace/core/runtime/cpu/cpu_runtime.h
@@ -29,7 +29,8 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
                                            const std::vector<int> &cpu_ids);
 
 MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
-                                             CPUAffinityPolicy policy);
+                                             CPUAffinityPolicy policy,
+                                             bool use_gemmlowp = false);
 
 }  // namespace mace
 
diff --git a/mace/core/testing/test_benchmark_main.cc b/mace/core/testing/test_benchmark_main.cc
index e730c10e..48a6928d 100644
--- a/mace/core/testing/test_benchmark_main.cc
+++ b/mace/core/testing/test_benchmark_main.cc
@@ -42,12 +42,7 @@ int main(int argc, char **argv) {
   if (status != mace::MACE_SUCCESS) {
     LOG(WARNING) << "Set openmp or cpu affinity failed.";
   }
-  status = SetGemmlowpThreadPolicy(
-      FLAGS_omp_num_threads,
-      static_cast<mace::CPUAffinityPolicy>(FLAGS_cpu_affinity_policy));
-  if (status != mace::MACE_SUCCESS) {
-    LOG(WARNING) << "Set gemmlowp threads or cpu affinity failed.";
-  }
+
   mace::OpenCLRuntime::Configure(
       static_cast<mace::GPUPerfHint>(FLAGS_gpu_perf_hint),
       static_cast<mace::GPUPriorityHint>(FLAGS_gpu_priority_hint));
diff --git a/mace/kernels/gemmlowp_util.cc b/mace/kernels/gemmlowp_util.cc
deleted file mode 100644
index 50716d52..00000000
--- a/mace/kernels/gemmlowp_util.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/gemmlowp_util.h"
-
-#include <algorithm>
-#include <vector>
-
-#include "mace/core/runtime/cpu/cpu_runtime.h"
-
-namespace mace {
-
-gemmlowp::GemmContext& GetGemmlowpContext() {
-  static auto *gemm_context = new gemmlowp::GemmContext;
-  return *gemm_context;
-}
-
-MaceStatus SetGemmlowpThreadPolicy(int num_threads_hint,
-                                   CPUAffinityPolicy policy) {
-  gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
-
-  if (policy != AFFINITY_NONE) {
-    std::vector<int> big_core_ids;
-    std::vector<int> little_core_ids;
-    MaceStatus res = GetCPUBigLittleCoreIDs(&big_core_ids, &little_core_ids);
-    if (res != MACE_SUCCESS) {
-      return res;
-    }
-
-    int use_cpu_size;
-    if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) {
-      use_cpu_size = static_cast<int>(big_core_ids.size());
-    } else {
-      use_cpu_size = static_cast<int>(little_core_ids.size());
-    }
-
-    if (num_threads_hint <= 0 || num_threads_hint > use_cpu_size) {
-      num_threads_hint = use_cpu_size;
-    }
-  }
-
-  gemm_context.set_max_num_threads(std::max(0, num_threads_hint));
-
-  return MACE_SUCCESS;
-}
-
-}  // namespace mace
diff --git a/mace/kernels/quantize.h b/mace/kernels/quantize.h
index 7030d79f..1f1cb8d1 100644
--- a/mace/kernels/quantize.h
+++ b/mace/kernels/quantize.h
@@ -104,6 +104,7 @@ inline void QuantizeWithScaleAndZeropoint(const float *input,
                                           int32_t zero_point,
                                           T *output) {
   float recip_scale = 1 / scale;
+#pragma omp parallel for
   for (int i = 0; i < size; ++i) {
     output[i] = Saturate<T>(roundf(zero_point + recip_scale * input[i]));
   }
@@ -132,6 +133,7 @@ inline void Dequantize(const T *input,
                        const float scale,
                        const int32_t zero_point,
                        float *output) {
+#pragma omp parallel for
   for (int i = 0; i < size; ++i) {
     output[i] = scale * (input[i] - zero_point);
   }
diff --git a/mace/libmace/mace_runtime.cc b/mace/libmace/mace_runtime.cc
index 45ae962c..24b2cd8f 100644
--- a/mace/libmace/mace_runtime.cc
+++ b/mace/libmace/mace_runtime.cc
@@ -90,10 +90,13 @@ void SetGPUHints(GPUPerfHint gpu_perf_hint, GPUPriorityHint gpu_priority_hint) {
 }
 
 MaceStatus SetOpenMPThreadPolicy(int num_threads_hint,
-                                 CPUAffinityPolicy policy) {
+                                 CPUAffinityPolicy policy,
+                                 bool use_gemmlowp) {
   VLOG(1) << "Set OpenMP threads number hint: " << num_threads_hint
           << ", affinity policy: " << policy;
-  return SetOpenMPThreadsAndAffinityPolicy(num_threads_hint, policy);
+  return SetOpenMPThreadsAndAffinityPolicy(num_threads_hint,
+                                           policy,
+                                           use_gemmlowp);
 }
 
 MaceStatus SetOpenMPThreadAffinity(int num_threads,
diff --git a/mace/public/mace_runtime.h b/mace/public/mace_runtime.h
index f97a2cf7..4cd60d2b 100644
--- a/mace/public/mace_runtime.h
+++ b/mace/public/mace_runtime.h
@@ -137,15 +137,19 @@ void SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint);
 /// is larger than it.
 /// The OpenMP threads will be bind to (via sched_setaffinity) big cores
 /// (AFFINITY_BIG_ONLY) or little cores (AFFINITY_LITTLE_ONLY).
+/// If use_gemmlowp is set to be true, then gemmlowp threads would be set for
+/// quantized inference.
 ///
 /// \param num_threads_hint it is only a hint.
 /// \param policy one of CPUAffinityPolicy
+/// \param use_gemmlowp use gemmlowp for quantized inference
 /// \return MACE_SUCCESS for success, or it can't reliably detect big-LITTLE
 /// cores (see GetBigLittleCoreIDs). In such cases, it's suggested to use
 /// AFFINITY_NONE to use all cores.
 __attribute__((visibility("default")))
 MaceStatus SetOpenMPThreadPolicy(int num_threads_hint,
-                                 CPUAffinityPolicy policy);
+                                 CPUAffinityPolicy policy,
+                                 bool use_gemmlowp = false);
 
 /// \brief Set OpenMP threads number and processor affinity.
 ///
@@ -177,29 +181,6 @@ MaceStatus SetOpenMPThreadAffinity(int num_threads,
 __attribute__((visibility("default")))
 MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
                                std::vector<int> *little_core_ids);
-
-/// \brief Set gemmlowp threads number and affinity policy for quantization.
-///
-/// Caution: this function may hurt performance if improper parameters provided.
-/// gemmlowp shares threads with OpenMP, which are set by SetOpenMPThreadPolicy,
-/// so affinity policy set by these two functions should be the same.
-/// When num_threads_hint is zero or negative,
-/// the function will set the threads number equaling to the number of
-/// big (AFFINITY_BIG_ONLY), little (AFFINITY_LITTLE_ONLY) or all
-/// (AFFINITY_NONE) cores according to the policy. The threads number will
-/// also be truncated to the corresponding cores number when num_threads_hint
-/// is larger than it.
-/// The gemmlowp threads will be bind to (via sched_setaffinity) big cores
-/// (AFFINITY_BIG_ONLY) or little cores (AFFINITY_LITTLE_ONLY).
-///
-/// \param num_threads_hint it is only a hint.
-/// \param policy one of CPUAffinityPolicy
-/// \return MACE_SUCCESS for success, or it can't reliably detect big-LITTLE
-/// cores (see GetBigLittleCoreIDs). In such cases, it's suggested to use
-/// AFFINITY_NONE to use all cores.
-__attribute__((visibility("default")))
-MaceStatus SetGemmlowpThreadPolicy(int num_threads_hint,
-                                   CPUAffinityPolicy policy);
 }  // namespace mace
 
 #endif  // MACE_PUBLIC_MACE_RUNTIME_H_
diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc
index 54532d32..0aeefb78 100644
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -205,7 +205,8 @@ bool RunModel(const std::string &model_name,
   // config runtime
   MaceStatus status = mace::SetOpenMPThreadPolicy(
       FLAGS_omp_num_threads,
-      static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
+      static_cast<CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
+      true);
   if (status != MACE_SUCCESS) {
     LOG(WARNING) << "Set openmp or cpu affinity failed.";
   }
-- 
GitLab