From 840aa2d0c077acaf003b5dd50a8a3c402904560e Mon Sep 17 00:00:00 2001
From: Bin Li <libin11@xiaomi.com>
Date: Mon, 17 Sep 2018 15:13:01 +0800
Subject: [PATCH] Refactor CPURuntime

---
 mace/core/BUILD                          |  5 +-
 mace/core/device.cc                      |  8 ++-
 mace/core/device.h                       |  4 +-
 mace/core/runtime/cpu/cpu_runtime.cc     | 27 ++++-----
 mace/core/runtime/cpu/cpu_runtime.h      | 46 +++++++++++----
 mace/core/runtime/opencl/gpu_device.cc   |  6 +-
 mace/core/runtime/opencl/gpu_device.h    |  4 +-
 mace/core/testing/test_benchmark_main.cc |  7 +--
 mace/kernels/conv_2d.h                   |  5 +-
 mace/kernels/fully_connected.h           |  5 +-
 mace/kernels/gemmlowp_util.h             |  2 -
 mace/kernels/matmul.h                    |  5 +-
 mace/kernels/matmul_benchmark.cc         | 22 ++++---
 mace/libmace/mace.cc                     | 74 +++++++++---------------
 mace/ops/ops_test_util.cc                | 19 ++++--
 mace/ops/ops_test_util.h                 | 12 ++--
 mace/public/mace.h                       | 43 +-------------
 17 files changed, 143 insertions(+), 151 deletions(-)
diff --git a/mace/core/BUILD b/mace/core/BUILD
index bacde19a..8b97ed91 100644
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -104,10 +104,13 @@ cc_library(
         "-Werror",
         "-Wextra",
         "-Wno-missing-field-initializers",
-    ],
+    ] + if_opencl_enabled([
+        "-DMACE_ENABLE_OPENCL",
+    ]),
     deps = [
         ":core",
         "//external:gflags_nothreads",
+        "//mace/ops:test",
         "//mace/utils",
     ],
 )
diff --git a/mace/core/device.cc b/mace/core/device.cc
index 09f5a068..aa0d1663 100644
--- a/mace/core/device.cc
+++ b/mace/core/device.cc
@@ -16,8 +16,12 @@
 
 namespace mace {
 
-CPUDevice::CPUDevice(const int num_threads)
-    : cpu_runtime_(new CPURuntime(num_threads)) {}
+CPUDevice::CPUDevice(const int num_threads,
+                     const CPUAffinityPolicy policy,
+                     const bool use_gemmlowp)
+    : cpu_runtime_(new CPURuntime(num_threads,
+                                  policy,
+                                  use_gemmlowp)) {}
 
 CPUDevice::~CPUDevice() = default;
 
diff --git a/mace/core/device.h b/mace/core/device.h
index 7336d79f..ec1c7e6a 100644
--- a/mace/core/device.h
+++ b/mace/core/device.h
@@ -41,7 +41,9 @@ class Device {
 
 class CPUDevice : public Device {
  public:
-  explicit CPUDevice(const int num_threads);
+  CPUDevice(const int num_threads,
+            const CPUAffinityPolicy policy,
+            const bool use_gemmlowp);
   virtual ~CPUDevice();
 
 #ifdef MACE_ENABLE_OPENCL
diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc
index 5e76e499..ac8a3582 100644
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -27,7 +27,6 @@
 #include <utility>
 #include <vector>
 
-#include "public/gemmlowp.h"
 #include "mace/core/macros.h"
 #include "mace/public/mace.h"
 #include "mace/utils/logging.h"
@@ -92,13 +91,6 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) {
   }
 }
 
-}  // namespace
-
-gemmlowp::GemmContext& GetGemmlowpContext() {
-  static auto *gemm_context = new gemmlowp::GemmContext;
-  return *gemm_context;
-}
-
 MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
                                   std::vector<int> *little_core_ids) {
   MACE_CHECK_NOTNULL(big_core_ids);
@@ -174,13 +166,15 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
 #endif
 }
 
-MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
-                                             CPUAffinityPolicy policy,
-                                             bool use_gemmlowp) {
+}  // namespace
+
+MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
+    int omp_num_threads_hint,
+    CPUAffinityPolicy policy,
+    gemmlowp::GemmContext *gemm_context) {
   if (policy == CPUAffinityPolicy::AFFINITY_NONE) {
-    if (use_gemmlowp) {
-      gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
-      gemm_context.set_max_num_threads(std::max(0, omp_num_threads_hint));
+    if (gemm_context) {
+      gemm_context->set_max_num_threads(std::max(0, omp_num_threads_hint));
     }
 #ifdef MACE_ENABLE_OPENMP
     if (omp_num_threads_hint > 0) {
@@ -211,9 +205,8 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
     omp_num_threads_hint = use_cpu_ids.size();
   }
 
-  if (use_gemmlowp) {
-    gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
-    gemm_context.set_max_num_threads(omp_num_threads_hint);
+  if (gemm_context) {
+    gemm_context->set_max_num_threads(omp_num_threads_hint);
   }
 
   return SetOpenMPThreadsAndAffinityCPUs(omp_num_threads_hint, use_cpu_ids);
diff --git a/mace/core/runtime/cpu/cpu_runtime.h b/mace/core/runtime/cpu/cpu_runtime.h
index 83d397ee..4b0f796b 100644
--- a/mace/core/runtime/cpu/cpu_runtime.h
+++ b/mace/core/runtime/cpu/cpu_runtime.h
@@ -15,33 +15,55 @@
 #ifndef MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
 #define MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
 
+#include <memory>
 #include <vector>
 
+#include "public/gemmlowp.h"
 #include "mace/public/mace.h"
+#include "mace/utils/logging.h"
 
 namespace mace {
 
 extern int MaceOpenMPThreadCount;
 
-MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
-                                  std::vector<int> *little_core_ids);
-
-MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
-                                           const std::vector<int> &cpu_ids);
-
-MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
-                                             CPUAffinityPolicy policy,
-                                             bool use_gemmlowp = false);
-
 class CPURuntime {
  public:
-  explicit CPURuntime(const int num_threads) : num_threads_(num_threads) {}
+  CPURuntime(const int num_threads,
+             CPUAffinityPolicy policy,
+             bool use_gemmlowp)
+      : num_threads_(num_threads),
+        policy_(policy),
+        gemm_context_(nullptr) {
+    if (use_gemmlowp) {
+      MACE_CHECK_NOTNULL(GetGemmlowpContext());
+    }
+
+    SetOpenMPThreadsAndAffinityPolicy(num_threads_,
+                                      policy_,
+                                      gemm_context_.get());
+  }
   ~CPURuntime() = default;
-  inline int num_threads() const {
+
+  gemmlowp::GemmContext *GetGemmlowpContext() {
+    if (!gemm_context_) {
+      gemm_context_.reset(new gemmlowp::GemmContext());
+    }
+    return gemm_context_.get();
+  }
+
+  int num_threads() const {
     return num_threads_;
   }
+
  private:
+  MaceStatus SetOpenMPThreadsAndAffinityPolicy(
+      int omp_num_threads_hint,
+      CPUAffinityPolicy policy,
+      gemmlowp::GemmContext *gemm_context);
+
   int num_threads_;
+  CPUAffinityPolicy policy_;
+  std::unique_ptr<gemmlowp::GemmContext> gemm_context_;
 };
 }  // namespace mace
 
diff --git a/mace/core/runtime/opencl/gpu_device.cc b/mace/core/runtime/opencl/gpu_device.cc
index cd9e41bb..65686f83 100644
--- a/mace/core/runtime/opencl/gpu_device.cc
+++ b/mace/core/runtime/opencl/gpu_device.cc
@@ -21,8 +21,10 @@ GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
                      const GPUPriorityHint priority,
                      const GPUPerfHint perf,
                      KVStorage *opencl_binary_storage,
-                     const int num_threads) :
-    CPUDevice(num_threads),
+                     const int num_threads,
+                     CPUAffinityPolicy cpu_affinity_policy,
+                     bool use_gemmlowp) :
+    CPUDevice(num_threads, cpu_affinity_policy, use_gemmlowp),
     runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf,
                                opencl_binary_storage, tuner)),
     allocator_(new OpenCLAllocator(runtime_.get())) {}
diff --git a/mace/core/runtime/opencl/gpu_device.h b/mace/core/runtime/opencl/gpu_device.h
index 1526ba0a..350d53c8 100644
--- a/mace/core/runtime/opencl/gpu_device.h
+++ b/mace/core/runtime/opencl/gpu_device.h
@@ -30,7 +30,9 @@ class GPUDevice : public CPUDevice {
             const GPUPriorityHint priority = GPUPriorityHint::PRIORITY_LOW,
             const GPUPerfHint perf = GPUPerfHint::PERF_NORMAL,
             KVStorage *opencl_binary_storage = nullptr,
-            const int num_threads = -1);
+            const int num_threads = -1,
+            CPUAffinityPolicy cpu_affinity_policy = AFFINITY_NONE,
+            bool use_gemmlowp = false);
   ~GPUDevice();
   OpenCLRuntime *opencl_runtime() override;
   Allocator *allocator() override;
diff --git a/mace/core/testing/test_benchmark_main.cc b/mace/core/testing/test_benchmark_main.cc
index 49c26326..173b8873 100644
--- a/mace/core/testing/test_benchmark_main.cc
+++ b/mace/core/testing/test_benchmark_main.cc
@@ -17,7 +17,7 @@
 #include "gflags/gflags.h"
 #include "mace/core/runtime/cpu/cpu_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/utils/logging.h"
+#include "mace/ops/ops_test_util.h"
 
 DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*");
 DEFINE_int32(omp_num_threads, -1, "num of openmp threads");
@@ -31,13 +31,10 @@ int main(int argc, char **argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
   // config runtime
-  mace::MaceStatus status = mace::SetOpenMPThreadsAndAffinityPolicy(
+  mace::ops::test::OpTestContext::Get(
       FLAGS_omp_num_threads,
       static_cast<mace::CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
       true);
-  if (status != mace::MACE_SUCCESS) {
-    LOG(WARNING) << "Set openmp or cpu affinity failed.";
-  }
 
   mace::testing::Benchmark::Run(FLAGS_filter.c_str());
   return 0;
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index 024644f3..a8ecaa89 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -838,7 +838,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
     MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1,
                "Quantization convolution does not support dilation > 1 yet.");
 
-    gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
+    auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext();
+    MACE_CHECK_NOTNULL(gemm_context);
 
     std::vector<index_t> output_shape(4);
     std::vector<int> paddings(2);
@@ -955,7 +956,7 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
 
     using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
     gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
-        &gemm_context, filter_matrix, input_matrix, &output_matrix,
+        gemm_context, filter_matrix, input_matrix, &output_matrix,
         -filter->zero_point(), -input->zero_point(), output_pipeline);
 
     return MACE_SUCCESS;
diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h
index e6743aa4..ccbc6344 100644
--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -100,7 +100,8 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
                         Tensor *output,
                         StatsFuture *future) {
     MACE_UNUSED(future);
-    gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
+    auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext();
+    MACE_CHECK_NOTNULL(gemm_context);
 
     std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
     MACE_RETURN_IF_ERROR(output->Resize(output_shape));
@@ -142,7 +143,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
 
     using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
     gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
-        &gemm_context, weight_matrix, input_matrix, &output_matrix,
+        gemm_context, weight_matrix, input_matrix, &output_matrix,
         -weight->zero_point(), -input->zero_point(), output_pipeline);
 
     return MACE_SUCCESS;
diff --git a/mace/kernels/gemmlowp_util.h b/mace/kernels/gemmlowp_util.h
index 28d45d3a..f8fd26e0 100644
--- a/mace/kernels/gemmlowp_util.h
+++ b/mace/kernels/gemmlowp_util.h
@@ -22,8 +22,6 @@
 
 namespace mace {
 
-gemmlowp::GemmContext& GetGemmlowpContext();
-
 struct GemmlowpOutputPipeline {
   typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col>
       ColVectorMap;
diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h
index 9c5292d2..d22d391f 100644
--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -119,7 +119,8 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {
                   const index_t K,
                   const index_t width,
                   Tensor *C) {
-    gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
+    auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext();
+    MACE_CHECK_NOTNULL(gemm_context);
 
     Tensor::MappingGuard guarda(A);
     Tensor::MappingGuard guardb(B);
@@ -146,7 +147,7 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {
 
       using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
       gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
-          &gemm_context, a_matrix, b_matrix, &c_matrix, -A->zero_point(),
+          gemm_context, a_matrix, b_matrix, &c_matrix, -A->zero_point(),
           -B->zero_point(), output_pipeline);
     }
   }
diff --git a/mace/kernels/matmul_benchmark.cc b/mace/kernels/matmul_benchmark.cc
index be76a88e..ef19bd6c 100644
--- a/mace/kernels/matmul_benchmark.cc
+++ b/mace/kernels/matmul_benchmark.cc
@@ -21,8 +21,8 @@
 #include "public/gemmlowp.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/kernels/gemm.h"
-#include "mace/kernels/gemmlowp_util.h"
 #include "mace/kernels/sgemm.h"
+#include "mace/ops/ops_test_util.h"
 
 namespace gemmlowp {
 
@@ -164,18 +164,22 @@ void MatmulBenchmark_gemmlowp_uint8(int iters, int rows, int depth, int cols) {
   const auto output_pipeline =
       std::make_tuple(quantize_down_stage, saturating_cast_stage);
 
-  gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
+  auto gemm_context =
+      mace::ops::test::OpTestContext::Get()
+          ->GetDevice(CPU)->cpu_runtime()->GetGemmlowpContext();
+  MACE_CHECK_NOTNULL(gemm_context);
+
   using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
 
   gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>(
-      &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+      gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
       -128, output_pipeline);
 
   mace::testing::StartTiming();
   while (iters--) {
     gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
                                      BitDepthParams>(
-        &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+        gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
         -128, output_pipeline);
   }
 }
@@ -195,18 +199,22 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {
 
   const auto output_pipeline = std::make_tuple();
 
-  gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
+  auto gemm_context =
+      mace::ops::test::OpTestContext::Get()
+          ->GetDevice(CPU)->cpu_runtime()->GetGemmlowpContext();
+  MACE_CHECK_NOTNULL(gemm_context);
+
   using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
 
   gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>(
-      &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+      gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
       -128, output_pipeline);
 
   mace::testing::StartTiming();
   while (iters--) {
     gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
                                      BitDepthParams>(
-        &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+        gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
         -128, output_pipeline);
   }
 }
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index 80a35943..b9ae1497 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -177,9 +177,6 @@ class MaceEngineConfig::Impl {
                                 CPUAffinityPolicy policy,
                                 bool use_gemmlowp);
 
-  MaceStatus SetOpenMPThreadAffinity(int num_threads,
-                                     const std::vector<int> &cpu_ids);
-
   inline DeviceType device_type() const {
     return device_type_;
   }
@@ -188,6 +185,14 @@ class MaceEngineConfig::Impl {
     return num_threads_;
   }
 
+  inline CPUAffinityPolicy cpu_affinity_policy() const {
+    return cpu_affinity_policy_;
+  }
+
+  inline bool use_gemmlowp() const {
+    return use_gemmlowp_;
+  }
+
   inline std::shared_ptr<GPUContext> gpu_context() const {
     return gpu_context_;
   }
@@ -203,6 +208,8 @@ class MaceEngineConfig::Impl {
  private:
   DeviceType device_type_;
   int num_threads_;
+  CPUAffinityPolicy cpu_affinity_policy_;
+  bool use_gemmlowp_;
   std::shared_ptr<GPUContext> gpu_context_;
   GPUPriorityHint gpu_priority_hint_;
   GPUPerfHint gpu_perf_hint_;
@@ -211,6 +218,8 @@ class MaceEngineConfig::Impl {
 MaceEngineConfig::Impl::Impl(const DeviceType device_type)
     : device_type_(device_type),
       num_threads_(-1),
+      cpu_affinity_policy_(CPUAffinityPolicy::AFFINITY_NONE),
+      use_gemmlowp_(false),
       gpu_context_(new GPUContext),
       gpu_priority_hint_(GPUPriorityHint::PRIORITY_LOW),
       gpu_perf_hint_(GPUPerfHint::PERF_NORMAL) {}
@@ -234,15 +243,9 @@ MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy(
     CPUAffinityPolicy policy,
     bool use_gemmlowp) {
   num_threads_ = num_threads;
-  return mace::SetOpenMPThreadsAndAffinityPolicy(
-      num_threads, policy, use_gemmlowp);
-}
-
-MaceStatus MaceEngineConfig::Impl::SetOpenMPThreadAffinity(
-    int num_threads,
-    const std::vector<int> &cpu_ids) {
-  num_threads_ = num_threads;
-  return mace::SetOpenMPThreadsAndAffinityCPUs(num_threads, cpu_ids);
+  cpu_affinity_policy_ = policy;
+  use_gemmlowp_ = use_gemmlowp;
+  return MACE_SUCCESS;
 }
 
 
@@ -270,32 +273,6 @@ MaceStatus MaceEngineConfig::SetCPUThreadPolicy(
   return impl_->SetCPUThreadPolicy(num_threads_hint, policy, use_gemmlowp);
 }
 
-MaceStatus MaceEngineConfig::SetOpenMPThreadAffinity(
-    int num_threads,
-    const std::vector<int> &cpu_ids) {
-  return impl_->SetOpenMPThreadAffinity(num_threads, cpu_ids);
-}
-
-DeviceType MaceEngineConfig::device_type() const {
-  return impl_->device_type();
-}
-
-int MaceEngineConfig::num_threads() const {
-  return impl_->num_threads();
-}
-
-std::shared_ptr<GPUContext> MaceEngineConfig::gpu_context() const {
-  return impl_->gpu_context();
-}
-
-GPUPerfHint MaceEngineConfig::gpu_perf_hint() const {
-  return impl_->gpu_perf_hint();
-}
-
-GPUPriorityHint MaceEngineConfig::gpu_priority_hint() const {
-  return impl_->gpu_priority_hint();
-}
-
 // Mace Tensor
 class MaceTensor::Impl {
  public:
@@ -389,7 +366,7 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
     : model_data_(nullptr),
       model_data_size_(0),
       op_registry_(new OperatorRegistry()),
-      device_type_(config.device_type()),
+      device_type_(config.impl_->device_type()),
       device_(nullptr),
       ws_(new Workspace()),
       net_(nullptr)
@@ -399,16 +376,21 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
 {
   LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion();
   if (device_type_ == DeviceType::CPU || device_type_ == DeviceType::HEXAGON) {
-    device_.reset(new CPUDevice(config.num_threads()));
+    device_.reset(new CPUDevice(config.impl_->num_threads(),
+                                config.impl_->cpu_affinity_policy(),
+                                config.impl_->use_gemmlowp()));
   }
 #ifdef MACE_ENABLE_OPENCL
   if (device_type_ == DeviceType::GPU) {
-    device_.reset(new GPUDevice(config.gpu_context()->opencl_tuner(),
-                                config.gpu_context()->opencl_cache_storage(),
-                                config.gpu_priority_hint(),
-                                config.gpu_perf_hint(),
-                                config.gpu_context()->opencl_binary_storage(),
-                                config.num_threads()));
+    device_.reset(new GPUDevice(
+        config.impl_->gpu_context()->opencl_tuner(),
+        config.impl_->gpu_context()->opencl_cache_storage(),
+        config.impl_->gpu_priority_hint(),
+        config.impl_->gpu_perf_hint(),
+        config.impl_->gpu_context()->opencl_binary_storage(),
+        config.impl_->num_threads(),
+        config.impl_->cpu_affinity_policy(),
+        config.impl_->use_gemmlowp()));
   }
 #endif
 }
diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc
index 5be4cb96..5e94c3a6 100644
--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -18,8 +18,12 @@ namespace mace {
 namespace ops {
 namespace test {
 
-OpTestContext *OpTestContext::Get() {
-  static OpTestContext instance;
+OpTestContext *OpTestContext::Get(int num_threads,
+                                  CPUAffinityPolicy cpu_affinity_policy,
+                                  bool use_gemmlowp) {
+  static OpTestContext instance(num_threads,
+                                cpu_affinity_policy,
+                                use_gemmlowp);
   return &instance;
 }
 
@@ -31,8 +35,15 @@ Device *OpTestContext::GetDevice(DeviceType device_type) {
   return device_map_[device_type].get();
 }
 
-OpTestContext::OpTestContext() : gpu_context_(new GPUContext()) {
-  device_map_[DeviceType::CPU] = std::unique_ptr<Device>(new CPUDevice(-1));
+OpTestContext::OpTestContext(int num_threads,
+                             CPUAffinityPolicy cpu_affinity_policy,
+                             bool use_gemmlowp)
+    : gpu_context_(new GPUContext()) {
+  device_map_[DeviceType::CPU] = std::unique_ptr<Device>(
+      new CPUDevice(num_threads,
+                    cpu_affinity_policy,
+                    use_gemmlowp));
+
   device_map_[DeviceType::GPU] = std::unique_ptr<Device>(
       new GPUDevice(gpu_context_->opencl_tuner(),
                     gpu_context_->opencl_cache_storage(),
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index 3a248ac1..4ebfb4d4 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -114,11 +114,17 @@ class OpDefBuilder {
 
 class OpTestContext {
  public:
-  static OpTestContext *Get();
+  static OpTestContext *Get(
+      int num_threads = -1,
+      CPUAffinityPolicy cpu_affinity_policy = AFFINITY_BIG_ONLY,
+      bool use_gemmlowp = true);
   std::shared_ptr<GPUContext> gpu_context() const;
   Device *GetDevice(DeviceType device_type);
+
  private:
-  OpTestContext();
+  OpTestContext(int num_threads,
+                CPUAffinityPolicy cpu_affinity_policy,
+                bool use_gemmlowp);
   MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext);
 
   std::shared_ptr<GPUContext> gpu_context_;
@@ -501,8 +507,6 @@ class OpsTestNet {
 class OpsTestBase : public ::testing::Test {
  protected:
   virtual void SetUp() {
-    SetOpenMPThreadsAndAffinityPolicy(-1,
-                                      CPUAffinityPolicy::AFFINITY_BIG_ONLY);
   }
 
   virtual void TearDown() {
diff --git a/mace/public/mace.h b/mace/public/mace.h
index 0b743423..db50a58e 100644
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -97,21 +97,6 @@ enum MaceStatus {
     }                                                                      \
   }
 
-/// \brief Get ARM big.LITTLE configuration.
-///
-/// This function will detect the max frequencies of all CPU cores, and assume
-/// the cores with largest max frequencies as big cores, and all the remaining
-/// cores as little. If all cpu core's max frequencies equals, big_core_ids and
-/// little_core_ids will both be filled with all cpu core ids.
-///
-/// \param [out] big_core_ids
-/// \param [out] little_core_ids
-/// \return If successful, it returns MACE_SUCCESS and error if it can't
-///         reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
-
-MACE_API MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
-                                        std::vector<int> *little_core_ids);
-
 /// \brief GPU context contain the status used for GPU device.
 ///
 /// The life cycle of GPUContext object is the same as MaceEngines use it.
@@ -170,6 +155,8 @@ class MACE_API GPUContextBuilder {
 };
 
 class MACE_API MaceEngineConfig {
+  friend class MaceEngine;
+
  public:
   explicit MaceEngineConfig(const DeviceType device_type);
   ~MaceEngineConfig();
@@ -219,32 +206,6 @@ class MACE_API MaceEngineConfig {
                                 CPUAffinityPolicy policy,
                                 bool use_gemmlowp = false);
 
-  /// \brief Set OpenMP threads number and processor affinity.
-  ///
-  /// Caution: this function may hurt performance
-  /// if improper parameters provided.
-  /// This function may not work well on some chips (e.g. MTK). Setting thread
-  /// affinity to offline cores may run very slow or unexpectedly.
-  /// In such cases, please use SetOpenMPThreadPolicy with default policy
-  /// instead.
-  ///
-  /// \param num_threads
-  /// \param cpu_ids
-  /// \return MACE_SUCCESS for success, other for failed.
-  MaceStatus SetOpenMPThreadAffinity(
-      int num_threads,
-      const std::vector<int> &cpu_ids);
-
-  DeviceType device_type() const;
-
-  int num_threads() const;
-
-  std::shared_ptr<GPUContext> gpu_context() const;
-
-  GPUPriorityHint gpu_priority_hint() const;
-
-  GPUPerfHint gpu_perf_hint() const;
-
  private:
   class Impl;
   std::unique_ptr<Impl> impl_;
-- 
GitLab