Merge branch 'cpu_runtime' into 'master'

Refactor CPURuntime See merge request !800

Merge branch 'cpu_runtime' into 'master'
Refactor CPURuntime See merge request !800
34797c2b · 李寅 · 46311d82 · 840aa2d0 · 34797c2b · 34797c2b
17 changed file
--- a/mace/core/BUILD
+++ b/mace/core/BUILD
@@ -104,10 +104,13 @@ cc_library(
        "-Werror",
        "-Wextra",
        "-Wno-missing-field-initializers",
-    ],
+    ] + if_opencl_enabled([
+        "-DMACE_ENABLE_OPENCL",
+    ]),
    deps = [
        ":core",
        "//external:gflags_nothreads",
+        "//mace/ops:test",
        "//mace/utils",
    ],
 )
--- a/mace/core/device.cc
+++ b/mace/core/device.cc
@@ -16,8 +16,12 @@

 namespace mace {

-CPUDevice::CPUDevice(const int num_threads)
-    : cpu_runtime_(new CPURuntime(num_threads)) {}
+CPUDevice::CPUDevice(const int num_threads,
+                     const CPUAffinityPolicy policy,
+                     const bool use_gemmlowp)
+    : cpu_runtime_(new CPURuntime(num_threads,
+                                  policy,
+                                  use_gemmlowp)) {}

 CPUDevice::~CPUDevice() = default;


--- a/mace/core/device.h
+++ b/mace/core/device.h
@@ -41,7 +41,9 @@ class Device {

 class CPUDevice : public Device {
 public:
-  explicit CPUDevice(const int num_threads);
+  CPUDevice(const int num_threads,
+            const CPUAffinityPolicy policy,
+            const bool use_gemmlowp);
  virtual ~CPUDevice();

 #ifdef MACE_ENABLE_OPENCL

--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -27,7 +27,6 @@
 #include <utility>
 #include <vector>

-#include "public/gemmlowp.h"
 #include "mace/core/macros.h"
 #include "mace/public/mace.h"
 #include "mace/utils/logging.h"
@@ -92,13 +91,6 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) {
  }
 }

-}  // namespace
-
-gemmlowp::GemmContext& GetGemmlowpContext() {
-  static auto *gemm_context = new gemmlowp::GemmContext;
-  return *gemm_context;
-}
-
 MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
                                  std::vector<int> *little_core_ids) {
  MACE_CHECK_NOTNULL(big_core_ids);
@@ -174,13 +166,15 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
 #endif
 }

-MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
-                                             CPUAffinityPolicy policy,
-                                             bool use_gemmlowp) {
+}  // namespace
+
+MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
+    int omp_num_threads_hint,
+    CPUAffinityPolicy policy,
+    gemmlowp::GemmContext *gemm_context) {
  if (policy == CPUAffinityPolicy::AFFINITY_NONE) {
-    if (use_gemmlowp) {
-      gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
-      gemm_context.set_max_num_threads(std::max(0, omp_num_threads_hint));
+    if (gemm_context) {
+      gemm_context->set_max_num_threads(std::max(0, omp_num_threads_hint));
    }
 #ifdef MACE_ENABLE_OPENMP
    if (omp_num_threads_hint > 0) {
@@ -211,9 +205,8 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
    omp_num_threads_hint = use_cpu_ids.size();
  }

-  if (use_gemmlowp) {
-    gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
-    gemm_context.set_max_num_threads(omp_num_threads_hint);
+  if (gemm_context) {
+    gemm_context->set_max_num_threads(omp_num_threads_hint);
  }

  return SetOpenMPThreadsAndAffinityCPUs(omp_num_threads_hint, use_cpu_ids);

--- a/mace/core/runtime/cpu/cpu_runtime.h
+++ b/mace/core/runtime/cpu/cpu_runtime.h
@@ -15,33 +15,55 @@
 #ifndef MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
 #define MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_

+#include <memory>
 #include <vector>

+#include "public/gemmlowp.h"
 #include "mace/public/mace.h"
+#include "mace/utils/logging.h"

 namespace mace {

 extern int MaceOpenMPThreadCount;

-MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
-                                  std::vector<int> *little_core_ids);
-
-MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
-                                           const std::vector<int> &cpu_ids);
-
-MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
-                                             CPUAffinityPolicy policy,
-                                             bool use_gemmlowp = false);
-
 class CPURuntime {
 public:
-  explicit CPURuntime(const int num_threads) : num_threads_(num_threads) {}
+  CPURuntime(const int num_threads,
+             CPUAffinityPolicy policy,
+             bool use_gemmlowp)
+      : num_threads_(num_threads),
+        policy_(policy),
+        gemm_context_(nullptr) {
+    if (use_gemmlowp) {
+      MACE_CHECK_NOTNULL(GetGemmlowpContext());
+    }
+
+    SetOpenMPThreadsAndAffinityPolicy(num_threads_,
+                                      policy_,
+                                      gemm_context_.get());
+  }
  ~CPURuntime() = default;
-  inline int num_threads() const {
+
+  gemmlowp::GemmContext *GetGemmlowpContext() {
+    if (!gemm_context_) {
+      gemm_context_.reset(new gemmlowp::GemmContext());
+    }
+    return gemm_context_.get();
+  }
+
+  int num_threads() const {
    return num_threads_;
  }
+
 private:
+  MaceStatus SetOpenMPThreadsAndAffinityPolicy(
+      int omp_num_threads_hint,
+      CPUAffinityPolicy policy,
+      gemmlowp::GemmContext *gemm_context);
+
  int num_threads_;
+  CPUAffinityPolicy policy_;
+  std::unique_ptr<gemmlowp::GemmContext> gemm_context_;
 };
 }  // namespace mace


--- a/mace/core/runtime/opencl/gpu_device.cc
+++ b/mace/core/runtime/opencl/gpu_device.cc
@@ -21,8 +21,10 @@ GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
                     const GPUPriorityHint priority,
                     const GPUPerfHint perf,
                     KVStorage *opencl_binary_storage,
-                     const int num_threads) :
-    CPUDevice(num_threads),
+                     const int num_threads,
+                     CPUAffinityPolicy cpu_affinity_policy,
+                     bool use_gemmlowp) :
+    CPUDevice(num_threads, cpu_affinity_policy, use_gemmlowp),
    runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf,
                               opencl_binary_storage, tuner)),
    allocator_(new OpenCLAllocator(runtime_.get())) {}

--- a/mace/core/runtime/opencl/gpu_device.h
+++ b/mace/core/runtime/opencl/gpu_device.h
@@ -30,7 +30,9 @@ class GPUDevice : public CPUDevice {
            const GPUPriorityHint priority = GPUPriorityHint::PRIORITY_LOW,
            const GPUPerfHint perf = GPUPerfHint::PERF_NORMAL,
            KVStorage *opencl_binary_storage = nullptr,
-            const int num_threads = -1);
+            const int num_threads = -1,
+            CPUAffinityPolicy cpu_affinity_policy = AFFINITY_NONE,
+            bool use_gemmlowp = false);
  ~GPUDevice();
  OpenCLRuntime *opencl_runtime() override;
  Allocator *allocator() override;

--- a/mace/core/testing/test_benchmark_main.cc
+++ b/mace/core/testing/test_benchmark_main.cc
@@ -17,7 +17,7 @@
 #include "gflags/gflags.h"
 #include "mace/core/runtime/cpu/cpu_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/utils/logging.h"
+#include "mace/ops/ops_test_util.h"

 DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*");
 DEFINE_int32(omp_num_threads, -1, "num of openmp threads");
@@ -31,13 +31,10 @@ int main(int argc, char **argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);

  // config runtime
-  mace::MaceStatus status = mace::SetOpenMPThreadsAndAffinityPolicy(
+  mace::ops::test::OpTestContext::Get(
      FLAGS_omp_num_threads,
      static_cast<mace::CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
      true);
-  if (status != mace::MACE_SUCCESS) {
-    LOG(WARNING) << "Set openmp or cpu affinity failed.";
-  }

  mace::testing::Benchmark::Run(FLAGS_filter.c_str());
  return 0;

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -853,7 +853,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
    MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1,
               "Quantization convolution does not support dilation > 1 yet.");

-    gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
+    auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext();
+    MACE_CHECK_NOTNULL(gemm_context);

    std::vector<index_t> output_shape(4);
    std::vector<int> paddings(2);
@@ -970,7 +971,7 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {

    using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
    gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
-        &gemm_context, filter_matrix, input_matrix, &output_matrix,
+        gemm_context, filter_matrix, input_matrix, &output_matrix,
        -filter->zero_point(), -input->zero_point(), output_pipeline);

    return MACE_SUCCESS;

--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -100,7 +100,8 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
                        Tensor *output,
                        StatsFuture *future) {
    MACE_UNUSED(future);
-    gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
+    auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext();
+    MACE_CHECK_NOTNULL(gemm_context);

    std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
@@ -142,7 +143,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {

    using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
    gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
-        &gemm_context, weight_matrix, input_matrix, &output_matrix,
+        gemm_context, weight_matrix, input_matrix, &output_matrix,
        -weight->zero_point(), -input->zero_point(), output_pipeline);

    return MACE_SUCCESS;

--- a/mace/kernels/gemmlowp_util.h
+++ b/mace/kernels/gemmlowp_util.h
@@ -22,8 +22,6 @@

 namespace mace {

-gemmlowp::GemmContext& GetGemmlowpContext();
-
 struct GemmlowpOutputPipeline {
  typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col>
      ColVectorMap;

--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -122,7 +122,8 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {
                  const index_t K,
                  const index_t width,
                  Tensor *C) {
-    gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
+    auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext();
+    MACE_CHECK_NOTNULL(gemm_context);

    Tensor::MappingGuard guarda(A);
    Tensor::MappingGuard guardb(B);
@@ -149,7 +150,7 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {

      using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
      gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
-          &gemm_context, a_matrix, b_matrix, &c_matrix, -A->zero_point(),
+          gemm_context, a_matrix, b_matrix, &c_matrix, -A->zero_point(),
          -B->zero_point(), output_pipeline);
    }
  }

--- a/mace/kernels/matmul_benchmark.cc
+++ b/mace/kernels/matmul_benchmark.cc
@@ -21,8 +21,8 @@
 #include "public/gemmlowp.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/kernels/gemm.h"
-#include "mace/kernels/gemmlowp_util.h"
 #include "mace/kernels/sgemm.h"
+#include "mace/ops/ops_test_util.h"

 namespace gemmlowp {

@@ -164,18 +164,22 @@ void MatmulBenchmark_gemmlowp_uint8(int iters, int rows, int depth, int cols) {
  const auto output_pipeline =
      std::make_tuple(quantize_down_stage, saturating_cast_stage);

-  gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
+  auto gemm_context =
+      mace::ops::test::OpTestContext::Get()
+          ->GetDevice(CPU)->cpu_runtime()->GetGemmlowpContext();
+  MACE_CHECK_NOTNULL(gemm_context);
+
  using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;

  gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>(
-      &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+      gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
      -128, output_pipeline);

  mace::testing::StartTiming();
  while (iters--) {
    gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
                                     BitDepthParams>(
-        &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+        gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
        -128, output_pipeline);
  }
 }
@@ -195,18 +199,22 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {

  const auto output_pipeline = std::make_tuple();

-  gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
+  auto gemm_context =
+      mace::ops::test::OpTestContext::Get()
+          ->GetDevice(CPU)->cpu_runtime()->GetGemmlowpContext();
+  MACE_CHECK_NOTNULL(gemm_context);
+
  using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;

  gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>(
-      &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+      gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
      -128, output_pipeline);

  mace::testing::StartTiming();
  while (iters--) {
    gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
                                     BitDepthParams>(
-        &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+        gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
        -128, output_pipeline);
  }
 }

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -177,9 +177,6 @@ class MaceEngineConfig::Impl {
                                CPUAffinityPolicy policy,
                                bool use_gemmlowp);

-  MaceStatus SetOpenMPThreadAffinity(int num_threads,
-                                     const std::vector<int> &cpu_ids);
-
  inline DeviceType device_type() const {
    return device_type_;
  }
@@ -188,6 +185,14 @@ class MaceEngineConfig::Impl {
    return num_threads_;
  }

+  inline CPUAffinityPolicy cpu_affinity_policy() const {
+    return cpu_affinity_policy_;
+  }
+
+  inline bool use_gemmlowp() const {
+    return use_gemmlowp_;
+  }
+
  inline std::shared_ptr<GPUContext> gpu_context() const {
    return gpu_context_;
  }
@@ -203,6 +208,8 @@ class MaceEngineConfig::Impl {
 private:
  DeviceType device_type_;
  int num_threads_;
+  CPUAffinityPolicy cpu_affinity_policy_;
+  bool use_gemmlowp_;
  std::shared_ptr<GPUContext> gpu_context_;
  GPUPriorityHint gpu_priority_hint_;
  GPUPerfHint gpu_perf_hint_;
@@ -211,6 +218,8 @@ class MaceEngineConfig::Impl {
 MaceEngineConfig::Impl::Impl(const DeviceType device_type)
    : device_type_(device_type),
      num_threads_(-1),
+      cpu_affinity_policy_(CPUAffinityPolicy::AFFINITY_NONE),
+      use_gemmlowp_(false),
      gpu_context_(new GPUContext),
      gpu_priority_hint_(GPUPriorityHint::PRIORITY_LOW),
      gpu_perf_hint_(GPUPerfHint::PERF_NORMAL) {}
@@ -234,15 +243,9 @@ MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy(
    CPUAffinityPolicy policy,
    bool use_gemmlowp) {
  num_threads_ = num_threads;
-  return mace::SetOpenMPThreadsAndAffinityPolicy(
-      num_threads, policy, use_gemmlowp);
-}
-
-MaceStatus MaceEngineConfig::Impl::SetOpenMPThreadAffinity(
-    int num_threads,
-    const std::vector<int> &cpu_ids) {
-  num_threads_ = num_threads;
-  return mace::SetOpenMPThreadsAndAffinityCPUs(num_threads, cpu_ids);
+  cpu_affinity_policy_ = policy;
+  use_gemmlowp_ = use_gemmlowp;
+  return MACE_SUCCESS;
 }


@@ -270,32 +273,6 @@ MaceStatus MaceEngineConfig::SetCPUThreadPolicy(
  return impl_->SetCPUThreadPolicy(num_threads_hint, policy, use_gemmlowp);
 }

-MaceStatus MaceEngineConfig::SetOpenMPThreadAffinity(
-    int num_threads,
-    const std::vector<int> &cpu_ids) {
-  return impl_->SetOpenMPThreadAffinity(num_threads, cpu_ids);
-}
-
-DeviceType MaceEngineConfig::device_type() const {
-  return impl_->device_type();
-}
-
-int MaceEngineConfig::num_threads() const {
-  return impl_->num_threads();
-}
-
-std::shared_ptr<GPUContext> MaceEngineConfig::gpu_context() const {
-  return impl_->gpu_context();
-}
-
-GPUPerfHint MaceEngineConfig::gpu_perf_hint() const {
-  return impl_->gpu_perf_hint();
-}
-
-GPUPriorityHint MaceEngineConfig::gpu_priority_hint() const {
-  return impl_->gpu_priority_hint();
-}
-
 // Mace Tensor
 class MaceTensor::Impl {
 public:
@@ -389,7 +366,7 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
    : model_data_(nullptr),
      model_data_size_(0),
      op_registry_(new OperatorRegistry()),
-      device_type_(config.device_type()),
+      device_type_(config.impl_->device_type()),
      device_(nullptr),
      ws_(new Workspace()),
      net_(nullptr)
@@ -399,16 +376,21 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
 {
  LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion();
  if (device_type_ == DeviceType::CPU || device_type_ == DeviceType::HEXAGON) {
-    device_.reset(new CPUDevice(config.num_threads()));
+    device_.reset(new CPUDevice(config.impl_->num_threads(),
+                                config.impl_->cpu_affinity_policy(),
+                                config.impl_->use_gemmlowp()));
  }
 #ifdef MACE_ENABLE_OPENCL
  if (device_type_ == DeviceType::GPU) {
-    device_.reset(new GPUDevice(config.gpu_context()->opencl_tuner(),
-                                config.gpu_context()->opencl_cache_storage(),
-                                config.gpu_priority_hint(),
-                                config.gpu_perf_hint(),
-                                config.gpu_context()->opencl_binary_storage(),
-                                config.num_threads()));
+    device_.reset(new GPUDevice(
+        config.impl_->gpu_context()->opencl_tuner(),
+        config.impl_->gpu_context()->opencl_cache_storage(),
+        config.impl_->gpu_priority_hint(),
+        config.impl_->gpu_perf_hint(),
+        config.impl_->gpu_context()->opencl_binary_storage(),
+        config.impl_->num_threads(),
+        config.impl_->cpu_affinity_policy(),
+        config.impl_->use_gemmlowp()));
  }
 #endif
 }

--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -18,8 +18,12 @@ namespace mace {
 namespace ops {
 namespace test {

-OpTestContext *OpTestContext::Get() {
-  static OpTestContext instance;
+OpTestContext *OpTestContext::Get(int num_threads,
+                                  CPUAffinityPolicy cpu_affinity_policy,
+                                  bool use_gemmlowp) {
+  static OpTestContext instance(num_threads,
+                                cpu_affinity_policy,
+                                use_gemmlowp);
  return &instance;
 }

@@ -31,8 +35,15 @@ Device *OpTestContext::GetDevice(DeviceType device_type) {
  return device_map_[device_type].get();
 }

-OpTestContext::OpTestContext() : gpu_context_(new GPUContext()) {
-  device_map_[DeviceType::CPU] = std::unique_ptr<Device>(new CPUDevice(-1));
+OpTestContext::OpTestContext(int num_threads,
+                             CPUAffinityPolicy cpu_affinity_policy,
+                             bool use_gemmlowp)
+    : gpu_context_(new GPUContext()) {
+  device_map_[DeviceType::CPU] = std::unique_ptr<Device>(
+      new CPUDevice(num_threads,
+                    cpu_affinity_policy,
+                    use_gemmlowp));
+
  device_map_[DeviceType::GPU] = std::unique_ptr<Device>(
      new GPUDevice(gpu_context_->opencl_tuner(),
                    gpu_context_->opencl_cache_storage(),

--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -114,11 +114,17 @@ class OpDefBuilder {

 class OpTestContext {
 public:
-  static OpTestContext *Get();
+  static OpTestContext *Get(
+      int num_threads = -1,
+      CPUAffinityPolicy cpu_affinity_policy = AFFINITY_BIG_ONLY,
+      bool use_gemmlowp = true);
  std::shared_ptr<GPUContext> gpu_context() const;
  Device *GetDevice(DeviceType device_type);
+
 private:
-  OpTestContext();
+  OpTestContext(int num_threads,
+                CPUAffinityPolicy cpu_affinity_policy,
+                bool use_gemmlowp);
  MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext);

  std::shared_ptr<GPUContext> gpu_context_;
@@ -504,8 +510,6 @@ class OpsTestNet {
 class OpsTestBase : public ::testing::Test {
 protected:
  virtual void SetUp() {
-    SetOpenMPThreadsAndAffinityPolicy(-1,
-                                      CPUAffinityPolicy::AFFINITY_BIG_ONLY);
  }

  virtual void TearDown() {

--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -97,21 +97,6 @@ enum MaceStatus {
    }                                                                      \
  }

-/// \brief Get ARM big.LITTLE configuration.
-///
-/// This function will detect the max frequencies of all CPU cores, and assume
-/// the cores with largest max frequencies as big cores, and all the remaining
-/// cores as little. If all cpu core's max frequencies equals, big_core_ids and
-/// little_core_ids will both be filled with all cpu core ids.
-///
-/// \param [out] big_core_ids
-/// \param [out] little_core_ids
-/// \return If successful, it returns MACE_SUCCESS and error if it can't
-///         reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
-
-MACE_API MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
-                                        std::vector<int> *little_core_ids);
-
 /// \brief GPU context contain the status used for GPU device.
 ///
 /// The life cycle of GPUContext object is the same as MaceEngines use it.
@@ -170,6 +155,8 @@ class MACE_API GPUContextBuilder {
 };

 class MACE_API MaceEngineConfig {
+  friend class MaceEngine;
+
 public:
  explicit MaceEngineConfig(const DeviceType device_type);
  ~MaceEngineConfig();
@@ -219,32 +206,6 @@ class MACE_API MaceEngineConfig {
                                CPUAffinityPolicy policy,
                                bool use_gemmlowp = false);

-  /// \brief Set OpenMP threads number and processor affinity.
-  ///
-  /// Caution: this function may hurt performance
-  /// if improper parameters provided.
-  /// This function may not work well on some chips (e.g. MTK). Setting thread
-  /// affinity to offline cores may run very slow or unexpectedly.
-  /// In such cases, please use SetOpenMPThreadPolicy with default policy
-  /// instead.
-  ///
-  /// \param num_threads
-  /// \param cpu_ids
-  /// \return MACE_SUCCESS for success, other for failed.
-  MaceStatus SetOpenMPThreadAffinity(
-      int num_threads,
-      const std::vector<int> &cpu_ids);
-
-  DeviceType device_type() const;
-
-  int num_threads() const;
-
-  std::shared_ptr<GPUContext> gpu_context() const;
-
-  GPUPriorityHint gpu_priority_hint() const;
-
-  GPUPerfHint gpu_perf_hint() const;
-
 private:
  class Impl;
  std::unique_ptr<Impl> impl_;