提交 840aa2d0 编写于 作者: B Bin Li

Refactor CPURuntime

上级 df23f428
...@@ -104,10 +104,13 @@ cc_library( ...@@ -104,10 +104,13 @@ cc_library(
"-Werror", "-Werror",
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
], ] + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
]),
deps = [ deps = [
":core", ":core",
"//external:gflags_nothreads", "//external:gflags_nothreads",
"//mace/ops:test",
"//mace/utils", "//mace/utils",
], ],
) )
...@@ -16,8 +16,12 @@ ...@@ -16,8 +16,12 @@
namespace mace { namespace mace {
CPUDevice::CPUDevice(const int num_threads) CPUDevice::CPUDevice(const int num_threads,
: cpu_runtime_(new CPURuntime(num_threads)) {} const CPUAffinityPolicy policy,
const bool use_gemmlowp)
: cpu_runtime_(new CPURuntime(num_threads,
policy,
use_gemmlowp)) {}
CPUDevice::~CPUDevice() = default; CPUDevice::~CPUDevice() = default;
......
...@@ -41,7 +41,9 @@ class Device { ...@@ -41,7 +41,9 @@ class Device {
class CPUDevice : public Device { class CPUDevice : public Device {
public: public:
explicit CPUDevice(const int num_threads); CPUDevice(const int num_threads,
const CPUAffinityPolicy policy,
const bool use_gemmlowp);
virtual ~CPUDevice(); virtual ~CPUDevice();
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
......
...@@ -27,7 +27,6 @@ ...@@ -27,7 +27,6 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "public/gemmlowp.h"
#include "mace/core/macros.h" #include "mace/core/macros.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/utils/logging.h" #include "mace/utils/logging.h"
...@@ -92,13 +91,6 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) { ...@@ -92,13 +91,6 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) {
} }
} }
} // namespace
gemmlowp::GemmContext& GetGemmlowpContext() {
static auto *gemm_context = new gemmlowp::GemmContext;
return *gemm_context;
}
MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids, MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
std::vector<int> *little_core_ids) { std::vector<int> *little_core_ids) {
MACE_CHECK_NOTNULL(big_core_ids); MACE_CHECK_NOTNULL(big_core_ids);
...@@ -174,13 +166,15 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, ...@@ -174,13 +166,15 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
#endif #endif
} }
MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint, } // namespace
MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
int omp_num_threads_hint,
CPUAffinityPolicy policy, CPUAffinityPolicy policy,
bool use_gemmlowp) { gemmlowp::GemmContext *gemm_context) {
if (policy == CPUAffinityPolicy::AFFINITY_NONE) { if (policy == CPUAffinityPolicy::AFFINITY_NONE) {
if (use_gemmlowp) { if (gemm_context) {
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext(); gemm_context->set_max_num_threads(std::max(0, omp_num_threads_hint));
gemm_context.set_max_num_threads(std::max(0, omp_num_threads_hint));
} }
#ifdef MACE_ENABLE_OPENMP #ifdef MACE_ENABLE_OPENMP
if (omp_num_threads_hint > 0) { if (omp_num_threads_hint > 0) {
...@@ -211,9 +205,8 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint, ...@@ -211,9 +205,8 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
omp_num_threads_hint = use_cpu_ids.size(); omp_num_threads_hint = use_cpu_ids.size();
} }
if (use_gemmlowp) { if (gemm_context) {
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext(); gemm_context->set_max_num_threads(omp_num_threads_hint);
gemm_context.set_max_num_threads(omp_num_threads_hint);
} }
return SetOpenMPThreadsAndAffinityCPUs(omp_num_threads_hint, use_cpu_ids); return SetOpenMPThreadsAndAffinityCPUs(omp_num_threads_hint, use_cpu_ids);
......
...@@ -15,33 +15,55 @@ ...@@ -15,33 +15,55 @@
#ifndef MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_ #ifndef MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
#define MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_ #define MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
#include <memory>
#include <vector> #include <vector>
#include "public/gemmlowp.h"
#include "mace/public/mace.h" #include "mace/public/mace.h"
#include "mace/utils/logging.h"
namespace mace { namespace mace {
extern int MaceOpenMPThreadCount; extern int MaceOpenMPThreadCount;
MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
std::vector<int> *little_core_ids);
MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
const std::vector<int> &cpu_ids);
MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
CPUAffinityPolicy policy,
bool use_gemmlowp = false);
class CPURuntime { class CPURuntime {
public: public:
explicit CPURuntime(const int num_threads) : num_threads_(num_threads) {} CPURuntime(const int num_threads,
CPUAffinityPolicy policy,
bool use_gemmlowp)
: num_threads_(num_threads),
policy_(policy),
gemm_context_(nullptr) {
if (use_gemmlowp) {
MACE_CHECK_NOTNULL(GetGemmlowpContext());
}
SetOpenMPThreadsAndAffinityPolicy(num_threads_,
policy_,
gemm_context_.get());
}
~CPURuntime() = default; ~CPURuntime() = default;
inline int num_threads() const {
gemmlowp::GemmContext *GetGemmlowpContext() {
if (!gemm_context_) {
gemm_context_.reset(new gemmlowp::GemmContext());
}
return gemm_context_.get();
}
int num_threads() const {
return num_threads_; return num_threads_;
} }
private: private:
MaceStatus SetOpenMPThreadsAndAffinityPolicy(
int omp_num_threads_hint,
CPUAffinityPolicy policy,
gemmlowp::GemmContext *gemm_context);
int num_threads_; int num_threads_;
CPUAffinityPolicy policy_;
std::unique_ptr<gemmlowp::GemmContext> gemm_context_;
}; };
} // namespace mace } // namespace mace
......
...@@ -21,8 +21,10 @@ GPUDevice::GPUDevice(Tuner<uint32_t> *tuner, ...@@ -21,8 +21,10 @@ GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
const GPUPriorityHint priority, const GPUPriorityHint priority,
const GPUPerfHint perf, const GPUPerfHint perf,
KVStorage *opencl_binary_storage, KVStorage *opencl_binary_storage,
const int num_threads) : const int num_threads,
CPUDevice(num_threads), CPUAffinityPolicy cpu_affinity_policy,
bool use_gemmlowp) :
CPUDevice(num_threads, cpu_affinity_policy, use_gemmlowp),
runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf, runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf,
opencl_binary_storage, tuner)), opencl_binary_storage, tuner)),
allocator_(new OpenCLAllocator(runtime_.get())) {} allocator_(new OpenCLAllocator(runtime_.get())) {}
......
...@@ -30,7 +30,9 @@ class GPUDevice : public CPUDevice { ...@@ -30,7 +30,9 @@ class GPUDevice : public CPUDevice {
const GPUPriorityHint priority = GPUPriorityHint::PRIORITY_LOW, const GPUPriorityHint priority = GPUPriorityHint::PRIORITY_LOW,
const GPUPerfHint perf = GPUPerfHint::PERF_NORMAL, const GPUPerfHint perf = GPUPerfHint::PERF_NORMAL,
KVStorage *opencl_binary_storage = nullptr, KVStorage *opencl_binary_storage = nullptr,
const int num_threads = -1); const int num_threads = -1,
CPUAffinityPolicy cpu_affinity_policy = AFFINITY_NONE,
bool use_gemmlowp = false);
~GPUDevice(); ~GPUDevice();
OpenCLRuntime *opencl_runtime() override; OpenCLRuntime *opencl_runtime() override;
Allocator *allocator() override; Allocator *allocator() override;
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "mace/core/runtime/cpu/cpu_runtime.h" #include "mace/core/runtime/cpu/cpu_runtime.h"
#include "mace/core/testing/test_benchmark.h" #include "mace/core/testing/test_benchmark.h"
#include "mace/utils/logging.h" #include "mace/ops/ops_test_util.h"
DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*"); DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*");
DEFINE_int32(omp_num_threads, -1, "num of openmp threads"); DEFINE_int32(omp_num_threads, -1, "num of openmp threads");
...@@ -31,13 +31,10 @@ int main(int argc, char **argv) { ...@@ -31,13 +31,10 @@ int main(int argc, char **argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true); gflags::ParseCommandLineFlags(&argc, &argv, true);
// config runtime // config runtime
mace::MaceStatus status = mace::SetOpenMPThreadsAndAffinityPolicy( mace::ops::test::OpTestContext::Get(
FLAGS_omp_num_threads, FLAGS_omp_num_threads,
static_cast<mace::CPUAffinityPolicy>(FLAGS_cpu_affinity_policy), static_cast<mace::CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
true); true);
if (status != mace::MACE_SUCCESS) {
LOG(WARNING) << "Set openmp or cpu affinity failed.";
}
mace::testing::Benchmark::Run(FLAGS_filter.c_str()); mace::testing::Benchmark::Run(FLAGS_filter.c_str());
return 0; return 0;
......
...@@ -838,7 +838,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase { ...@@ -838,7 +838,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1, MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1,
"Quantization convolution does not support dilation > 1 yet."); "Quantization convolution does not support dilation > 1 yet.");
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext(); auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext();
MACE_CHECK_NOTNULL(gemm_context);
std::vector<index_t> output_shape(4); std::vector<index_t> output_shape(4);
std::vector<int> paddings(2); std::vector<int> paddings(2);
...@@ -955,7 +956,7 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase { ...@@ -955,7 +956,7 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams; using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>( gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
&gemm_context, filter_matrix, input_matrix, &output_matrix, gemm_context, filter_matrix, input_matrix, &output_matrix,
-filter->zero_point(), -input->zero_point(), output_pipeline); -filter->zero_point(), -input->zero_point(), output_pipeline);
return MACE_SUCCESS; return MACE_SUCCESS;
......
...@@ -100,7 +100,8 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase { ...@@ -100,7 +100,8 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
Tensor *output, Tensor *output,
StatsFuture *future) { StatsFuture *future) {
MACE_UNUSED(future); MACE_UNUSED(future);
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext(); auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext();
MACE_CHECK_NOTNULL(gemm_context);
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)}; std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
MACE_RETURN_IF_ERROR(output->Resize(output_shape)); MACE_RETURN_IF_ERROR(output->Resize(output_shape));
...@@ -142,7 +143,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase { ...@@ -142,7 +143,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams; using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>( gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
&gemm_context, weight_matrix, input_matrix, &output_matrix, gemm_context, weight_matrix, input_matrix, &output_matrix,
-weight->zero_point(), -input->zero_point(), output_pipeline); -weight->zero_point(), -input->zero_point(), output_pipeline);
return MACE_SUCCESS; return MACE_SUCCESS;
......
...@@ -22,8 +22,6 @@ ...@@ -22,8 +22,6 @@
namespace mace { namespace mace {
gemmlowp::GemmContext& GetGemmlowpContext();
struct GemmlowpOutputPipeline { struct GemmlowpOutputPipeline {
typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col>
ColVectorMap; ColVectorMap;
......
...@@ -119,7 +119,8 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel { ...@@ -119,7 +119,8 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {
const index_t K, const index_t K,
const index_t width, const index_t width,
Tensor *C) { Tensor *C) {
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext(); auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext();
MACE_CHECK_NOTNULL(gemm_context);
Tensor::MappingGuard guarda(A); Tensor::MappingGuard guarda(A);
Tensor::MappingGuard guardb(B); Tensor::MappingGuard guardb(B);
...@@ -146,7 +147,7 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel { ...@@ -146,7 +147,7 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {
using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams; using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>( gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
&gemm_context, a_matrix, b_matrix, &c_matrix, -A->zero_point(), gemm_context, a_matrix, b_matrix, &c_matrix, -A->zero_point(),
-B->zero_point(), output_pipeline); -B->zero_point(), output_pipeline);
} }
} }
......
...@@ -21,8 +21,8 @@ ...@@ -21,8 +21,8 @@
#include "public/gemmlowp.h" #include "public/gemmlowp.h"
#include "mace/core/testing/test_benchmark.h" #include "mace/core/testing/test_benchmark.h"
#include "mace/kernels/gemm.h" #include "mace/kernels/gemm.h"
#include "mace/kernels/gemmlowp_util.h"
#include "mace/kernels/sgemm.h" #include "mace/kernels/sgemm.h"
#include "mace/ops/ops_test_util.h"
namespace gemmlowp { namespace gemmlowp {
...@@ -164,18 +164,22 @@ void MatmulBenchmark_gemmlowp_uint8(int iters, int rows, int depth, int cols) { ...@@ -164,18 +164,22 @@ void MatmulBenchmark_gemmlowp_uint8(int iters, int rows, int depth, int cols) {
const auto output_pipeline = const auto output_pipeline =
std::make_tuple(quantize_down_stage, saturating_cast_stage); std::make_tuple(quantize_down_stage, saturating_cast_stage);
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext(); auto gemm_context =
mace::ops::test::OpTestContext::Get()
->GetDevice(CPU)->cpu_runtime()->GetGemmlowpContext();
MACE_CHECK_NOTNULL(gemm_context);
using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams; using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>( gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>(
&gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
-128, output_pipeline); -128, output_pipeline);
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
BitDepthParams>( BitDepthParams>(
&gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
-128, output_pipeline); -128, output_pipeline);
} }
} }
...@@ -195,18 +199,22 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) { ...@@ -195,18 +199,22 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {
const auto output_pipeline = std::make_tuple(); const auto output_pipeline = std::make_tuple();
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext(); auto gemm_context =
mace::ops::test::OpTestContext::Get()
->GetDevice(CPU)->cpu_runtime()->GetGemmlowpContext();
MACE_CHECK_NOTNULL(gemm_context);
using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams; using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>( gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>(
&gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
-128, output_pipeline); -128, output_pipeline);
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
BitDepthParams>( BitDepthParams>(
&gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
-128, output_pipeline); -128, output_pipeline);
} }
} }
......
...@@ -177,9 +177,6 @@ class MaceEngineConfig::Impl { ...@@ -177,9 +177,6 @@ class MaceEngineConfig::Impl {
CPUAffinityPolicy policy, CPUAffinityPolicy policy,
bool use_gemmlowp); bool use_gemmlowp);
MaceStatus SetOpenMPThreadAffinity(int num_threads,
const std::vector<int> &cpu_ids);
inline DeviceType device_type() const { inline DeviceType device_type() const {
return device_type_; return device_type_;
} }
...@@ -188,6 +185,14 @@ class MaceEngineConfig::Impl { ...@@ -188,6 +185,14 @@ class MaceEngineConfig::Impl {
return num_threads_; return num_threads_;
} }
inline CPUAffinityPolicy cpu_affinity_policy() const {
return cpu_affinity_policy_;
}
inline bool use_gemmlowp() const {
return use_gemmlowp_;
}
inline std::shared_ptr<GPUContext> gpu_context() const { inline std::shared_ptr<GPUContext> gpu_context() const {
return gpu_context_; return gpu_context_;
} }
...@@ -203,6 +208,8 @@ class MaceEngineConfig::Impl { ...@@ -203,6 +208,8 @@ class MaceEngineConfig::Impl {
private: private:
DeviceType device_type_; DeviceType device_type_;
int num_threads_; int num_threads_;
CPUAffinityPolicy cpu_affinity_policy_;
bool use_gemmlowp_;
std::shared_ptr<GPUContext> gpu_context_; std::shared_ptr<GPUContext> gpu_context_;
GPUPriorityHint gpu_priority_hint_; GPUPriorityHint gpu_priority_hint_;
GPUPerfHint gpu_perf_hint_; GPUPerfHint gpu_perf_hint_;
...@@ -211,6 +218,8 @@ class MaceEngineConfig::Impl { ...@@ -211,6 +218,8 @@ class MaceEngineConfig::Impl {
MaceEngineConfig::Impl::Impl(const DeviceType device_type) MaceEngineConfig::Impl::Impl(const DeviceType device_type)
: device_type_(device_type), : device_type_(device_type),
num_threads_(-1), num_threads_(-1),
cpu_affinity_policy_(CPUAffinityPolicy::AFFINITY_NONE),
use_gemmlowp_(false),
gpu_context_(new GPUContext), gpu_context_(new GPUContext),
gpu_priority_hint_(GPUPriorityHint::PRIORITY_LOW), gpu_priority_hint_(GPUPriorityHint::PRIORITY_LOW),
gpu_perf_hint_(GPUPerfHint::PERF_NORMAL) {} gpu_perf_hint_(GPUPerfHint::PERF_NORMAL) {}
...@@ -234,15 +243,9 @@ MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy( ...@@ -234,15 +243,9 @@ MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy(
CPUAffinityPolicy policy, CPUAffinityPolicy policy,
bool use_gemmlowp) { bool use_gemmlowp) {
num_threads_ = num_threads; num_threads_ = num_threads;
return mace::SetOpenMPThreadsAndAffinityPolicy( cpu_affinity_policy_ = policy;
num_threads, policy, use_gemmlowp); use_gemmlowp_ = use_gemmlowp;
} return MACE_SUCCESS;
MaceStatus MaceEngineConfig::Impl::SetOpenMPThreadAffinity(
int num_threads,
const std::vector<int> &cpu_ids) {
num_threads_ = num_threads;
return mace::SetOpenMPThreadsAndAffinityCPUs(num_threads, cpu_ids);
} }
...@@ -270,32 +273,6 @@ MaceStatus MaceEngineConfig::SetCPUThreadPolicy( ...@@ -270,32 +273,6 @@ MaceStatus MaceEngineConfig::SetCPUThreadPolicy(
return impl_->SetCPUThreadPolicy(num_threads_hint, policy, use_gemmlowp); return impl_->SetCPUThreadPolicy(num_threads_hint, policy, use_gemmlowp);
} }
MaceStatus MaceEngineConfig::SetOpenMPThreadAffinity(
int num_threads,
const std::vector<int> &cpu_ids) {
return impl_->SetOpenMPThreadAffinity(num_threads, cpu_ids);
}
DeviceType MaceEngineConfig::device_type() const {
return impl_->device_type();
}
int MaceEngineConfig::num_threads() const {
return impl_->num_threads();
}
std::shared_ptr<GPUContext> MaceEngineConfig::gpu_context() const {
return impl_->gpu_context();
}
GPUPerfHint MaceEngineConfig::gpu_perf_hint() const {
return impl_->gpu_perf_hint();
}
GPUPriorityHint MaceEngineConfig::gpu_priority_hint() const {
return impl_->gpu_priority_hint();
}
// Mace Tensor // Mace Tensor
class MaceTensor::Impl { class MaceTensor::Impl {
public: public:
...@@ -389,7 +366,7 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config) ...@@ -389,7 +366,7 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
: model_data_(nullptr), : model_data_(nullptr),
model_data_size_(0), model_data_size_(0),
op_registry_(new OperatorRegistry()), op_registry_(new OperatorRegistry()),
device_type_(config.device_type()), device_type_(config.impl_->device_type()),
device_(nullptr), device_(nullptr),
ws_(new Workspace()), ws_(new Workspace()),
net_(nullptr) net_(nullptr)
...@@ -399,16 +376,21 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config) ...@@ -399,16 +376,21 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
{ {
LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion(); LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion();
if (device_type_ == DeviceType::CPU || device_type_ == DeviceType::HEXAGON) { if (device_type_ == DeviceType::CPU || device_type_ == DeviceType::HEXAGON) {
device_.reset(new CPUDevice(config.num_threads())); device_.reset(new CPUDevice(config.impl_->num_threads(),
config.impl_->cpu_affinity_policy(),
config.impl_->use_gemmlowp()));
} }
#ifdef MACE_ENABLE_OPENCL #ifdef MACE_ENABLE_OPENCL
if (device_type_ == DeviceType::GPU) { if (device_type_ == DeviceType::GPU) {
device_.reset(new GPUDevice(config.gpu_context()->opencl_tuner(), device_.reset(new GPUDevice(
config.gpu_context()->opencl_cache_storage(), config.impl_->gpu_context()->opencl_tuner(),
config.gpu_priority_hint(), config.impl_->gpu_context()->opencl_cache_storage(),
config.gpu_perf_hint(), config.impl_->gpu_priority_hint(),
config.gpu_context()->opencl_binary_storage(), config.impl_->gpu_perf_hint(),
config.num_threads())); config.impl_->gpu_context()->opencl_binary_storage(),
config.impl_->num_threads(),
config.impl_->cpu_affinity_policy(),
config.impl_->use_gemmlowp()));
} }
#endif #endif
} }
......
...@@ -18,8 +18,12 @@ namespace mace { ...@@ -18,8 +18,12 @@ namespace mace {
namespace ops { namespace ops {
namespace test { namespace test {
OpTestContext *OpTestContext::Get() { OpTestContext *OpTestContext::Get(int num_threads,
static OpTestContext instance; CPUAffinityPolicy cpu_affinity_policy,
bool use_gemmlowp) {
static OpTestContext instance(num_threads,
cpu_affinity_policy,
use_gemmlowp);
return &instance; return &instance;
} }
...@@ -31,8 +35,15 @@ Device *OpTestContext::GetDevice(DeviceType device_type) { ...@@ -31,8 +35,15 @@ Device *OpTestContext::GetDevice(DeviceType device_type) {
return device_map_[device_type].get(); return device_map_[device_type].get();
} }
OpTestContext::OpTestContext() : gpu_context_(new GPUContext()) { OpTestContext::OpTestContext(int num_threads,
device_map_[DeviceType::CPU] = std::unique_ptr<Device>(new CPUDevice(-1)); CPUAffinityPolicy cpu_affinity_policy,
bool use_gemmlowp)
: gpu_context_(new GPUContext()) {
device_map_[DeviceType::CPU] = std::unique_ptr<Device>(
new CPUDevice(num_threads,
cpu_affinity_policy,
use_gemmlowp));
device_map_[DeviceType::GPU] = std::unique_ptr<Device>( device_map_[DeviceType::GPU] = std::unique_ptr<Device>(
new GPUDevice(gpu_context_->opencl_tuner(), new GPUDevice(gpu_context_->opencl_tuner(),
gpu_context_->opencl_cache_storage(), gpu_context_->opencl_cache_storage(),
......
...@@ -114,11 +114,17 @@ class OpDefBuilder { ...@@ -114,11 +114,17 @@ class OpDefBuilder {
class OpTestContext { class OpTestContext {
public: public:
static OpTestContext *Get(); static OpTestContext *Get(
int num_threads = -1,
CPUAffinityPolicy cpu_affinity_policy = AFFINITY_BIG_ONLY,
bool use_gemmlowp = true);
std::shared_ptr<GPUContext> gpu_context() const; std::shared_ptr<GPUContext> gpu_context() const;
Device *GetDevice(DeviceType device_type); Device *GetDevice(DeviceType device_type);
private: private:
OpTestContext(); OpTestContext(int num_threads,
CPUAffinityPolicy cpu_affinity_policy,
bool use_gemmlowp);
MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext); MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext);
std::shared_ptr<GPUContext> gpu_context_; std::shared_ptr<GPUContext> gpu_context_;
...@@ -501,8 +507,6 @@ class OpsTestNet { ...@@ -501,8 +507,6 @@ class OpsTestNet {
class OpsTestBase : public ::testing::Test { class OpsTestBase : public ::testing::Test {
protected: protected:
virtual void SetUp() { virtual void SetUp() {
SetOpenMPThreadsAndAffinityPolicy(-1,
CPUAffinityPolicy::AFFINITY_BIG_ONLY);
} }
virtual void TearDown() { virtual void TearDown() {
......
...@@ -97,21 +97,6 @@ enum MaceStatus { ...@@ -97,21 +97,6 @@ enum MaceStatus {
} \ } \
} }
/// \brief Get ARM big.LITTLE configuration.
///
/// This function will detect the max frequencies of all CPU cores, and assume
/// the cores with largest max frequencies as big cores, and all the remaining
/// cores as little. If all cpu core's max frequencies equals, big_core_ids and
/// little_core_ids will both be filled with all cpu core ids.
///
/// \param [out] big_core_ids
/// \param [out] little_core_ids
/// \return If successful, it returns MACE_SUCCESS and error if it can't
/// reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
MACE_API MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
std::vector<int> *little_core_ids);
/// \brief GPU context contain the status used for GPU device. /// \brief GPU context contain the status used for GPU device.
/// ///
/// The life cycle of GPUContext object is the same as MaceEngines use it. /// The life cycle of GPUContext object is the same as MaceEngines use it.
...@@ -170,6 +155,8 @@ class MACE_API GPUContextBuilder { ...@@ -170,6 +155,8 @@ class MACE_API GPUContextBuilder {
}; };
class MACE_API MaceEngineConfig { class MACE_API MaceEngineConfig {
friend class MaceEngine;
public: public:
explicit MaceEngineConfig(const DeviceType device_type); explicit MaceEngineConfig(const DeviceType device_type);
~MaceEngineConfig(); ~MaceEngineConfig();
...@@ -219,32 +206,6 @@ class MACE_API MaceEngineConfig { ...@@ -219,32 +206,6 @@ class MACE_API MaceEngineConfig {
CPUAffinityPolicy policy, CPUAffinityPolicy policy,
bool use_gemmlowp = false); bool use_gemmlowp = false);
/// \brief Set OpenMP threads number and processor affinity.
///
/// Caution: this function may hurt performance
/// if improper parameters provided.
/// This function may not work well on some chips (e.g. MTK). Setting thread
/// affinity to offline cores may run very slow or unexpectedly.
/// In such cases, please use SetOpenMPThreadPolicy with default policy
/// instead.
///
/// \param num_threads
/// \param cpu_ids
/// \return MACE_SUCCESS for success, other for failed.
MaceStatus SetOpenMPThreadAffinity(
int num_threads,
const std::vector<int> &cpu_ids);
DeviceType device_type() const;
int num_threads() const;
std::shared_ptr<GPUContext> gpu_context() const;
GPUPriorityHint gpu_priority_hint() const;
GPUPerfHint gpu_perf_hint() const;
private: private:
class Impl; class Impl;
std::unique_ptr<Impl> impl_; std::unique_ptr<Impl> impl_;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册