提交 34797c2b 编写于 作者: 李寅

Merge branch 'cpu_runtime' into 'master'

Refactor CPURuntime

See merge request !800
......@@ -104,10 +104,13 @@ cc_library(
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
],
] + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
]),
deps = [
":core",
"//external:gflags_nothreads",
"//mace/ops:test",
"//mace/utils",
],
)
......@@ -16,8 +16,12 @@
namespace mace {
CPUDevice::CPUDevice(const int num_threads)
: cpu_runtime_(new CPURuntime(num_threads)) {}
CPUDevice::CPUDevice(const int num_threads,
const CPUAffinityPolicy policy,
const bool use_gemmlowp)
: cpu_runtime_(new CPURuntime(num_threads,
policy,
use_gemmlowp)) {}
CPUDevice::~CPUDevice() = default;
......
......@@ -41,7 +41,9 @@ class Device {
class CPUDevice : public Device {
public:
explicit CPUDevice(const int num_threads);
CPUDevice(const int num_threads,
const CPUAffinityPolicy policy,
const bool use_gemmlowp);
virtual ~CPUDevice();
#ifdef MACE_ENABLE_OPENCL
......
......@@ -27,7 +27,6 @@
#include <utility>
#include <vector>
#include "public/gemmlowp.h"
#include "mace/core/macros.h"
#include "mace/public/mace.h"
#include "mace/utils/logging.h"
......@@ -92,13 +91,6 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) {
}
}
} // namespace
gemmlowp::GemmContext& GetGemmlowpContext() {
static auto *gemm_context = new gemmlowp::GemmContext;
return *gemm_context;
}
MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
std::vector<int> *little_core_ids) {
MACE_CHECK_NOTNULL(big_core_ids);
......@@ -174,13 +166,15 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
#endif
}
MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
CPUAffinityPolicy policy,
bool use_gemmlowp) {
} // namespace
MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
int omp_num_threads_hint,
CPUAffinityPolicy policy,
gemmlowp::GemmContext *gemm_context) {
if (policy == CPUAffinityPolicy::AFFINITY_NONE) {
if (use_gemmlowp) {
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
gemm_context.set_max_num_threads(std::max(0, omp_num_threads_hint));
if (gemm_context) {
gemm_context->set_max_num_threads(std::max(0, omp_num_threads_hint));
}
#ifdef MACE_ENABLE_OPENMP
if (omp_num_threads_hint > 0) {
......@@ -211,9 +205,8 @@ MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
omp_num_threads_hint = use_cpu_ids.size();
}
if (use_gemmlowp) {
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
gemm_context.set_max_num_threads(omp_num_threads_hint);
if (gemm_context) {
gemm_context->set_max_num_threads(omp_num_threads_hint);
}
return SetOpenMPThreadsAndAffinityCPUs(omp_num_threads_hint, use_cpu_ids);
......
......@@ -15,33 +15,55 @@
#ifndef MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
#define MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_
#include <memory>
#include <vector>
#include "public/gemmlowp.h"
#include "mace/public/mace.h"
#include "mace/utils/logging.h"
namespace mace {
extern int MaceOpenMPThreadCount;
MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
std::vector<int> *little_core_ids);
MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
const std::vector<int> &cpu_ids);
MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint,
CPUAffinityPolicy policy,
bool use_gemmlowp = false);
class CPURuntime {
public:
explicit CPURuntime(const int num_threads) : num_threads_(num_threads) {}
CPURuntime(const int num_threads,
CPUAffinityPolicy policy,
bool use_gemmlowp)
: num_threads_(num_threads),
policy_(policy),
gemm_context_(nullptr) {
if (use_gemmlowp) {
MACE_CHECK_NOTNULL(GetGemmlowpContext());
}
SetOpenMPThreadsAndAffinityPolicy(num_threads_,
policy_,
gemm_context_.get());
}
~CPURuntime() = default;
inline int num_threads() const {
gemmlowp::GemmContext *GetGemmlowpContext() {
if (!gemm_context_) {
gemm_context_.reset(new gemmlowp::GemmContext());
}
return gemm_context_.get();
}
int num_threads() const {
return num_threads_;
}
private:
MaceStatus SetOpenMPThreadsAndAffinityPolicy(
int omp_num_threads_hint,
CPUAffinityPolicy policy,
gemmlowp::GemmContext *gemm_context);
int num_threads_;
CPUAffinityPolicy policy_;
std::unique_ptr<gemmlowp::GemmContext> gemm_context_;
};
} // namespace mace
......
......@@ -21,8 +21,10 @@ GPUDevice::GPUDevice(Tuner<uint32_t> *tuner,
const GPUPriorityHint priority,
const GPUPerfHint perf,
KVStorage *opencl_binary_storage,
const int num_threads) :
CPUDevice(num_threads),
const int num_threads,
CPUAffinityPolicy cpu_affinity_policy,
bool use_gemmlowp) :
CPUDevice(num_threads, cpu_affinity_policy, use_gemmlowp),
runtime_(new OpenCLRuntime(opencl_cache_storage, priority, perf,
opencl_binary_storage, tuner)),
allocator_(new OpenCLAllocator(runtime_.get())) {}
......
......@@ -30,7 +30,9 @@ class GPUDevice : public CPUDevice {
const GPUPriorityHint priority = GPUPriorityHint::PRIORITY_LOW,
const GPUPerfHint perf = GPUPerfHint::PERF_NORMAL,
KVStorage *opencl_binary_storage = nullptr,
const int num_threads = -1);
const int num_threads = -1,
CPUAffinityPolicy cpu_affinity_policy = AFFINITY_NONE,
bool use_gemmlowp = false);
~GPUDevice();
OpenCLRuntime *opencl_runtime() override;
Allocator *allocator() override;
......
......@@ -17,7 +17,7 @@
#include "gflags/gflags.h"
#include "mace/core/runtime/cpu/cpu_runtime.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/utils/logging.h"
#include "mace/ops/ops_test_util.h"
DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*");
DEFINE_int32(omp_num_threads, -1, "num of openmp threads");
......@@ -31,13 +31,10 @@ int main(int argc, char **argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
// config runtime
mace::MaceStatus status = mace::SetOpenMPThreadsAndAffinityPolicy(
mace::ops::test::OpTestContext::Get(
FLAGS_omp_num_threads,
static_cast<mace::CPUAffinityPolicy>(FLAGS_cpu_affinity_policy),
true);
if (status != mace::MACE_SUCCESS) {
LOG(WARNING) << "Set openmp or cpu affinity failed.";
}
mace::testing::Benchmark::Run(FLAGS_filter.c_str());
return 0;
......
......@@ -853,7 +853,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1,
"Quantization convolution does not support dilation > 1 yet.");
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext();
MACE_CHECK_NOTNULL(gemm_context);
std::vector<index_t> output_shape(4);
std::vector<int> paddings(2);
......@@ -970,7 +971,7 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
&gemm_context, filter_matrix, input_matrix, &output_matrix,
gemm_context, filter_matrix, input_matrix, &output_matrix,
-filter->zero_point(), -input->zero_point(), output_pipeline);
return MACE_SUCCESS;
......
......@@ -100,7 +100,8 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
Tensor *output,
StatsFuture *future) {
MACE_UNUSED(future);
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext();
MACE_CHECK_NOTNULL(gemm_context);
std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
MACE_RETURN_IF_ERROR(output->Resize(output_shape));
......@@ -142,7 +143,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
&gemm_context, weight_matrix, input_matrix, &output_matrix,
gemm_context, weight_matrix, input_matrix, &output_matrix,
-weight->zero_point(), -input->zero_point(), output_pipeline);
return MACE_SUCCESS;
......
......@@ -22,8 +22,6 @@
namespace mace {
gemmlowp::GemmContext& GetGemmlowpContext();
struct GemmlowpOutputPipeline {
typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col>
ColVectorMap;
......
......@@ -122,7 +122,8 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {
const index_t K,
const index_t width,
Tensor *C) {
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext();
MACE_CHECK_NOTNULL(gemm_context);
Tensor::MappingGuard guarda(A);
Tensor::MappingGuard guardb(B);
......@@ -149,7 +150,7 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {
using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
&gemm_context, a_matrix, b_matrix, &c_matrix, -A->zero_point(),
gemm_context, a_matrix, b_matrix, &c_matrix, -A->zero_point(),
-B->zero_point(), output_pipeline);
}
}
......
......@@ -21,8 +21,8 @@
#include "public/gemmlowp.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/kernels/gemm.h"
#include "mace/kernels/gemmlowp_util.h"
#include "mace/kernels/sgemm.h"
#include "mace/ops/ops_test_util.h"
namespace gemmlowp {
......@@ -164,18 +164,22 @@ void MatmulBenchmark_gemmlowp_uint8(int iters, int rows, int depth, int cols) {
const auto output_pipeline =
std::make_tuple(quantize_down_stage, saturating_cast_stage);
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
auto gemm_context =
mace::ops::test::OpTestContext::Get()
->GetDevice(CPU)->cpu_runtime()->GetGemmlowpContext();
MACE_CHECK_NOTNULL(gemm_context);
using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>(
&gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
-128, output_pipeline);
mace::testing::StartTiming();
while (iters--) {
gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
BitDepthParams>(
&gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
-128, output_pipeline);
}
}
......@@ -195,18 +199,22 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {
const auto output_pipeline = std::make_tuple();
gemmlowp::GemmContext& gemm_context = GetGemmlowpContext();
auto gemm_context =
mace::ops::test::OpTestContext::Get()
->GetDevice(CPU)->cpu_runtime()->GetGemmlowpContext();
MACE_CHECK_NOTNULL(gemm_context);
using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>(
&gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
-128, output_pipeline);
mace::testing::StartTiming();
while (iters--) {
gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
BitDepthParams>(
&gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
-128, output_pipeline);
}
}
......
......@@ -177,9 +177,6 @@ class MaceEngineConfig::Impl {
CPUAffinityPolicy policy,
bool use_gemmlowp);
MaceStatus SetOpenMPThreadAffinity(int num_threads,
const std::vector<int> &cpu_ids);
inline DeviceType device_type() const {
return device_type_;
}
......@@ -188,6 +185,14 @@ class MaceEngineConfig::Impl {
return num_threads_;
}
inline CPUAffinityPolicy cpu_affinity_policy() const {
return cpu_affinity_policy_;
}
inline bool use_gemmlowp() const {
return use_gemmlowp_;
}
inline std::shared_ptr<GPUContext> gpu_context() const {
return gpu_context_;
}
......@@ -203,6 +208,8 @@ class MaceEngineConfig::Impl {
private:
DeviceType device_type_;
int num_threads_;
CPUAffinityPolicy cpu_affinity_policy_;
bool use_gemmlowp_;
std::shared_ptr<GPUContext> gpu_context_;
GPUPriorityHint gpu_priority_hint_;
GPUPerfHint gpu_perf_hint_;
......@@ -211,6 +218,8 @@ class MaceEngineConfig::Impl {
MaceEngineConfig::Impl::Impl(const DeviceType device_type)
: device_type_(device_type),
num_threads_(-1),
cpu_affinity_policy_(CPUAffinityPolicy::AFFINITY_NONE),
use_gemmlowp_(false),
gpu_context_(new GPUContext),
gpu_priority_hint_(GPUPriorityHint::PRIORITY_LOW),
gpu_perf_hint_(GPUPerfHint::PERF_NORMAL) {}
......@@ -234,15 +243,9 @@ MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy(
CPUAffinityPolicy policy,
bool use_gemmlowp) {
num_threads_ = num_threads;
return mace::SetOpenMPThreadsAndAffinityPolicy(
num_threads, policy, use_gemmlowp);
}
MaceStatus MaceEngineConfig::Impl::SetOpenMPThreadAffinity(
int num_threads,
const std::vector<int> &cpu_ids) {
num_threads_ = num_threads;
return mace::SetOpenMPThreadsAndAffinityCPUs(num_threads, cpu_ids);
cpu_affinity_policy_ = policy;
use_gemmlowp_ = use_gemmlowp;
return MACE_SUCCESS;
}
......@@ -270,32 +273,6 @@ MaceStatus MaceEngineConfig::SetCPUThreadPolicy(
return impl_->SetCPUThreadPolicy(num_threads_hint, policy, use_gemmlowp);
}
MaceStatus MaceEngineConfig::SetOpenMPThreadAffinity(
int num_threads,
const std::vector<int> &cpu_ids) {
return impl_->SetOpenMPThreadAffinity(num_threads, cpu_ids);
}
DeviceType MaceEngineConfig::device_type() const {
return impl_->device_type();
}
int MaceEngineConfig::num_threads() const {
return impl_->num_threads();
}
std::shared_ptr<GPUContext> MaceEngineConfig::gpu_context() const {
return impl_->gpu_context();
}
GPUPerfHint MaceEngineConfig::gpu_perf_hint() const {
return impl_->gpu_perf_hint();
}
GPUPriorityHint MaceEngineConfig::gpu_priority_hint() const {
return impl_->gpu_priority_hint();
}
// Mace Tensor
class MaceTensor::Impl {
public:
......@@ -389,7 +366,7 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
: model_data_(nullptr),
model_data_size_(0),
op_registry_(new OperatorRegistry()),
device_type_(config.device_type()),
device_type_(config.impl_->device_type()),
device_(nullptr),
ws_(new Workspace()),
net_(nullptr)
......@@ -399,16 +376,21 @@ MaceEngine::Impl::Impl(const MaceEngineConfig &config)
{
LOG(INFO) << "Creating MaceEngine, MACE version: " << MaceVersion();
if (device_type_ == DeviceType::CPU || device_type_ == DeviceType::HEXAGON) {
device_.reset(new CPUDevice(config.num_threads()));
device_.reset(new CPUDevice(config.impl_->num_threads(),
config.impl_->cpu_affinity_policy(),
config.impl_->use_gemmlowp()));
}
#ifdef MACE_ENABLE_OPENCL
if (device_type_ == DeviceType::GPU) {
device_.reset(new GPUDevice(config.gpu_context()->opencl_tuner(),
config.gpu_context()->opencl_cache_storage(),
config.gpu_priority_hint(),
config.gpu_perf_hint(),
config.gpu_context()->opencl_binary_storage(),
config.num_threads()));
device_.reset(new GPUDevice(
config.impl_->gpu_context()->opencl_tuner(),
config.impl_->gpu_context()->opencl_cache_storage(),
config.impl_->gpu_priority_hint(),
config.impl_->gpu_perf_hint(),
config.impl_->gpu_context()->opencl_binary_storage(),
config.impl_->num_threads(),
config.impl_->cpu_affinity_policy(),
config.impl_->use_gemmlowp()));
}
#endif
}
......
......@@ -18,8 +18,12 @@ namespace mace {
namespace ops {
namespace test {
OpTestContext *OpTestContext::Get() {
static OpTestContext instance;
OpTestContext *OpTestContext::Get(int num_threads,
CPUAffinityPolicy cpu_affinity_policy,
bool use_gemmlowp) {
static OpTestContext instance(num_threads,
cpu_affinity_policy,
use_gemmlowp);
return &instance;
}
......@@ -31,8 +35,15 @@ Device *OpTestContext::GetDevice(DeviceType device_type) {
return device_map_[device_type].get();
}
OpTestContext::OpTestContext() : gpu_context_(new GPUContext()) {
device_map_[DeviceType::CPU] = std::unique_ptr<Device>(new CPUDevice(-1));
OpTestContext::OpTestContext(int num_threads,
CPUAffinityPolicy cpu_affinity_policy,
bool use_gemmlowp)
: gpu_context_(new GPUContext()) {
device_map_[DeviceType::CPU] = std::unique_ptr<Device>(
new CPUDevice(num_threads,
cpu_affinity_policy,
use_gemmlowp));
device_map_[DeviceType::GPU] = std::unique_ptr<Device>(
new GPUDevice(gpu_context_->opencl_tuner(),
gpu_context_->opencl_cache_storage(),
......
......@@ -114,11 +114,17 @@ class OpDefBuilder {
class OpTestContext {
public:
static OpTestContext *Get();
static OpTestContext *Get(
int num_threads = -1,
CPUAffinityPolicy cpu_affinity_policy = AFFINITY_BIG_ONLY,
bool use_gemmlowp = true);
std::shared_ptr<GPUContext> gpu_context() const;
Device *GetDevice(DeviceType device_type);
private:
OpTestContext();
OpTestContext(int num_threads,
CPUAffinityPolicy cpu_affinity_policy,
bool use_gemmlowp);
MACE_DISABLE_COPY_AND_ASSIGN(OpTestContext);
std::shared_ptr<GPUContext> gpu_context_;
......@@ -504,8 +510,6 @@ class OpsTestNet {
class OpsTestBase : public ::testing::Test {
protected:
virtual void SetUp() {
SetOpenMPThreadsAndAffinityPolicy(-1,
CPUAffinityPolicy::AFFINITY_BIG_ONLY);
}
virtual void TearDown() {
......
......@@ -97,21 +97,6 @@ enum MaceStatus {
} \
}
/// \brief Get ARM big.LITTLE configuration.
///
/// This function will detect the max frequencies of all CPU cores, and assume
/// the cores with largest max frequencies as big cores, and all the remaining
/// cores as little. If all cpu core's max frequencies equals, big_core_ids and
/// little_core_ids will both be filled with all cpu core ids.
///
/// \param [out] big_core_ids
/// \param [out] little_core_ids
/// \return If successful, it returns MACE_SUCCESS and error if it can't
/// reliabley detect the frequency of big-LITTLE cores (e.g. MTK).
MACE_API MaceStatus GetBigLittleCoreIDs(std::vector<int> *big_core_ids,
std::vector<int> *little_core_ids);
/// \brief GPU context contain the status used for GPU device.
///
/// The life cycle of GPUContext object is the same as MaceEngines use it.
......@@ -170,6 +155,8 @@ class MACE_API GPUContextBuilder {
};
class MACE_API MaceEngineConfig {
friend class MaceEngine;
public:
explicit MaceEngineConfig(const DeviceType device_type);
~MaceEngineConfig();
......@@ -219,32 +206,6 @@ class MACE_API MaceEngineConfig {
CPUAffinityPolicy policy,
bool use_gemmlowp = false);
/// \brief Set OpenMP threads number and processor affinity.
///
/// Caution: this function may hurt performance
/// if improper parameters provided.
/// This function may not work well on some chips (e.g. MTK). Setting thread
/// affinity to offline cores may run very slow or unexpectedly.
/// In such cases, please use SetOpenMPThreadPolicy with default policy
/// instead.
///
/// \param num_threads
/// \param cpu_ids
/// \return MACE_SUCCESS for success, other for failed.
MaceStatus SetOpenMPThreadAffinity(
int num_threads,
const std::vector<int> &cpu_ids);
DeviceType device_type() const;
int num_threads() const;
std::shared_ptr<GPUContext> gpu_context() const;
GPUPriorityHint gpu_priority_hint() const;
GPUPerfHint gpu_perf_hint() const;
private:
class Impl;
std::unique_ptr<Impl> impl_;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册