提交 012de769 编写于 作者: M Megvii Engine Team

feat(mgb/gopt): add profiler cache

In order to improve performance of the profiling procedure. Make layout transform testcase stable. The profiling result in ci environment will be cached in files.

GitOrigin-RevId: ba2743f35fcdbd554b7cd82e70f433ccdcc66fa4
上级 b33ec46e
......@@ -9,6 +9,7 @@ dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary
dnn/src/cuda/sass/prebuilt/map_defs.cpp binary
dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary
dnn/src/cuda/elemwise_multi_type/kimpl/* binary
src/gopt/test/cache_data.h binary
tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text
imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text
ci/resource/models/float/mobilenet_v2.pkl filter=lfs diff=lfs merge=lfs -text
......
......@@ -2,13 +2,11 @@ cc_library(
name = "mgblar",
copts = ["-std=c++14"],
srcs = [
"src/infile_persistent_cache.cpp",
"src/mgblar.cpp",
"src/json_loader.cpp",
"src/text_table.cpp",
],
hdrs = [
"src/infile_persistent_cache.h",
"src/mgblar.h",
"src/json_loader.h",
"src/text_table.h",
......@@ -57,11 +55,9 @@ cc_megvii_binary(
cc_library(
name = "megbrain_ios_lar_lib",
srcs = [
"src/infile_persistent_cache.cpp",
"src/mgblar.cpp",
],
hdrs = [
"src/infile_persistent_cache.h",
"src/mgblar.h",
],
copts = ["-DMGB_NO_MAIN=1"],
......
......@@ -10,7 +10,6 @@
*/
#include "./mgblar.h"
#include "./infile_persistent_cache.h"
#include "./json_loader.h"
#include "./npy.h"
#include "./text_table.h"
......@@ -30,6 +29,7 @@
#include "megbrain/serialization/extern_c_opr.h"
#include "megbrain/serialization/serializer.h"
#include "megbrain/utils/debug.h"
#include "megbrain/utils/infile_persistent_cache.h"
#include "megbrain/system.h"
#include "megbrain/version.h"
......
/**
* \file sdk/load-and-run/src/infile_persistent_cache.cpp
* \file src/core/impl/utils/infile_persistent_cache.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -9,7 +9,7 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "./infile_persistent_cache.h"
#include "megbrain/utils/infile_persistent_cache.h"
#if defined(_WIN32)
#include <io.h>
......
/**
* \file sdk/load-and-run/src/infile_persistent_cache.h
* \file src/core/include/megbrain/utils/infile_persistent_cache.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......@@ -70,6 +70,7 @@ public:
Maybe<Blob> get(const std::string& category, const Blob& key) override;
void put(const std::string& category, const Blob& key,
const Blob& value) override;
bool support_dump_cache() override { return true; }
};
} // namespace mgb
......
......@@ -39,6 +39,8 @@ public:
virtual void put(
const std::string& category, const Blob& key, const Blob& value) = 0;
virtual bool support_dump_cache() { return false; }
//! set an implementation; return the original implementation
static std::shared_ptr<PersistentCache> set_impl(
std::shared_ptr<PersistentCache> impl);
......
/**
* \file src/gopt/impl/global_layout_transform/opr_safe_dump.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "./opr_safe_dump.h"
#include "megbrain/opr/basic_arith.h"
#include "megbrain/opr/dnn/convolution.h"
#include "megbrain/opr/dnn/pooling.h"
#include "megbrain/opr/imgproc.h"
#include "megbrain/opr/nn_int.h"
#include "megbrain/opr/tensor_manip.h"
#include "midout.h"
MIDOUT_DECL(megbrain_opr_safe_dump)
#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_safe_dump, __VA_ARGS__) {
#define MIDOUT_E \
} \
MIDOUT_END();
using namespace mgb;
using namespace opr;
namespace {
template <typename Param>
void write_param(std::string& data, const Param& param) {
megdnn::Algorithm::serialize_write_pod(param, data);
}
template <>
void write_param(std::string& /* data */, const DType& /* dtype */) {}
template <class Opr>
struct OprDumpImpl {
static std::string dump(const cg::OperatorNodeBase* opr_) {
MIDOUT_B(Opr)
auto&& opr = opr_->cast_final_safe<Opr>();
std::string data;
write_param(data, opr.param());
return data;
MIDOUT_E
}
};
#define INST(_Opr) \
template <> \
struct OprDumpImpl<_Opr> { \
static std::string dump(const cg::OperatorNodeBase* opr_) { \
MIDOUT_B(_Opr) \
auto&& opr = opr_->cast_final_safe<_Opr>(); \
std::string data; \
write_param(data, opr.param()); \
using ExecutionPolicy = megdnn::param::ExecutionPolicy; \
ExecutionPolicy policy{ \
opr.execution_policy_transient().strategy, \
opr.execution_policy_transient().workspace_limit}; \
write_param(data, policy); \
return data; \
MIDOUT_E \
} \
};
INST(Convolution);
INST(ConvBiasForward);
INST(ConvolutionBackwardData);
INST(PoolingForward);
#undef INST
} // namespace
namespace mgb {
namespace gopt {
namespace intl {
std::string opr_safe_dump(const cg::OperatorNodeBase* opr) {
#define cb(_Opr) \
if (opr->dyn_typeinfo() == _Opr::typeinfo()) { \
return OprDumpImpl<_Opr>::dump(opr); \
} else
FOREACH_SUPPORTED_OPR(cb) {
mgb_throw(InternalError, "unsupported operator(got:%s)",
opr->dyn_typeinfo()->name);
}
#undef cb
}
} // namespace intl
} // namespace gopt
} // namespace mgb
// vim: syntax=cpp.doxygen
/**
* \file src/gopt/impl/global_layout_transform/opr_safe_dump.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megbrain/graph.h"
namespace mgb {
namespace gopt {
namespace intl {
#define FOREACH_SUPPORTED_OPR(cb) \
cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) \
cb(PoolingForward) cb(WarpPerspective) cb(Resize) cb(Elemwise) \
cb(ElemwiseMultiType) cb(Concat) cb(PowC) cb(TypeCvt)
std::string opr_safe_dump(const cg::OperatorNodeBase* opr);
} // namespace intl
} // namespace gopt
} // namespace mgb
// vim: syntax=cpp.doxygen
/**
* \file src/gopt/impl/profiler_cache.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "./opr_safe_dump.h"
#include "megbrain/gopt/profiler.h"
#include "megbrain/comp_node_env.h"
using namespace mgb;
using namespace gopt;
using ReformatKey = ReformatManager::ReformatKey;
// =================== ProfilerCache ======================
void ProfilerCache::Key::build_blob_from_opr() {
auto&& opr = m_key_impl.opr_key.opr;
// process opr type
auto type = opr->dyn_typeinfo()->name;
size_t type_size = strlen(type);
// process opr param
auto data = intl::opr_safe_dump(opr);
size_t param_size = data.size();
size_t nr_inputs = opr->input().size();
size_t nr_outputs = opr->usable_output().size();
size_t nr_layouts = nr_inputs + nr_outputs;
m_blob_storage.reserve(sizeof(TensorLayout) * 3 * nr_layouts + type_size +
param_size);
// serialize opr type
m_blob_storage.append(type, type_size);
// serialize param
const char* data_ptr = reinterpret_cast<const char*>(data.data());
m_blob_storage.append(data_ptr, param_size);
// serialize layouts
auto append_layout = [this](const VarNode* v) {
TensorLayout ly{v->shape(), v->dtype(), v->format()};
for (size_t i = 0; i < ly.ndim; ++i) {
if (i)
m_blob_storage.push_back(',');
m_blob_storage.append(std::to_string(ly.shape[i]));
}
if (!ly.is_contiguous()) {
m_blob_storage.push_back(';');
for (size_t i = 0; i < ly.ndim; ++i) {
if (i)
m_blob_storage.push_back(',');
m_blob_storage.append(std::to_string(ly.stride[i]));
}
}
m_blob_storage.push_back(';');
m_blob_storage.append(ly.dtype.name());
m_blob_storage.push_back('|');
};
for (size_t i = 0; i < nr_inputs; ++i) {
append_layout(opr->input(i));
}
for (size_t i = 0; i < nr_outputs; ++i) {
append_layout(opr->output(i));
}
// serialize opr_format
m_blob_storage.append(std::to_string(
static_cast<uint32_t>(m_key_impl.opr_key.opr_format)));
// serialize extra_attribute
m_blob_storage.append(std::to_string(
static_cast<uint32_t>(m_key_impl.opr_key.extra_attribute)));
}
void ProfilerCache::Key::build_category(CompNode cn) {
m_category = "layout_transform_profile:";
auto&& env = CompNodeEnv::from_comp_node(cn);
switch (env.property().type) {
#if MGB_CUDA
case CompNode::DeviceType::CUDA: {
auto&& prop = env.cuda_env().device_prop;
m_category += ssprintf("plat=cuda;dev=%s;cap=%d.%d", prop.name,
prop.major, prop.minor);
break;
}
#endif
case CompNode::DeviceType::CPU:
m_category += "plat=cpu";
break;
default:
mgb_throw(MegBrainError,
"unsupported comp node for global layout transform "
"profiler cache category");
}
}
void ProfilerCache::Key::build_blob_from_var() {
auto v = m_key_impl.var_key.var;
// serialize layouts
auto append_layout = [this](const VarNode* v) {
TensorLayout ly{v->shape(), v->dtype(), v->format()};
for (size_t i = 0; i < ly.ndim; ++i) {
if (i)
m_blob_storage.push_back(',');
m_blob_storage.append(std::to_string(ly.shape[i]));
}
if (!ly.is_contiguous()) {
m_blob_storage.push_back(';');
for (size_t i = 0; i < ly.ndim; ++i) {
if (i)
m_blob_storage.push_back(',');
m_blob_storage.append(std::to_string(ly.stride[i]));
}
}
m_blob_storage.push_back(';');
m_blob_storage.append(ly.dtype.name());
m_blob_storage.push_back('|');
};
append_layout(v);
// serialze reformat key
m_blob_storage.append(m_key_impl.var_key.key.to_string());
}
const std::string& ProfilerCache::Key::category() const {
mgb_assert(!m_category.empty());
return m_category;
}
PersistentCache::Blob ProfilerCache::Key::blob() const {
mgb_assert(!m_blob_storage.empty());
return {m_blob_storage.data(), m_blob_storage.size()};
}
ProfilerCache& ProfilerCache::inst() {
static ProfilerCache inst;
return inst;
}
ProfilerCache& ProfilerCache::set_impl(std::unique_ptr<PersistentCache> impl) {
mgb_assert(impl != nullptr);
m_impl.swap(impl);
return *this;
}
void ProfilerCache::dump_cache(const char* path) {
mgb_assert(m_impl->support_dump_cache(),
"current impl of ProfilerCache does not support dump cache to "
"file.");
auto cache = static_cast<InFilePersistentCache*>(m_impl.get());
cache->dump_cache(path);
}
Maybe<ProfilerCache::Result> ProfilerCache::get(const Key& key) {
auto raw_buf = m_impl->get(key.category(), key.blob());
if (!raw_buf.valid())
return None;
// data type of cost is float
auto buf = static_cast<const uint8_t*>(raw_buf->ptr);
auto size = raw_buf->size;
mgb_assert(buf && size == sizeof(float),
"ProfileCache invalid value: ptr=%p, size=%zu", buf, size);
auto read_f32 = [&]() {
auto ret = *reinterpret_cast<const float*>(buf);
return ret;
};
auto cost = read_f32();
return cost;
}
void ProfilerCache::put(const Key& key, Result& result) {
std::string val;
megdnn::Algorithm::serialize_write_pod(result, val);
m_impl->put(key.category(), key.blob(), {val.data(), val.size()});
}
// vim: syntax=cpp.doxygen
......@@ -154,69 +154,61 @@ void MarkInputContiguous::init_output_static_infer_desc() {
} // namespace
/* ================== ProfilerImpl =================*/
class ProfilerImpl final : public ProfilerBase {
public:
ProfilerImpl(int runs = 10) : m_runs{runs} {};
~ProfilerImpl() = default;
ProfilingResult profile(const Problem& problem) const override;
private:
static constexpr float PROFILE_TIME_OUT = 1e7;
using ReformatAttribute = ReformatKey::Attribute;
/*!
* \brief profile opr format agnostic operators (like elemwise, elemwise
* multi type, typecvt etc.)
*
* \param opr pointer to the operator node to be profiled
* \param base_format the original tensor format of the operator node.
* \param available_tensor_formats the available tensor formats
* \return the operator node record
*/
OperatorNodeRecord profile_operator(
const OperatorNodeBase* opr, TensorFormats base_format,
const SmallVector<TensorFormats>& available_tensor_formats,
ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
float profile_operator(
const OperatorNodeBase* opr, TensorFormats base_format,
TensorFormats tensor_format,
ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
/*!
* \brief profile opr format aware operators (like conv, deconv, conv_bias,
* etc.)
*
* \param opr pointer to the operator node to be profiled
* \param base_config the tensor formats configuration of base opr format
* \param config all the available configuration
* \return the operator node record
*/
OperatorNodeRecord profile_operator(
const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const SmallVector<OprTensorFormatsConfiguration>& available_configs,
ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
float profile_operator(
const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const OprTensorFormatsConfiguration& config,
ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
/*!
* \brief profile layout transform of the var node
*
* \param var pointer to the var node to be profiled
* \param base_format the original tensor formats in which the var node is
* stored \param available_tensor_formats the available tensor formats
* \param extra_attribute the extra attributes (options) of the problem
* \return the var node record
*/
VarNodeRecord profile_var_node(
const VarNode* var, TensorFormats base_format,
const SmallVector<TensorFormats>& available_tensor_formats,
ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
float profile_var_node(
const VarNode* var, TensorFormats base_format,
const ReformatKey& key) const;
int m_runs; /// sample times of the profiler
};
ProfilerImpl::ProfilerImpl(int runs, float opr_threshold,
float var_node_threshold)
: m_opr_threshold{opr_threshold},
m_var_node_threshold{var_node_threshold},
m_runs{runs} {
m_opr_filter = [this](const OperatorNodeBase* opr,
OperatorNodeBase* new_opr) {
/// \note: for the considerations of performance, we skip nchw(naive)
/// kernels for conv bias on CUDA platform. to remove this later
if (auto conv = try_cast_as_op<opr::ConvBiasForward>(new_opr)) {
if (conv->output(0)->comp_node().device_type() ==
CompNode::DeviceType::CUDA &&
conv->input(0)->dtype().category() ==
DTypeCategory::QUANTIZED &&
conv->param().format == OprFormat::NCHW) {
return false;
}
}
float comp1 = m_opr_footprint.get_computation(
const_cast<OperatorNodeBase*>(opr));
float comp2 = m_opr_footprint.get_computation(new_opr);
if (comp2 > m_opr_threshold * comp1)
return false;
return true;
};
m_var_node_filter = [this](const VarNode* var, TensorShape from,
TensorShape to, ReformatKey key) {
/// \note: due to the alignment requirement of low-bit tensor, we skip
/// some layout transform for low-bit tensors. The skipped layout
/// transforms do not have corresponding dnn kernel and cannot be
/// implemented by tensor manip operators (like reshape, dimshuffle,
/// subtensor, etc.).
if (var->dtype().enumv() == DTypeEnum::QuantizedS4 ||
var->dtype().enumv() == DTypeEnum::Quantized4Asymm) {
if (key.input_format == TensorFormats::NCHW &&
key.output_format != TensorFormats::NHWC &&
key.output_format != TensorFormats::NCHWc64) {
return false;
}
if (key.output_format == TensorFormats::NCHW &&
key.input_format != TensorFormats::NHWC &&
key.input_format != TensorFormats::NCHWc64) {
return false;
}
}
TensorLayout orig_ly = {var->shape(), var->dtype()},
from_ly = {from, var->dtype()}, to_ly = {to, var->dtype()};
float orig_memory = orig_ly.span().dist_byte() * 2.f;
float reformat_memory =
from_ly.span().dist_byte() + to_ly.span().dist_byte();
if (reformat_memory > orig_memory * m_var_node_threshold)
return false;
return true;
};
}
ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator(
const OperatorNodeBase* opr, TensorFormats base_format,
......@@ -507,56 +499,6 @@ ProfilerImpl::ProfilingResult ProfilerImpl::profile(const Problem& problem) cons
}
/* ================== ProfilerBase =================*/
ProfilerBase::ProfilerBase(float opr_threshold, float var_node_threshold)
: m_opr_threshold{opr_threshold}, m_var_node_threshold{var_node_threshold} {
m_opr_filter = [this](const OperatorNodeBase* opr, OperatorNodeBase* new_opr) {
/// \note: for the considerations of performance, we skip nchw(naive)
/// kernels for conv bias on CUDA platform. to remove this later
if (auto conv = try_cast_as_op<opr::ConvBiasForward>(new_opr)) {
if (conv->output(0)->comp_node().device_type() ==
CompNode::DeviceType::CUDA &&
conv->input(0)->dtype().category() == DTypeCategory::QUANTIZED &&
conv->param().format == OprFormat::NCHW) {
return false;
}
}
float comp1 =
m_opr_footprint.get_computation(const_cast<OperatorNodeBase*>(opr));
float comp2 = m_opr_footprint.get_computation(new_opr);
if (comp2 > m_opr_threshold * comp1)
return false;
return true;
};
m_var_node_filter = [this](const VarNode* var, TensorShape from, TensorShape to,
ReformatKey key) {
/// \note: due to the alignment requirement of low-bit tensor, we skip
/// some layout transform for low-bit tensors. The skipped layout
/// transforms do not have corresponding dnn kernel and cannot be
/// implemented by tensor manip operators (like reshape, dimshuffle,
/// subtensor, etc.).
if (var->dtype().enumv() == DTypeEnum::QuantizedS4 ||
var->dtype().enumv() == DTypeEnum::Quantized4Asymm) {
if (key.input_format == TensorFormats::NCHW &&
key.output_format != TensorFormats::NHWC &&
key.output_format != TensorFormats::NCHWc64) {
return false;
}
if (key.output_format == TensorFormats::NCHW &&
key.input_format != TensorFormats::NHWC &&
key.input_format != TensorFormats::NCHWc64) {
return false;
}
}
TensorLayout orig_ly = {var->shape(), var->dtype()},
from_ly = {from, var->dtype()}, to_ly = {to, var->dtype()};
float orig_memory = orig_ly.span().dist_byte() * 2.f;
float reformat_memory = from_ly.span().dist_byte() + to_ly.span().dist_byte();
if (reformat_memory > orig_memory * m_var_node_threshold)
return false;
return true;
};
}
std::string ProfilerBase::OperatorNodeRecord::to_string() const {
auto str = ssprintf(
"\nopr type: %s\nopr name: %s\ninputs:\n", opr->dyn_typeinfo()->name,
......@@ -595,4 +537,68 @@ std::unique_ptr<ProfilerBase> ProfilerBase::make_profiler() {
return std::make_unique<ProfilerImpl>();
}
std::unique_ptr<ProfilerBase> ProfilerBase::make_cached_profiler(
const char* path) {
return std::make_unique<CachedProfiler>(path);
}
/* ================== CachedProfiler =================*/
CachedProfiler::CachedProfiler(const char* path, int runs, float opr_threshold,
float var_node_threshold)
: ProfilerImpl(runs, opr_threshold, var_node_threshold), m_path{path} {
if (m_path != nullptr) { // file cache
ProfilerCache::inst().set_impl(
std::make_unique<InFilePersistentCache>(m_path));
}
}
CachedProfiler::ProfilingResult CachedProfiler::profile(
const Problem& problem) const {
auto ret = ProfilerImpl::profile(problem);
if (m_path != nullptr)
ProfilerCache::inst().dump_cache(m_path);
return ret;
}
float CachedProfiler::profile_operator(
const OperatorNodeBase* opr, TensorFormats base_format,
TensorFormats tensor_format, ReformatAttribute extra_attribute) const {
ProfilerCache::Key key{opr, tensor_formats_to_opr_format(tensor_format),
extra_attribute};
auto ret = ProfilerCache::inst().get(key);
if (ret.valid())
return ret.val();
auto rst = ProfilerImpl::profile_operator(opr, base_format, tensor_format,
extra_attribute);
ProfilerCache::inst().put(key, rst);
return rst;
}
float CachedProfiler::profile_operator(
const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const OprTensorFormatsConfiguration& config,
ReformatAttribute extra_attribute) const {
ProfilerCache::Key key{opr, config.opr_format, extra_attribute};
auto ret = ProfilerCache::inst().get(key);
if (ret.valid())
return ret.val();
auto rst = ProfilerImpl::profile_operator(opr, base_config, config,
extra_attribute);
ProfilerCache::inst().put(key, rst);
return rst;
}
float CachedProfiler::profile_var_node(const VarNode* var,
TensorFormats base_format,
const ReformatKey& key) const {
ProfilerCache::Key pf_key{var, key};
auto ret = ProfilerCache::inst().get(pf_key);
if (ret.valid())
return ret.val();
auto rst = ProfilerImpl::profile_var_node(var, base_format, key);
ProfilerCache::inst().put(pf_key, rst);
return rst;
}
// vim: syntax=cpp.doxygen
......@@ -18,11 +18,13 @@
#include "megbrain/gopt/subgraph_extractor.h"
#include "megbrain/opr/dnn/convolution.h"
#include "megbrain/plugin/opr_footprint.h"
#include "megbrain/utils/infile_persistent_cache.h"
namespace mgb {
namespace gopt {
class Problem;
class CachedProfiler;
/*!
* \brief A profiler that collects all the performance data to describe the
......@@ -75,22 +77,245 @@ public:
using VarNodeFilter = thin_function<bool(
const VarNode*, TensorShape, TensorShape, ReformatManager::ReformatKey)>;
ProfilerBase(float opr_threshold = 2.f, float var_node_threshold = 2.f);
ProfilerBase(OprFilter opr_filter, VarNodeFilter var_node_filter = {})
: m_opr_filter{std::move(opr_filter)},
m_var_node_filter{std::move(var_node_filter)} {}
ProfilerBase() = default;
virtual ~ProfilerBase() = default;
virtual ProfilingResult profile(const Problem& problem) const = 0;
ProfilerBase& set_opr_filter(const OprFilter& opr_filter) {
m_opr_filter = opr_filter;
return *this;
}
ProfilerBase& set_var_node_filter(const VarNodeFilter& var_node_filter) {
m_var_node_filter = var_node_filter;
return *this;
}
static std::unique_ptr<ProfilerBase> make_profiler();
static std::unique_ptr<ProfilerBase> make_cached_profiler(
const char* path = nullptr);
protected:
OprFilter m_opr_filter;
VarNodeFilter m_var_node_filter;
float m_opr_threshold;
float m_var_node_threshold;
};
private:
/*! \brief A default profiler impl
*/
class ProfilerImpl : public ProfilerBase {
public:
ProfilerImpl(int runs = 10, float opr_threshold = 2.f,
float var_node_threshold = 2.f);
~ProfilerImpl() = default;
ProfilingResult profile(const Problem& problem) const override;
protected:
static constexpr float PROFILE_TIME_OUT = 1e7;
using ReformatKey = ReformatManager::ReformatKey;
using ReformatAttribute = ReformatKey::Attribute;
/*!
* \brief profile opr format agnostic operators (like elemwise, elemwise
* multi type, typecvt etc.)
*
* \param opr pointer to the operator node to be profiled
* \param base_format the original tensor format of the operator node.
* \param available_tensor_formats the available tensor formats
* \return the operator node record
*/
OperatorNodeRecord profile_operator(
const OperatorNodeBase* opr, TensorFormats base_format,
const SmallVector<TensorFormats>& available_tensor_formats,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const;
/*!
* \brief prfile opr format agnostic operators (like elemwise, elemwise multi type, typecvt etc.)
*
* \param opr pointer to the operator to be profiled
* \param base_format the original tensor format of the operator node.
* \param tensor_format the tensor format to be profiled
* \param extra_attribute identify whether to use image object for OpenCL or automatically padding nhwc layout
* \return elapsed time of operator in the given tensor format configuration
*/
virtual float profile_operator(
const OperatorNodeBase* opr, TensorFormats base_format,
TensorFormats tensor_format,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const;
/*!
* \brief profile opr format aware operators (like conv, deconv, conv_bias,
* etc.)
*
* \param opr pointer to the operator node to be profiled
* \param base_config the tensor formats configuration of base opr format
* \param config all the available configuration
* \return the operator node record
*/
OperatorNodeRecord profile_operator(
const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const SmallVector<OprTensorFormatsConfiguration>& available_configs,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const;
/*!
* \brief prfile opr format aware operators (like conv, deconv, conv_bias, resize, warp etc.)
*
* \param opr pointer to the operator to be profiled
* \param base_config the original opr format configuration of the operator node,
* \param config the opr format configuration to be profiled
* \param extra_attribute identify whether to use image object for OpenCL or automatically padding nhwc layout
* \return elapsed time of operator in the given opr format configuration
*/
virtual float profile_operator(const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const OprTensorFormatsConfiguration& config,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const;
/*!
* \brief profile layout transform of the var node
*
* \param var pointer to the var node to be profiled
* \param base_format the original tensor formats in which the var node is
* stored
* \param available_tensor_formats the available tensor formats
* \param extra_attribute the extra attributes (options) of the problem
* \return the var node record
*/
VarNodeRecord profile_var_node(
const VarNode* var, TensorFormats base_format,
const SmallVector<TensorFormats>& available_tensor_formats,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const;
/*!
* \brief profile layout transform of the var node
*
* \param var pointer to the var node to be profiled
* \param base_format the original tensor formats in which the var node is
* stored
* \param key type of ReformatKey, identify the information/attributes of the layout transoform
* \return elapsed time of the layout transform
*/
virtual float profile_var_node(const VarNode* var,
TensorFormats base_format,
const ReformatKey& key) const;
OprFootprint m_opr_footprint;
float m_opr_threshold; /// a threshold, when the computation of the newly
/// created operator that is built in some opr
/// format configuration is as greater as
/// m_opr_threshold times of the original operator,
/// the opr format configuration will be skipped
/// (i.e. the cost is infinite)
float m_var_node_threshold; /// a threshold, when the memory footprint of
/// the layout transform of the var node is as
/// larger as m_var_node_threshold as the var
/// node itself, the layout transform will be
/// skipped (i.e. the cost is infinite)
int m_runs; /// sample times of the profiler
};
/*!
* \brief a ProfilerCache that manages the profiling results of operator in
* different layouts and of layout transform of var nodes.
*/
class ProfilerCache : public NonCopyableObj {
ProfilerCache() : m_impl{std::make_unique<InMemoryPersistentCache>()} {};
public:
using ReformatKey = ReformatManager::ReformatKey;
using ReformatAttribute = ReformatKey::Attribute;
using OprFormat = ProfilerBase::OprFormat;
class Key final : public NonCopyableObj {
std::string m_blob_storage;
std::string m_category;
struct OprKey {
const OperatorNodeBase* opr;
OprFormat opr_format;
ReformatAttribute extra_attribute;
};
struct VarKey {
const VarNode* var;
ReformatKey key;
};
union KeyImpl {
OprKey opr_key;
VarKey var_key;
KeyImpl() { std::memset(this, 0, sizeof(KeyImpl)); }
};
KeyImpl m_key_impl;
void build_blob_from_opr();
void build_blob_from_var();
void build_category(CompNode cn);
public:
Key(const OperatorNodeBase* opr, OprFormat opr_format,
ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) {
m_key_impl.opr_key = {opr, opr_format, extra_attribute};
build_blob_from_opr();
mgb_assert(
opr->node_prop().contain(
cg::OperatorNodeProp::Flag::SINGLE_COMP_NODE),
"operator with multiple comp node is not supported(opr:%s)",
opr->cname());
// here, we assume that the operator to be profiled has only one
// comp node
build_category(opr->output(0)->comp_node());
}
Key(const VarNode* var, ReformatKey key) {
m_key_impl.var_key = {var, key};
build_blob_from_var();
build_category(var->comp_node());
}
const std::string& category() const;
PersistentCache::Blob blob() const;
};
using Result = float;
public:
static ProfilerCache& inst();
ProfilerCache& set_impl(std::unique_ptr<PersistentCache> impl);
void dump_cache(const char* path);
Maybe<Result> get(const Key& key);
void put(const Key& key, Result& result);
private:
std::unique_ptr<PersistentCache> m_impl;
};
class CachedProfiler final : public ProfilerImpl {
public:
CachedProfiler(const char* path = nullptr, int runs = 10,
float opr_threshold = 2.f, float var_node_threshold = 2.f);
ProfilingResult profile(const Problem& problem) const override;
private:
float profile_operator(const OperatorNodeBase* opr,
TensorFormats base_format,
TensorFormats tensor_format,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const override;
float profile_operator(const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const OprTensorFormatsConfiguration& config,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const override;
float profile_var_node(const VarNode* var, TensorFormats base_format,
const ReformatKey& key) const override;
const char* m_path;
};
} // namespace gopt
......
此差异由.gitattributes 抑制。
#!/usr/bin/env python3
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
#
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 为了保证全局图优化里的 profiling 结果不受到 ci 环境的影响,所以把写死的 profiling 结果存到了 cache 里去,
# 每次跑测试会从内存里读取 cache 里的 profiling 结果,然后根据 profiling 结果去做全局图优化。
# 这个脚本用来把 dump 出去的 cache 文件转化成 cache 的头文件,用于测试时读取数据。
# 如果在 src/gopt/test/layout_transform_pass.cpp 里添加了全局图优化相关的测试,则需要考虑用这个脚本来
# 处理一下 profiling 数据。
# 1. 首先将 src/gopt/test/layout_transform_pass.cpp 中的 `#define MGB_WITH_CACHED_TEST 1` 修改为
# `#define MGB_WITH_CACHED_TEST 0`
# 2. 编译megbrain_test,并运行所有全局图优化相关测试:
# ./megbrain_test --gtest_filter="*LayoutTransform*"
# 3. 用这个脚本把所有的cache文件打包在一起
# python3 embed_cache.py -o cache_data.h $(ls /path/to/cache/*.cache)
# 4. 将步骤1中的 define 改回去,这样 profile 过程用到的是 cache 下来的数据。随后可以重新构建 megbrain_test ,
# 验证测试是否正确。
import os.path
import logging
import hashlib
import argparse
import struct
import itertools
import sys
import subprocess
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.WARNING, format='%(asctime)-15s %(message)s')
CHAR_MAP = {i: r'{}'.format(i) for i in range(256)}
def _u32(data):
return struct.unpack('<I', data)[0]
class CacheDataGenerator:
_cache_files = None
def __init__(self, cache_files):
self._cache_files = cache_files
def _get_hash(self):
return _u32(self._hash.digest()[:4])
def gen_cache_data(self, fpath):
fname = os.path.basename(fpath)
with open(fpath, 'rb') as fcache:
cache_data = fcache.read()
cache_data = struct.unpack(
"<{}B".format(len(cache_data)), cache_data)
ret = list(map(CHAR_MAP.__getitem__, cache_data))
for i in range(50, len(ret), 50):
ret[i] = '\n' + ret[i]
return ','.join(ret)
def gen_cache_data_header(self, fout, src_map):
fout.write('// generated embed_cache.py\n')
fout.write('#include <vector>\n')
fout.write('#include <stdint.h>\n')
for k, v in sorted(src_map.items()):
fout.write("""
static const std::vector<uint8_t> {} = {{
""".format(k.replace('.', '_')))
fout.write('{}'.format(v))
fout.write('};\n')
def invoke(self, output):
logger.info('generate cache_data.h ...')
fname2cache_data = {}
for fname in self._cache_files:
base, ext = os.path.splitext(os.path.basename(fname))
assert ext == ".cache", "ext: {}, fname {}".format(ext, fname)
assert base not in fname2cache_data, "duplicated kernel: " + base
fname2cache_data[base] = self.gen_cache_data(fname)
with open(output, 'w') as fout:
self.gen_cache_data_header(fout, fname2cache_data)
logger.info('done')
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='embed cache into cache header file',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-o', '--output', help='output source file',
required=True)
parser.add_argument('cache', help='cache files to be embedded', nargs='+')
args = parser.parse_args()
cache_generator = CacheDataGenerator(args.cache)
cache_generator.invoke(args.output)
......@@ -23,6 +23,12 @@
#include "megbrain/plugin/profiler.h"
#include "megbrain/serialization/serializer.h"
#define MGB_WITH_CACHED_TEST 1
#if MGB_WITH_CACHED_TEST
#include "./cache_data.h"
#endif
using namespace mgb;
using namespace gopt;
using namespace serialization;
......@@ -53,6 +59,78 @@ size_t find_opr_num(SymbolVar endpoint) {
cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
return opr_num;
}
using OprFormat = Problem::OprFormat;
OprFormat tensor_formats_to_opr_format(TensorFormats tensor_format) {
switch (tensor_format) {
case TensorFormats::NCHW:
return OprFormat::NCHW;
case TensorFormats::NCHWc4:
return OprFormat::NCHW4;
case TensorFormats::NCHWc8:
return OprFormat::NCHW8;
case TensorFormats::NCHWc32:
return OprFormat::NCHW32;
case TensorFormats::NCHWc64:
return OprFormat::NCHW64;
case TensorFormats::NHWC:
return OprFormat::NHWC;
case TensorFormats::CHWNc4:
return OprFormat::CHWN4;
default:
mgb_throw(MegBrainError, "tensor format(%u) is not supported",
static_cast<uint32_t>(tensor_format));
}
}
class ProfilerMock : public ProfilerImpl {
public:
ProfilerMock(const uint8_t* bin, size_t size) {
mgb_assert(bin != nullptr);
ProfilerCache::inst().set_impl(
std::make_unique<InFilePersistentCache>(bin, size));
}
~ProfilerMock() {
// reset in memory cache
ProfilerCache::inst().set_impl(
std::make_unique<InMemoryPersistentCache>());
}
private:
float profile_operator(const OperatorNodeBase* opr,
TensorFormats base_format,
TensorFormats tensor_format,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const override {
ProfilerCache::Key key{opr, tensor_formats_to_opr_format(tensor_format),
extra_attribute};
auto ret = ProfilerCache::inst().get(key);
if (ret.valid())
return ret.val();
mgb_assert(false);
}
float profile_operator(const OperatorNodeBase* opr,
const OprTensorFormatsConfiguration& base_config,
const OprTensorFormatsConfiguration& config,
ReformatAttribute extra_attribute =
ReformatAttribute::DEFAULT) const override {
ProfilerCache::Key key{opr, config.opr_format, extra_attribute};
std::string tmp;
tmp.reserve(key.blob().size);
auto ret = ProfilerCache::inst().get(key);
if (ret.valid())
return ret.val();
mgb_assert(false);
}
float profile_var_node(const VarNode* var, TensorFormats base_format,
const ReformatKey& key) const override {
ProfilerCache::Key pf_key{var, key};
auto ret = ProfilerCache::inst().get(pf_key);
if (ret.valid())
return ret.val();
mgb_assert(false);
}
};
} // namespace
#if MGB_CUDA
......@@ -96,15 +174,23 @@ TEST(TestLayoutTransform, Resnet18_QS8) {
OprFormat::NCHW, TensorFormats::NCHW, Target::UNSPEC,
ReformatAttribute::AUTO_PADDING_NHWC};
auto ctx = std::make_unique<LayoutTransformContext>(
std::move(opr_list), std::move(available_tensor_formats), attribute);
ctx->add_opr_config(
opr::ConvBiasForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4, OprFormat::NHWC})
.add_opr_config(
opr::PoolingForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NHWC,
OprFormat::CHWN4});
auto profiler = ProfilerBase::make_profiler();
std::move(opr_list), std::move(available_tensor_formats),
attribute);
ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4,
OprFormat::NHWC})
.add_opr_config(opr::PoolingForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32,
OprFormat::NHWC, OprFormat::CHWN4});
#if MGB_WITH_CACHED_TEST
auto profiler = std::make_unique<ProfilerMock>(
static_cast<const uint8_t*>(
TestLayoutTransform_Resnet18_QS8.data()),
TestLayoutTransform_Resnet18_QS8.size());
#else
auto profiler = ProfilerBase::make_cached_profiler(
"TestLayoutTransform.Resnet18_QS8.cache");
#endif
std::unique_ptr<SolverBase> solver{
new DynamicProgrammingSolver(std::move(profiler))};
auto new_output =
......@@ -190,7 +276,15 @@ TEST(TestLayoutTransform, Resnet18_QS4) {
opr::PoolingForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64,
OprFormat::NHWC, OprFormat::CHWN4});
auto profiler = ProfilerBase::make_profiler();
#if MGB_WITH_CACHED_TEST
auto profiler = std::make_unique<ProfilerMock>(
static_cast<const uint8_t*>(
TestLayoutTransform_Resnet18_QS4.data()),
TestLayoutTransform_Resnet18_QS4.size());
#else
auto profiler = ProfilerBase::make_cached_profiler(
"TestLayoutTransform.Resnet18_QS4.cache");
#endif
std::unique_ptr<SolverBase> solver{
new DynamicProgrammingSolver(std::move(profiler))};
auto new_output =
......@@ -305,7 +399,15 @@ TEST(TestLayoutTransform, Detection_QS8) {
opr::PoolingForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64,
OprFormat::NHWC, OprFormat::CHWN4});
auto profiler = ProfilerBase::make_profiler();
#if MGB_WITH_CACHED_TEST
auto profiler = std::make_unique<ProfilerMock>(
static_cast<const uint8_t*>(
TestLayoutTransform_Detection_QS8.data()),
TestLayoutTransform_Detection_QS8.size());
#else
auto profiler = ProfilerBase::make_cached_profiler(
"TestLayoutTransform.Detection_QS8.cache");
#endif
std::unique_ptr<SolverBase> solver{
new DynamicProgrammingSolver(std::move(profiler))};
auto new_outputs =
......@@ -375,7 +477,15 @@ TEST(TestLayoutTransform, Detection_QS4) {
opr::PoolingForward::typeinfo(),
{OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64,
OprFormat::NHWC, OprFormat::CHWN4});
auto profiler = ProfilerBase::make_profiler();
#if MGB_WITH_CACHED_TEST
auto profiler = std::make_unique<ProfilerMock>(
static_cast<const uint8_t*>(
TestLayoutTransform_Detection_QS4.data()),
TestLayoutTransform_Detection_QS4.size());
#else
auto profiler = ProfilerBase::make_cached_profiler(
"TestLayoutTransform.Detection_QS4.cache");
#endif
std::unique_ptr<SolverBase> solver{
new DynamicProgrammingSolver(std::move(profiler))};
auto new_outputs =
......@@ -443,10 +553,18 @@ TEST(TestLayoutTransform, Wide) {
OprFormat::NCHW, TensorFormats::NCHW, Target::UNSPEC,
ReformatAttribute::DEFAULT};
auto ctx = std::make_unique<LayoutTransformContext>(
std::move(opr_list), std::move(available_tensor_formats), attribute);
ctx->add_opr_config(
opr::ConvBiasForward::typeinfo(), {OprFormat::NCHW, OprFormat::NHWC});
auto profiler = ProfilerBase::make_profiler();
std::move(opr_list), std::move(available_tensor_formats),
attribute);
ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
{OprFormat::NCHW, OprFormat::NHWC});
#if MGB_WITH_CACHED_TEST
auto profiler = std::make_unique<ProfilerMock>(
static_cast<const uint8_t*>(TestLayoutTransform_Wide.data()),
TestLayoutTransform_Wide.size());
#else
auto profiler = ProfilerBase::make_cached_profiler(
"TestLayoutTransform.Wide.cache");
#endif
std::unique_ptr<SolverBase> solver{
new DynamicProgrammingSolver(std::move(profiler))};
auto v = gopt::GraphOptimizer{}
......@@ -463,12 +581,8 @@ TEST(TestLayoutTransform, Wide) {
auto func = network.graph->compile({{sym_o, {}}});
func->execute();
gprof.to_json_full(func.get())->writeto_fpath(output_file("wide.json"));
/// check global layout transform pass, no dimshuffle
/// disable the following check, to make ci stable.
#if 0
auto nr_dimshuffle = find_opr_num<opr::Dimshuffle>(sym_o);
ASSERT_EQ(nr_dimshuffle, 0u);
#endif
auto nr_param_merge = find_opr_num<opr::MultipleDeviceTensorHolder>(sym_o);
ASSERT_EQ(nr_param_merge, 1u);
/// check first conv format
......@@ -477,48 +591,6 @@ TEST(TestLayoutTransform, Wide) {
ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NCHW);
}
TEST(TestLayoutTransform, ElemwiseMultiType) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
Network network(cn);
auto x = network.add_var("x", {64, 64, 1, 2});
auto y = network.add_var("y", {64, 64, 1, 2});
x = network.add_type_cvt(x, dtype::QuantizedS4{1.f});
y = network.add_type_cvt(y, dtype::QuantizedS4{1.f});
auto x_ = network.add_type_cvt(x, dtype::Float32());
auto y_ = network.add_type_cvt(y, dtype::Float32());
auto z = network.add_elemwise(
{x_, y_}, dtype::Float32(), opr::Elemwise::Mode::FUSE_ADD_RELU);
z = network.add_type_cvt(z, dtype::QuantizedS4{1.f});
z = network.add_type_cvt(z, dtype::Float32());
auto z2 = network.add_elemwise(
{x, y}, dtype::QuantizedS4{1.f}, opr::Elemwise::Mode::FUSE_ADD_RELU);
z2 = network.add_type_cvt(z2, dtype::Float32());
HostTensorND t1;
auto func1 = network.graph->compile({make_callback_copy(z, t1)});
func1->execute();
HostTensorND t3;
auto func3 = network.graph->compile({make_callback_copy(z2, t3)});
func3->execute();
auto alter_x = opr::RelayoutFormat::make(
x, megdnn::param::RelayoutFormat::Mode::NCHW_NCHW64);
auto alter_y = opr::RelayoutFormat::make(
y, megdnn::param::RelayoutFormat::Mode::NCHW_NCHW64);
auto alter_z = network.add_elemwise(
{alter_x, alter_y}, dtype::QuantizedS4{1.f},
opr::Elemwise::Mode::FUSE_ADD_RELU);
alter_z = opr::RelayoutFormat::make(
alter_z, megdnn::param::RelayoutFormat::Mode::NCHW64_NCHW);
alter_z = network.add_type_cvt(alter_z, dtype::Float32());
HostTensorND t2;
auto func2 = network.graph->compile({make_callback_copy(alter_z, t2)});
func2->execute();
// MGB_ASSERT_TENSOR_EQ(t1, t3);
MGB_ASSERT_TENSOR_EQ(t2, t3);
}
#if CUDA_VERSION >= 10020
TEST(TestLayoutTransform, DetectionHead) {
REQUIRE_GPU(1);
......@@ -600,8 +672,15 @@ TEST(TestLayoutTransform, DetectionHead) {
.add_opr_config(
opr::WarpPerspectiveForward::typeinfo(),
{OprFormat::NHWC, OprFormat::NCHW4, OprFormat::NCHW64});
auto profiler = ProfilerBase::make_profiler();
#if MGB_WITH_CACHED_TEST
auto profiler = std::make_unique<ProfilerMock>(
static_cast<const uint8_t*>(
TestLayoutTransform_DetectionHead.data()),
TestLayoutTransform_DetectionHead.size());
#else
auto profiler = ProfilerBase::make_cached_profiler(
"TestLayoutTransform.DetectionHead.cache");
#endif
std::unique_ptr<SolverBase> solver{
new DynamicProgrammingSolver(std::move(profiler))};
auto new_out_vars =
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册