提交 d56570d9 编写于 作者: M Megvii Engine Team

fix(megbrain): add rdnn to copybara

GitOrigin-RevId: 7d8bf770532d385dc8a797192cddf5587b27afe0
上级 7de1bb11
......@@ -320,15 +320,12 @@ public:
input->force_assign_dev_tensor_from_tensor(dev_tensor);
mgb_assert(input->comp_node() == dev_tensor.comp_node());
mgb_assert(input->shape().eq_shape(layout));
mgb_assert(input->dtype() == layout.dtype);
idx++;
}
}
void init_output_tensor(const SmallVector<Tensor*>& outputs) {
mgb_assert(m_opr->usable_output().size() == outputs.size());
::mgb::opr::intl::WorkspaceLimitHook::set_impl(
m_opr->owner_graph(), get_workspace_limit);
......@@ -347,9 +344,6 @@ public:
mgb_assert(j < outputs.size());
auto&& tensor = outputs[j];
auto&& layout = tensor->layout();
mgb_assert(var->comp_node() == tensor->comp_node());
mgb_assert(var->shape().eq_shape(layout));
mgb_assert(var->dtype() == layout.dtype);
if (var->m_mem_plan.chunk().owner_var != var) {
tensor->assign_from_dev_tensor(
var->m_dev_tensor); // memory forwarding
......@@ -816,7 +810,6 @@ public:
// minigraph.opr()->usable_output() bug execution may use the attrs for those
// output var, so we infer attrs for all outputs, but only return
// LogicalTensorDesc for minigraph.opr()->usable_output()
SmallVector<LogicalTensorDesc> output_descs;
for (size_t i = 0; i < minigraph.opr()->output().size(); ++i) {
auto* var = minigraph.opr()->output()[i];
auto* shape = sess.infer(sess.output_data[i].shape_infer, true);
......@@ -824,19 +817,15 @@ public:
var->shape(*shape);
}
for (size_t i = 0; i < minigraph.output_size(); ++i) {
SmallVector<TensorPtr> outputs(minigraph.output_size(), {});
for (size_t i = 0; i < outputs.size(); i++) {
auto* ovar = minigraph.output_var(i);
mgb_assert(ovar->dtype().valid() && ovar->comp_node().valid());
mgb_assert(
ovar->shape().ndim ||
ovar->contain_flag(VarNode::Flag::NO_SYS_MEM_ALLOC));
output_descs.push_back({{ovar->shape(), ovar->dtype()}, ovar->comp_node()});
}
SmallVector<TensorPtr> outputs(output_descs.size(), {});
for (size_t i = 0; i < outputs.size(); i++) {
outputs[i] =
Tensor::make(output_descs[i].layout, output_descs[i].comp_node);
outputs[i] = Tensor::make(
TensorLayout{ovar->shape(), ovar->dtype()}, ovar->comp_node());
}
auto raw_outputs = to_raw_ptr_array(outputs, false);
......
#include <limits>
#include <unordered_set>
#include "megbrain/exception.h"
#include "megbrain/rdnn/algo_chooser.h"
#include "megbrain/utils/invoke.h"
//! TODO: here has to be know some megdnn::opr when there is produced midout.h
//! fix it if there is another graceful way.
#include "megdnn/opr_param_defs.h"
#include "megdnn/oprs.h"
#include "megdnn/oprs/base.h"
#include "midout.h"
MIDOUT_DECL(megbrain_opr_algo_chooser)
#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_algo_chooser, __VA_ARGS__) {
#define MIDOUT_E \
} \
MIDOUT_END();
using namespace megdnn;
using namespace mgb;
#define APPLY(statement, ...) \
mgb::apply( \
[&](const auto&... args) { return statement; }, \
std::tuple_cat(__VA_ARGS__))
// timeout delta to be added with fastest known algorithm for new algos
constexpr double TIMEOUT_TOLERANCE = 2;
#define CACHE_KEY_VERSION "v5"
namespace {
template <class MegDNNOpr>
struct MegDNNOpr2Typename;
#define cb(_Opr) \
template <> \
struct MegDNNOpr2Typename<megdnn::_Opr> { \
static const char* name; \
}; \
const char* MegDNNOpr2Typename<megdnn::_Opr>::name = #_Opr;
DNN_FOREACH_FASTRUN_OPR(cb)
#undef cb
template <typename Opr>
std::string profile_name(Opr* opr) {
std::string ret = std::string(::MegDNNOpr2Typename<Opr>::name) + CACHE_KEY_VERSION;
ret.append(opr->get_algorithm_set_name());
return ret;
}
template <typename Opr>
std::string format_fixlayouts(
const typename rdnn::AlgoChooser<Opr>::FixedTensorLayouts& layouts,
size_t arity_in, size_t arity_out, const std::string& delimiter = " -> ") {
std::string ret;
if (arity_in) {
ret.append("(");
for (size_t i = 0; i < arity_in; ++i) {
if (i) {
ret.append(", ");
}
ret.append(layouts[i].to_string() + " ");
}
ret.append(")");
}
if (arity_in && arity_out) {
ret.append(delimiter);
}
if (arity_out) {
ret.append("(");
for (size_t i = 0; i < arity_out; ++i) {
if (i) {
ret.append(", ");
}
ret.append(layouts[i + arity_in].to_string() + " ");
}
ret.append(")");
}
return ret;
}
/**
* \brief Check if the sub opr list has circular dependence.
*/
class CircularDepsChecker {
struct SearchItemStorage {
std::string data_hold;
size_t hash = 0;
SearchItemStorage(const Algorithm::SearchItem& item) {
Algorithm::serialize_write_pod(item.opr_type, data_hold);
for (auto&& layout : item.layouts) {
data_hold += layout.serialize();
}
data_hold += item.param;
}
SearchItemStorage& init_hash() {
hash = XXHash64CT::hash(data_hold.data(), data_hold.size(), 20201225);
return *this;
}
bool operator==(const SearchItemStorage& rhs) const {
return data_hold == rhs.data_hold;
}
struct Hash {
size_t operator()(const SearchItemStorage& s) const { return s.hash; }
};
};
std::unordered_set<SearchItemStorage, SearchItemStorage::Hash> m_set;
public:
void put(const megdnn::Algorithm::SearchItem& key) {
SearchItemStorage key_storage(key);
key_storage.init_hash();
mgb_assert(
m_set.find(key_storage) == m_set.end(),
"Circular dependency during flatten search space");
auto ret = m_set.insert(std::move(key_storage));
mgb_assert(ret.second);
}
void remove(const megdnn::Algorithm::SearchItem& key) {
SearchItemStorage key_storage(key);
key_storage.init_hash();
auto&& iter = m_set.find(key_storage);
mgb_assert(iter != m_set.end());
m_set.erase(iter);
}
};
///////////////// OprTypeTrait /////////////////////////////
template <megdnn::Algorithm::OprType>
struct OprFromOprTypeTrait;
template <typename Opr>
struct OprTypeFromOprTrait;
#define cb(_opr_type, _opr) \
template <> \
struct OprFromOprTypeTrait<megdnn::Algorithm::OprType::_opr_type> { \
using Opr = megdnn::_opr; \
}; \
template <> \
struct OprTypeFromOprTrait<megdnn::_opr> { \
constexpr static megdnn::Algorithm::OprType opr_type = \
megdnn::Algorithm::OprType::_opr_type; \
}
cb(MATRIX_MUL_FORWARD, MatrixMulForward);
cb(BATCHED_MATRIX_MUL_FORWARD, BatchedMatrixMulForward);
cb(CONVOLUTION_FORWARD, ConvolutionForward);
cb(CONVOLUTION_BACKWARD_DATA, ConvolutionBackwardData);
cb(CONVOLUTION_BACKWARD_FILTER, ConvolutionBackwardFilter);
cb(CONVOLUTION3D_FORWARD, Convolution3DForward);
cb(CONVOLUTION3D_BACKWARD_DATA, Convolution3DBackwardData);
cb(CONVOLUTION3D_BACKWARD_FILTER, Convolution3DBackwardFilter);
cb(LOCAL_SHARE_FORWARD, LocalShareForward);
cb(LOCAL_SHARE_BACKWARD_DATA, LocalShareBackwardData);
cb(LOCAL_SHARE_BACKWARD_FILTER, LocalShareBackwardFilter);
cb(DEFORMABLE_CONV_FORWARD, DeformableConvForward);
cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData);
cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter);
cb(BATCH_CONV_FORWARD, BatchConvBiasForward);
cb(CONVBIAS_FORWARD, ConvBiasForward);
cb(POOLING_FORWARD, PoolingForward);
cb(POOLING_BACKWARD, PoolingBackward);
#undef cb
// clang-format off
#define FOREACH_OPR_TYPE_WITH_STMT(cb, stmt) \
cb(MATRIX_MUL_FORWARD, stmt) \
cb(BATCHED_MATRIX_MUL_FORWARD, stmt) \
cb(CONVOLUTION_FORWARD, stmt) \
cb(CONVOLUTION_BACKWARD_DATA, stmt) \
cb(CONVOLUTION_BACKWARD_FILTER, stmt) \
cb(CONVOLUTION3D_FORWARD, stmt) \
cb(CONVOLUTION3D_BACKWARD_DATA, stmt) \
cb(CONVOLUTION3D_BACKWARD_FILTER, stmt) \
cb(LOCAL_SHARE_FORWARD, stmt) \
cb(LOCAL_SHARE_BACKWARD_DATA, stmt) \
cb(LOCAL_SHARE_BACKWARD_FILTER, stmt) \
cb(DEFORMABLE_CONV_FORWARD, stmt) \
cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \
cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \
cb(BATCH_CONV_FORWARD, stmt) \
cb(CONVBIAS_FORWARD, stmt) \
cb(POOLING_FORWARD, stmt) \
cb(POOLING_BACKWARD, stmt)
// clang-format on
#define _OPR_TYPE_CASE(_opr_type, _stmt) \
case Algorithm::OprType::_opr_type: { \
using _Opr = typename OprFromOprTypeTrait<Algorithm::OprType::_opr_type>::Opr; \
_stmt; \
break; \
}
#define FOREACH_OPR_TYPE_DISPATCH(_search_items, _stmt) \
for (size_t _item_idx = 0; _item_idx < _search_items.size(); _item_idx++) { \
auto&& _item = _search_items[_item_idx]; \
switch (_item.opr_type) { \
FOREACH_OPR_TYPE_WITH_STMT(_OPR_TYPE_CASE, _stmt) \
default: \
mgb_throw(MegBrainError, "unknown opr_type"); \
} \
}
template <typename Opr>
TensorLayoutArray to_layout_array(
const typename rdnn::AlgoChooser<Opr>::FixedTensorLayouts& layouts) {
TensorLayoutArray ret;
for (auto&& layout : layouts) {
ret.push_back(layout);
}
return ret;
}
template <typename Opr>
typename rdnn::AlgoChooser<Opr>::FixedTensorLayouts to_fixed_layouts(
const TensorLayoutArray& layouts) {
typename rdnn::AlgoChooser<Opr>::FixedTensorLayouts ret;
mgb_assert(ret.size() == layouts.size());
size_t idx = 0;
for (auto&& layout : layouts) {
ret[idx++] = layout;
}
return ret;
}
/**
* flatten search space in postorder traversal
* The subopr search construct a search tree
*
* A
* / \
* B1B2 C
* / \
* D1D2D3 E
* We use postorder traverse the search tree.
* D1 -> D2 -> D3 -> E -> B1 -> B2 -> C -> A
*/
template <typename Opr>
std::vector<megdnn::Algorithm::SearchItem> flatten_search_space(
const typename rdnn::AlgoChooser<Opr>::AlgoChooserHelper& helper,
CircularDepsChecker& checker) {
auto&& search_item = megdnn::Algorithm::SearchItem{
OprTypeFromOprTrait<Opr>::opr_type, helper.param(),
to_layout_array<Opr>(helper.fastrun_layouts())};
checker.put(search_item);
std::vector<megdnn::Algorithm::SearchItem> ret;
for (auto algo_info : helper.get_all_candidates()) {
megdnn::Algorithm* algo = helper.get_algorithm_from_desc(algo_info.desc);
mgb_assert(algo, "Unknown algo description");
std::vector<megdnn::Algorithm::SearchItem>&& sub_items = algo->get_subopr_list(
to_layout_array<Opr>(helper.fastrun_layouts()), helper.megdnn_opr());
FOREACH_OPR_TYPE_DISPATCH(sub_items, {
auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(helper.comp_node());
megdnn_opr->param() =
Algorithm::deserialize_read_pod<typename _Opr::Param>(_item.param);
typename rdnn::AlgoChooser<_Opr>::AlgoChooserHelper sub_helper(
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(),
_item.param, helper.comp_node(), helper.execution_policy(),
helper.allow_weight_preprocess(), helper.desc());
auto space = flatten_search_space<_Opr>(sub_helper, checker);
ret.insert(ret.end(), space.begin(), space.end());
});
}
ret.push_back(search_item);
checker.remove(search_item);
return ret;
}
//! serialize a algo's desc to string. format is
//! handle_type|algo_type|size_of_param|size_of_name|string_of_param|string_of_name
static void serialize_write_pod(const Algorithm::Info::Desc& val, std::string& result) {
megdnn::Algorithm::serialize_write_pod(val.handle_type, result);
megdnn::Algorithm::serialize_write_pod(val.type, result);
uint32_t param_size = val.param.size();
uint32_t name_size = val.name.size();
megdnn::Algorithm::serialize_write_pod<uint32_t>(param_size, result);
megdnn::Algorithm::serialize_write_pod<uint32_t>(name_size, result);
megdnn::Algorithm::serialize_write_pod(val.param, result);
megdnn::Algorithm::serialize_write_pod(val.name, result);
}
static Algorithm::Info::Desc deserialize_read_pod(
const std::string& data, size_t offset = 0) {
Algorithm::Info::Desc ret;
#define cb(_val, _type) \
_val = megdnn::Algorithm::deserialize_read_pod<_type>(data.data(), offset); \
offset += sizeof(_val)
cb(ret.handle_type, megdnn::Handle::HandleType);
cb(ret.type, uint32_t);
uint32_t param_size = 0;
uint32_t name_size = 0;
cb(param_size, uint32_t);
cb(name_size, uint32_t);
if (param_size > 0) {
ret.param = std::string(data.data() + offset, param_size);
offset += param_size;
}
if (name_size > 0) {
ret.name = std::string(data.data() + offset, name_size);
offset += name_size;
}
return ret;
}
} // namespace
namespace megdnn {
namespace param {
MGB_DEF_ENUM_CLASS_BIT_OPR(ExecutionPolicy::Strategy)
} // namespace param
} // namespace megdnn
namespace mgb {
namespace rdnn {
template <class Opr>
class LayoutsModifier {
using FixedTensorLayouts = typename AlgoChooser<Opr>::FixedTensorLayouts;
public:
static void on(FixedTensorLayouts&, const typename Opr::Param&, size_t) {}
private:
//! index of batch in tensor, 3 for CHWN4 e.g.
static size_t index_of_batch(const typename Opr::Param&) { return 0; }
//! indices contain batch in inputs and outputs, src(0) dst(2) for conv e.g.
static std::vector<size_t> sm_indices_contain_batch;
};
template <class Opr>
std::vector<size_t> LayoutsModifier<Opr>::sm_indices_contain_batch = {};
#define DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(opr, idxs) \
template <> \
class LayoutsModifier<opr> { \
public: \
using FixedTensorLayouts = typename AlgoChooser<opr>::FixedTensorLayouts; \
static void on( \
FixedTensorLayouts& layouts, const opr::Param& param, \
size_t new_batch_size) { \
size_t batch_index = index_of_batch(param); \
for (size_t index : sm_indices_contain_batch) { \
layouts.at(index)[batch_index] = new_batch_size; \
} \
} \
\
private: \
static size_t index_of_batch(const opr::Param&) { return 0; } \
static std::vector<size_t> sm_indices_contain_batch; \
}; \
std::vector<size_t> LayoutsModifier<opr>::sm_indices_contain_batch = idxs;
DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(
megdnn::Convolution3DForward, (std::initializer_list<size_t>{0, 2}))
DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(
megdnn::Convolution3DBackwardData, (std::initializer_list<size_t>{1, 2}))
DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(
megdnn::Convolution3DBackwardFilter, (std::initializer_list<size_t>{0, 1}))
DEFAULT_OPR_WITHOUT_INPUT_BROADCAST(
megdnn::BatchedMatrixMul, (std::initializer_list<size_t>{0, 1, 2}))
#undef DEFAULT_OPR_WITHOUT_INPUT_BROADCAST
#define CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(opr, idxs) \
template <> \
class LayoutsModifier<opr> { \
public: \
using FixedTensorLayouts = typename AlgoChooser<opr>::FixedTensorLayouts; \
static void on( \
FixedTensorLayouts& layouts, const opr::Param& param, \
size_t new_batch_size) { \
size_t batch_index = index_of_batch(param); \
for (size_t index : sm_indices_contain_batch) { \
layouts.at(index)[batch_index] = new_batch_size; \
} \
} \
\
private: \
static size_t index_of_batch(const opr::Param& param) { \
if (param.format == opr::Param::Format::CHWN4) { \
return 3; \
} \
return 0; \
} \
static std::vector<size_t> sm_indices_contain_batch; \
}; \
std::vector<size_t> LayoutsModifier<opr>::sm_indices_contain_batch = idxs;
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(
megdnn::ConvolutionForward, (std::initializer_list<size_t>{0, 2}))
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(
megdnn::ConvolutionBackwardData, (std::initializer_list<size_t>{1, 2}))
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(
megdnn::ConvolutionBackwardFilter, (std::initializer_list<size_t>{0, 1}))
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(
megdnn::LocalShareForward, (std::initializer_list<size_t>{0, 2}))
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(
megdnn::LocalShareBackwardData, (std::initializer_list<size_t>{1, 2}))
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(
megdnn::LocalShareBackwardFilter, (std::initializer_list<size_t>{0, 1}))
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(
megdnn::DeformableConvForward, (std::initializer_list<size_t>{0, 2, 3, 4}))
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(
megdnn::DeformableConvBackwardData,
(std::initializer_list<size_t>{0, 2, 3, 4, 5, 6, 7}))
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(
megdnn::DeformableConvBackwardFilter,
(std::initializer_list<size_t>{0, 1, 2, 3}))
CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST(
megdnn::BatchConvBiasForward, (std::initializer_list<size_t>{0, 1, 2, 3, 4}))
#undef CONV_LIKE_OPR_WITHOUT_INPUT_BROADCAST
template <>
class LayoutsModifier<megdnn::ConvBiasForward> {
public:
using FixedTensorLayouts =
typename AlgoChooser<megdnn::ConvBiasForward>::FixedTensorLayouts;
static void on(
FixedTensorLayouts& layouts, const megdnn::ConvBiasForward::Param& param,
size_t new_batch_size) {
size_t batch_index = index_of_batch(param);
for (size_t index : sm_indices_contain_batch) {
layouts.at(index)[batch_index] = new_batch_size;
}
for (size_t index : sm_indices_contain_batch_broadcast) {
if (!check_bias_share_in_channel(layouts.at(index), param.format)) {
layouts.at(index)[batch_index] = new_batch_size;
}
}
}
private:
static std::vector<size_t> sm_indices_contain_batch;
static std::vector<size_t> sm_indices_contain_batch_broadcast;
static size_t index_of_batch(const megdnn::ConvBiasForward::Param& param) {
if (param.format == megdnn::ConvBiasForward::Param::Format::CHWN4) {
return 3;
}
return 0;
}
};
std::vector<size_t> LayoutsModifier<megdnn::ConvBiasForward>::sm_indices_contain_batch =
{0, 3, 4};
std::vector<size_t>
LayoutsModifier<megdnn::ConvBiasForward>::sm_indices_contain_batch_broadcast = {
2};
template <>
class LayoutsModifier<megdnn::MatrixMul> {
public:
using FixedTensorLayouts =
typename AlgoChooser<megdnn::MatrixMul>::FixedTensorLayouts;
static void on(
FixedTensorLayouts& layouts, const megdnn::MatrixMul::Param& param,
size_t new_batch_size) {
//! Because we do not know whether the batch size is in the dimension m
//! or the dimension n, we just ignore both m and n here.
// FIXME Find a way to make mgb obtain batch size information from R or
// automatically
layouts.at(2)[0] = new_batch_size;
layouts.at(2)[1] = new_batch_size;
if (param.transposeA) {
layouts.at(0)[1] = new_batch_size;
} else {
layouts.at(0)[0] = new_batch_size;
}
if (param.transposeB) {
layouts.at(1)[0] = new_batch_size;
} else {
layouts.at(1)[1] = new_batch_size;
}
}
};
///////////////////////////// AlgoChooserHelper //////////////////////////
template <typename Opr>
AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper(
const FixedTensorLayouts& layouts, Opr* megdnn_opr,
const std::string& param_str, const CompNode& cn,
const megdnn::param::ExecutionPolicy& execution_policy,
bool allow_weight_preprocess, const AlgoChooserDesc& desc)
: m_fastrun_layouts{layouts},
m_incache_layouts{layouts},
m_dnn_opr{megdnn_opr},
m_param{param_str},
m_cn{cn},
m_execution_policy{execution_policy},
m_allow_weight_preprocess{allow_weight_preprocess},
m_desc{desc} {
auto fastrun_batch_size = desc.shared_batch_size;
if (fastrun_batch_size) {
LayoutsModifier<Opr>::on(m_incache_layouts, m_dnn_opr->param(), 0);
LayoutsModifier<Opr>::on(
m_fastrun_layouts, m_dnn_opr->param(), fastrun_batch_size);
}
if (m_desc.no_profiling_on_shape_change) {
for (size_t i = 0; i < m_incache_layouts.size(); i++) {
for (size_t j = 0; j < m_incache_layouts.at(i).ndim; j++) {
m_incache_layouts.at(i)[j] = 0;
m_incache_layouts.at(i).stride[j] = 0;
}
}
}
mgb_assert(m_fastrun_layouts.size() == layouts.size());
static_assert(
std::tuple_size<FixedTensorLayouts>::value == 2 ||
std::tuple_size<FixedTensorLayouts>::value == 3 ||
std::tuple_size<FixedTensorLayouts>::value == 4 ||
std::tuple_size<FixedTensorLayouts>::value == 5 ||
std::tuple_size<FixedTensorLayouts>::value == 8,
"Pooling assumes arity = 2 or 4,Convolution AlgoChooser assumes "
"arity = 3 , 5 or 8 (for deformable conv)");
}
template <typename Opr>
typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelper::
choose_by_heuristic(const ExecutionStrategy& selected_strategy) const {
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("choose_by_heuristic")))
ImplExecutionPolicy policy;
auto workspace_limit =
m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit);
auto attr = extract_algo_attribute(selected_strategy);
policy.algo = APPLY(m_dnn_opr->get_algorithm_info_heuristic(
args..., workspace_limit, attr.first, attr.second),
m_fastrun_layouts)
.desc;
Algorithm* algo = m_dnn_opr->get_algorithm_from_desc(policy.algo);
mgb_assert(algo, "Unknown algo description");
std::vector<Algorithm::SearchItem>&& sub_items =
algo->get_subopr_list(to_layout_array<Opr>(m_fastrun_layouts), m_dnn_opr);
FOREACH_OPR_TYPE_DISPATCH(sub_items, {
auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn);
megdnn_opr->param() =
Algorithm::deserialize_read_pod<typename _Opr::Param>(_item.param);
typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper(
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), _item.param,
m_cn, m_execution_policy, m_allow_weight_preprocess, m_desc);
policy.sub_policy.push_back(sub_helper.choose_by_heuristic(selected_strategy));
});
return policy;
MIDOUT_E
}
template <typename Opr>
typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelper::
choose_by_profile(
const ExecutionStrategy& selected_strategy, bool enable_update) const {
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("choose_by_profile")))
if (m_desc.no_profiling_on_shape_change) {
auto policy = m_dnn_opr->execution_policy();
if (policy.algo.valid()) {
return policy;
}
if (is_matmul<Opr>()) {
mgb_log_warn(
"choose algo by heuristic, which may cause performance "
"regression.");
return choose_by_heuristic(selected_strategy);
}
}
typename AlgoChooser<Opr>::ImplExecutionPolicy tmp_policy;
bool retrive_from_cache = true;
bool allow_log = false;
construct_execution_policy(
selected_strategy, tmp_policy, retrive_from_cache, allow_log);
if (tmp_policy.algo.valid()) {
// return policy when contruct successed
return tmp_policy;
}
if (enable_update) {
CircularDepsChecker circular_deps_checker;
auto&& search_items = flatten_search_space<Opr>(*this, circular_deps_checker);
FOREACH_OPR_TYPE_DISPATCH(search_items, {
auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn);
megdnn_opr->param() =
Algorithm::deserialize_read_pod<typename _Opr::Param>(_item.param);
typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper(
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(),
_item.param, m_cn, m_execution_policy, m_allow_weight_preprocess,
m_desc);
sub_helper.profile(selected_strategy);
});
}
typename AlgoChooser<Opr>::ImplExecutionPolicy policy;
construct_execution_policy(selected_strategy, policy);
return policy;
MIDOUT_E
}
template <typename Opr>
std::pair<
typename AlgoChooser<Opr>::ImplAlgoDesc, Maybe<AlgoChooserProfileCache::Result>>
AlgoChooser<Opr>::AlgoChooserHelper::get_profile_result_from_cache(
const ExecutionStrategy& selected_strategy) const {
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_profile_result_from_cache")))
AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str());
typename Opr::Param origin_param = m_dnn_opr->param();
AlgoChooserProfileCache::Key cache_key{
m_incache_layouts.data(), m_incache_layouts.size(), &origin_param,
sizeof(origin_param)};
auto&& rst = cache.get(cache_key);
if (!rst.valid())
return {{}, rst};
auto&& prof = rst.val();
if (prof.empty())
return {{}, rst};
size_t workspace_limit =
m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit);
auto target_attr = extract_algo_attribute(selected_strategy);
bool skip_by_negative = false;
bool skip_by_workspace = false;
for (auto&& i : prof) {
auto attr_of_algo = static_cast<megdnn::Algorithm::Attribute>(i.attribute);
bool contain_attr_all_positive =
(target_attr.first == (attr_of_algo & target_attr.first));
bool contain_attr_any_negative =
static_cast<bool>(attr_of_algo & target_attr.second);
if (contain_attr_all_positive) {
if (!contain_attr_any_negative) {
if (i.workspace <= workspace_limit) {
Algorithm::Info::Desc algo_desc = deserialize_read_pod(i.algo);
return {algo_desc, rst};
}
skip_by_workspace = true;
} else {
skip_by_negative = true;
}
}
}
if (skip_by_workspace)
return {};
std::string layouts_str = AlgoChooser::format_fixlayouts(m_fastrun_layouts);
if (skip_by_negative) {
mgb_log_error(
"opr: %s, layouts: %s, No usable algo. There are available "
"algos match "
"positive strategy(%s), but filtered by negative stategy(%s).",
::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(),
Algorithm::attribute_str(target_attr.first).c_str(),
Algorithm::attribute_str(target_attr.second).c_str());
} else {
mgb_log_error(
"opr: %s, layouts: %s, No usable algo. algos read from cache "
"could not "
"satisfy positive strategy(%s)",
::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(),
Algorithm::attribute_str(target_attr.first).c_str());
}
mgb_trap();
MIDOUT_E
}
template <typename Opr>
void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy(
const ExecutionStrategy& selected_strategy,
typename AlgoChooser<Opr>::ImplExecutionPolicy& policy, bool retrive_from_cache,
bool allow_log) const {
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("construct_execution_policy")))
if (!policy.algo.valid()) {
if (retrive_from_cache) {
policy.algo = get_profile_result_from_cache(selected_strategy).first;
if (!policy.algo.valid()) {
if (allow_log) {
auto target_attr = extract_algo_attribute(selected_strategy);
std::string layouts_str =
AlgoChooser::format_fixlayouts(m_fastrun_layouts);
std::string msg = ssprintf(
"(opr : %s, layouts %s, with attribute(%s) and "
"without attribute(%s)",
::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(),
Algorithm::attribute_str(target_attr.first).c_str(),
Algorithm::attribute_str(target_attr.second).c_str());
mgb_log_warn(
"No algo get from cache for %s. This may caused by "
"mismatch with model and cache file or imcomplete "
"cache file. ex. profiling with version1, but "
"inferencing on version2 or profiling modelA but "
"inferencing modelB",
msg.c_str());
}
return;
}
} else {
auto workspace_limit = m_desc.get_workspace_limit(
m_cn, m_execution_policy.workspace_limit);
auto attr = extract_algo_attribute(selected_strategy);
policy.algo =
APPLY(m_dnn_opr->get_algorithm_info_heuristic(
args..., workspace_limit, attr.first, attr.second),
m_fastrun_layouts)
.desc;
mgb_assert(
policy.algo.valid(),
"No algo found from heuristic with strategy %u and "
"workspace limit %zu",
static_cast<uint32_t>(selected_strategy), workspace_limit);
}
}
Algorithm* algo = m_dnn_opr->get_algorithm_from_desc(policy.algo);
mgb_assert(algo, "Unknown algo description");
std::vector<Algorithm::SearchItem>&& sub_items =
algo->get_subopr_list(to_layout_array<Opr>(m_fastrun_layouts), m_dnn_opr);
FOREACH_OPR_TYPE_DISPATCH(sub_items, {
auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn);
megdnn_opr->param() =
Algorithm::deserialize_read_pod<typename _Opr::Param>(_item.param);
typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper(
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), _item.param,
m_cn, m_execution_policy, m_allow_weight_preprocess, m_desc);
policy.sub_policy.push_back({});
sub_helper.construct_execution_policy(
selected_strategy, policy.sub_policy.back(), retrive_from_cache,
allow_log);
if (!policy.sub_policy.back().algo.valid()) {
// means sub_helper.construct_execution_policy fails. clean up
// policy.algo and return
policy = {};
return;
}
});
MIDOUT_E
}
template <typename Opr>
size_t AlgoChooser<Opr>::AlgoChooserHelper::get_workspace_size_bytes(
const ImplExecutionPolicy& policy, const FixedTensorLayouts& layouts) const {
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_workspace_size_bytes")))
m_dnn_opr->execution_policy() = policy;
size_t result;
const FixedTensorLayouts* layouts_ptr = &m_fastrun_layouts;
if (layouts.at(0).ndim) {
layouts_ptr = &layouts;
}
if_constexpr<opr_supports_preprocess<Opr>()>(
[&](auto _) {
auto&& opr = _(m_dnn_opr);
auto prep = this->construct_fake_preprocess_filter(*layouts_ptr);
PreprocessFilter<Opr>* prep_ptr = prep.valid() ? &prep.val() : nullptr;
result = std::max(
APPLY(opr->get_preprocess_workspace_in_bytes(args...),
*layouts_ptr),
APPLY(opr->get_workspace_in_bytes(args..., prep_ptr),
*layouts_ptr));
},
/* else */
[&](auto _) {
result = APPLY(
_(m_dnn_opr)->get_workspace_in_bytes(args...), *layouts_ptr);
});
return result;
MIDOUT_E
}
template <typename Opr>
std::vector<typename AlgoChooser<Opr>::ImplAlgo> AlgoChooser<
Opr>::AlgoChooserHelper::get_all_candidates() const {
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("get_all_candidates")))
auto heu = choose_by_heuristic(m_execution_policy.strategy);
auto&& ret = APPLY(m_dnn_opr->get_all_algorithms_info(args...), m_fastrun_layouts);
bool found = false;
for (size_t i = 0; i < ret.size(); ++i) {
if (ret[i].desc == heu.algo) {
found = true;
std::swap(ret[i], ret[0]);
break;
}
}
Algorithm* palgo = m_dnn_opr->get_algorithm_from_desc(heu.algo);
mgb_assert(palgo, "Unknown algo description");
mgb_assert(
found,
"algo %s got by heuristic not found in "
"candidate list",
palgo->name());
return std::move(ret);
MIDOUT_E
}
template <typename Opr>
Maybe<AlgoChooserProfileCache::ResultEntry> AlgoChooser<Opr>::AlgoChooserHelper::
profile_single_algo(const ImplExecutionPolicy& policy, double& timeout) const {
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("profile_single_algo")))
typename TimedProfiler<Opr>::Param param;
// force check copy size <= dest len-1 from gcc8 for safe
param.execution_policy =
TimedProfiler<Opr>::Param::ExecutionPolicyBlob::serialize(policy);
param.workspace = get_workspace_size_bytes(policy);
for (int i = 0; i < arity; ++i) {
auto&& src = m_fastrun_layouts[i];
bool cond_normal = src.format.is_default() &&
(src.dtype.category() == DTypeCategory::FLOAT ||
src.dtype.category() == DTypeCategory::INT ||
src.dtype.category() == DTypeCategory::QUANTIZED);
bool cond_low_bit = src.dtype.is_low_bit() && src.format.is_lowbit_aligned() &&
(src.dtype.category() == DTypeCategory::QUANTIZED ||
src.dtype.category() == DTypeCategory::LOWBIT);
MGB_MARK_USED_VAR(cond_normal);
MGB_MARK_USED_VAR(cond_low_bit);
mgb_assert(
cond_normal || cond_low_bit, "unsupported layout in profiling: %s",
src.to_string().c_str());
param.dtypes[i] = src.dtype.enumv();
}
param.comp_node_physical = m_cn.locator();
param.comp_node_logical = m_cn.locator_logical();
mgb_assert(param.shapes.size() == m_fastrun_layouts.size());
for (size_t i = 0; i < param.shapes.size(); ++i)
param.shapes[i] = m_fastrun_layouts[i];
param.opr_param = m_dnn_opr->param();
param.allow_weight_preprocess = m_allow_weight_preprocess;
Algorithm* palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo);
mgb_assert(palgo, "can not find algo when profile single algo");
auto rst = TimedProfiler<Opr>::profile(param, timeout);
// MIOpen conv profiles all available algos when a specfic shape is
// provided for the first time, which probably adds to the result time.
// Therefore, a second profile execution is needed.
if (strncmp(palgo->name(), "MIOpen", 6) == 0) {
rst = TimedProfiler<Opr>::profile(param, timeout);
}
if (!rst.valid())
return None;
std::string algo_desc;
serialize_write_pod(policy.algo, algo_desc);
return AlgoChooserProfileCache::ResultEntry{
algo_desc, static_cast<uint32_t>(palgo->attribute()), rst.val().time,
param.workspace};
MIDOUT_E
}
template <typename Opr>
void AlgoChooser<Opr>::AlgoChooserHelper::profile(
const ExecutionStrategy& selected_strategy) const {
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("profile")))
auto&& rst = get_profile_result_from_cache(selected_strategy);
if (rst.first.valid())
return;
AlgoChooserProfileCache::Result prof_rst;
auto target_attr = extract_algo_attribute(selected_strategy);
std::string layouts_str = AlgoChooser::format_fixlayouts(m_fastrun_layouts);
double cur_timeout = 0;
auto workspace_limit =
m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit);
RealTimer timer;
std::unordered_set<std::string> rst_algos;
if (rst.second.valid()) {
std::transform(
rst.second.val().begin(), rst.second.val().end(),
std::inserter(rst_algos, rst_algos.end()),
[](const AlgoChooserProfileCache::ResultEntry& result) {
return result.algo;
});
}
for (auto algo : get_all_candidates()) {
std::string desc;
serialize_write_pod(algo.desc, desc);
if (rst_algos.find(desc) != rst_algos.end()) {
continue;
}
Maybe<AlgoChooserProfileCache::ResultEntry> cur_rst;
ImplExecutionPolicy policy;
policy.algo = algo.desc;
//! check negative attribute : skip negative attribute
auto palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo);
if (palgo->contain_attribute_any(target_attr.second)) {
mgb_log_debug(
"skip algo %s, which matches the profile strategy required "
"'not contain attribute(%s).'",
algo.desc.name.c_str(),
Algorithm::attribute_str(target_attr.second).c_str());
continue;
}
//! check workspace limit
construct_execution_policy(selected_strategy, policy);
mgb_assert(
policy.algo.valid(),
"construct execution policy must success when profiling");
if (get_workspace_size_bytes(policy) > workspace_limit) {
continue;
}
std::string msg = ssprintf(
"profiling %s algorithm %s %s", ::MegDNNOpr2Typename<Opr>::name,
algo.desc.name.c_str(), layouts_str.c_str());
timer.reset();
MGB_TRY { cur_rst = profile_single_algo(policy, cur_timeout); }
MGB_CATCH(std::exception & exc, {
mgb_log_warn("caught exception during %s: %s", msg.c_str(), exc.what());
continue;
})
MGB_CATCH(..., {
mgb_log_warn("caught exception during %s", msg.c_str());
continue;
})
if (!cur_rst.valid()) {
mgb_log_warn(
"timeout when %s; timeout setting: %.3fsec", msg.c_str(),
cur_timeout);
continue;
}
if (!cur_timeout) {
cur_timeout = timer.get_secs() + TIMEOUT_TOLERANCE;
} else {
cur_timeout = std::min(cur_timeout, timer.get_secs() + TIMEOUT_TOLERANCE);
}
auto&& rst = cur_rst.val();
mgb_log_debug(
"%s: workspace: %zu; time: %.3gsec", msg.c_str(), rst.workspace,
rst.time);
prof_rst.push_back(rst);
}
std::string msg = ssprintf(
"no usable %s algorithm %s without attribute(%s) or could not meet "
"workspace limite requirement(%zu)",
::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(),
Algorithm::attribute_str(target_attr.second).c_str(), workspace_limit);
mgb_assert(!prof_rst.empty(), "%s", msg.c_str());
if (rst.second.valid())
prof_rst.insert(
prof_rst.end(), rst.second.val().begin(), rst.second.val().end());
FixedTensorLayouts incache_layouts = m_incache_layouts;
typename Opr::Param origin_param = m_dnn_opr->param();
AlgoChooserProfileCache::Key cache_key{
incache_layouts.data(), incache_layouts.size(), &origin_param,
sizeof(origin_param)};
AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str());
cache.put(cache_key, prof_rst);
MIDOUT_E
}
template <typename Opr>
Maybe<PreprocessFilter<Opr>> AlgoChooser<Opr>::AlgoChooserHelper::
construct_fake_preprocess_filter(const FixedTensorLayouts& layouts) const {
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("construct_fake_preprocess_filter")))
Maybe<PreprocessFilter<Opr>> result = None;
const FixedTensorLayouts* layouts_ptr = &m_fastrun_layouts;
if (layouts.at(0).ndim) {
layouts_ptr = &layouts;
}
if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
if (!m_allow_weight_preprocess)
return;
auto opr = _(m_dnn_opr);
auto layouts =
APPLY(opr->deduce_preprocessed_filter_layout(args...), *layouts_ptr);
//! No preprocess layout means no need weight preprocess
if (layouts.empty()) {
return;
}
//! all layouts arm empty means no need weight preprocess
bool layout_valid = false;
for (auto&& layout : layouts) {
if (!layout.is_empty()) {
layout_valid = true;
}
}
if (!layout_valid) {
return;
}
result = PreprocessFilter<Opr>{};
auto& res = result.val();
res.algorithm_id = nullptr;
res.tensors.resize(layouts.size());
for (size_t i = 0; i < layouts.size(); i++) {
res.tensors[i] = megdnn::TensorND(nullptr, layouts[i]);
}
});
return result;
MIDOUT_E
}
template <typename Opr>
std::pair<AlgoAttribute, AlgoAttribute> AlgoChooser<Opr>::AlgoChooserHelper::
extract_algo_attribute(const ExecutionStrategy& strategy) const {
std::pair<AlgoAttribute, AlgoAttribute> ret =
std::make_pair(AlgoAttribute::DEFAULT, AlgoAttribute::DEFAULT);
//! from strategy
if (strategy & ExecutionStrategy::REPRODUCIBLE) {
ret.first |= AlgoAttribute::REPRODUCIBLE;
}
if (strategy & ExecutionStrategy::OPTMIZED) {
ret.second |= AlgoAttribute::NAIVE;
}
//! from graph option
// FIXME: no_profiling_on_shape_change extract USABLE_DEPEND_ON_SHAPE
// attribute when fixed usable
if (m_desc.shared_batch_size) {
ret.second |= AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
}
if (m_desc.binary_equal_between_batch) {
ret.first |= AlgoAttribute::REPRODUCIBLE;
ret.second |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
}
return ret;
}
#define INST(Opr) \
template AlgoChooser<megdnn::Opr>::AlgoChooserHelper::AlgoChooserHelper( \
const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \
const std::string& param_str, const CompNode& cn, \
const megdnn::param::ExecutionPolicy& execution_policy, \
bool allow_weight_preprocess, const AlgoChooserDesc& desc); \
template typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy \
AlgoChooser<megdnn::Opr>::AlgoChooserHelper::choose_by_heuristic( \
const ExecutionStrategy& select_strategy) const; \
template typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy \
AlgoChooser<megdnn::Opr>::AlgoChooserHelper::choose_by_profile( \
const ExecutionStrategy& select_strategy, bool enable_update) const; \
template typename std::pair< \
AlgoChooser<megdnn::Opr>::ImplAlgoDesc, \
Maybe<AlgoChooserProfileCache::Result>> \
AlgoChooser<megdnn::Opr>::AlgoChooserHelper::get_profile_result_from_cache( \
const ExecutionStrategy& select_strategy) const; \
template void \
AlgoChooser<megdnn::Opr>::AlgoChooserHelper::construct_execution_policy( \
const ExecutionStrategy& select_strategy, \
typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& policy, \
bool retrive_from_cache, bool allow_log) const; \
template size_t \
AlgoChooser<megdnn::Opr>::AlgoChooserHelper::get_workspace_size_bytes( \
const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& policy, \
const FixedTensorLayouts& layouts) const; \
template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo> \
AlgoChooser<megdnn::Opr>::AlgoChooserHelper::get_all_candidates() const; \
template Maybe<AlgoChooserProfileCache::ResultEntry> \
AlgoChooser<megdnn::Opr>::AlgoChooserHelper::profile_single_algo( \
const typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& policy, \
double& timeout) const; \
template std::pair<AlgoAttribute, AlgoAttribute> \
AlgoChooser<megdnn::Opr>::AlgoChooserHelper::extract_algo_attribute( \
const ExecutionStrategy& strategy) const; \
template void AlgoChooser<megdnn::Opr>::AlgoChooserHelper::profile( \
const ExecutionStrategy& selected_strategy) const;
DNN_FOREACH_FASTRUN_OPR(INST)
#undef INST
//////////////////////////////// AlgoChoose /////////////////////////////
template <typename Opr>
typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::get_policy(
const AlgoChooserHelper& helper) {
auto opr_strategy = helper.execution_policy().strategy;
auto strategy2str = [](auto strategy) {
std::string ret;
if (strategy & ExecutionStrategy::HEURISTIC) {
ret += "HEURISTIC ";
}
if (strategy & ExecutionStrategy::PROFILE) {
ret += "PROFILE ";
}
if (strategy & ExecutionStrategy::REPRODUCIBLE) {
ret += "REPRODUCIBLE ";
}
if (strategy & ExecutionStrategy::OPTIMIZED) {
ret += "OPTIMIZED ";
}
return ret;
};
mgb_log_debug("Use Stragegy :%s", strategy2str(opr_strategy).c_str());
if (opr_strategy & ExecutionStrategy::HEURISTIC) {
if (opr_strategy & ExecutionStrategy::PROFILE) {
//! this strategy will choose from cache first, then choost by
//! heuristic if fail.
ImplExecutionPolicy policy = helper.choose_by_profile(opr_strategy, false);
if (!policy.algo.valid()) {
policy = helper.choose_by_heuristic(opr_strategy);
}
return policy;
} else {
return helper.choose_by_heuristic(opr_strategy);
}
}
#if MGB_ENABLE_FASTRUN
else if (opr_strategy & ExecutionStrategy::PROFILE) {
return helper.choose_by_profile(opr_strategy, true);
}
#endif
else {
mgb_throw(InternalError, "bad ExecutionPolicy strategy");
}
}
template <typename Opr>
std::string AlgoChooser<Opr>::format_fixlayouts(const FixedTensorLayouts& layout) {
return ::format_fixlayouts<Opr>(layout, arity_in, arity_out);
}
#define INST(Opr) \
template AlgoChooser<megdnn::Opr>::ImplExecutionPolicy \
AlgoChooser<megdnn::Opr>::get_policy(const AlgoChooserHelper& proxy); \
template std::string AlgoChooser<Opr>::format_fixlayouts( \
const FixedTensorLayouts& layout);
DNN_FOREACH_FASTRUN_OPR(INST)
#undef INST
} // namespace rdnn
} // namespace mgb
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
#include "megbrain/rdnn/management.h"
#include "megbrain/comp_node_env.h"
#include "megbrain/tensor.h"
#include "megbrain/utils/metahelper.h"
#include "megdnn/handle.h"
#include "megdnn/oprs.h"
/* ================== global functions ================== */
using namespace mgb;
using namespace mgb::opr;
namespace {
template <class Opr>
class MegDNNGlobalOprContainer final : public UserDataContainer::UserData {
MGB_TYPEINFO_OBJ_DECL;
std::shared_ptr<megdnn::Handle> m_megdnn_handle;
std::unique_ptr<Opr> m_opr;
public:
MegDNNGlobalOprContainer(CompNode cn)
: m_megdnn_handle{intl::get_megdnn_handle_shared(cn)},
m_opr{m_megdnn_handle->create_operator<Opr>()} {
mgb_assert(m_opr->is_thread_safe());
}
Opr* get() const { return m_opr.get(); }
};
template <class Opr>
MGB_TYPEINFO_OBJ_IMPL(MegDNNGlobalOprContainer<Opr>);
} // anonymous namespace
std::shared_ptr<megdnn::Handle> intl::get_megdnn_handle_shared(CompNode comp_node) {
auto& handle = MegDNNHandle::get(CompNodeEnv::from_comp_node(comp_node));
return {handle.shared_from_this(), handle.handle()};
}
megdnn::Handle* intl::get_megdnn_handle(CompNode comp_node) {
return MegDNNHandle::get(CompNodeEnv::from_comp_node(comp_node)).handle();
}
template <typename Opr>
Opr* intl::get_megdnn_global_opr(CompNode comp_node) {
using T = MegDNNGlobalOprContainer<Opr>;
auto maker = [comp_node]() { return std::make_shared<T>(comp_node); };
return CompNodeEnv::from_comp_node(comp_node).get_user_data<T>(maker).get();
}
namespace mgb {
namespace opr {
namespace intl {
#define INST(o) template o* get_megdnn_global_opr<o>(CompNode)
INST(megdnn::AddUpdate);
INST(megdnn::Relayout);
INST(megdnn::Checksum);
#undef INST
} // namespace intl
} // namespace opr
} // namespace mgb
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
#include "megbrain/rdnn/profiler.h"
#include "megbrain/utils/invoke.h"
#include "megdnn/handle.h"
#include "megdnn/oprs/base.h"
#if MGB_ROCM
#include "hcc_detail/hcc_defs_prologue.h"
#include "megcore_rocm.h"
#endif
//! TODO: here has to be know some megdnn::opr when there is produced midout.h
//! fix it if there is another graceful way.
#include "megdnn/oprs.h"
#include "midout.h"
MIDOUT_DECL(megbrain_opr_profile)
#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_profile, __VA_ARGS__) {
#define MIDOUT_E \
} \
MIDOUT_END();
namespace {
std::string serialize_policy(const megdnn::ExecutionPolicy& policy) {
std::string ret;
//! serialize AlgorithmDesc
megdnn::Algorithm::serialize_write_pod(policy.algo.handle_type, ret);
megdnn::Algorithm::serialize_write_pod(policy.algo.type, ret);
uint32_t param_size = policy.algo.param.size();
uint32_t name_size = policy.algo.name.size();
megdnn::Algorithm::serialize_write_pod<uint32_t>(param_size, ret);
megdnn::Algorithm::serialize_write_pod<uint32_t>(name_size, ret);
ret += policy.algo.param;
ret += policy.algo.name;
//! serialize sub_policy
uint32_t size = policy.sub_policy.size();
megdnn::Algorithm::serialize_write_pod(size, ret);
for (auto&& sub : policy.sub_policy) {
ret += serialize_policy(sub);
}
return ret;
}
megdnn::ExecutionPolicy deserialize_policy(
const char* buf, uint32_t size, uint32_t& offset) {
megdnn::ExecutionPolicy ret;
#define cb(_val, _type) \
_val = megdnn::Algorithm::deserialize_read_pod<_type>(buf, offset); \
offset += sizeof(_val)
cb(ret.algo.handle_type, megdnn::Handle::HandleType);
cb(ret.algo.type, uint32_t);
uint32_t param_size = 0;
uint32_t name_size = 0;
cb(param_size, uint32_t);
cb(name_size, uint32_t);
if (param_size > 0) {
ret.algo.param = std::string(buf + offset, param_size);
offset += param_size;
}
if (name_size > 0) {
ret.algo.name = std::string(buf + offset, name_size);
offset += name_size;
}
uint32_t nr_policy = 0;
cb(nr_policy, uint32_t);
#undef cb
for (uint32_t i = 0; i < nr_policy; i++) {
ret.sub_policy.push_back(deserialize_policy(buf, size, offset));
}
return ret;
}
} // namespace
namespace mgb {
namespace rdnn {
#define APPLY(statement, ...) \
mgb::apply( \
[&](const auto&... args) { return statement; }, \
std::tuple_cat(__VA_ARGS__))
////////////// TimedProfiler::Param::ExecutionPolicyBlob //////////////////////
template <typename Opr>
typename TimedProfiler<Opr>::Param::ExecutionPolicyBlob TimedProfiler<Opr>::Param::
ExecutionPolicyBlob::serialize(const megdnn::ExecutionPolicy& policy) {
ExecutionPolicyBlob ret;
std::string serialize_bin = serialize_policy(policy);
mgb_assert(serialize_bin.size() < MAX_SIZE_IN_BYTES);
memcpy(ret.data, serialize_bin.data(), serialize_bin.size());
ret.size = serialize_bin.size();
return ret;
}
template <typename Opr>
megdnn::ExecutionPolicy TimedProfiler<Opr>::Param::ExecutionPolicyBlob::deserialize()
const {
uint32_t offset = 0;
auto&& ret = deserialize_policy(data, size, offset);
mgb_assert(offset == size);
return std::move(ret);
}
#define INST(Opr) \
template typename TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob \
TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::serialize( \
const megdnn::ExecutionPolicy& policy); \
template megdnn::ExecutionPolicy \
TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::deserialize() const;
DNN_FOREACH_FASTRUN_OPR(INST)
#undef INST
////////////////// TimedProfiler //////////////////////////////
template <typename Opr>
const double TimedProfiler<Opr>::timeout_setting =
TimedProfiler<Opr>::init_timeout_setting();
template <typename Opr>
double TimedProfiler<Opr>::init_timeout_setting() {
#if MGB_ENABLE_FASTRUN
sys::TimedFuncInvoker::ins().register_func(
AlgoChooserFuncId<Opr>::ID, &TimedProfiler<Opr>::prof_impl,
&TimedProfiler<Opr>::prof_init_device);
auto to_set = MGB_GETENV("MGB_CONV_PROFILING_TIMEOUT");
if (to_set)
return std::stod(to_set);
#endif
return 0;
}
#define APPLY(statement, ...) \
mgb::apply( \
[&](const auto&... args) { return statement; }, \
std::tuple_cat(__VA_ARGS__))
template <typename Opr>
void TimedProfiler<Opr>::preprocess(
const TensorLayoutArray&, const megdnn::SmallVector<DeviceTensorND>&,
UniqPtrWithCN<Opr>&, megdnn::Workspace&, std::array<TensorLayout, arity>&,
std::array<DeviceTensorND, arity_in>&, PreprocessFilter<Opr>&) {
// Opr is neither convbias nor convolution.This function do nothing.
}
//! convbias
template <>
void TimedProfiler<megdnn::ConvBias>::preprocess(
const TensorLayoutArray& preprocessed_layout,
const SmallVector<DeviceTensorND>& flt_val,
UniqPtrWithCN<megdnn::ConvBias>& megdnn_opr, megdnn::Workspace& mdn_workspace,
std::array<TensorLayout, arity>& layouts,
std::array<DeviceTensorND, arity_in>& inp_val,
PreprocessFilter<megdnn::ConvBias>& prep_flt) {
if (!preprocessed_layout.empty()) {
auto&& pf = prep_flt;
pf.algorithm_id = nullptr;
pf.tensors.resize(flt_val.size());
for (size_t i = 0; i < flt_val.size(); i++) {
pf.tensors[i] = flt_val[i].as_megdnn();
}
APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace),
std::forward_as_tuple(
layouts[0], inp_val[1].as_megdnn(), inp_val[2].as_megdnn()),
array_skip<arity_in - 1>(layouts));
}
}
//! convolution
template <>
void TimedProfiler<megdnn::ConvolutionForward>::preprocess(
const TensorLayoutArray& preprocessed_layout,
const megdnn::SmallVector<DeviceTensorND>& flt_val,
UniqPtrWithCN<megdnn::ConvolutionForward>& megdnn_opr,
megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts,
std::array<DeviceTensorND, arity_in>& inp_val,
PreprocessFilter<megdnn::ConvolutionForward>& prep_flt) {
if (!preprocessed_layout.empty()) {
auto&& pf = prep_flt;
pf.algorithm_id = nullptr;
pf.tensors.resize(flt_val.size());
for (size_t i = 0; i < flt_val.size(); i++) {
pf.tensors[i] = flt_val[i].as_megdnn();
}
APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace),
std::forward_as_tuple(layouts[0], inp_val[1].as_megdnn()),
array_skip<2>(layouts));
}
}
template <typename Opr>
typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
const TParam& raw_param) {
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_impl")))
#if MGB_ROCM
bool miopen_algo_search_enabled;
megcore::getMIOpenAlgoSearchStatus(&miopen_algo_search_enabled);
mgb_assert(miopen_algo_search_enabled, "MIOpen algo search not enabled");
#endif
auto&& param = raw_param.as_single_pod<Param>();
CompNode cn = CompNode::load(param.comp_node_physical, param.comp_node_logical);
auto megdnn_opr = opr::intl::create_megdnn_opr<Opr>(cn);
std::array<TensorLayout, arity> layouts;
auto from_enum = [&](DTypeEnum enumv) -> DType {
switch (enumv) {
#define cb(_dt) \
case DTypeTrait<_dt>::enumv: \
return _dt(1.0f, static_cast<uint8_t>(0))
cb(dtype::Quantized8Asymm);
cb(dtype::Quantized4Asymm);
#undef cb
#define cb(_dt) \
case DTypeTrait<_dt>::enumv: \
return _dt(1.0f)
cb(dtype::QuantizedS8);
cb(dtype::QuantizedS16);
cb(dtype::QuantizedS32);
cb(dtype::QuantizedS4);
default:
return DType::from_enum(enumv);
#undef cb
}
};
for (int i = 0; i < arity; ++i) {
layouts[i] = {param.shapes[i], from_enum(param.dtypes[i])};
}
megdnn_opr->param() = param.opr_param;
megdnn_opr->execution_policy() = param.execution_policy.deserialize();
// Allocate preprocessed weight buffers.
TensorLayoutArray preprocessed_layout;
if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
if (param.allow_weight_preprocess) {
preprocessed_layout = APPLY(
_(megdnn_opr)->deduce_preprocessed_filter_layout(args...), layouts);
}
});
{
// first allocate a whole chunk to avoid memory fragmentation (here we
// rely on memory allocator to reuse memory)
auto align = cn.get_mem_addr_alignment();
size_t tot_size = align;
for (int i = 0; i < arity; ++i) {
tot_size += layouts[i].span().high_byte + align;
}
for (const auto& layout : preprocessed_layout) {
tot_size += layout.span().high_byte + align;
}
tot_size += param.workspace;
DeviceTensorStorage storage{cn};
storage.ensure_size(tot_size);
}
// allocate input and output memory
std::array<DeviceTensorND, arity_in> inp_val;
std::array<DeviceTensorND, arity_out> out_val;
DeviceTensorND workspace;
for (int i = 0; i < arity_in; ++i) {
inp_val[i].comp_node(cn).dtype(layouts[i].dtype).resize(layouts[i]);
}
for (int i = 0; i < arity_out; ++i) {
out_val[i]
.comp_node(cn)
.dtype(layouts[arity_in + i].dtype)
.resize(layouts[arity_in + i]);
}
megdnn::Workspace mdn_workspace;
// allocate workspace
if (param.workspace) {
workspace.comp_node(cn).dtype(dtype::Byte()).resize({param.workspace});
mdn_workspace.size = param.workspace;
mdn_workspace.raw_ptr = workspace.raw_ptr();
}
// allocate storage for preprocessed filter
SmallVector<DeviceTensorND> flt_val(preprocessed_layout.size());
for (size_t i = 0; i < preprocessed_layout.size(); i++) {
flt_val[i] = {
cn, preprocessed_layout[i], preprocessed_layout[i].dtype,
preprocessed_layout[i].format};
}
for (int i = 0; i < arity_in; ++i) {
fill_zero_dev_tensor(inp_val[i]);
}
PreprocessFilter<Opr> prep_flt;
preprocess(
preprocessed_layout, flt_val, megdnn_opr, mdn_workspace, layouts, inp_val,
prep_flt);
RealTimer timer;
auto ev_start = cn.create_event(CompNode::Event::NEED_TIMER),
ev_end = cn.create_event(CompNode::Event::NEED_TIMER);
ev_start->record();
if_constexpr<opr_supports_preprocess<Opr>()>(
[&](auto _) {
auto&& opr = _(megdnn_opr);
PreprocessFilter<Opr>* pf =
preprocessed_layout.empty() ? nullptr : &prep_flt;
APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val,
out_val);
},
/* else */
[&](auto _) {
APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), inp_val,
out_val);
});
ev_end->record();
megdnn::Algorithm* algo =
megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo);
mgb_assert(algo);
double next_report_time = 0.5;
while (!ev_end->finished()) {
if (timer.get_secs() >= next_report_time) {
#if MGB_ENABLE_GETENV
mgb_log_warn(
"profiling conv algo %s already took %.3f/%.3f secs"
" (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ",
algo->name(), timer.get_secs(), param.actual_timeout);
#else
mgb_log_warn(
"profiling conv algo %s already took %.3f/%.3f secs", algo->name(),
timer.get_secs(), param.actual_timeout);
#endif
next_report_time = timer.get_secs() + 1;
}
using namespace std::literals;
#if !__DEPLOY_ON_XP_SP2__
std::this_thread::sleep_for(1000us);
#endif
}
// release all free blocks owned by child process,
// in order to avoid main process running out of memory
cn.try_coalesce_all_free_memory();
mgb_assert(ev_start->finished());
return TResult::from_pod(Result{ev_start->elapsed_time_until(*ev_end)});
MIDOUT_E
};
template <typename Opr>
Maybe<typename TimedProfiler<Opr>::Result> TimedProfiler<Opr>::profile(
const Param& param, double& timeout) {
mgb_assert(timeout >= 0);
if (!timeout) {
timeout = timeout_setting;
} else if (timeout_setting) {
timeout = std::min(timeout, timeout_setting);
}
param.actual_timeout = timeout ? timeout : std::numeric_limits<double>::infinity();
auto res = sys::TimedFuncInvoker::ins().invoke(
AlgoChooserFuncId<Opr>::ID, TParam::from_pod(const_cast<Param&>(param)),
timeout);
if (res.valid())
return res.val().template as_single_pod<Result>();
return None;
}
template <typename Opr>
void TimedProfiler<Opr>::prof_init_device(const TParam& raw_param) {
MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_init_device")))
#if MGB_ROCM
megcore::enableMIOpenAlgoSearch(true);
#endif
auto&& param = raw_param.as_single_pod<Param>();
CompNode cn = CompNode::load(param.comp_node_physical, param.comp_node_logical);
// wait for cuda init, so its time does not get accounted in timeout
cn.sync();
MIDOUT_E
}
#define INST(Opr) \
template const double TimedProfiler<megdnn::Opr>::timeout_setting; \
template double TimedProfiler<megdnn::Opr>::init_timeout_setting(); \
template typename TimedProfiler<megdnn::Opr>::TResult \
TimedProfiler<megdnn::Opr>::prof_impl(const TParam& raw_param); \
template Maybe<typename TimedProfiler<megdnn::Opr>::Result> \
TimedProfiler<megdnn::Opr>::profile(const Param& param, double& timeout); \
template void TimedProfiler<megdnn::Opr>::prof_init_device(const TParam& raw_param);
DNN_FOREACH_FASTRUN_OPR(INST)
#undef INST
} // namespace rdnn
} // namespace mgb
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
#pragma once
#include <memory>
#include "megbrain/opr/param_defs.h"
#include "megbrain/rdnn/profiler.h"
#include "megbrain/utils/persistent_cache.h"
#include "megdnn/oprs/base.h"
namespace mgb {
namespace rdnn {
//! define logical operation of megdnn::param::ExecutionPolicy::Strategy::Enum
//! and megdnn::detail::AlgoAttribute enum
using ExecutionStrategy = megdnn::param::ExecutionPolicy::Strategy;
using AlgoAttribute = megdnn::AlgoAttribute;
/* =================== AlgoChooser =================== */
/*!
* \brief choose algorithm according to ExecutionPolicy
*
* This class only provides static methods, and the entry point is
* AlgoChooser::setup_algo. When profiling is needed, it would first try to
* retrive profiling stats from cache, and run TimedProfiler when necessary
*
* \tparam Opr megdnn operator impl
*/
struct AlgoChooserDesc {
uint32_t shared_batch_size = 0;
bool binary_equal_between_batch = false;
bool no_profiling_on_shape_change = false;
using WorkspaceLimitGetter = std::function<size_t(CompNode, size_t)>;
WorkspaceLimitGetter get_workspace_limit;
};
template <typename Opr>
class AlgoChooser {
static constexpr int arity_in = OprArityTrait<Opr>::arity_in;
static constexpr int arity_out = OprArityTrait<Opr>::arity_out;
static constexpr int arity = OprArityTrait<Opr>::arity;
using ImplAlgo = typename Opr::AlgorithmInfo;
using ImplAlgoDesc = typename Opr::AlgorithmInfo::Desc;
protected:
using ImplExecutionPolicy = megdnn::ExecutionPolicy;
public:
using FixedTensorLayouts = std::array<TensorLayout, arity>;
class AlgoChooserHelper {
//! fastrun layouts
FixedTensorLayouts m_fastrun_layouts;
//! layouts used when get and set cache item
FixedTensorLayouts m_incache_layouts;
Opr* m_dnn_opr;
std::string m_param;
CompNode m_cn;
megdnn::param::ExecutionPolicy m_execution_policy;
bool m_allow_weight_preprocess;
const AlgoChooserDesc& m_desc;
public:
MGE_WIN_DECLSPEC_FUC AlgoChooserHelper(
const FixedTensorLayouts& layouts, Opr* megdnn_opr,
const std::string& param_str, const CompNode& cn,
const megdnn::param::ExecutionPolicy& execution_policy,
bool allow_weight_preprocess, const AlgoChooserDesc& desc);
Opr* megdnn_opr() const { return m_dnn_opr; }
const TensorLayout& inp_layout(size_t idx) const {
return m_fastrun_layouts[idx];
}
const megdnn::param::ExecutionPolicy& execution_policy() const {
return m_execution_policy;
}
CompNode comp_node() const { return m_cn; }
const std::string& param() const { return m_param; }
bool allow_weight_preprocess() const { return m_allow_weight_preprocess; }
megdnn::Algorithm* get_algorithm_from_desc(
const megdnn::Algorithm::Info::Desc& desc) const {
return m_dnn_opr->get_algorithm_from_desc(desc);
}
const FixedTensorLayouts& fastrun_layouts() const { return m_fastrun_layouts; }
const FixedTensorLayouts& incache_layouts() const { return m_incache_layouts; }
const AlgoChooserDesc& desc() const { return m_desc; }
//! construct algo chain by heuristic
ImplExecutionPolicy choose_by_heuristic(
const ExecutionStrategy& selected_strategy) const;
//! construct algo chain by profiling
ImplExecutionPolicy choose_by_profile(
const ExecutionStrategy& selected_strategy, bool enable_update) const;
//! get all profile algorithm from cache, return invalid if not exists
std::pair<ImplAlgoDesc, Maybe<AlgoChooserProfileCache::Result>>
get_profile_result_from_cache(const ExecutionStrategy& selected_strategy) const;
/**
* \brief construct execution policy from cache or heuristic.
*
* \param selected_strategy select algo which matched this strategy
* \param[in,out] policy execution policy
* \param retrive_from_cache retrive algo from cache if set True, get
* from heuristic otherwise.
* \param allow_log no warning log print if set True, print warning info
* otherwise.
*/
void construct_execution_policy(
const ExecutionStrategy& selected_strategy, ImplExecutionPolicy& policy,
bool retrive_from_cache = true, bool allow_log = true) const;
//! get workspace size required for specific execution policy
MGE_WIN_DECLSPEC_FUC size_t get_workspace_size_bytes(
const ImplExecutionPolicy& policy,
const FixedTensorLayouts& layouts = {}) const;
//! get all candidate algos, and the one choose_by_heuristic() is
//! put first
std::vector<ImplAlgo> get_all_candidates() const;
/*!
* \brief profile a single algorithm
*
* This is actually a wrapper that constructs param and call
* TimedProfiler<Opr>::profile for the actual profiling
*
* \param[in,out] timeout set the timeout, and return the actual
* timeout used during profiling
*/
Maybe<AlgoChooserProfileCache::ResultEntry> profile_single_algo(
const ImplExecutionPolicy& policy, double& timeout) const;
//! profile and save to cache
void profile(const ExecutionStrategy& selected_strategy) const;
/**
* \brief extract algo attribute from execution strategy and graph
* option.
*
* \param strategy select algo which matched this strategy
* \return pair<positive_attr, negative_attr>
*/
std::pair<AlgoAttribute, AlgoAttribute> extract_algo_attribute(
const ExecutionStrategy& strategy) const;
private:
Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter(
const FixedTensorLayouts& layouts = {}) const;
};
template <typename U>
friend class AlgoChooser;
//! entrance for getting algorithm according to execution strategy
MGE_WIN_DECLSPEC_FUC static ImplExecutionPolicy get_policy(
const AlgoChooserHelper& helper);
//! format given layouts to string
static std::string format_fixlayouts(const FixedTensorLayouts& layout);
};
} // namespace rdnn
} // namespace mgb
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
#pragma once
#include "megbrain/comp_node.h"
#include "megdnn/handle.h"
namespace mgb {
namespace opr {
namespace intl {
//! get megdnn handle from comp node
MGE_WIN_DECLSPEC_FUC megdnn::Handle* get_megdnn_handle(CompNode comp_node);
MGE_WIN_DECLSPEC_FUC std::shared_ptr<megdnn::Handle> get_megdnn_handle_shared(
CompNode comp_node);
/*!
* \brief get global megdnn operator asscoated with a computing node
* \tparam Opr megdnn operator class, must be one of:
* * AddUpdate
* * Relayout
* * Checksum
*/
template <typename Opr>
MGE_WIN_DECLSPEC_FUC Opr* get_megdnn_global_opr(CompNode comp_node);
template <class Obj>
class UniqPtrWithCN : public std::unique_ptr<Obj> {
CompNode m_cn;
public:
UniqPtrWithCN() = default;
template <class RObj>
UniqPtrWithCN(UniqPtrWithCN<RObj>&& o)
: std::unique_ptr<Obj>(std::move(o)), m_cn(o.comp_node()) {}
UniqPtrWithCN(std::unique_ptr<Obj> ptr, CompNode cn)
: std::unique_ptr<Obj>{std::move(ptr)}, m_cn{cn} {}
CompNode comp_node() const { return m_cn; }
};
//! create megdnn opr from megdnn handle in a CompNode
template <class Opr>
UniqPtrWithCN<Opr> create_megdnn_opr(CompNode comp_node) {
return {get_megdnn_handle(comp_node)->create_operator<Opr>(), comp_node};
}
} // namespace intl
} // namespace opr
namespace rdnn {
template <typename Obj>
using UniqPtrWithCN = opr::intl::UniqPtrWithCN<Obj>;
} // namespace rdnn
} // namespace mgb
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
#pragma once
#include "megbrain/comp_node.h"
#include "megbrain/rdnn/management.h"
#include "megbrain/system.h"
#include "megbrain/tensor.h"
#include "megbrain/utils/hash_ct.h"
#include "megbrain/utils/timer.h"
#include "megdnn/basic_types.h"
#include "megdnn/oprs.h"
namespace mgb {
namespace rdnn {
// clang-format off
#define DNN_FOREACH_FASTRUN_OPR(cb) \
cb(ConvolutionForward) \
cb(ConvBiasForward) \
cb(ConvolutionBackwardData) \
cb(ConvolutionBackwardFilter) \
cb(Convolution3DForward) \
cb(Convolution3DBackwardData) \
cb(Convolution3DBackwardFilter) \
cb(LocalShareForward) \
cb(LocalShareBackwardData) \
cb(LocalShareBackwardFilter) \
cb(DeformableConvForward) \
cb(DeformableConvBackwardFilter) \
cb(DeformableConvBackwardData) \
cb(BatchConvBiasForward) \
cb(MatrixMul) \
cb(BatchedMatrixMul) \
cb(PoolingForward) \
cb(PoolingBackward)
// clang-format on
template <typename Opr>
constexpr bool opr_supports_preprocess() {
return std::is_same<Opr, megdnn::ConvolutionForward>::value ||
std::is_same<Opr, megdnn::ConvBias>::value;
}
template <typename Opr>
constexpr bool opr_contain_bias() {
return std::is_same<Opr, megdnn::ConvBias>::value;
}
//! matmul and batchedMatrixMul
template <typename Opr>
constexpr bool is_matmul() {
return std::is_same<Opr, megdnn::MatrixMul>::value ||
std::is_same<Opr, megdnn::BatchedMatrixMul>::value;
}
template <typename Opr, bool has_prep>
struct PreprocessFilterImpl {
using T = union {};
};
template <typename Opr>
struct PreprocessFilterImpl<Opr, true> {
using T = typename Opr::PreprocessedFilter;
};
template <typename Opr>
using PreprocessFilter =
typename PreprocessFilterImpl<Opr, opr_supports_preprocess<Opr>()>::T;
template <typename Opr>
struct AlgoChooserFuncId {};
#define DEF_FUNC_ID(func) \
template <> \
struct AlgoChooserFuncId<megdnn::func> { \
__attribute__((unused)) static constexpr sys::TimedFuncInvoker::FuncId ID = \
static_cast<sys::TimedFuncInvoker::FuncId>( \
MGB_HASH_STR("megdnn::" #func)); \
};
DNN_FOREACH_FASTRUN_OPR(DEF_FUNC_ID)
#undef DEF_FUNC_ID
/* =================== TimedProfiler =================== */
/*!
* \brief profile a megdnn opr conv with given param
*
* This class only provides static methods, and the entry point is
* TimedProfiler::profile; it would run profiler in a timed environment by
* sys::TimedFuncInvoker
*
* \tparam Opr megdnn opr impl
*/
template <typename Opr>
class TimedProfiler {
static constexpr int arity_in = OprArityTrait<Opr>::arity_in;
static constexpr int arity_out = OprArityTrait<Opr>::arity_out;
static constexpr int arity = OprArityTrait<Opr>::arity;
using TensorShapeArray = std::array<megdnn::TensorShape, arity>;
public:
struct Param {
struct ExecutionPolicyBlob {
//! enlarge the max size if needed
constexpr static size_t MAX_SIZE_IN_BYTES = 10240;
char data[MAX_SIZE_IN_BYTES];
uint32_t size;
static ExecutionPolicyBlob serialize(const megdnn::ExecutionPolicy& policy);
megdnn::ExecutionPolicy deserialize() const;
};
ExecutionPolicyBlob execution_policy;
size_t workspace;
megdnn::DTypeEnum dtypes[arity];
CompNode::Locator comp_node_physical, comp_node_logical;
TensorShapeArray shapes;
typename Opr::Param opr_param;
bool allow_weight_preprocess;
//! filled by profile()
mutable double actual_timeout;
};
struct Result {
double time;
};
static Maybe<Result> profile(const Param& param, double& timeout);
private:
using TParam = sys::TimedFuncInvoker::Param;
using TResult = sys::TimedFuncInvoker::Result;
static const double timeout_setting;
static double init_timeout_setting();
static void preprocess(
const megdnn::TensorLayoutArray& preprocessed_layout,
const SmallVector<DeviceTensorND>& flt_val, UniqPtrWithCN<Opr>& megdnn_opr,
megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts,
std::array<DeviceTensorND, arity_in>& inp_val,
PreprocessFilter<Opr>& prep_flt);
static TResult prof_impl(const TParam& raw_param);
static void prof_init_device(const TParam& raw_param);
};
} // namespace rdnn
} // namespace mgb
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册