提交 5bdc430e 编写于 作者: M Megvii Engine Team

fix(mgb/fastrun): fix megbrain fastrun memory overflow bug

GitOrigin-RevId: f56aa5a5059ef6c5ea9d51fe31d89a0d778aa702
上级 d7ddd43f
...@@ -11,7 +11,8 @@ MGE_WIN_DECLSPEC_FUC size_t setup_algo( ...@@ -11,7 +11,8 @@ MGE_WIN_DECLSPEC_FUC size_t setup_algo(
const typename mgb::rdnn::AlgoChooser<Opr>::FixedTensorLayouts& layouts, const typename mgb::rdnn::AlgoChooser<Opr>::FixedTensorLayouts& layouts,
Opr* megdnn_opr, uint32_t shared_batch_size, bool binary_equal_between_batch, Opr* megdnn_opr, uint32_t shared_batch_size, bool binary_equal_between_batch,
bool no_profiling_on_shape_change, CompNode comp_node, bool no_profiling_on_shape_change, CompNode comp_node,
megdnn::param::ExecutionPolicy execution_policy, bool allow_weight_preprocess) { megdnn::param::ExecutionPolicy execution_policy, bool allow_weight_preprocess,
SmallVector<megdnn::TensorND>* inp_tensornds = nullptr) {
megdnn::AlgorithmCache::Key cache_key( megdnn::AlgorithmCache::Key cache_key(
megdnn_opr->handle(), megdnn_opr->get_opr_type(), layouts.data(), megdnn_opr->handle(), megdnn_opr->get_opr_type(), layouts.data(),
layouts.size(), &megdnn_opr->param(), sizeof(megdnn_opr->param())); layouts.size(), &megdnn_opr->param(), sizeof(megdnn_opr->param()));
...@@ -39,7 +40,7 @@ MGE_WIN_DECLSPEC_FUC size_t setup_algo( ...@@ -39,7 +40,7 @@ MGE_WIN_DECLSPEC_FUC size_t setup_algo(
using AlgoChooserHelper = typename mgb::rdnn::AlgoChooser<Opr>::AlgoChooserHelper; using AlgoChooserHelper = typename mgb::rdnn::AlgoChooser<Opr>::AlgoChooserHelper;
AlgoChooserHelper helper( AlgoChooserHelper helper(
layouts, megdnn_opr, param_str, comp_node, execution_policy, layouts, megdnn_opr, param_str, comp_node, execution_policy,
allow_weight_preprocess, desc); allow_weight_preprocess, desc, inp_tensornds);
megdnn::ExecutionPolicy policy; megdnn::ExecutionPolicy policy;
policy = mgb::rdnn::AlgoChooser<Opr>::get_policy(helper); policy = mgb::rdnn::AlgoChooser<Opr>::get_policy(helper);
......
...@@ -141,11 +141,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor( ...@@ -141,11 +141,8 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
def, inputs[0]->layout().ndim, inputs[0]->layout(), def, inputs[0]->layout().ndim, inputs[0]->layout(),
inputs[1]->layout()); inputs[1]->layout());
DeviceTensorND out =
BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout);
using TensorND = megdnn::TensorND; using TensorND = megdnn::TensorND;
SmallVector<TensorND> inp_tensornds(inputs.size()); SmallVector<TensorND> inp_tensornds(inputs.size() + 2);
TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size());
for (unsigned i = 0; i < inputs.size(); ++i) { for (unsigned i = 0; i < inputs.size(); ++i) {
inp_tensornds[i] = inputs[i]->dnn_tensor(); inp_tensornds[i] = inputs[i]->dnn_tensor();
...@@ -168,13 +165,20 @@ SmallVector<TensorPtr> apply_on_physical_tensor( ...@@ -168,13 +165,20 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
TensorLayout empty_shp({0}, inputs[0]->dtype()); TensorLayout empty_shp({0}, inputs[0]->dtype());
empty_shp.ndim = 0; empty_shp.ndim = 0;
DeviceTensorND empty_bias =
BlobManager::inst()->alloc_workspace_with_defrag(cn, empty_shp);
inp_tensornds[2] = empty_bias.as_megdnn();
inp_tensornds[3] = empty_bias.as_megdnn();
size_t sz = setup_algo<megdnn::ConvBiasForward>( size_t sz = setup_algo<megdnn::ConvBiasForward>(
{inp_shapes[0], inp_shapes[1], empty_shp, empty_shp, oup_shapes[0]}, {inp_shapes[0], inp_shapes[1], empty_shp, empty_shp, oup_shapes[0]},
dnn_opr.op.get(), 0, false, false, cn, conv.policy(), false); dnn_opr.op.get(), 0, false, false, cn, conv.policy(), false,
&inp_tensornds);
// alloc memory // alloc memory
DeviceTensorND empty_bias = DeviceTensorND out =
BlobManager::inst()->alloc_workspace_with_defrag(cn, empty_shp); BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout);
TensorLayout w_layout({sz}, dtype::Byte()); TensorLayout w_layout({sz}, dtype::Byte());
auto dnn_wk = dnn_opr.create_workspace(w_layout); auto dnn_wk = dnn_opr.create_workspace(w_layout);
...@@ -364,9 +368,6 @@ SmallVector<TensorPtr> apply_on_physical_tensor( ...@@ -364,9 +368,6 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
def, inputs[1]->layout().ndim, inputs[0]->layout(), inputs[1]->layout(), def, inputs[1]->layout().ndim, inputs[0]->layout(), inputs[1]->layout(),
cn); cn);
DeviceTensorND out =
BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout);
using TensorND = megdnn::TensorND; using TensorND = megdnn::TensorND;
SmallVector<TensorND> inp_tensornds(inputs.size()); SmallVector<TensorND> inp_tensornds(inputs.size());
TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size());
...@@ -380,7 +381,10 @@ SmallVector<TensorPtr> apply_on_physical_tensor( ...@@ -380,7 +381,10 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
size_t sz = setup_algo<megdnn::ConvolutionBackwardData>( size_t sz = setup_algo<megdnn::ConvolutionBackwardData>(
{inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, {inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false,
false, cn, convbwd.policy(), false); false, cn, convbwd.policy(), false, &inp_tensornds);
DeviceTensorND out =
BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout);
auto wk = Blob::make(cn, sz); auto wk = Blob::make(cn, sz);
auto ptr = wk->storage().get(); auto ptr = wk->storage().get();
...@@ -542,7 +546,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( ...@@ -542,7 +546,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
// shape infer // shape infer
size_t sz = setup_algo<megdnn::Convolution3D>( size_t sz = setup_algo<megdnn::Convolution3D>(
{inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, {inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false,
false, cn, conv.policy(), false); false, cn, conv.policy(), false, &inp_tensornds);
// alloc memory // alloc memory
DeviceTensorND out = DeviceTensorND out =
...@@ -598,8 +602,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor( ...@@ -598,8 +602,9 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) { SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
auto&& op_def = def.cast_final_safe<Convolution3DBackwardData>(); auto&& op_def = def.cast_final_safe<Convolution3DBackwardData>();
auto cn = inputs[0]->comp_node(); auto cn = inputs[0]->comp_node();
megdnn::TensorND weight = inputs[0]->dnn_tensor();
megdnn::TensorND diff = inputs[1]->dnn_tensor(); auto&& wlayout = inputs[0]->layout();
auto&& dlayout = inputs[1]->layout();
DnnOprCaller<megdnn::Convolution3DBackwardData> caller(cn); DnnOprCaller<megdnn::Convolution3DBackwardData> caller(cn);
auto&& dnn_opr = caller.op; auto&& dnn_opr = caller.op;
...@@ -608,21 +613,24 @@ SmallVector<TensorPtr> apply_on_physical_tensor( ...@@ -608,21 +613,24 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
TensorLayout& oup_layout = output_descs[0].layout; TensorLayout& oup_layout = output_descs[0].layout;
if (!validated) { if (!validated) {
megdnn::Convolution3DBackwardData::deduce_layout_impl( megdnn::Convolution3DBackwardData::deduce_layout_impl(
weight.layout, diff.layout, op_def.param(), oup_layout); wlayout, dlayout, op_def.param(), oup_layout);
} }
DeviceTensorND oup = DeviceTensorND oup =
BlobManager::inst()->alloc_workspace_with_defrag(cn, oup_layout); BlobManager::inst()->alloc_workspace_with_defrag(cn, oup_layout);
SmallVector<megdnn::TensorND> inp_tensornds(inputs.size());
inp_tensornds[0] = inputs[0]->dnn_tensor();
inp_tensornds[1] = inputs[1]->dnn_tensor();
size_t wk_size = setup_algo<megdnn::Convolution3DBackwardData>( size_t wk_size = setup_algo<megdnn::Convolution3DBackwardData>(
{weight.layout, diff.layout, oup_layout}, dnn_opr.get(), 0, false, false, {wlayout, dlayout, oup_layout}, dnn_opr.get(), 0, false, false, cn,
cn, op_def.policy(), false); op_def.policy(), false, &inp_tensornds);
megdnn::Workspace dnn_wk; megdnn::Workspace dnn_wk;
if (wk_size != 0) { if (wk_size != 0) {
TensorLayout w_layout({wk_size}, dtype::Byte()); TensorLayout w_layout({wk_size}, dtype::Byte());
dnn_wk = caller.create_workspace(w_layout); dnn_wk = caller.create_workspace(w_layout);
} }
dnn_opr->exec(weight, diff, oup.as_megdnn(), dnn_wk); dnn_opr->exec(inp_tensornds[0], inp_tensornds[1], oup.as_megdnn(), dnn_wk);
return {Tensor::make(oup)}; return {Tensor::make(oup)};
} }
......
...@@ -229,12 +229,11 @@ SmallVector<TensorPtr> apply_on_physical_tensor( ...@@ -229,12 +229,11 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
inp_tensornds[0].layout = layout_a; inp_tensornds[0].layout = layout_a;
inp_tensornds[1].layout = layout_b; inp_tensornds[1].layout = layout_b;
} }
DeviceTensorND out =
BlobManager::inst()->alloc_workspace_with_defrag(cn, dst_layout);
size_t sz = setup_algo<megdnn::MatrixMul>( size_t sz = setup_algo<megdnn::MatrixMul>(
{layout_a, layout_b, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, {layout_a, layout_b, dst_layout}, dnn_opr.op.get(), 0, false, false, cn,
matmul.policy(), false); matmul.policy(), false, &inp_tensornds);
DeviceTensorND out =
BlobManager::inst()->alloc_workspace_with_defrag(cn, dst_layout);
TensorLayout w_layout({sz}, dtype::Byte()); TensorLayout w_layout({sz}, dtype::Byte());
auto dnn_wk = dnn_opr.create_workspace(w_layout); auto dnn_wk = dnn_opr.create_workspace(w_layout);
...@@ -470,21 +469,22 @@ SmallVector<TensorPtr> apply_on_physical_tensor( ...@@ -470,21 +469,22 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
return {Tensor::make(out)}; return {Tensor::make(out)};
} }
using TensorND = megdnn::TensorND; SmallVector<megdnn::TensorND> inp_tensornds(2u);
TensorND inp_nd1 = inp1->dnn_tensor(); inp_tensornds[0] = inp1->dnn_tensor();
inp_nd1.layout = layout1; inp_tensornds[0].layout = layout1;
TensorND inp_nd2 = inp2->dnn_tensor(); inp_tensornds[1] = inp2->dnn_tensor();
inp_nd2.layout = layout2; inp_tensornds[1].layout = layout2;
size_t sz = setup_algo<megdnn::BatchedMatrixMul>(
{layout1, layout2, dst_layout}, dnn_opr.op.get(), 0, false, false, cn,
matmul.policy(), false, &inp_tensornds);
DeviceTensorND out = DeviceTensorND out =
BlobManager::inst()->alloc_workspace_with_defrag(cn, dst_layout); BlobManager::inst()->alloc_workspace_with_defrag(cn, dst_layout);
size_t sz = setup_algo<megdnn::BatchedMatrixMul>(
{layout1, layout2, dst_layout}, dnn_opr.op.get(), 0, false, false, cn,
matmul.policy(), false);
TensorLayout w_layout({sz}, dtype::Byte()); TensorLayout w_layout({sz}, dtype::Byte());
auto dnn_wk = dnn_opr.create_workspace(w_layout); auto dnn_wk = dnn_opr.create_workspace(w_layout);
dnn_opr.op->exec(inp_nd1, inp_nd2, out.as_megdnn(), dnn_wk); dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out.as_megdnn(), dnn_wk);
shp1[shp1.ndim - 2] = dst_layout[dst_layout.ndim - 2]; shp1[shp1.ndim - 2] = dst_layout[dst_layout.ndim - 2];
shp1[shp1.ndim - 1] = dst_layout[dst_layout.ndim - 1]; shp1[shp1.ndim - 1] = dst_layout[dst_layout.ndim - 1];
......
...@@ -49,23 +49,25 @@ SmallVector<TensorPtr> apply_on_physical_tensor( ...@@ -49,23 +49,25 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
auto&& op_def = def.cast_final_safe<Pooling>(); auto&& op_def = def.cast_final_safe<Pooling>();
auto cn = inputs[0]->comp_node(); auto cn = inputs[0]->comp_node();
megdnn::TensorND inp_tensornd = inputs[0]->dnn_tensor();
DnnOprCaller<megdnn::Pooling> caller(cn); DnnOprCaller<megdnn::Pooling> caller(cn);
auto&& dnn_opr = caller.op; auto&& dnn_opr = caller.op;
dnn_opr->param() = op_def.param(); dnn_opr->param() = op_def.param();
SmallVector<megdnn::TensorND> inp_tensornds(inputs.size());
inp_tensornds[0] = inputs[0]->dnn_tensor();
TensorLayout& oup_layout = output_descs[0].layout; TensorLayout& oup_layout = output_descs[0].layout;
if (!validated) { if (!validated) {
megdnn::Pooling::deduce_layout_impl( megdnn::Pooling::deduce_layout_impl(
inp_tensornd.layout, op_def.param(), oup_layout); inp_tensornds[0].layout, op_def.param(), oup_layout);
} }
DeviceTensorND out_devtensor =
BlobManager::inst()->alloc_workspace_with_defrag(cn, oup_layout);
size_t wk_size = setup_algo<megdnn::Pooling>( size_t wk_size = setup_algo<megdnn::Pooling>(
{inp_tensornd.layout, oup_layout}, dnn_opr.get(), 0, false, false, cn, {inp_tensornds[0].layout, oup_layout}, dnn_opr.get(), 0, false, false, cn,
op_def.policy(), false); op_def.policy(), false, &inp_tensornds);
DeviceTensorND out_devtensor =
BlobManager::inst()->alloc_workspace_with_defrag(cn, oup_layout);
megdnn::Workspace dnn_wk; megdnn::Workspace dnn_wk;
if (wk_size) { if (wk_size) {
...@@ -73,7 +75,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor( ...@@ -73,7 +75,7 @@ SmallVector<TensorPtr> apply_on_physical_tensor(
dnn_wk = caller.create_workspace(w_layout); dnn_wk = caller.create_workspace(w_layout);
} }
dnn_opr->exec(inp_tensornd, out_devtensor.as_megdnn(), dnn_wk); dnn_opr->exec(inp_tensornds[0], out_devtensor.as_megdnn(), dnn_wk);
return {Tensor::make(out_devtensor)}; return {Tensor::make(out_devtensor)};
} }
......
...@@ -265,7 +265,8 @@ std::vector<megdnn::Algorithm::SearchItem> flatten_search_space( ...@@ -265,7 +265,8 @@ std::vector<megdnn::Algorithm::SearchItem> flatten_search_space(
typename rdnn::AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( typename rdnn::AlgoChooser<_Opr>::AlgoChooserHelper sub_helper(
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(),
_item.param, helper.comp_node(), helper.execution_policy(), _item.param, helper.comp_node(), helper.execution_policy(),
helper.allow_weight_preprocess(), helper.desc()); helper.allow_weight_preprocess(), helper.desc(),
helper.get_input());
auto space = flatten_search_space<_Opr>(sub_helper, checker); auto space = flatten_search_space<_Opr>(sub_helper, checker);
ret.insert(ret.end(), space.begin(), space.end()); ret.insert(ret.end(), space.begin(), space.end());
}); });
...@@ -488,7 +489,8 @@ AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper( ...@@ -488,7 +489,8 @@ AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper(
const FixedTensorLayouts& layouts, Opr* megdnn_opr, const FixedTensorLayouts& layouts, Opr* megdnn_opr,
const std::string& param_str, const CompNode& cn, const std::string& param_str, const CompNode& cn,
const megdnn::param::ExecutionPolicy& execution_policy, const megdnn::param::ExecutionPolicy& execution_policy,
bool allow_weight_preprocess, const AlgoChooserDesc& desc) bool allow_weight_preprocess, const AlgoChooserDesc& desc,
SmallVector<megdnn::TensorND>* inputs)
: m_fastrun_layouts{layouts}, : m_fastrun_layouts{layouts},
m_incache_layouts{layouts}, m_incache_layouts{layouts},
m_dnn_opr{megdnn_opr}, m_dnn_opr{megdnn_opr},
...@@ -496,7 +498,8 @@ AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper( ...@@ -496,7 +498,8 @@ AlgoChooser<Opr>::AlgoChooserHelper::AlgoChooserHelper(
m_cn{cn}, m_cn{cn},
m_execution_policy{execution_policy}, m_execution_policy{execution_policy},
m_allow_weight_preprocess{allow_weight_preprocess}, m_allow_weight_preprocess{allow_weight_preprocess},
m_desc{desc} { m_desc{desc},
m_inputs{inputs} {
auto fastrun_batch_size = desc.shared_batch_size; auto fastrun_batch_size = desc.shared_batch_size;
if (fastrun_batch_size) { if (fastrun_batch_size) {
...@@ -604,7 +607,7 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp ...@@ -604,7 +607,7 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp
typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper(
to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(),
_item.param, m_cn, m_execution_policy, m_allow_weight_preprocess, _item.param, m_cn, m_execution_policy, m_allow_weight_preprocess,
m_desc); m_desc, m_inputs);
sub_helper.profile(selected_strategy); sub_helper.profile(selected_strategy);
}); });
} }
...@@ -868,6 +871,7 @@ Maybe<AlgoChooserProfileCache::ResultEntry> AlgoChooser<Opr>::AlgoChooserHelper: ...@@ -868,6 +871,7 @@ Maybe<AlgoChooserProfileCache::ResultEntry> AlgoChooser<Opr>::AlgoChooserHelper:
param.shapes[i] = m_fastrun_layouts[i]; param.shapes[i] = m_fastrun_layouts[i];
param.opr_param = m_dnn_opr->param(); param.opr_param = m_dnn_opr->param();
param.allow_weight_preprocess = m_allow_weight_preprocess; param.allow_weight_preprocess = m_allow_weight_preprocess;
param.inp_tensornds = m_inputs;
Algorithm* palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo); Algorithm* palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo);
mgb_assert(palgo, "can not find algo when profile single algo"); mgb_assert(palgo, "can not find algo when profile single algo");
...@@ -964,7 +968,9 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( ...@@ -964,7 +968,9 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
if (!policy.algo.valid()) if (!policy.algo.valid())
continue; continue;
size_t workspace_needed = get_workspace_size_bytes(policy); size_t workspace_needed = get_workspace_size_bytes(policy);
if (data_size + workspace_needed > if (m_inputs != nullptr)
workspace_needed += data_size;
if (workspace_needed >
m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) { m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) {
continue; continue;
} }
...@@ -1101,7 +1107,8 @@ std::pair<AlgoAttribute, AlgoAttribute> AlgoChooser<Opr>::AlgoChooserHelper:: ...@@ -1101,7 +1107,8 @@ std::pair<AlgoAttribute, AlgoAttribute> AlgoChooser<Opr>::AlgoChooserHelper::
const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \ const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \
const std::string& param_str, const CompNode& cn, \ const std::string& param_str, const CompNode& cn, \
const megdnn::param::ExecutionPolicy& execution_policy, \ const megdnn::param::ExecutionPolicy& execution_policy, \
bool allow_weight_preprocess, const AlgoChooserDesc& desc); \ bool allow_weight_preprocess, const AlgoChooserDesc& desc, \
SmallVector<megdnn::TensorND>* inputs); \
template typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy \ template typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy \
AlgoChooser<megdnn::Opr>::AlgoChooserHelper::choose_by_heuristic( \ AlgoChooser<megdnn::Opr>::AlgoChooserHelper::choose_by_heuristic( \
const ExecutionStrategy& select_strategy) const; \ const ExecutionStrategy& select_strategy) const; \
......
...@@ -143,7 +143,7 @@ template <typename Opr> ...@@ -143,7 +143,7 @@ template <typename Opr>
void TimedProfiler<Opr>::preprocess( void TimedProfiler<Opr>::preprocess(
const TensorLayoutArray&, const megdnn::SmallVector<DeviceTensorND>&, const TensorLayoutArray&, const megdnn::SmallVector<DeviceTensorND>&,
UniqPtrWithCN<Opr>&, megdnn::Workspace&, std::array<TensorLayout, arity>&, UniqPtrWithCN<Opr>&, megdnn::Workspace&, std::array<TensorLayout, arity>&,
std::array<DeviceTensorND, arity_in>&, PreprocessFilter<Opr>&) { std::array<megdnn::TensorND, arity_in>&, PreprocessFilter<Opr>&) {
// Opr is neither convbias nor convolution.This function do nothing. // Opr is neither convbias nor convolution.This function do nothing.
} }
...@@ -154,7 +154,7 @@ void TimedProfiler<megdnn::ConvBias>::preprocess( ...@@ -154,7 +154,7 @@ void TimedProfiler<megdnn::ConvBias>::preprocess(
const SmallVector<DeviceTensorND>& flt_val, const SmallVector<DeviceTensorND>& flt_val,
UniqPtrWithCN<megdnn::ConvBias>& megdnn_opr, megdnn::Workspace& mdn_workspace, UniqPtrWithCN<megdnn::ConvBias>& megdnn_opr, megdnn::Workspace& mdn_workspace,
std::array<TensorLayout, arity>& layouts, std::array<TensorLayout, arity>& layouts,
std::array<DeviceTensorND, arity_in>& inp_val, std::array<megdnn::TensorND, arity_in>& inp_val,
PreprocessFilter<megdnn::ConvBias>& prep_flt) { PreprocessFilter<megdnn::ConvBias>& prep_flt) {
if (!preprocessed_layout.empty()) { if (!preprocessed_layout.empty()) {
auto&& pf = prep_flt; auto&& pf = prep_flt;
...@@ -164,8 +164,7 @@ void TimedProfiler<megdnn::ConvBias>::preprocess( ...@@ -164,8 +164,7 @@ void TimedProfiler<megdnn::ConvBias>::preprocess(
pf.tensors[i] = flt_val[i].as_megdnn(); pf.tensors[i] = flt_val[i].as_megdnn();
} }
APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace), APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace),
std::forward_as_tuple( std::forward_as_tuple(layouts[0], inp_val[1], inp_val[2]),
layouts[0], inp_val[1].as_megdnn(), inp_val[2].as_megdnn()),
array_skip<arity_in - 1>(layouts)); array_skip<arity_in - 1>(layouts));
} }
} }
...@@ -177,7 +176,7 @@ void TimedProfiler<megdnn::ConvolutionForward>::preprocess( ...@@ -177,7 +176,7 @@ void TimedProfiler<megdnn::ConvolutionForward>::preprocess(
const megdnn::SmallVector<DeviceTensorND>& flt_val, const megdnn::SmallVector<DeviceTensorND>& flt_val,
UniqPtrWithCN<megdnn::ConvolutionForward>& megdnn_opr, UniqPtrWithCN<megdnn::ConvolutionForward>& megdnn_opr,
megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts, megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts,
std::array<DeviceTensorND, arity_in>& inp_val, std::array<megdnn::TensorND, arity_in>& inp_val,
PreprocessFilter<megdnn::ConvolutionForward>& prep_flt) { PreprocessFilter<megdnn::ConvolutionForward>& prep_flt) {
if (!preprocessed_layout.empty()) { if (!preprocessed_layout.empty()) {
auto&& pf = prep_flt; auto&& pf = prep_flt;
...@@ -187,8 +186,7 @@ void TimedProfiler<megdnn::ConvolutionForward>::preprocess( ...@@ -187,8 +186,7 @@ void TimedProfiler<megdnn::ConvolutionForward>::preprocess(
pf.tensors[i] = flt_val[i].as_megdnn(); pf.tensors[i] = flt_val[i].as_megdnn();
} }
APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace), APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace),
std::forward_as_tuple(layouts[0], inp_val[1].as_megdnn()), std::forward_as_tuple(layouts[0], inp_val[1]), array_skip<2>(layouts));
array_skip<2>(layouts));
} }
} }
...@@ -259,9 +257,13 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( ...@@ -259,9 +257,13 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
std::max(cn.get_free_mem(), cn.get_max_block_size_available()); std::max(cn.get_free_mem(), cn.get_max_block_size_available());
auto align = cn.get_mem_addr_alignment(); auto align = cn.get_mem_addr_alignment();
size_t tot_size = align; size_t tot_size = align;
for (int i = 0; i < arity; ++i) { for (size_t i = 0; i < arity; ++i) {
// if input tensornds are given, only consider output tensornds
if (param.inp_tensornds != nullptr) {
if (i >= (*param.inp_tensornds).size())
tot_size += layouts[i].span().high_byte + align; tot_size += layouts[i].span().high_byte + align;
} }
}
for (const auto& layout : preprocessed_layout) { for (const auto& layout : preprocessed_layout) {
tot_size += layout.span().high_byte + align; tot_size += layout.span().high_byte + align;
} }
...@@ -275,20 +277,34 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( ...@@ -275,20 +277,34 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
#endif #endif
// allocate input and output memory // allocate input and output memory
std::array<DeviceTensorND, arity_in> inp_val; std::array<DeviceTensorND, arity_in> inp_dev;
std::array<DeviceTensorND, arity_out> out_val; std::array<DeviceTensorND, arity_out> out_dev;
std::array<megdnn::TensorND, arity_in> inp_val;
std::array<megdnn::TensorND, arity_out> out_val;
DeviceTensorND workspace; DeviceTensorND workspace;
if (param.inp_tensornds != nullptr) {
// if inp_tensornds exists, then reusing it
for (int i = 0; i < arity_in; ++i) {
inp_val[i] = (*param.inp_tensornds)[i];
}
} else {
// inp_tensornds does not exist, create zero tensor with the same layout
for (int i = 0; i < arity_in; ++i) { for (int i = 0; i < arity_in; ++i) {
inp_val[i].comp_node(cn).dtype(layouts[i].dtype).resize(layouts[i]); inp_dev[i].comp_node(cn).dtype(layouts[i].dtype).resize(layouts[i]);
fill_zero_dev_tensor(inp_dev[i]);
inp_val[i] = inp_dev[i].as_megdnn();
}
} }
for (int i = 0; i < arity_out; ++i) { for (int i = 0; i < arity_out; ++i) {
out_val[i] out_dev[i]
.comp_node(cn) .comp_node(cn)
.dtype(layouts[arity_in + i].dtype) .dtype(layouts[arity_in + i].dtype)
.resize(layouts[arity_in + i]); .resize(layouts[arity_in + i]);
out_val[i] = out_dev[i].as_megdnn();
} }
megdnn::Workspace mdn_workspace;
megdnn::Workspace mdn_workspace;
// allocate workspace // allocate workspace
if (param.workspace) { if (param.workspace) {
workspace.comp_node(cn).dtype(dtype::Byte()).resize({param.workspace}); workspace.comp_node(cn).dtype(dtype::Byte()).resize({param.workspace});
...@@ -304,10 +320,6 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( ...@@ -304,10 +320,6 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
preprocessed_layout[i].format}; preprocessed_layout[i].format};
} }
for (int i = 0; i < arity_in; ++i) {
fill_zero_dev_tensor(inp_val[i]);
}
PreprocessFilter<Opr> prep_flt; PreprocessFilter<Opr> prep_flt;
preprocess( preprocess(
preprocessed_layout, flt_val, megdnn_opr, mdn_workspace, layouts, inp_val, preprocessed_layout, flt_val, megdnn_opr, mdn_workspace, layouts, inp_val,
...@@ -322,13 +334,12 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( ...@@ -322,13 +334,12 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
auto&& opr = _(megdnn_opr); auto&& opr = _(megdnn_opr);
PreprocessFilter<Opr>* pf = PreprocessFilter<Opr>* pf =
preprocessed_layout.empty() ? nullptr : &prep_flt; preprocessed_layout.empty() ? nullptr : &prep_flt;
APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val, APPLY(opr->exec(args..., pf, mdn_workspace), inp_val, out_val);
out_val);
}, },
/* else */ /* else */
[&](auto _) { [&](auto _) {
APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), APPLY(_(megdnn_opr)->exec(args..., mdn_workspace), inp_val,
inp_val, out_val); out_val);
}); });
} }
ev_start->record(); ev_start->record();
...@@ -337,13 +348,11 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( ...@@ -337,13 +348,11 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
auto&& opr = _(megdnn_opr); auto&& opr = _(megdnn_opr);
PreprocessFilter<Opr>* pf = PreprocessFilter<Opr>* pf =
preprocessed_layout.empty() ? nullptr : &prep_flt; preprocessed_layout.empty() ? nullptr : &prep_flt;
APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val, APPLY(opr->exec(args..., pf, mdn_workspace), inp_val, out_val);
out_val);
}, },
/* else */ /* else */
[&](auto _) { [&](auto _) {
APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), inp_val, APPLY(_(megdnn_opr)->exec(args..., mdn_workspace), inp_val, out_val);
out_val);
}); });
ev_end->record(); ev_end->record();
...@@ -370,10 +379,10 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( ...@@ -370,10 +379,10 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
DeviceTensorStorage storage; DeviceTensorStorage storage;
for (int i = 0; i < arity_in; ++i) { for (int i = 0; i < arity_in; ++i) {
inp_val[i].reset(storage, TensorLayout{}); inp_dev[i].reset(storage, TensorLayout{});
} }
for (int i = 0; i < arity_out; ++i) { for (int i = 0; i < arity_out; ++i) {
out_val[i].reset(storage, TensorLayout{}); out_dev[i].reset(storage, TensorLayout{});
} }
for (size_t i = 0; i < preprocessed_layout.size(); i++) { for (size_t i = 0; i < preprocessed_layout.size(); i++) {
flt_val[i].reset(storage, TensorLayout{}); flt_val[i].reset(storage, TensorLayout{});
......
...@@ -60,13 +60,15 @@ public: ...@@ -60,13 +60,15 @@ public:
megdnn::param::ExecutionPolicy m_execution_policy; megdnn::param::ExecutionPolicy m_execution_policy;
bool m_allow_weight_preprocess; bool m_allow_weight_preprocess;
const AlgoChooserDesc& m_desc; const AlgoChooserDesc& m_desc;
SmallVector<megdnn::TensorND>* m_inputs;
public: public:
MGE_WIN_DECLSPEC_FUC AlgoChooserHelper( MGE_WIN_DECLSPEC_FUC AlgoChooserHelper(
const FixedTensorLayouts& layouts, Opr* megdnn_opr, const FixedTensorLayouts& layouts, Opr* megdnn_opr,
const std::string& param_str, const CompNode& cn, const std::string& param_str, const CompNode& cn,
const megdnn::param::ExecutionPolicy& execution_policy, const megdnn::param::ExecutionPolicy& execution_policy,
bool allow_weight_preprocess, const AlgoChooserDesc& desc); bool allow_weight_preprocess, const AlgoChooserDesc& desc,
SmallVector<megdnn::TensorND>* inputs = nullptr);
Opr* megdnn_opr() const { return m_dnn_opr; } Opr* megdnn_opr() const { return m_dnn_opr; }
...@@ -93,6 +95,8 @@ public: ...@@ -93,6 +95,8 @@ public:
const AlgoChooserDesc& desc() const { return m_desc; } const AlgoChooserDesc& desc() const { return m_desc; }
SmallVector<megdnn::TensorND>* get_input() const { return m_inputs; }
//! construct algo chain by heuristic //! construct algo chain by heuristic
ImplExecutionPolicy choose_by_heuristic( ImplExecutionPolicy choose_by_heuristic(
const ExecutionStrategy& selected_strategy) const; const ExecutionStrategy& selected_strategy) const;
......
...@@ -122,6 +122,8 @@ public: ...@@ -122,6 +122,8 @@ public:
//! filled by profile() //! filled by profile()
mutable double actual_timeout; mutable double actual_timeout;
// input
SmallVector<megdnn::TensorND>* inp_tensornds;
}; };
struct Result { struct Result {
...@@ -141,7 +143,7 @@ private: ...@@ -141,7 +143,7 @@ private:
const megdnn::TensorLayoutArray& preprocessed_layout, const megdnn::TensorLayoutArray& preprocessed_layout,
const SmallVector<DeviceTensorND>& flt_val, UniqPtrWithCN<Opr>& megdnn_opr, const SmallVector<DeviceTensorND>& flt_val, UniqPtrWithCN<Opr>& megdnn_opr,
megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts, megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts,
std::array<DeviceTensorND, arity_in>& inp_val, std::array<megdnn::TensorND, arity_in>& inp_val,
PreprocessFilter<Opr>& prep_flt); PreprocessFilter<Opr>& prep_flt);
static TResult prof_impl(const TParam& raw_param); static TResult prof_impl(const TParam& raw_param);
static void prof_init_device(const TParam& raw_param); static void prof_init_device(const TParam& raw_param);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册