diff --git a/imperative/src/impl/algo_chooser.h b/imperative/src/impl/algo_chooser.h index 0a6eb582d87387e7ee7f5cadfcb9adfca59c9f44..078a691b649247c1aeb04440590c643ab0d2505a 100644 --- a/imperative/src/impl/algo_chooser.h +++ b/imperative/src/impl/algo_chooser.h @@ -11,7 +11,8 @@ MGE_WIN_DECLSPEC_FUC size_t setup_algo( const typename mgb::rdnn::AlgoChooser::FixedTensorLayouts& layouts, Opr* megdnn_opr, uint32_t shared_batch_size, bool binary_equal_between_batch, bool no_profiling_on_shape_change, CompNode comp_node, - megdnn::param::ExecutionPolicy execution_policy, bool allow_weight_preprocess) { + megdnn::param::ExecutionPolicy execution_policy, bool allow_weight_preprocess, + SmallVector* inp_tensornds = nullptr) { megdnn::AlgorithmCache::Key cache_key( megdnn_opr->handle(), megdnn_opr->get_opr_type(), layouts.data(), layouts.size(), &megdnn_opr->param(), sizeof(megdnn_opr->param())); @@ -39,7 +40,7 @@ MGE_WIN_DECLSPEC_FUC size_t setup_algo( using AlgoChooserHelper = typename mgb::rdnn::AlgoChooser::AlgoChooserHelper; AlgoChooserHelper helper( layouts, megdnn_opr, param_str, comp_node, execution_policy, - allow_weight_preprocess, desc); + allow_weight_preprocess, desc, inp_tensornds); megdnn::ExecutionPolicy policy; policy = mgb::rdnn::AlgoChooser::get_policy(helper); diff --git a/imperative/src/impl/ops/convolution.cpp b/imperative/src/impl/ops/convolution.cpp index 31b991d786392902fa5a71b5552fa801ebe231ae..b079e182cb288edd0a4f7ce996b20304b2b3dc77 100644 --- a/imperative/src/impl/ops/convolution.cpp +++ b/imperative/src/impl/ops/convolution.cpp @@ -141,11 +141,8 @@ SmallVector apply_on_physical_tensor( def, inputs[0]->layout().ndim, inputs[0]->layout(), inputs[1]->layout()); - DeviceTensorND out = - BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout); - using TensorND = megdnn::TensorND; - SmallVector inp_tensornds(inputs.size()); + SmallVector inp_tensornds(inputs.size() + 2); TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); for (unsigned i = 0; i < inputs.size(); ++i) { inp_tensornds[i] = inputs[i]->dnn_tensor(); @@ -168,13 +165,20 @@ SmallVector apply_on_physical_tensor( TensorLayout empty_shp({0}, inputs[0]->dtype()); empty_shp.ndim = 0; + DeviceTensorND empty_bias = + BlobManager::inst()->alloc_workspace_with_defrag(cn, empty_shp); + + inp_tensornds[2] = empty_bias.as_megdnn(); + inp_tensornds[3] = empty_bias.as_megdnn(); + size_t sz = setup_algo( {inp_shapes[0], inp_shapes[1], empty_shp, empty_shp, oup_shapes[0]}, - dnn_opr.op.get(), 0, false, false, cn, conv.policy(), false); + dnn_opr.op.get(), 0, false, false, cn, conv.policy(), false, + &inp_tensornds); // alloc memory - DeviceTensorND empty_bias = - BlobManager::inst()->alloc_workspace_with_defrag(cn, empty_shp); + DeviceTensorND out = + BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout); TensorLayout w_layout({sz}, dtype::Byte()); auto dnn_wk = dnn_opr.create_workspace(w_layout); @@ -364,9 +368,6 @@ SmallVector apply_on_physical_tensor( def, inputs[1]->layout().ndim, inputs[0]->layout(), inputs[1]->layout(), cn); - DeviceTensorND out = - BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout); - using TensorND = megdnn::TensorND; SmallVector inp_tensornds(inputs.size()); TensorLayoutArray inp_shapes(inputs.size()), oup_shapes(output_descs.size()); @@ -380,7 +381,10 @@ SmallVector apply_on_physical_tensor( size_t sz = setup_algo( {inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, - false, cn, convbwd.policy(), false); + false, cn, convbwd.policy(), false, &inp_tensornds); + + DeviceTensorND out = + BlobManager::inst()->alloc_workspace_with_defrag(cn, out_layout); auto wk = Blob::make(cn, sz); auto ptr = wk->storage().get(); @@ -542,7 +546,7 @@ SmallVector apply_on_physical_tensor( // shape infer size_t sz = setup_algo( {inp_shapes[0], inp_shapes[1], oup_shapes[0]}, dnn_opr.op.get(), 0, false, - false, cn, conv.policy(), false); + false, cn, conv.policy(), false, &inp_tensornds); // alloc memory DeviceTensorND out = @@ -598,8 +602,9 @@ SmallVector apply_on_physical_tensor( SmallVector& output_descs, const bool& validated) { auto&& op_def = def.cast_final_safe(); auto cn = inputs[0]->comp_node(); - megdnn::TensorND weight = inputs[0]->dnn_tensor(); - megdnn::TensorND diff = inputs[1]->dnn_tensor(); + + auto&& wlayout = inputs[0]->layout(); + auto&& dlayout = inputs[1]->layout(); DnnOprCaller caller(cn); auto&& dnn_opr = caller.op; @@ -608,21 +613,24 @@ SmallVector apply_on_physical_tensor( TensorLayout& oup_layout = output_descs[0].layout; if (!validated) { megdnn::Convolution3DBackwardData::deduce_layout_impl( - weight.layout, diff.layout, op_def.param(), oup_layout); + wlayout, dlayout, op_def.param(), oup_layout); } DeviceTensorND oup = BlobManager::inst()->alloc_workspace_with_defrag(cn, oup_layout); + SmallVector inp_tensornds(inputs.size()); + inp_tensornds[0] = inputs[0]->dnn_tensor(); + inp_tensornds[1] = inputs[1]->dnn_tensor(); size_t wk_size = setup_algo( - {weight.layout, diff.layout, oup_layout}, dnn_opr.get(), 0, false, false, - cn, op_def.policy(), false); + {wlayout, dlayout, oup_layout}, dnn_opr.get(), 0, false, false, cn, + op_def.policy(), false, &inp_tensornds); megdnn::Workspace dnn_wk; if (wk_size != 0) { TensorLayout w_layout({wk_size}, dtype::Byte()); dnn_wk = caller.create_workspace(w_layout); } - dnn_opr->exec(weight, diff, oup.as_megdnn(), dnn_wk); + dnn_opr->exec(inp_tensornds[0], inp_tensornds[1], oup.as_megdnn(), dnn_wk); return {Tensor::make(oup)}; } diff --git a/imperative/src/impl/ops/matmul.cpp b/imperative/src/impl/ops/matmul.cpp index d30d4786e10b9f7cca8266aa7fa78bd569135297..1173ac16f03e6644e009549e0a7ed4302e8c41e6 100644 --- a/imperative/src/impl/ops/matmul.cpp +++ b/imperative/src/impl/ops/matmul.cpp @@ -229,12 +229,11 @@ SmallVector apply_on_physical_tensor( inp_tensornds[0].layout = layout_a; inp_tensornds[1].layout = layout_b; } - - DeviceTensorND out = - BlobManager::inst()->alloc_workspace_with_defrag(cn, dst_layout); size_t sz = setup_algo( {layout_a, layout_b, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, - matmul.policy(), false); + matmul.policy(), false, &inp_tensornds); + DeviceTensorND out = + BlobManager::inst()->alloc_workspace_with_defrag(cn, dst_layout); TensorLayout w_layout({sz}, dtype::Byte()); auto dnn_wk = dnn_opr.create_workspace(w_layout); @@ -470,21 +469,22 @@ SmallVector apply_on_physical_tensor( return {Tensor::make(out)}; } - using TensorND = megdnn::TensorND; - TensorND inp_nd1 = inp1->dnn_tensor(); - inp_nd1.layout = layout1; - TensorND inp_nd2 = inp2->dnn_tensor(); - inp_nd2.layout = layout2; + SmallVector inp_tensornds(2u); + inp_tensornds[0] = inp1->dnn_tensor(); + inp_tensornds[0].layout = layout1; + inp_tensornds[1] = inp2->dnn_tensor(); + inp_tensornds[1].layout = layout2; + + size_t sz = setup_algo( + {layout1, layout2, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, + matmul.policy(), false, &inp_tensornds); DeviceTensorND out = BlobManager::inst()->alloc_workspace_with_defrag(cn, dst_layout); - size_t sz = setup_algo( - {layout1, layout2, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, - matmul.policy(), false); TensorLayout w_layout({sz}, dtype::Byte()); auto dnn_wk = dnn_opr.create_workspace(w_layout); - dnn_opr.op->exec(inp_nd1, inp_nd2, out.as_megdnn(), dnn_wk); + dnn_opr.op->exec(inp_tensornds[0], inp_tensornds[1], out.as_megdnn(), dnn_wk); shp1[shp1.ndim - 2] = dst_layout[dst_layout.ndim - 2]; shp1[shp1.ndim - 1] = dst_layout[dst_layout.ndim - 1]; diff --git a/imperative/src/impl/ops/pooling.cpp b/imperative/src/impl/ops/pooling.cpp index b71a0d5a2167f303411b0a3bdbd4d7843a2340de..294fccc178e4564053315207dff6fd45d3b099dc 100644 --- a/imperative/src/impl/ops/pooling.cpp +++ b/imperative/src/impl/ops/pooling.cpp @@ -49,23 +49,25 @@ SmallVector apply_on_physical_tensor( auto&& op_def = def.cast_final_safe(); auto cn = inputs[0]->comp_node(); - megdnn::TensorND inp_tensornd = inputs[0]->dnn_tensor(); - DnnOprCaller caller(cn); auto&& dnn_opr = caller.op; dnn_opr->param() = op_def.param(); + SmallVector inp_tensornds(inputs.size()); + inp_tensornds[0] = inputs[0]->dnn_tensor(); + TensorLayout& oup_layout = output_descs[0].layout; if (!validated) { megdnn::Pooling::deduce_layout_impl( - inp_tensornd.layout, op_def.param(), oup_layout); + inp_tensornds[0].layout, op_def.param(), oup_layout); } - DeviceTensorND out_devtensor = - BlobManager::inst()->alloc_workspace_with_defrag(cn, oup_layout); size_t wk_size = setup_algo( - {inp_tensornd.layout, oup_layout}, dnn_opr.get(), 0, false, false, cn, - op_def.policy(), false); + {inp_tensornds[0].layout, oup_layout}, dnn_opr.get(), 0, false, false, cn, + op_def.policy(), false, &inp_tensornds); + + DeviceTensorND out_devtensor = + BlobManager::inst()->alloc_workspace_with_defrag(cn, oup_layout); megdnn::Workspace dnn_wk; if (wk_size) { @@ -73,7 +75,7 @@ SmallVector apply_on_physical_tensor( dnn_wk = caller.create_workspace(w_layout); } - dnn_opr->exec(inp_tensornd, out_devtensor.as_megdnn(), dnn_wk); + dnn_opr->exec(inp_tensornds[0], out_devtensor.as_megdnn(), dnn_wk); return {Tensor::make(out_devtensor)}; } diff --git a/src/rdnn/impl/algo_chooser.cpp b/src/rdnn/impl/algo_chooser.cpp index 7c90f6250f83db24e4af32c261df86af26bb3601..3a496f8ba7c5183eaa216f8a218ab19b494bdd28 100644 --- a/src/rdnn/impl/algo_chooser.cpp +++ b/src/rdnn/impl/algo_chooser.cpp @@ -265,7 +265,8 @@ std::vector flatten_search_space( typename rdnn::AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), _item.param, helper.comp_node(), helper.execution_policy(), - helper.allow_weight_preprocess(), helper.desc()); + helper.allow_weight_preprocess(), helper.desc(), + helper.get_input()); auto space = flatten_search_space<_Opr>(sub_helper, checker); ret.insert(ret.end(), space.begin(), space.end()); }); @@ -488,7 +489,8 @@ AlgoChooser::AlgoChooserHelper::AlgoChooserHelper( const FixedTensorLayouts& layouts, Opr* megdnn_opr, const std::string& param_str, const CompNode& cn, const megdnn::param::ExecutionPolicy& execution_policy, - bool allow_weight_preprocess, const AlgoChooserDesc& desc) + bool allow_weight_preprocess, const AlgoChooserDesc& desc, + SmallVector* inputs) : m_fastrun_layouts{layouts}, m_incache_layouts{layouts}, m_dnn_opr{megdnn_opr}, @@ -496,7 +498,8 @@ AlgoChooser::AlgoChooserHelper::AlgoChooserHelper( m_cn{cn}, m_execution_policy{execution_policy}, m_allow_weight_preprocess{allow_weight_preprocess}, - m_desc{desc} { + m_desc{desc}, + m_inputs{inputs} { auto fastrun_batch_size = desc.shared_batch_size; if (fastrun_batch_size) { @@ -604,7 +607,7 @@ typename AlgoChooser::ImplExecutionPolicy AlgoChooser::AlgoChooserHelp typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), _item.param, m_cn, m_execution_policy, m_allow_weight_preprocess, - m_desc); + m_desc, m_inputs); sub_helper.profile(selected_strategy); }); } @@ -868,6 +871,7 @@ Maybe AlgoChooser::AlgoChooserHelper: param.shapes[i] = m_fastrun_layouts[i]; param.opr_param = m_dnn_opr->param(); param.allow_weight_preprocess = m_allow_weight_preprocess; + param.inp_tensornds = m_inputs; Algorithm* palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo); mgb_assert(palgo, "can not find algo when profile single algo"); @@ -964,7 +968,9 @@ void AlgoChooser::AlgoChooserHelper::profile( if (!policy.algo.valid()) continue; size_t workspace_needed = get_workspace_size_bytes(policy); - if (data_size + workspace_needed > + if (m_inputs != nullptr) + workspace_needed += data_size; + if (workspace_needed > m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) { continue; } @@ -1101,7 +1107,8 @@ std::pair AlgoChooser::AlgoChooserHelper:: const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \ const std::string& param_str, const CompNode& cn, \ const megdnn::param::ExecutionPolicy& execution_policy, \ - bool allow_weight_preprocess, const AlgoChooserDesc& desc); \ + bool allow_weight_preprocess, const AlgoChooserDesc& desc, \ + SmallVector* inputs); \ template typename AlgoChooser::ImplExecutionPolicy \ AlgoChooser::AlgoChooserHelper::choose_by_heuristic( \ const ExecutionStrategy& select_strategy) const; \ diff --git a/src/rdnn/impl/profiler.cpp b/src/rdnn/impl/profiler.cpp index accf586efdc2beb5699bccfb317d2789496c4b5f..8fa427d0493509a86e20a1ffb0f5d29b0d185c10 100644 --- a/src/rdnn/impl/profiler.cpp +++ b/src/rdnn/impl/profiler.cpp @@ -143,7 +143,7 @@ template void TimedProfiler::preprocess( const TensorLayoutArray&, const megdnn::SmallVector&, UniqPtrWithCN&, megdnn::Workspace&, std::array&, - std::array&, PreprocessFilter&) { + std::array&, PreprocessFilter&) { // Opr is neither convbias nor convolution.This function do nothing. } @@ -154,7 +154,7 @@ void TimedProfiler::preprocess( const SmallVector& flt_val, UniqPtrWithCN& megdnn_opr, megdnn::Workspace& mdn_workspace, std::array& layouts, - std::array& inp_val, + std::array& inp_val, PreprocessFilter& prep_flt) { if (!preprocessed_layout.empty()) { auto&& pf = prep_flt; @@ -164,8 +164,7 @@ void TimedProfiler::preprocess( pf.tensors[i] = flt_val[i].as_megdnn(); } APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace), - std::forward_as_tuple( - layouts[0], inp_val[1].as_megdnn(), inp_val[2].as_megdnn()), + std::forward_as_tuple(layouts[0], inp_val[1], inp_val[2]), array_skip(layouts)); } } @@ -177,7 +176,7 @@ void TimedProfiler::preprocess( const megdnn::SmallVector& flt_val, UniqPtrWithCN& megdnn_opr, megdnn::Workspace& mdn_workspace, std::array& layouts, - std::array& inp_val, + std::array& inp_val, PreprocessFilter& prep_flt) { if (!preprocessed_layout.empty()) { auto&& pf = prep_flt; @@ -187,8 +186,7 @@ void TimedProfiler::preprocess( pf.tensors[i] = flt_val[i].as_megdnn(); } APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace), - std::forward_as_tuple(layouts[0], inp_val[1].as_megdnn()), - array_skip<2>(layouts)); + std::forward_as_tuple(layouts[0], inp_val[1]), array_skip<2>(layouts)); } } @@ -259,8 +257,12 @@ typename TimedProfiler::TResult TimedProfiler::prof_impl( std::max(cn.get_free_mem(), cn.get_max_block_size_available()); auto align = cn.get_mem_addr_alignment(); size_t tot_size = align; - for (int i = 0; i < arity; ++i) { - tot_size += layouts[i].span().high_byte + align; + for (size_t i = 0; i < arity; ++i) { + // if input tensornds are given, only consider output tensornds + if (param.inp_tensornds != nullptr) { + if (i >= (*param.inp_tensornds).size()) + tot_size += layouts[i].span().high_byte + align; + } } for (const auto& layout : preprocessed_layout) { tot_size += layout.span().high_byte + align; @@ -275,20 +277,34 @@ typename TimedProfiler::TResult TimedProfiler::prof_impl( #endif // allocate input and output memory - std::array inp_val; - std::array out_val; + std::array inp_dev; + std::array out_dev; + std::array inp_val; + std::array out_val; DeviceTensorND workspace; - for (int i = 0; i < arity_in; ++i) { - inp_val[i].comp_node(cn).dtype(layouts[i].dtype).resize(layouts[i]); + + if (param.inp_tensornds != nullptr) { + // if inp_tensornds exists, then reusing it + for (int i = 0; i < arity_in; ++i) { + inp_val[i] = (*param.inp_tensornds)[i]; + } + } else { + // inp_tensornds does not exist, create zero tensor with the same layout + for (int i = 0; i < arity_in; ++i) { + inp_dev[i].comp_node(cn).dtype(layouts[i].dtype).resize(layouts[i]); + fill_zero_dev_tensor(inp_dev[i]); + inp_val[i] = inp_dev[i].as_megdnn(); + } } for (int i = 0; i < arity_out; ++i) { - out_val[i] + out_dev[i] .comp_node(cn) .dtype(layouts[arity_in + i].dtype) .resize(layouts[arity_in + i]); + out_val[i] = out_dev[i].as_megdnn(); } - megdnn::Workspace mdn_workspace; + megdnn::Workspace mdn_workspace; // allocate workspace if (param.workspace) { workspace.comp_node(cn).dtype(dtype::Byte()).resize({param.workspace}); @@ -304,10 +320,6 @@ typename TimedProfiler::TResult TimedProfiler::prof_impl( preprocessed_layout[i].format}; } - for (int i = 0; i < arity_in; ++i) { - fill_zero_dev_tensor(inp_val[i]); - } - PreprocessFilter prep_flt; preprocess( preprocessed_layout, flt_val, megdnn_opr, mdn_workspace, layouts, inp_val, @@ -322,13 +334,12 @@ typename TimedProfiler::TResult TimedProfiler::prof_impl( auto&& opr = _(megdnn_opr); PreprocessFilter* pf = preprocessed_layout.empty() ? nullptr : &prep_flt; - APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val, - out_val); + APPLY(opr->exec(args..., pf, mdn_workspace), inp_val, out_val); }, /* else */ [&](auto _) { - APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), - inp_val, out_val); + APPLY(_(megdnn_opr)->exec(args..., mdn_workspace), inp_val, + out_val); }); } ev_start->record(); @@ -337,13 +348,11 @@ typename TimedProfiler::TResult TimedProfiler::prof_impl( auto&& opr = _(megdnn_opr); PreprocessFilter* pf = preprocessed_layout.empty() ? nullptr : &prep_flt; - APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val, - out_val); + APPLY(opr->exec(args..., pf, mdn_workspace), inp_val, out_val); }, /* else */ [&](auto _) { - APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), inp_val, - out_val); + APPLY(_(megdnn_opr)->exec(args..., mdn_workspace), inp_val, out_val); }); ev_end->record(); @@ -370,10 +379,10 @@ typename TimedProfiler::TResult TimedProfiler::prof_impl( DeviceTensorStorage storage; for (int i = 0; i < arity_in; ++i) { - inp_val[i].reset(storage, TensorLayout{}); + inp_dev[i].reset(storage, TensorLayout{}); } for (int i = 0; i < arity_out; ++i) { - out_val[i].reset(storage, TensorLayout{}); + out_dev[i].reset(storage, TensorLayout{}); } for (size_t i = 0; i < preprocessed_layout.size(); i++) { flt_val[i].reset(storage, TensorLayout{}); diff --git a/src/rdnn/include/megbrain/rdnn/algo_chooser.h b/src/rdnn/include/megbrain/rdnn/algo_chooser.h index 40ff1ca0c1dcde41c2b51e1634169c9f9cebe5c4..23e4b0696fc182b62d1082160d190c9eb4d2f916 100644 --- a/src/rdnn/include/megbrain/rdnn/algo_chooser.h +++ b/src/rdnn/include/megbrain/rdnn/algo_chooser.h @@ -60,13 +60,15 @@ public: megdnn::param::ExecutionPolicy m_execution_policy; bool m_allow_weight_preprocess; const AlgoChooserDesc& m_desc; + SmallVector* m_inputs; public: MGE_WIN_DECLSPEC_FUC AlgoChooserHelper( const FixedTensorLayouts& layouts, Opr* megdnn_opr, const std::string& param_str, const CompNode& cn, const megdnn::param::ExecutionPolicy& execution_policy, - bool allow_weight_preprocess, const AlgoChooserDesc& desc); + bool allow_weight_preprocess, const AlgoChooserDesc& desc, + SmallVector* inputs = nullptr); Opr* megdnn_opr() const { return m_dnn_opr; } @@ -93,6 +95,8 @@ public: const AlgoChooserDesc& desc() const { return m_desc; } + SmallVector* get_input() const { return m_inputs; } + //! construct algo chain by heuristic ImplExecutionPolicy choose_by_heuristic( const ExecutionStrategy& selected_strategy) const; diff --git a/src/rdnn/include/megbrain/rdnn/profiler.h b/src/rdnn/include/megbrain/rdnn/profiler.h index 5537a70fbcd9abad1a4c3150e2a18d2c0b6f9fe3..e0a71b9ecbe2e7c5fb1e5ea8ef2e68359ad49861 100644 --- a/src/rdnn/include/megbrain/rdnn/profiler.h +++ b/src/rdnn/include/megbrain/rdnn/profiler.h @@ -122,6 +122,8 @@ public: //! filled by profile() mutable double actual_timeout; + // input + SmallVector* inp_tensornds; }; struct Result { @@ -141,7 +143,7 @@ private: const megdnn::TensorLayoutArray& preprocessed_layout, const SmallVector& flt_val, UniqPtrWithCN& megdnn_opr, megdnn::Workspace& mdn_workspace, std::array& layouts, - std::array& inp_val, + std::array& inp_val, PreprocessFilter& prep_flt); static TResult prof_impl(const TParam& raw_param); static void prof_init_device(const TParam& raw_param);