#include "layout_trans_options.h" #include #include "megbrain/serialization/serializer.h" #include "misc.h" #include "models/model_lite.h" #include "models/model_mdl.h" namespace lar { template <> void GoptLayoutOption::config_model_internel( RuntimeParam& runtime_param, std::shared_ptr model) { if (runtime_param.stage == RunStage::AFTER_NETWORK_CREATED) { if (m_layout_transform) { LITE_LOG("using global layout transform optimization\n"); if (m_layout_transform_target == mgb::gopt::GraphTuningOptions::Target::CPU) { model->get_config().device_type = LiteDeviceType::LITE_CPU; } #if LITE_WITH_CUDA else if ( m_layout_transform_target == mgb::gopt::GraphTuningOptions::Target::CUDA) { model->get_config().device_type = LiteDeviceType::LITE_CUDA; } #endif LITE_LOG("enable layout transform while load model for lite"); auto&& lite_network = model->get_lite_network(); lite::Runtime::enable_global_layout_transform(lite_network); } } else if (runtime_param.stage == RunStage::GLOBAL_OPTIMIZATION) { if (m_layout_transform) { auto&& network = model->get_lite_network(); if (!m_layout_transform_dump_file.empty()) { lite::Runtime::dump_layout_transform_model( network, m_layout_transform_dump_file); } } } } template <> void GoptLayoutOption::config_model_internel( RuntimeParam& runtime_param, std::shared_ptr model) { if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) { if (m_layout_transform) { mgb_log_debug("update input shape for global layout transform\n"); auto&& load_result = model->get_mdl_load_result(); if (m_force_batch_size > 0) { for (auto&& i : load_result.tensor_map) { auto& in = i.second; mgb::TensorShape new_shape = in->shape(); new_shape[0] = m_force_batch_size; mgb::HostTensorND new_tensor; new_tensor.comp_node(mgb::CompNode::default_cpu(), true) .dtype(in->dtype()) .resize(new_shape); mgb::dt_byte* raw_ptr = new_tensor.raw_ptr(); memset((char*)raw_ptr, 1, new_tensor.layout().total_nr_elems()); in->copy_from(new_tensor); } } for (auto&& item : load_result.output_var_list) { if (item.shape()[0] > 1) { mgb_log_warn( " model may be dumped with multi batch and will cost lots " "of time to profile during global layout transform!!!\n"); } } //! update output varlist when input shape maybe change(some pass excution //! time depends on the shape of init input) mgb::thin_hash_table::ThinHashMap varmap; mgb::cg::DepOprIter dep([&](mgb::cg::OperatorNodeBase* opr) { if (auto h2d = opr->try_cast_final()) { auto param = h2d->param(); mgb::TensorShape new_shape = h2d->host_data()->shape(); std::shared_ptr new_tensor = std::make_shared( h2d->host_data()->comp_node(), new_shape, h2d->host_data()->dtype()); new_tensor->only_reset_raw_storage(h2d->host_data()->storage()); auto h2d_opr = mgb::opr::Host2DeviceCopy::make( *h2d->owner_graph(), new_tensor, param, h2d->config()); varmap[h2d->output(0)] = h2d_opr; } }); for (auto&& i : load_result.output_var_list) dep.add(i); if (!varmap.empty()) { auto output_vars = mgb::cg::replace_vars(load_result.output_var_list, varmap); for (size_t i = 0; i < load_result.output_var_list.size(); ++i) { output_vars[i].rename( load_result.output_var_list[i].node()->name()); } load_result.output_var_list = output_vars; } } } else if (runtime_param.stage == RunStage::GLOBAL_OPTIMIZATION) { if (m_layout_transform) { mgb_log("using global layout transform optimization\n"); auto&& load_result = model->get_mdl_load_result(); load_result.output_var_list = mgb::gopt::layout_transform( load_result.output_var_list, m_layout_transform_target); if (!m_layout_transform_dump_file.empty()) { auto out_file = mgb::serialization::OutputFile::make_fs( m_layout_transform_dump_file.c_str(), 'w'); auto testcase_num = model->get_testcase_num(); if (testcase_num) { const char* magic = "mgbtest0"; constexpr size_t len = sizeof(magic); out_file->write(magic, len); out_file->write(&testcase_num, sizeof(testcase_num)); } using DumpConfig = mgb::serialization::GraphDumper::DumpConfig; DumpConfig config{1, false, false}; auto dumper = model->get_dumper(std::move(out_file)); dumper->dump(load_result.output_var_list, config); if (testcase_num) { auto input_file = model->get_loader()->reset_file(); auto current_offset = input_file->tell(); auto loader = model->reset_loader(std::move(input_file)); auto testcase = loader->load(model->get_mdl_config(), false); mgb::serialization::GraphDumper::DumpConfig config{1, false, false}; for (size_t i = 0; i < testcase_num; ++i) { auto casefile = mgb::serialization::OutputFile::make_fs( m_layout_transform_dump_file.c_str(), 'a'); auto casedumper = model->get_dumper(std::move(casefile)); casedumper->dump(testcase.output_var_list, config); if (i != testcase_num - 1) { loader = model->reset_loader(); testcase = loader->load(model->get_mdl_config(), false); } } input_file = model->get_loader()->reset_file(); input_file->rewind(); input_file->skip(current_offset); model->reset_loader(std::move(input_file)); } } } } } } // namespace lar using namespace lar; bool GoptLayoutOption::m_valid; void GoptLayoutOption::update() { m_option_name = "gopt_layout"; if (FLAGS_layout_transform != "cpu" #if LITE_WITH_CUDA && FLAGS_layout_transform != "cuda" #endif ) { m_layout_transform = false; m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::UNSPEC; } else { m_layout_transform = true; if (FLAGS_layout_transform == "cpu") { m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::CPU; } #if LITE_WITH_CUDA else if (FLAGS_layout_transform == "cuda") { m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::CUDA; } #endif } m_layout_transform_dump_file = FLAGS_layout_transform_dump; m_force_batch_size = FLAGS_layout_transform_batch_size; m_option = { {"layout_transform", lar::String::make("")}, }; std::static_pointer_cast(m_option["layout_transform"]) ->set_value(FLAGS_layout_transform); } bool GoptLayoutOption::is_valid() { bool ret = false; if (!FLAGS_layout_transform.empty()) { if (FLAGS_layout_transform != "cpu" #if LITE_WITH_CUDA && FLAGS_layout_transform != "cuda" #endif ) { mgb_assert( false, "unsupported target(got:%s) for global layout " "transform", FLAGS_layout_transform.c_str()); ret = false; } else { ret = true; } } ret = ret || !FLAGS_layout_transform_dump.empty(); if (FLAGS_layout_transform_batch_size > 0) { mgb_assert( FLAGS_layout_transform_batch_size > 0 && !FLAGS_layout_transform.empty(), "\"layout-transform-batch-size\" should be set with " "\"layout-transform\""); ret = ret || FLAGS_layout_transform_batch_size > 0; } return ret || m_valid; } std::shared_ptr GoptLayoutOption::create_option() { static std::shared_ptr option(new GoptLayoutOption); if (GoptLayoutOption::is_valid()) { option->update(); return std::static_pointer_cast(option); } else { return nullptr; } } void GoptLayoutOption::config_model( RuntimeParam& runtime_param, std::shared_ptr model) { auto value = std::static_pointer_cast(m_option["layout_transform"]) ->get_value(); if (value.empty()) { return; } if (value == "cpu") { m_layout_transform = true; m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::CPU; } #if LITE_WITH_CUDA else if (value == "cuda") { m_layout_transform = true; m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::CUDA; } #endif else { mgb_throw( mgb::AssertionError, "invalid options of global layout transform %s", value.c_str()); } CONFIG_MODEL_FUN; } DEFINE_string( layout_transform, "", "Enable global layout transform optimization for computing graph. User should " "specify the device target for the optimization, and a series of passes will " "be applied on the computing graph. The passes will benchmark the elapsed time " "of operators on different tensor layouts, and select fastest implementation " "for the operators. The optimization process will take some time. The default " "target is unspec, which all the available for operators will be profiled. So " "the optimize time will be longer."); DEFINE_string( layout_transform_dump, "", "The computing graph after global layout transform will be dumped to the given " "file path."); DEFINE_int32( layout_transform_batch_size, -1, "the batch size of input for global layout transform optimization working on"); REGIST_OPTION_CREATOR(gopt_layout, lar::GoptLayoutOption::create_option); REGIST_OPTION_VALIDATER(gopt_layout, lar::GoptLayoutOption::set_valid);