#include "layout_trans_options.h" #include #include "megbrain/serialization/serializer.h" #include "misc.h" #include "models/model_lite.h" #include "models/model_mdl.h" namespace lar { template <> void GoptLayoutOption::config_model_internel( RuntimeParam& runtime_param, std::shared_ptr model) { if (runtime_param.stage == RunStage::AFTER_NETWORK_CREATED) { if (m_layout_transform) { LITE_LOG("using global layout transform optimization"); if (m_layout_transform_target == mgb::gopt::GraphTuningOptions::Target::CPU) { model->get_config().device_type = LiteDeviceType::LITE_CPU; } #if LITE_WITH_CUDA else if ( m_layout_transform_target == mgb::gopt::GraphTuningOptions::Target::CUDA) { model->get_config().device_type = LiteDeviceType::LITE_CUDA; } #endif LITE_LOG("enable layout transform while load model for lite"); auto&& lite_network = model->get_lite_network(); lite::Runtime::enable_global_layout_transform(lite_network); } } else if (runtime_param.stage == RunStage::GLOBAL_OPTIMIZATION) { if (m_layout_transform) { auto&& network = model->get_lite_network(); if (!m_layout_transform_dump_file.empty()) { lite::Runtime::dump_layout_transform_model( network, m_layout_transform_dump_file); } } } } template <> void GoptLayoutOption::config_model_internel( RuntimeParam& runtime_param, std::shared_ptr model) { if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) { if (m_layout_transform) { auto&& load_result = model->get_mdl_load_result(); for (auto&& item : load_result.output_var_list) { if (item.shape()[0] > 1) { mgb_log_warn( " model may be dumped with multi batch and will cost lots " "of time to profile during global layout transform!!!"); } } } } else if (runtime_param.stage == RunStage::GLOBAL_OPTIMIZATION) { if (m_layout_transform) { mgb_log("using global layout transform optimization"); auto&& load_result = model->get_mdl_load_result(); auto output_vars = mgb::gopt::layout_transform( load_result.output_var_list, m_layout_transform_target); for (size_t i = 0; i < load_result.output_var_list.size(); ++i) { output_vars[i].rename(load_result.output_var_list[i].node()->name()); } load_result.output_var_list = output_vars; if (!m_layout_transform_dump_file.empty()) { auto out_file = mgb::serialization::OutputFile::make_fs( m_layout_transform_dump_file.c_str(), 'w'); auto testcase_num = model->get_testcase_num(); if (testcase_num) { const char* magic = "mgbtest0"; constexpr size_t len = sizeof(magic); out_file->write(magic, len); out_file->write(&testcase_num, sizeof(testcase_num)); } using DumpConfig = mgb::serialization::GraphDumper::DumpConfig; DumpConfig config{1, false, false}; auto dumper = model->get_dumper(std::move(out_file)); dumper->dump(load_result.output_var_list, config); if (testcase_num) { auto input_file = model->get_loader()->reset_file(); auto current_offset = input_file->tell(); auto loader = model->reset_loader(std::move(input_file)); auto testcase = loader->load(model->get_mdl_config(), false); mgb::serialization::GraphDumper::DumpConfig config{1, false, false}; for (size_t i = 0; i < testcase_num; ++i) { auto casefile = mgb::serialization::OutputFile::make_fs( m_layout_transform_dump_file.c_str(), 'a'); auto casedumper = model->get_dumper(std::move(casefile)); casedumper->dump(testcase.output_var_list, config); if (i != testcase_num - 1) { loader = model->reset_loader(); testcase = loader->load(model->get_mdl_config(), false); } } input_file = model->get_loader()->reset_file(); input_file->rewind(); input_file->skip(current_offset); model->reset_loader(std::move(input_file)); } } } } } } // namespace lar using namespace lar; bool GoptLayoutOption::m_valid; void GoptLayoutOption::update() { m_option_name = "gopt_layout"; if (FLAGS_layout_transform != "cpu" #if LITE_WITH_CUDA && FLAGS_layout_transform != "cuda" #endif ) { m_layout_transform = false; m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::UNSPEC; } else { m_layout_transform = true; if (FLAGS_layout_transform == "cpu") { m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::CPU; } #if LITE_WITH_CUDA else if (FLAGS_layout_transform == "cuda") { m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::CUDA; } #endif } m_layout_transform_dump_file = FLAGS_layout_transform_dump; m_option = { {"layout_transform", lar::String::make("")}, }; std::static_pointer_cast(m_option["layout_transform"]) ->set_value(FLAGS_layout_transform); } bool GoptLayoutOption::is_valid() { bool ret = false; if (!FLAGS_layout_transform.empty()) { if (FLAGS_layout_transform != "cpu" #if LITE_WITH_CUDA && FLAGS_layout_transform != "cuda" #endif ) { mgb_assert( false, "unsupported target(got:%s) for global layout " "transform", FLAGS_layout_transform.c_str()); ret = false; } else { ret = true; } } ret = ret || !FLAGS_layout_transform_dump.empty(); return ret || m_valid; } std::shared_ptr GoptLayoutOption::create_option() { static std::shared_ptr option(new GoptLayoutOption); if (GoptLayoutOption::is_valid()) { option->update(); return std::static_pointer_cast(option); } else { return nullptr; } } void GoptLayoutOption::config_model( RuntimeParam& runtime_param, std::shared_ptr model) { auto value = std::static_pointer_cast(m_option["layout_transform"]) ->get_value(); if (value.empty()) { return; } if (value == "cpu") { m_layout_transform = true; m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::CPU; } #if LITE_WITH_CUDA else if (value == "cuda") { m_layout_transform = true; m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::CUDA; } #endif else { mgb_throw( mgb::AssertionError, "invalid options of global layout transform %s", value.c_str()); } CONFIG_MODEL_FUN; } DEFINE_string( layout_transform, "", "Enable global layout transform optimization for computing graph. User should " "specify the device target for the optimization, and a series of passes will " "be applied on the computing graph. The passes will benchmark the elapsed time " "of operators on different tensor layouts, and select fastest implementation " "for the operators. The optimization process will take some time. The default " "target is unspec, which all the available for operators will be profiled. So " "the optimize time will be longer."); DEFINE_string( layout_transform_dump, "", "The computing graph after global layout transform will be dumped to the given " "file path."); REGIST_OPTION_CREATOR(gopt_layout, lar::GoptLayoutOption::create_option); REGIST_OPTION_VALIDATER(gopt_layout, lar::GoptLayoutOption::set_valid);