network_impl.cpp

#include "lite_build_config.h"

#if LITE_BUILD_WITH_MGE
#include "common.h"
#include "lite/network.h"
#include "memory_allocator.h"
#include "network_impl.h"
#include "parse_info/parse_info_base.h"
#include "parse_model/model_parser.h"

#include "megbrain/common.h"
#include "megbrain/comp_node.h"
#include "megbrain/comp_node_env.h"
#include "megbrain/graph.h"
#include "megbrain/graph/cg.h"
#include "megbrain/opr/imgproc.h"
#include "megbrain/opr/io.h"
#include "megbrain/opr/tensor_manip.h"
#include "megbrain/tensor.h"

#if MGB_OPENCL
#include "megcore_opencl.h"
#endif

#if defined(MGB_ENABLE_CPUINFO_CHECK) && MGB_ENABLE_CPUINFO
#include "cpuinfo.h"
#endif

#include <fstream>
#include <memory>
#include <set>

using namespace lite;
using namespace mgb;

LITE_DYN_TYPE_OBJ_FINAL_IMPL(NetworkImplDft);

void NetworkImplDft::set_config(const Config& config) {
    *m_user_config = config;
    m_compnode_locator = to_compnode_locator(m_user_config->device_type);
    m_compnode_locator.device = config.device_id;
}

void NetworkImplDft::shared_weight_with(const NetworkImplBase* src_network) {
    application_config();
    const auto& src_impl = src_network->cast_final_safe<NetworkImplDft>();
    LITE_ASSERT(src_impl.m_loader, "Clone network must after the network is loaded.");
    m_load_result = src_impl.m_loader->load(m_load_config, true);

    configure_after_loaded();
}

void NetworkImplDft::application_config() {
    auto device_type = m_user_config->device_type;
    m_compnode_locator.type = to_compnode_locator(device_type).type;
    //! when the device id is not configured, configure it
    if (m_compnode_locator.device == -1) {
        m_compnode_locator.device = m_user_config->device_id;
    }
    if (m_nr_threads > 1 && device_type == LiteDeviceType::LITE_CPU) {
        m_compnode_locator.type = mgb::CompNode::DeviceType::MULTITHREAD;
        if (m_compnode_locator.device == -1) {
            m_compnode_locator.device = m_user_config->device_id;
        }
    }
    //! model options
#define ConfigOption(mge_name, lite_name) \
    options.mge_name = m_user_config->options.lite_name;

    auto&& options = m_load_config.comp_graph->options();
    ConfigOption(graph_opt.weight_preprocess, weight_preprocess);
    ConfigOption(graph_opt.fuse_preprocess, fuse_preprocess);
    ConfigOption(fake_next_exec, fake_next_exec);
    ConfigOption(var_sanity_check_first_run, var_sanity_check_first_run);
    m_load_config.const_var_shape = m_user_config->options.const_shape;
    ConfigOption(force_dynamic_alloc, force_dynamic_alloc);
    ConfigOption(force_output_dynamic_alloc, force_output_dynamic_alloc);
    ConfigOption(
            force_output_use_user_specified_memory,
            force_output_use_user_specified_memory);
    ConfigOption(no_profiling_on_shape_change, no_profiling_on_shape_change);
    ConfigOption(graph_opt.jit, jit_level);
    ConfigOption(comp_node_seq_record_level, comp_node_seq_record_level);
    ConfigOption(graph_opt_level, graph_opt_level);
    ConfigOption(async_exec_level, async_exec_level);

#undef ConfigOption
#define ConfigOptionLayoutTransform(name) \
    if (m_user_config->options.name) {    \
        options.graph_opt.name();         \
    }
    ConfigOptionLayoutTransform(enable_nchw44);
    ConfigOptionLayoutTransform(enable_nchw44_dot);
    ConfigOptionLayoutTransform(enable_nchw88);
    ConfigOptionLayoutTransform(enable_nhwcd4);
    ConfigOptionLayoutTransform(enable_nchw4);
    ConfigOptionLayoutTransform(enable_nchw32);
    ConfigOptionLayoutTransform(enable_nchw64);
#undef ConfigOptionLayoutTransform
    if (m_user_config->has_compression) {
        m_load_config.tensor_value_loader = decompressed_tensor_value_loader;
    }

    //! if device is LITE_NONE, the compnode information is stored in model or
    //! xpu in MegEngine
    if (device_type != LiteDeviceType::LITE_DEVICE_DEFAULT) {
        m_load_config.comp_node_mapper = [this](mgb::CompNode::Locator& loc) {
            if (loc.type == mgb::CompNode::DeviceType::UNSPEC) {
                loc.type = m_compnode_locator.type;
            }
            loc.device = m_compnode_locator.device;
            //! if user set the thread number and the compnode is multithread
            if (loc.type == mgb::CompNode::DeviceType::MULTITHREAD &&
                m_nr_threads != 1) {
                loc.stream = m_nr_threads;
            } else {
                loc.stream = m_compnode_locator.stream;
            }
        };
    }
}

void NetworkImplDft::set_memory_allocator(std::shared_ptr<Allocator> user_allocator) {
    auto allocator = std::make_shared<UserStaticMemAlloc>(user_allocator);
    LITE_ASSERT(m_load_config.comp_graph);
    m_load_config.comp_graph->set_device_memory_allocator(allocator);
}

//! share the runtime memory with other network, the weights is not shared
void NetworkImplDft::share_runtime_memory_with(Network::NetworkImplBase* network_impl) {
    LITE_ASSERT(network_impl);
    LITE_ASSERT(m_load_config.comp_graph);
    m_load_config.comp_graph->share_device_memory_with(*(
            network_impl->cast_final_safe<NetworkImplDft>().m_load_config.comp_graph));
}

void NetworkImplDft::set_cpu_inplace_mode() {
    LITE_ASSERT(
            m_user_config->device_type == LiteDeviceType::LITE_CPU,
            "cpu inplace mode is only avaliable in CPU.");
    m_is_cpu_inplace_mode = true;
    if (m_compnode_locator.type == mgb::CompNode::DeviceType::CPU) {
        m_compnode_locator.device = mgb::CompNode::Locator::DEVICE_CPU_DEFAULT;
        m_user_config->device_id = mgb::CompNode::Locator::DEVICE_CPU_DEFAULT;
    } else {
        LITE_ASSERT(
                m_compnode_locator.type == CompNode::DeviceType::MULTITHREAD,
                "cpu inplace mode is only avaliable in CPU.");
        m_compnode_locator.device = mgb::CompNode::Locator::DEVICE_MULTITHREAD_DEFAULT;
        m_user_config->device_id = mgb::CompNode::Locator::DEVICE_MULTITHREAD_DEFAULT;
    }
}

void NetworkImplDft::set_cpu_threads_number(size_t nr_threads) {
    LITE_ASSERT(
            m_user_config->device_type == LiteDeviceType::LITE_CPU,
            "multi threads mode is only avaliable in CPU.");
    if (nr_threads > 1) {
        m_nr_threads = nr_threads;
        m_compnode_locator.type = mgb::CompNode::DeviceType::MULTITHREAD;
        if (m_is_cpu_inplace_mode) {
            m_compnode_locator.device =
                    mgb::CompNode::Locator::DEVICE_MULTITHREAD_DEFAULT;
            m_user_config->device_id =
                    mgb::CompNode::Locator::DEVICE_MULTITHREAD_DEFAULT;
        }
        m_compnode_locator.nr_threads = nr_threads;
    }
}

void NetworkImplDft::set_runtime_thread_affinity(
        const ThreadAffinityCallback& thread_affinity_callback) {
    LITE_ASSERT(
            m_user_config->device_type == LiteDeviceType::LITE_CPU,
            "multi threads mode is only avaliable in CPU.");
    mgb::CompNode::Locator loc;
    m_load_config.comp_node_mapper(loc);
    auto cn = mgb::CompNode::load(loc);
    if (m_nr_threads > 1) {
        mgb::CompNodeEnv::from_comp_node(cn).cpu_env().set_affinity(
                thread_affinity_callback);
    } else {
        mgb::CompNodeEnv::from_comp_node(cn).cpu_env().dispatch(
                [thread_affinity_callback](void) { thread_affinity_callback(0); });
    }
}

void NetworkImplDft::set_device_id(int device_id) {
    m_compnode_locator.device = device_id;
    m_user_config->device_id = device_id;
}

void NetworkImplDft::set_stream_id(int stream_id) {
    m_compnode_locator.stream = stream_id;
}

void NetworkImplDft::use_tensorrt() {
    auto&& options = m_load_config.comp_graph->options();
    options.graph_opt.tensorrt = true;
}

//! set the callback in async model
void NetworkImplDft::set_async_callback(const AsyncCallback& callback) {
    LITE_ASSERT(!m_is_cpu_inplace_mode, "cpu inplace mode not support async mode");
    LITE_ASSERT(
            m_user_config->device_type == LiteDeviceType::LITE_CPU ||
                    m_user_config->device_type == LiteDeviceType::LITE_CUDA,
            "Now only cpu and cuda>10.0 support async mode");
    m_async = true;
    m_async_callback = std::move(callback);
}

void NetworkImplDft::make_output_spec() {
    m_output_spec.clear();
    for (auto&& out : m_network_io->outputs) {
        if (m_load_result.output_var_map.count(out.name)) {
            auto&& load_out = m_load_result.output_var_map[out.name];
            auto cb = [&out, this](const mgb::DeviceTensorND& dv) mutable {
                mgb::CompNode comp_node = dv.comp_node();
                if (out.io_type == LiteIOType::LITE_IO_SHAPE) {
                    auto mgb_layout = dv.layout();
                    out.lite_tensor->set_layout(to_lite_layout(mgb_layout));
                } else {
                    TensorHelper::implement(out.lite_tensor)
                            ->cast_final_safe<TensorImplDft>()
                            .copy_from_mge_tensor(dv);
                    out.lite_tensor->update_from_implement();
                }
                if (m_async) {
                    out.have_sync = true;
                    bool need_exec_cb = true;
                    for (auto&& j : m_network_io->outputs) {
                        if (!j.have_sync) {
                            need_exec_cb = false;
                        }
                    }
                    if (need_exec_cb) {
                        for (auto&& j : m_network_io->outputs) {
                            j.have_sync = false;
                        }
                        comp_node.add_callback([this]() { finish(); });
                    }
                }
            };
            //! if write to user-specified memory, the CallbackCaller must be nullptr.
            if (m_user_config->options.force_output_use_user_specified_memory ||
                m_user_config->options.force_output_dynamic_alloc) {
                m_output_spec.emplace_back(load_out, nullptr);
            } else {
                m_output_spec.emplace_back(load_out, std::move(cb));
            }
        } else {
            LITE_THROW(ssprintf("no output named : %s in the mode", out.name.c_str()));
        }
    }
}

void NetworkImplDft::replace_src_discrete_input_opr_pass() {
    mgb::ThinHashMap<mgb::SymbolVar, mgb::SymbolVar> out_var_map;

    auto dest_with_extra_deps =
            get_dest_vars_with_extra_deps(m_load_result.output_var_list);
    gopt::SubGraph graph{dest_with_extra_deps};
    auto rewriter = graph.make_rewriter();

    auto on_opr = [&](cg::OperatorNodeBase* opr) {
        bool replace_output = false;
        for (auto inp : opr->input()) {
            if ((inp->owner_opr()->same_type<mgb::opr::Host2DeviceCopy>() ||
                 inp->owner_opr()->same_type<mgb::opr::VolatileSharedDeviceTensor>()) &&
                inp->name() == m_user_config->discrete_input_name) {
                bool is_h2d = true;
                if (inp->owner_opr()->same_type<mgb::opr::Host2DeviceCopy>()) {
                    is_h2d = true;
                } else {
                    is_h2d = false;
                }

                SymbolVarArray srcs;
                if (is_h2d) {
                    auto h2d = inp->owner_opr();
                    for (auto&& i :
                         get_discrete_tensors(m_user_config->discrete_input_name)) {
                        auto val = TensorHelper::implement(i)
                                           ->cast_final_safe<TensorImplDft>()
                                           .m_host_tensor;
                        LITE_ASSERT(val);
                        srcs.push_back(mgb::opr::Host2DeviceCopy::make(
                                *m_load_result.graph, val, h2d->config()));
                    }
                } else {
                    auto volatiled = inp->owner_opr();
                    for (auto&& i :
                         get_discrete_tensors(m_user_config->discrete_input_name)) {
                        auto val = TensorHelper::implement(i)
                                           ->cast_final_safe<TensorImplDft>()
                                           .m_dev_tensor;
                        LITE_ASSERT(val);
                        srcs.push_back(mgb::opr::VolatileSharedDeviceTensor::make(
                                *m_load_result.graph, val, volatiled->config()));
                    }
                }

                if (opr->same_type<mgb::opr::WarpPerspective>()) {
                    auto& warp = opr->cast_final<mgb::opr::WarpPerspective>();
                    SymbolVar new_out;
                    if (opr->input().size() == 3) {
                        new_out = mgb::opr::WarpPerspective::make(
                                srcs, warp.input(1), warp.input(2), warp.param(),
                                warp.config());
                    } else {
                        LITE_ASSERT(opr->input().size() == 4);
                        new_out = mgb::opr::WarpPerspective::make(
                                srcs, warp.input(1), warp.input(2), warp.input(3),
                                warp.param(), warp.config());
                    }
                    rewriter.replace_var(
                            warp.output(0), new_out.node(),
                            "replace WarpPerspective to WarpPerspective multi src "
                            "version.");
                    replace_output = true;
                } else {
                    auto concat = mgb::opr::Concat::make(srcs, 0);
                    rewriter.replace_var(inp, concat.node(), "add a concat opr.");
                }
            }
        }
        if (!replace_output) {
            rewriter.auto_replace_outputs(opr);
        }
    };
    graph.iter(on_opr);
    rewriter.apply_inplace();
    auto new_ovar = graph.endpoint_vars();
    new_ovar.resize(m_load_result.output_var_list.size());

    for (size_t i = 0; i < new_ovar.size(); ++i) {
        out_var_map[m_load_result.output_var_list[i]] = new_ovar[i];
    }
    for (auto&& i : m_load_result.output_var_map) {
        i.second = out_var_map.at(i.second);
    }
    for (auto&& i : m_load_result.output_var_map_id) {
        i.second = out_var_map.at(i.second);
    }
    for (size_t i = 0; i < m_load_result.output_var_list.size(); i++) {
        new_ovar[i].rename(m_load_result.output_var_list[i].node()->name());
    }
    m_load_result.output_var_list = std::move(new_ovar);
}

void NetworkImplDft::replace_dev_input_pass() {
    mgb::CompNode::Locator locator;
    m_load_config.comp_node_mapper(locator);
    //! CPU is not need use device input
    if (locator.type == mgb::CompNode::DeviceType::CPU) {
        return;
    }
    //! repalce the H2D with VolatileSharedDeviceTensor, and keep the dev tensor
    //! in m_network_io.input, user can directly change the dev tensor
    //! storage through m_network_io.input.lite_tensor->reset() befor forward
    using DeviceTensorMap =
            std::unordered_map<std::string, std::shared_ptr<mgb::DeviceTensorND>>;
    DeviceTensorMap name2dev_tensor;

    mgb::ThinHashMap<mgb::HostTensorND*, mgb::SymbolVar> host_val2var;

    //! construct host_val2var that maps from host tensor to corresponding var
    auto on_opr = [&](mgb::cg::OperatorNodeBase* opr) {
        if (opr->same_type<mgb::opr::Host2DeviceCopy>()) {
            mgb::HostTensorND* tensor =
                    opr->cast_final<mgb::opr::Host2DeviceCopy>().host_data().get();
            host_val2var[tensor] = opr->output(0);
        }
    };
    mgb::cg::DepOprIter dep_iter{on_opr};
    for (auto i : m_load_result.output_var_list) {
        dep_iter.add(i.node()->owner_opr());
    }

    mgb::ThinHashMap<mgb::SymbolVar, mgb::SymbolVar> inp_var_map, out_var_map;

    mgb::SmallVector<std::string> to_clear;
    for (auto&& config_in : m_network_io->inputs) {
        if (!config_in.is_host) {
            auto host_val = m_load_result.tensor_map[config_in.name];
            auto dev_val = TensorHelper::implement(config_in.lite_tensor)
                                   ->cast_final_safe<TensorImplDft>()
                                   .m_dev_tensor;
            auto dev_var = mgb::opr::VolatileSharedDeviceTensor::make(
                    *m_load_result.graph, dev_val, {config_in.name});
            inp_var_map[host_val2var.at(host_val.get())] = dev_var;
            name2dev_tensor[config_in.name] = dev_val;
        }
        //! reset lite_tensor in discrete mode
        if (config_in.name == m_user_config->discrete_input_name) {
            config_in.lite_tensor.reset();
        }
    }
    auto new_ovar = mgb::cg::replace_vars(m_load_result.output_var_list, inp_var_map);
    for (size_t i = 0; i < new_ovar.size(); ++i) {
        out_var_map[m_load_result.output_var_list[i]] = new_ovar[i];
    }
    for (auto&& i : m_load_result.output_var_map) {
        i.second = out_var_map.at(i.second);
    }
    for (auto&& i : m_load_result.output_var_map_id) {
        i.second = out_var_map.at(i.second);
    }
    for (size_t i = 0; i < m_load_result.output_var_list.size(); i++) {
        new_ovar[i].rename(m_load_result.output_var_list[i].node()->name());
    }
    m_load_result.output_var_list = std::move(new_ovar);
}

void NetworkImplDft::cross_compnode_model_detect() {
    mgb::ThinHashSet<LiteDeviceType> nr_used_device_type;
    auto on_opr = [&](mgb::cg::OperatorNodeBase* opr) {
        for (auto j : opr->output()) {
            if (j->comp_node() != mgb::CompNode::default_cpu()) {
                nr_used_device_type.insert(
                        get_device_from_locator(j->comp_node().locator()));
            }
        }
    };
    mgb::cg::DepOprIter dep_iter{on_opr};
    for (auto i : m_load_result.output_var_list) {
        dep_iter.add(i.node()->owner_opr());
    }
    m_nr_device_type = nr_used_device_type.size();
}

void NetworkImplDft::layout_transform_optimization() {
    if (m_set_layout_transform) {
        mgb::ThinHashMap<mgb::SymbolVar, mgb::SymbolVar> out_var_map;
        auto output_var_array = mgb::gopt::layout_transform(
                m_load_result.output_var_list, m_layout_transform_target);
        m_load_result.update_output_var_list(output_var_array);
    } else if (m_user_config->auto_optimize_inference) {
        //! set model weight preprocess
        m_load_config.comp_graph->options().graph_opt.weight_preprocess = true;
        LITE_LOG(
                "weight_preprocess is enabled, this maybe use more memory when "
                "infernece.");
        //! get the current format and data type of the model
        bool is_model_nchw = true;
        //! is any convolution is int8
        bool is_model_int8 = false;
        //! is all convolution is float32
        bool is_model_float32 = true;
        float conv_cnt = 0;
        float dimshuffle_cnt = 0;

        auto detect_int8_model = [&](const VarNode* input) {
            if (input->dtype().enumv() == megdnn::DTypeEnum::QuantizedS8 ||
                input->dtype().enumv() == megdnn::DTypeEnum::Quantized8Asymm) {
                is_model_int8 = true;
                is_model_float32 = false;
            } else if (input->dtype().enumv() == megdnn::DTypeEnum::Float32) {
                is_model_float32 = (is_model_float32 && true);
            } else {
                is_model_float32 = false;
            }
        };

        cg::DepOprIter dep([&](cg::OperatorNodeBase* opr) {
            if (auto conv = opr->try_cast_final<opr::ConvolutionForward>()) {
                if (conv->param().format != megdnn::param::ConvBias::Format::NCHW) {
                    is_model_nchw = false;
                }
                conv_cnt++;
                detect_int8_model(conv->input(0));
            } else if (auto conv_bias = opr->try_cast_final<opr::ConvBias>()) {
                if (conv_bias->param().format !=
                    megdnn::param::ConvBias::Format::NCHW) {
                    is_model_nchw = false;
                }
                conv_cnt++;
                detect_int8_model(conv->input(0));
            } else if (auto dimshuffle = opr->try_cast_final<opr::Dimshuffle>()) {
                LITE_MARK_USED_VAR(dimshuffle);
                dimshuffle_cnt++;
            }
        });
        for (auto&& i : m_load_result.output_var_list)
            dep.add(i);

        float radio_dimshuffle_conv = 0;
        if (conv_cnt > 0) {
            radio_dimshuffle_conv = dimshuffle_cnt / conv_cnt;
        }
        //! format optimize can only applied on nchw model,
        //! shufflenet like model will hurt the performance when using nchw88 or nchw44
        //! format, here just heuristically decide the gate radio of
        //! dimshuffle and convolution
        if (!is_model_nchw || radio_dimshuffle_conv > 0.15f) {
            return;
        }

        //! determine the layout by the device information
        //! TODO: shufflenet like model use nchw88 or nchw44 will hurt the
        //! performance
        if (m_user_config->device_type == LITE_CPU) {
#if defined(MGB_ENABLE_CPUINFO_CHECK) && MGB_ENABLE_CPUINFO
            cpuinfo_initialize();
            //! if all convolution and matmul data type is float32
            if (is_model_float32) {
                //! if device is x86
                //! if x86 support avx, use format nchw88
                if (cpuinfo_has_x86_avx()) {
                    m_load_config.comp_graph->options().graph_opt.enable_nchw88();
                    LITE_LOG("Configure model inference with nchw88 format.");
                } else if (cpuinfo_has_x86_sse2() && !cpuinfo_has_x86_sse3()) {
                    //! if x86 only support sse2, use format nchw44
                    m_load_config.comp_graph->options().graph_opt.enable_nchw44();
                    LITE_LOG("Configure model inference with nchw44 format.");
                } else if (cpuinfo_has_arm_neon()) {
                    //! if device is arm, use format nchw44
                    m_load_config.comp_graph->options().graph_opt.enable_nchw44();
                    LITE_LOG("Configure model inference with nchw44 format.");
                }
            } else if (is_model_int8) {
                //! if date type of convolution  is int8
                //! if device is arm and support dot, use nchw44-dot format
                if (cpuinfo_has_arm_neon() && cpuinfo_has_arm_neon_dot()) {
                    m_load_config.comp_graph->options().graph_opt.enable_nchw44_dot();
                    LITE_LOG("Configure model inference with nchw44-dot format.");
                } else if (cpuinfo_has_arm_neon()) {
                    //! if device is arm and do not support dot, use nchw44 format
                    m_load_config.comp_graph->options().graph_opt.enable_nchw44();
                    LITE_LOG("Configure model inference with nchw44 format.");
                }
            }
#endif
        }
    }
}

void NetworkImplDft::load_model(
        std::shared_ptr<void> model_mem, size_t size,
        std::unordered_map<std::string, LiteAny> separate_config_map) {
    if (!m_loader) {
        m_input_file =
                mgb::serialization::InputFile::make_mem_proxy(model_mem, size, false);
        m_format = mgb::serialization::GraphLoader::identify_graph_dump_format(
                *m_input_file);
        if (!m_format.valid()) {
            LITE_THROW("invalid model format");
        }
        m_loader = mgb::serialization::GraphLoader::make(
                std::move(m_input_file), m_format.val());
    }

    //! applay the user configration to mge model
    application_config();

    //! config some flag get from json config file
    if (separate_config_map.find("device_id") != separate_config_map.end()) {
        set_device_id(separate_config_map["device_id"].safe_cast<int>());
    }
    if (separate_config_map.find("number_threads") != separate_config_map.end() &&
        separate_config_map["number_threads"].safe_cast<uint32_t>() > 1) {
        set_cpu_threads_number(
                separate_config_map["number_threads"].safe_cast<uint32_t>());
    }
    if (separate_config_map.find("enable_inplace_model") != separate_config_map.end() &&
        separate_config_map["enable_inplace_model"].safe_cast<bool>()) {
        set_cpu_inplace_mode();
    }
    if (separate_config_map.find("use_tensorrt") != separate_config_map.end() &&
        separate_config_map["use_tensorrt"].safe_cast<bool>()) {
        use_tensorrt();
    }

    m_load_result = m_loader->load(m_load_config, true);
    configure_after_loaded();
}

void NetworkImplDft::configure_after_loaded() {
    modify_exection_policy();

    layout_transform_optimization();

    //! find how many compnode the model has, this should call before update_io
    cross_compnode_model_detect();

    //! update the IO of the network
    update_io();

    //! replace the IO when there is device input or output
    compile_graph();
}

void NetworkImplDft::compile_graph() {
    replace_dev_input_pass();
    if (!m_user_config->discrete_input_name.empty()) {
        replace_src_discrete_input_opr_pass();
    }
    make_output_spec();
    m_execute_func = m_load_result.graph_compile(m_output_spec);
}

void NetworkImplDft::start() const {
    if (m_start_callback) {
        std::unordered_map<std::string, std::pair<IO, std::shared_ptr<Tensor>>>
                input_io_map;
        for (auto&& io_inner : m_network_io->inputs) {
            input_io_map[io_inner.name] = {
                    IO{io_inner.name, io_inner.is_host, io_inner.io_type,
                       io_inner.config_layout},
                    io_inner.lite_tensor};
        }
        m_start_callback(input_io_map);
    }
}

void NetworkImplDft::forward() {
    start();
    if (m_load_config.comp_graph &&
        m_user_config->options.comp_node_seq_record_level == 2) {
        m_load_config.comp_graph.reset();
    }
    LITE_ASSERT(m_execute_func, "forward must be called after network loaded.");
    m_execute_func->execute();
}

void NetworkImplDft::wait() {
    if (!m_async) {
        m_execute_func->wait();
    }
    finish();
}

void NetworkImplDft::finish() const {
    if (m_async) {
        LITE_ASSERT(m_async_callback, "The callback func must set when async mode.");
        m_async_callback();
    }
    if (m_finish_callback) {
        std::unordered_map<std::string, std::pair<IO, std::shared_ptr<Tensor>>>
                output_io_map;
        for (auto&& io_inner : m_network_io->outputs) {
            output_io_map[io_inner.name] = {
                    IO{io_inner.name, io_inner.is_host, io_inner.io_type,
                       io_inner.config_layout},
                    io_inner.lite_tensor};
        }
        m_finish_callback(output_io_map);
    }
    output_plugin_result();
}

void NetworkImplDft::set_io(const NetworkIO& network_io) {
    for (auto&& in : network_io.inputs) {
        m_network_io->inputs.emplace_back(in);
    }
    for (auto&& out : network_io.outputs) {
        m_network_io->outputs.emplace_back(out);
    }
}

void NetworkImplDft::try_infer_tensor_layout(std::shared_ptr<Tensor> tensor, Var var) {
    if (var.node()->capable_shape_infer()) {
        auto&& static_infer_mgr = m_load_config.comp_graph->static_infer_manager();
        auto shape = static_infer_mgr.infer_shape_fallible(var.node());
        if (!shape) {
            LITE_WARN(
                    "Lite infer output shape failed, maybe the model is "
                    "dynamic "
                    "shape.\n");
            LITE_ASSERT(
                    !m_user_config->options.force_output_use_user_specified_memory,
                    "force_output_use_user_specified_memory can't be used when output "
                    "shape can't be derived.");
            return;
        }
        Layout layout = to_lite_layout(TensorLayout{*shape, var.dtype()});
        tensor->set_layout(layout);
    }
}

void NetworkImplDft::update_io() {
    update_input();
    update_output();
}

void NetworkImplDft::update_input() {
    auto device_type = m_user_config->device_type;
    auto device_id = m_compnode_locator.device;
    auto stream_id = m_compnode_locator.stream;
    //! if cpu all input and output are host
    if (device_type == LiteDeviceType::LITE_CPU) {
        for (auto&& in : m_network_io->inputs) {
            in.is_host = true;
        }
    }
    //! if cross compnode model, modify the device input if it is not valid
    if (m_nr_device_type > 1) {
        for (auto&& in_tensor_iter : m_load_result.tensor_map) {
            for (auto&& config_in : m_network_io->inputs) {
                //! if tensor is set to device input
                if (in_tensor_iter.first == config_in.name && !config_in.is_host) {
                    //! if the origin compnode of the tensor is not the device,
                    //! set the input to host
                    if (get_device_from_locator(
                                in_tensor_iter.second->comp_node().locator()) ==
                        LiteDeviceType::LITE_CPU) {
                        config_in.is_host = true;
                        LITE_WARN(
                                "The input tensor %s of the cross device model "
                                "should not from device.",
                                config_in.name.c_str());
                    }
                }
            }
        }
    }
    for (auto&& in_tensor_iter : m_load_result.tensor_map) {
        bool found = false;
        for (auto&& config_in : m_network_io->inputs) {
            if (in_tensor_iter.first == config_in.name) {
                found = true;
                if (config_in.is_host) {
                    config_in.lite_tensor = std::make_shared<Tensor>(
                            device_id, stream_id, device_type, true);
                    TensorHelper::implement(config_in.lite_tensor)
                            ->cast_final_safe<TensorImplDft>()
                            .m_host_tensor = in_tensor_iter.second;
                    config_in.lite_tensor->update_from_implement();
                } else {
                    config_in.lite_tensor =
                            std::make_shared<Tensor>(device_id, stream_id, device_type);
                    config_in.lite_tensor->set_layout(
                            to_lite_layout(in_tensor_iter.second->layout()));
                }
                TensorHelper::implement(config_in.lite_tensor)
                        ->cast_final_safe<TensorImplDft>()
                        .m_record_reset =
                        m_user_config->options.comp_node_seq_record_level > 0;
                if (config_in.config_layout.ndim &&
                    !(config_in.config_layout == config_in.lite_tensor->get_layout())) {
                    config_in.lite_tensor->set_layout(config_in.config_layout);
                }
            }
        }
        if (!found) {
            IOInner io_in;
            io_in.name = in_tensor_iter.first;
            io_in.lite_tensor =
                    std::make_shared<Tensor>(device_id, stream_id, device_type, true);
            TensorHelper::implement(io_in.lite_tensor)
                    ->cast_final_safe<TensorImplDft>()
                    .m_host_tensor = in_tensor_iter.second;
            TensorHelper::implement(io_in.lite_tensor)
                    ->cast_final_safe<TensorImplDft>()
                    .m_record_reset =
                    m_user_config->options.comp_node_seq_record_level > 0;
            io_in.lite_tensor->update_from_implement();
            m_network_io->inputs.push_back(io_in);
        }
    }

    if (!m_user_config->discrete_input_name.empty()) {
        update_input_lite_tensors();
    }

    //! delete the IO that is not the network
    for (auto it = m_network_io->inputs.begin(); it != m_network_io->inputs.end();) {
        if (it->lite_tensor == nullptr) {
            LITE_LOG("%s is not the network input, ignore it.", it->name.c_str());
            it = m_network_io->inputs.erase(it);
        } else {
            it++;
        }
    }
}

//! initialization lite_tensors when input is composed of discrete multiple tensors
void NetworkImplDft::update_input_lite_tensors() {
    auto device_type = m_user_config->device_type;
    auto device_id = m_compnode_locator.device;
    auto stream_id = m_compnode_locator.stream;

    for (auto&& in_tensor_iter : m_load_result.tensor_map) {
        if (in_tensor_iter.first != m_user_config->discrete_input_name) {
            continue;
        }
        for (auto&& config_in : m_network_io->inputs) {
            if (in_tensor_iter.first == config_in.name) {
                size_t bs = in_tensor_iter.second->shape(0);
                auto shape = in_tensor_iter.second->shape();
                if (config_in.config_layout.ndim) {
                    bs = config_in.config_layout.shapes[0];
                    for (size_t i = 0; i < config_in.config_layout.ndim; ++i) {
                        shape.shape[i] = config_in.config_layout.shapes[i];
                    }
                }
                shape.shape[0] = 1;
                for (size_t i = 0; i < bs; ++i) {
                    HostTensorND tensor(
                            in_tensor_iter.second->comp_node(), shape,
                            in_tensor_iter.second->dtype(),
                            in_tensor_iter.second->format());
                    if (config_in.is_host) {
                        config_in.lite_tensors.push_back(std::make_shared<Tensor>(
                                device_id, stream_id, device_type, true));
                        TensorHelper::implement(config_in.lite_tensors[i])
                                ->cast_final_safe<TensorImplDft>()
                                .m_host_tensor = std::make_shared<HostTensorND>(tensor);
                        config_in.lite_tensors[i]->update_from_implement();
                    } else {
                        config_in.lite_tensors.push_back(std::make_shared<Tensor>(
                                device_id, stream_id, device_type));
                        config_in.lite_tensors[i]->set_layout(
                                to_lite_layout(tensor.layout()));
                    }
                    TensorHelper::implement(config_in.lite_tensors[i])
                            ->cast_final_safe<TensorImplDft>()
                            .m_record_reset =
                            m_user_config->options.comp_node_seq_record_level > 0;
                }
            }
        }
    }
}

void NetworkImplDft::update_output() {
    auto device_type = m_user_config->device_type;
    auto device_id = m_compnode_locator.device;
    auto stream_id = m_compnode_locator.stream;
    if (device_type == LiteDeviceType::LITE_CPU) {
        for (auto&& out : m_network_io->outputs) {
            out.is_host = true;
        }
    }
    //! delete the output that is not the network
    for (auto out_it = m_network_io->outputs.begin();
         out_it != m_network_io->outputs.end();) {
        if (std::find_if(
                    m_load_result.output_var_list.begin(),
                    m_load_result.output_var_list.end(), [out_it](const SymbolVar var) {
                        return var.node()->name() == out_it->name;
                    }) == m_load_result.output_var_list.end()) {
            LITE_LOG("%s is not the network output, ignore it.", out_it->name.c_str());
            out_it = m_network_io->outputs.erase(out_it);
        } else {
            out_it++;
        }
    }
    //! user config the output tensor, so only compute the config output
    if (m_compute_configured_output_only) {
        LITE_ASSERT(
                m_network_io->outputs.size() > 0,
                "compute configured output only with no configure output.");
        for (auto out_it = m_network_io->outputs.begin();
             out_it != m_network_io->outputs.end(); out_it++) {
            //! use pinned memory to copy form device
            if (out_it->is_host) {
                out_it->lite_tensor = std::make_shared<Tensor>(
                        device_id, stream_id, device_type, true);
            } else {
                out_it->lite_tensor =
                        std::make_shared<Tensor>(device_id, stream_id, device_type);
            }
            SymbolVar var;
            for (auto&& out_var : m_load_result.output_var_list) {
                if (out_var.node()->name() == out_it->name) {
                    var = out_var;
                    break;
                }
            }
            try_infer_tensor_layout(out_it->lite_tensor, var);
            output_tensor_copy_optimize(var, out_it->lite_tensor);
            TensorHelper::implement(out_it->lite_tensor)
                    ->cast_final_safe<TensorImplDft>()
                    .m_record_reset =
                    m_user_config->options.comp_node_seq_record_level > 0;
        }
        //! user not set, use default output
    } else {
        for (auto&& out : m_load_result.output_var_list) {
            std::shared_ptr<Tensor> lite_tensor = nullptr;
            auto it = std::find_if(
                    m_network_io->outputs.begin(), m_network_io->outputs.end(),
                    [&out](const IOInner io) { return io.name == out.node()->name(); });
            if (it != m_network_io->outputs.end()) {
                if (it->is_host) {
                    it->lite_tensor = std::make_shared<Tensor>(
                            device_id, stream_id, device_type, true);
                } else {
                    it->lite_tensor =
                            std::make_shared<Tensor>(device_id, stream_id, device_type);
                }
                try_infer_tensor_layout(it->lite_tensor, out);
                lite_tensor = it->lite_tensor;
            } else {
                IOInner output;
                output.name = out.node()->name();
                output.lite_tensor = std::make_shared<Tensor>(
                        device_id, stream_id, device_type, true);
                m_network_io->outputs.push_back({output});
                try_infer_tensor_layout(output.lite_tensor, out);
                lite_tensor = output.lite_tensor;
            }
            output_tensor_copy_optimize(out, lite_tensor);
            TensorHelper::implement(lite_tensor)
                    ->cast_final_safe<TensorImplDft>()
                    .m_record_reset =
                    m_user_config->options.comp_node_seq_record_level > 0;
        }
    }
}

void NetworkImplDft::output_tensor_copy_optimize(
        Var var, std::shared_ptr<Tensor> tensor) {
    LITE_ASSERT(
            !(m_user_config->options.force_output_use_user_specified_memory &&
              m_user_config->options.force_output_dynamic_alloc),
            "Can't set force_output_use_user_specified_memory and "
            "force_output_dynamic_alloc at the same time.");
    if (m_user_config->options.force_output_use_user_specified_memory) {
        bool in_record = m_user_config->options.comp_node_seq_record_level > 0;
        TensorHelper::implement(tensor)
                ->cast_final_safe<TensorImplDft>()
                .set_reset_callback([var, in_record](TensorImplDft* dft_tensor) {
                    dft_tensor->device_share_host_memory();
                    auto dv = dft_tensor->dev_tensor().get();
                    dv->comp_node(var.node()->comp_node(), true);
                    var.node()->init_mem_plan(dv);
                    if (in_record) {
                        auto&& device_tensor = var.node()->mutable_dev_tensor();
                        device_tensor.only_reset_raw_storage(dv->storage());
                    } else {
                        var.node()->reset_dev_tensor_from_tensor(*dv);
                    }
                });
    }
    if (m_user_config->options.force_output_dynamic_alloc) {
        TensorHelper::implement(tensor)
                ->cast_final_safe<TensorImplDft>()
                .set_get_memory_callback([var](TensorImplDft* dft_tensor) {
                    if (dft_tensor->is_host()) {
                        auto host_tensor = dft_tensor->m_host_tensor;
                        *host_tensor =
                                HostTensorND::make_proxy(var.node()->dev_tensor());
                    } else {
                        auto dev_tensor = dft_tensor->m_dev_tensor;
                        *dev_tensor = var.node()->dev_tensor();
                    }
                });
    }
}

std::shared_ptr<Tensor> NetworkImplDft::get_io_tensor(
        std::string io_name, LiteTensorPhase phase) {
    if (phase == LiteTensorPhase::LITE_INPUT || phase == LiteTensorPhase::LITE_IO) {
        for (auto&& config_in : m_network_io->inputs) {
            if (io_name == config_in.name) {
                if (config_in.lite_tensor) {
                    return config_in.lite_tensor;
                } else {
                    LITE_THROW(mgb::ssprintf(
                            "%s input tensor is in discrete mode, you can use "
                            "get_discrete_tensors to get this input.",
                            io_name.c_str()));
                    return nullptr;
                }
            }
        }
    }
    if (phase == LiteTensorPhase::LITE_OUTPUT || phase == LiteTensorPhase::LITE_IO) {
        for (auto&& config_out : m_network_io->outputs) {
            if (io_name == config_out.name) {
                config_out.lite_tensor->update_from_implement();
                return config_out.lite_tensor;
            }
        }
    }
    LITE_THROW(mgb::ssprintf(
            "tensor name must be %s input tensor name or the registered "
            "output tensor name if NetworkIO is set, if NetworkIO is not set, "
            "the output tensor is all the network output tensor, or the output "
            "tensor is only the registered tensor.",
            io_name.c_str()));
    return nullptr;
}

std::vector<std::shared_ptr<Tensor>> NetworkImplDft::get_discrete_tensors(
        std::string io_name, LiteTensorPhase phase) {
    if (phase == LiteTensorPhase::LITE_INPUT) {
        for (auto&& config_in : m_network_io->inputs) {
            if (io_name == config_in.name &&
                config_in.name == m_user_config->discrete_input_name) {
                return config_in.lite_tensors;
            }
        }
    }
    LITE_THROW(mgb::ssprintf(
            "tensor name must be %s input tensor name.", io_name.c_str()));
    return {};
}

std::shared_ptr<Tensor> NetworkImplDft::get_input_tensor(size_t index) {
    return get_io_tensor(get_input_name(index));
}

std::vector<std::shared_ptr<Tensor>> NetworkImplDft::get_input_tensors(size_t index) {
    return get_discrete_tensors(get_input_name(index));
}

std::shared_ptr<Tensor> NetworkImplDft::get_output_tensor(size_t index) {
    return get_io_tensor(get_output_name(index));
}

//! set opr algorithm selection strategy in the network
void NetworkImplDft::set_network_algo_policy(
        LiteAlgoSelectStrategy strategy, uint32_t shared_batch_size,
        bool binary_equal_between_batch) {
    using S = megdnn::param::ExecutionPolicy::Strategy;
    auto dst_strategy = static_cast<S>(0);
    if (static_cast<uint32_t>(strategy) & LiteAlgoSelectStrategy::LITE_ALGO_HEURISTIC) {
        dst_strategy = dst_strategy | S::HEURISTIC;
    }
    if (static_cast<uint32_t>(strategy) & LiteAlgoSelectStrategy::LITE_ALGO_PROFILE) {
        dst_strategy = dst_strategy | S::PROFILE;
    }
    if (static_cast<uint32_t>(strategy) &
        LiteAlgoSelectStrategy::LITE_ALGO_REPRODUCIBLE) {
        dst_strategy = dst_strategy | S::REPRODUCIBLE;
    }
    if (static_cast<uint32_t>(strategy) & LiteAlgoSelectStrategy::LITE_ALGO_OPTIMIZED) {
        dst_strategy = dst_strategy | S::OPTIMIZED;
    }
    if (static_cast<uint32_t>(dst_strategy) != 0)
        m_execution_policy = dst_strategy;

    auto&& fast_run_config = m_load_config.comp_graph->options().fast_run_config;
    fast_run_config.binary_equal_between_batch = binary_equal_between_batch;
    fast_run_config.shared_batch_size = shared_batch_size;

    if (m_execute_func) {
        LITE_WARN(
                "set_network_algo_policy maybe cause error after loaded "
                "network!!!!");
        modify_exection_policy();
    }
}

void NetworkImplDft::modify_exection_policy() {
    auto& vars = m_load_result.output_var_list;
    if (static_cast<uint32_t>(m_execution_policy) != 0) {
        mgb::gopt::modify_opr_algo_strategy_inplace(vars, m_execution_policy);
    }
}

//! set opr algorithm selection strategy in the network
void NetworkImplDft::set_network_algo_workspace_limit(size_t workspace_limit) {
    mgb::SymbolVarArray vars;
    for (auto i : m_output_spec) {
        vars.push_back(i.first);
    }
    mgb::gopt::set_opr_algo_workspace_limit_inplace(vars, workspace_limit);
}

//! get the input tensor name in the order of graph
std::vector<const char*> NetworkImplDft::get_all_output_name() const {
    std::vector<const char*> output_names;
    for (auto& output : m_network_io->outputs) {
        output_names.push_back(output.name.c_str());
    }
    return output_names;
}

//! get the input tensor name in the order of graph
std::vector<const char*> NetworkImplDft::get_all_input_name() const {
    std::vector<const char*> input_names;
    for (auto& input : m_load_result.tensor_map) {
        input_names.push_back(input.first.c_str());
    }
    return input_names;
}

//! get the output tensor name in the order of graph
const char* NetworkImplDft::get_output_name(size_t index) const {
    LITE_ASSERT(
            index < m_load_result.output_var_list.size(),
            "The output tensor index is large than the total outputs number.");
    return m_load_result.output_var_list[index].node()->name().c_str();
}

//! get the input tensor name in the order of graph
const char* NetworkImplDft::get_input_name(size_t index) const {
    LITE_ASSERT(
            index < m_load_result.tensor_map.size(),
            "The input tensor index is large than the total inputs number.");
    size_t i = 0;
    for (auto& input : m_load_result.tensor_map) {
        if (i == index) {
            return input.first.c_str();
        }
        i++;
    }
    LITE_THROW(ssprintf("no input tensor of index %zu.", index));
}

//! Plugin part
void NetworkImplDft::enable_profile_performance(std::string profile_json_file) {
#if MGB_ENABLE_JSON
    m_profiler = std::make_unique<mgb::GraphProfiler>(m_load_config.comp_graph.get());
    m_profiler_output_file = profile_json_file;
#else
    LITE_MARK_USED_VAR(profile_json_file);
    LITE_THROW("JSON is disable at compile time.");
#endif
}

void NetworkImplDft::enable_io_txt_dump(std::string io_txt_out_file) {
    auto iodump = std::make_unique<mgb::TextOprIODump>(
            m_load_config.comp_graph.get(), io_txt_out_file.c_str());
    iodump->print_addr(false);
    m_iodump = std::move(iodump);
}

void NetworkImplDft::enable_io_bin_dump(std::string io_bin_out_dir) {
    m_iodump = std::make_unique<mgb::BinaryOprIODump>(
            m_load_config.comp_graph.get(), io_bin_out_dir.c_str());
}

void inline NetworkImplDft::output_plugin_result() const {
#if MGB_ENABLE_JSON
    if (m_profiler && m_execute_func) {
        m_profiler->to_json_full(m_execute_func.get())
                ->writeto_fpath(m_profiler_output_file);
    }
#endif
}

void NetworkImplDft::get_static_memory_alloc_info(const std::string& log_dir) const {
#ifndef __IN_TEE_ENV__
#if MGB_ENABLE_JSON
    m_execute_func->get_static_memory_alloc_info(log_dir);
    return;
#endif
#endif
    LITE_MARK_USED_VAR(log_dir);
}

void NetworkImplDft::enable_global_layout_transform() {
    m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::UNSPEC;

    switch (m_user_config->device_type) {
        case LiteDeviceType::LITE_CPU:
            m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::CPU;
            break;
        case LiteDeviceType::LITE_CUDA:
            m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::CUDA;
            break;
        default:
            m_layout_transform_target = mgb::gopt::GraphTuningOptions::Target::UNSPEC;
            LITE_WARN(
                    "lite compnode type: enum value: %d. is unspecial for layout "
                    "transform",
                    (int)(m_user_config->device_type));
    }
    m_set_layout_transform = true;
}

void NetworkImplDft::dump_layout_transform_model(std::string optimized_model_path) {
    if (m_set_layout_transform) {
        auto out_file = mgb::serialization::OutputFile::make_fs(
                optimized_model_path.c_str(), 'w');
        using DumpConfig = mgb::serialization::GraphDumper::DumpConfig;
        DumpConfig config{1, false, false};
        auto dumper = mgb::serialization::GraphDumper::make(
                std::move(out_file), m_format.val());
        dumper->dump(m_load_result.output_var_list, config);
    } else {
        LITE_THROW(
                ssprintf("dump layout transform model should call "
                         "enable_global_layout_transform before"));
    }
}

NetworkIO lite::get_model_io_info_dft(
        const std::string& model_path, const Config& config) {
    FILE* fin = fopen(model_path.c_str(), "rb");
    LITE_ASSERT(fin, "failed to open %s: %s", model_path.c_str(), strerror(errno));
    fseek(fin, 0, SEEK_END);
    size_t size = ftell(fin);
    fseek(fin, 0, SEEK_SET);
    void* ptr = malloc(size);
    std::shared_ptr<void> buf{ptr, ::free};
    auto nr = fread(buf.get(), 1, size, fin);
    LITE_ASSERT(nr == size);
    fclose(fin);
    return get_model_io_info_dft(ptr, size, config);
}

NetworkIO lite::get_model_io_info_dft(
        const void* model_mem, size_t size, const Config& config) {
    std::shared_ptr<void> model{const_cast<void*>(model_mem), [](void*) {}};
    auto input_file = mgb::serialization::InputFile::make_mem_proxy(model, size, false);
    auto format =
            mgb::serialization::GraphLoader::identify_graph_dump_format(*input_file);
    if (!format.valid()) {
        LITE_THROW("invalid model format");
    }
    auto loader =
            mgb::serialization::GraphLoader::make(std::move(input_file), format.val());

    mgb::serialization::GraphLoadConfig load_config;
    load_config.comp_graph = mgb::ComputingGraph::make();
    if (config.has_compression) {
        load_config.tensor_value_loader = decompressed_tensor_value_loader;
    }
    auto compnode_locator = to_compnode_locator(config.device_type);
    load_config.comp_node_mapper = [=](mgb::CompNode::Locator& loc) {
        if (loc.type == mgb::CompNode::DeviceType::UNSPEC) {
            loc.type = compnode_locator.type;
        }
        loc.device = compnode_locator.device;
    };
    auto load_result = loader->load(load_config, true);
    NetworkIO IOs;
    for (auto&& in_tensor_iter : load_result.tensor_map) {
        IO in_io;
        in_io.name = in_tensor_iter.first;
        in_io.config_layout = to_lite_layout(in_tensor_iter.second->layout());
        IOs.inputs.push_back(in_io);
    }
    auto infer_shape = [=](mgb::cg::SymbolVar var) -> const megdnn::TensorShape* {
        auto&& static_infer_mgr = load_config.comp_graph->static_infer_manager();
        using InferType = mgb::cg::static_infer::InferType;
        if (static_infer_mgr.get_infer_type(var.node()).shape &
            (InferType::CONST | InferType::RT_STATIC)) {
            return static_infer_mgr.infer_shape_fallible(var.node());
        } else {
            return nullptr;
        }
    };
    for (auto&& out : load_result.output_var_list) {
        IO out_io;
        out_io.name = out.node()->name();
        if (auto shape = infer_shape(out)) {
            out_io.config_layout = to_lite_layout(TensorLayout{*shape, out.dtype()});
        } else {
            out_io.config_layout = to_lite_layout(TensorLayout{{}, out.dtype()});
        }
        IOs.outputs.push_back(out_io);
    }
    return IOs;
}
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}