From fca6c76a0ec042ab64ebca4b3418e7cf35d71808 Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Fri, 5 Aug 2022 15:58:18 +0800
Subject: [PATCH] fix(lite): fix input invalid bug in lar for fitting mode

GitOrigin-RevId: 45d81c9a96dc64c4f41dd87581a5fa2bca76c2b7
---
 .gitattributes                                |   7 +
 lite/load_and_run/examples/example.sh         |   2 +-
 lite/load_and_run/examples/script/add_demo.py |   2 +-
 .../examples/script/mge_input_data.py         |   2 +-
 .../examples/script/resnet50_mge.py           |   2 +-
 lite/load_and_run/src/helpers/data_parser.h   |   2 +
 lite/load_and_run/src/helpers/json_loader.cpp |   6 +-
 lite/load_and_run/src/models/model_mdl.cpp    |  37 ++++
 lite/load_and_run/src/models/model_mdl.h      |   2 +
 .../src/options/extern_c_opr_options.cpp      |   5 +
 lite/load_and_run/src/options/io_options.cpp  | 171 ++++++++++++++----
 lite/load_and_run/src/options/io_options.h    |   5 +-
 .../src/options/layout_trans_options.cpp      |  71 +-------
 .../src/options/layout_trans_options.h        |   2 -
 .../src/options/plugin_options.cpp            |   4 +-
 .../src/options/strategy_options.cpp          |   8 +-
 .../src/strategys/strategy_fitting.cpp        |  18 +-
 .../src/strategys/strategy_normal.cpp         |   5 +-
 lite/test/test_io_options.cpp                 |  77 ++++++++
 lite/test/test_lar_options.cpp                |   6 +-
 lite/test/test_layout_options.cpp             |  27 ++-
 lite/test/test_options.h                      |  28 ++-
 src/gopt/impl/framework.cpp                   |   3 +-
 23 files changed, 352 insertions(+), 140 deletions(-)
 create mode 100644 lite/test/test_io_options.cpp

diff --git a/.gitattributes b/.gitattributes
index 14b131c07..40d05d0e3 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -23,3 +23,10 @@ imperative/python/test/unit/module/MagicMindRuntimeOprTest.GraphShapeMutable.mlu
 lite/test/resource/lite/ax_data_input.npy filter=lfs diff=lfs merge=lfs -text
 lite/test/resource/lite/ax_data_output.npy filter=lfs diff=lfs merge=lfs -text
 lite/test/resource/lite/ax_model.mge filter=lfs diff=lfs merge=lfs -text
+lite/test/resource/lite/add_demo_input.json filter=lfs diff=lfs merge=lfs -text
+lite/test/resource/lite/add_demo.mge filter=lfs diff=lfs merge=lfs -text
+lite/test/resource/lite/resnet50_b10.mdl filter=lfs diff=lfs merge=lfs -text
+lite/test/resource/lite/resnet50_input.npy filter=lfs diff=lfs merge=lfs -text
+lite/test/resource/lite/resnet50.mge filter=lfs diff=lfs merge=lfs -text
+lite/test/resource/lite/resnet50_uint8.mge filter=lfs diff=lfs merge=lfs -text
+lite/test/resource/lite/cat.ppm filter=lfs diff=lfs merge=lfs -text
diff --git a/lite/load_and_run/examples/example.sh b/lite/load_and_run/examples/example.sh
index ba4a3c3df..07525e807 100755
--- a/lite/load_and_run/examples/example.sh
+++ b/lite/load_and_run/examples/example.sh
@@ -112,7 +112,7 @@ function prepare_model_and_data(){
 
     #prepare mge model
     python3 script/resnet50_mge.py --dir model_source
-    python3 script/resnet50_mge.py  --dir model_source -d int8
+    python3 script/resnet50_mge.py  --dir model_source -d uint8
     python3 script/resnet50_mge.py  --dir model_source --inputs "#rand(0,255)"
 
     #make input_data
diff --git a/lite/load_and_run/examples/script/add_demo.py b/lite/load_and_run/examples/script/add_demo.py
index d0afe5cea..eeae569a8 100755
--- a/lite/load_and_run/examples/script/add_demo.py
+++ b/lite/load_and_run/examples/script/add_demo.py
@@ -43,7 +43,7 @@ if __name__ == "__main__":
     @jit.trace(symbolic=True, capture_as_const=True)
     def fun(data):
         return net(data)
-    data = tensor([3,4,5])
+    data = tensor([3.0,4.0,5.0])
     fun(data)
     if args.inputs == "":
         fun.dump(
diff --git a/lite/load_and_run/examples/script/mge_input_data.py b/lite/load_and_run/examples/script/mge_input_data.py
index 10a38817a..4240e5444 100755
--- a/lite/load_and_run/examples/script/mge_input_data.py
+++ b/lite/load_and_run/examples/script/mge_input_data.py
@@ -26,7 +26,7 @@ cv2.imwrite("input_data/cat.ppm",processed_img)
 #json 
 data_obj = {
     "shape": [1,3],
-    "type": "int32",
+    "type": "float32",
     "raw": [2,3,4]
 }
 with open("input_data/add_demo_input.json", "w") as f:
diff --git a/lite/load_and_run/examples/script/resnet50_mge.py b/lite/load_and_run/examples/script/resnet50_mge.py
index b9555cdba..1f5c5c6b2 100755
--- a/lite/load_and_run/examples/script/resnet50_mge.py
+++ b/lite/load_and_run/examples/script/resnet50_mge.py
@@ -348,4 +348,4 @@ if __name__ == "__main__":
             )
 
     else:
-        raise TypeError("dtype should be float32")
\ No newline at end of file
+        raise TypeError("dtype should be float32 or uint8")
\ No newline at end of file
diff --git a/lite/load_and_run/src/helpers/data_parser.h b/lite/load_and_run/src/helpers/data_parser.h
index c58e15c85..2b4c0c7e7 100644
--- a/lite/load_and_run/src/helpers/data_parser.h
+++ b/lite/load_and_run/src/helpers/data_parser.h
@@ -18,6 +18,8 @@ struct DataParser {
     };
     void feed(const std::string& path);
 
+    ~DataParser() { inputs.clear(); };
+
     std::unordered_map<std::string, mgb::HostTensorND> inputs;
 
 private:
diff --git a/lite/load_and_run/src/helpers/json_loader.cpp b/lite/load_and_run/src/helpers/json_loader.cpp
index 8818b9e05..9bdbf4b4c 100644
--- a/lite/load_and_run/src/helpers/json_loader.cpp
+++ b/lite/load_and_run/src/helpers/json_loader.cpp
@@ -321,10 +321,10 @@ std::unique_ptr<JsonLoader::Value> JsonLoader::load(const char* path) {
     const size_t size = ftell(fin.get());
     std::fseek(fin.get(), 0, SEEK_SET);
 
-    std::unique_ptr<char> buf(static_cast<char*>(malloc(size)));
+    std::vector<char> buf(size + 1);
 
-    auto nr = std::fread(buf.get(), 1, size, fin.get());
+    auto nr = std::fread(buf.data(), 1, size, fin.get());
     mgb_assert(nr == size);
 
-    return load(buf.get(), size);
+    return load(buf.data(), size);
 }
diff --git a/lite/load_and_run/src/models/model_mdl.cpp b/lite/load_and_run/src/models/model_mdl.cpp
index 8097e9fa0..1eb793d1c 100644
--- a/lite/load_and_run/src/models/model_mdl.cpp
+++ b/lite/load_and_run/src/models/model_mdl.cpp
@@ -179,4 +179,41 @@ std::vector<uint8_t> ModelMdl::get_model_data() {
             mgb::serialization::GraphDumper::make(std::move(out_file), m_format.val());
     dumper->dump(m_load_result.output_var_list, config);
     return out_data;
+}
+
+void ModelMdl::update_io() {
+    //! update output varlist when input shape maybe change(some pass excution
+    //! time depends on the shape of init input)
+    mgb::thin_hash_table::ThinHashMap<mgb::cg::SymbolVar, mgb::cg::SymbolVar> varmap;
+    auto&& network = m_load_result;
+    std::unordered_map<void*, std::string> tensor_name_map;
+    for (auto& input : network.tensor_map) {
+        tensor_name_map.insert({input.second->raw_ptr(), input.first});
+    }
+    mgb::cg::DepOprIter dep([&](mgb::cg::OperatorNodeBase* opr) {
+        if (auto h2d = opr->try_cast_final<mgb::opr::Host2DeviceCopy>()) {
+            if (tensor_name_map.find(h2d->host_data()->raw_ptr()) !=
+                tensor_name_map.end()) {
+                //! make new h2d opr with new host tensor shape
+                std::string name = tensor_name_map[h2d->host_data()->raw_ptr()];
+                std::shared_ptr<mgb::HostTensorND> new_tensor =
+                        std::make_shared<mgb::HostTensorND>();
+                new_tensor->copy_from(*h2d->host_data());
+
+                auto h2d_opr = mgb::opr::Host2DeviceCopy::make(
+                        *h2d->owner_graph(), new_tensor, h2d->param(), h2d->config());
+                //! rename new h2d with given name
+                h2d_opr.node()->owner_opr()->name(name);
+                varmap[h2d->output(0)] = h2d_opr;
+            }
+        }
+    });
+    //! get replace var map
+    for (auto&& i : network.output_var_list)
+        dep.add(i);
+    //! replace new h2d and update related var shape
+    if (!varmap.empty()) {
+        auto output_vars = mgb::cg::replace_vars(network.output_var_list, varmap);
+        network.output_var_list = output_vars;
+    }
 }
\ No newline at end of file
diff --git a/lite/load_and_run/src/models/model_mdl.h b/lite/load_and_run/src/models/model_mdl.h
index 43674a24e..0917efbcf 100644
--- a/lite/load_and_run/src/models/model_mdl.h
+++ b/lite/load_and_run/src/models/model_mdl.h
@@ -108,6 +108,8 @@ public:
 
     std::vector<uint8_t> get_model_data() override;
 
+    void update_io();
+
 private:
     bool share_model_mem;
     std::string model_path;
diff --git a/lite/load_and_run/src/options/extern_c_opr_options.cpp b/lite/load_and_run/src/options/extern_c_opr_options.cpp
index 6be9bb373..a9c1fca22 100644
--- a/lite/load_and_run/src/options/extern_c_opr_options.cpp
+++ b/lite/load_and_run/src/options/extern_c_opr_options.cpp
@@ -18,6 +18,11 @@ void COprLibOption::config_model_internel(
                     "lite model dont't support run with external c opr "
                     "parmeter");
         }
+        if (m_c_opr_init_func != MGB_C_OPR_INIT_FUNC_STR) {
+            LITE_THROW(
+                    "lite model dont't support to set the c_opr_init_func to another "
+                    "API");
+        }
     }
 }
 template <>
diff --git a/lite/load_and_run/src/options/io_options.cpp b/lite/load_and_run/src/options/io_options.cpp
index b9fe8ef1d..72adb9bb7 100644
--- a/lite/load_and_run/src/options/io_options.cpp
+++ b/lite/load_and_run/src/options/io_options.cpp
@@ -26,32 +26,89 @@ void InputOption::config_model_internel<ModelLite>(
         auto&& parser = model->get_input_parser();
         auto&& network = model->get_lite_network();
 
-        //! datd type map from mgb data type to lite data type
-        std::map<megdnn::DTypeEnum, LiteDataType> type_map = {
-                {megdnn::DTypeEnum::Float32, LiteDataType::LITE_FLOAT},
-                {megdnn::DTypeEnum::Int32, LiteDataType::LITE_INT},
-                {megdnn::DTypeEnum::Int8, LiteDataType::LITE_INT8},
-                {megdnn::DTypeEnum::Uint8, LiteDataType::LITE_UINT8}};
-
-        for (auto& i : parser.inputs) {
-            //! get tensor information from data parser
-            auto tensor = i.second;
-            auto data_type = tensor.dtype();
-            auto tensor_shape = tensor.shape();
-            mgb::dt_byte* src = tensor.raw_ptr();
-
-            //! set lite layout
-            lite::Layout layout;
-            layout.ndim = tensor_shape.ndim;
-            for (size_t idx = 0; idx < tensor_shape.ndim; idx++) {
-                layout.shapes[idx] = tensor_shape[idx];
+        //! datd type map from lite data type to  mgb data type
+        std::map<LiteDataType, megdnn::DTypeEnum> type_map = {
+                {LiteDataType::LITE_FLOAT, megdnn::DTypeEnum::Float32},
+                {LiteDataType::LITE_INT, megdnn::DTypeEnum::Int32},
+                {LiteDataType::LITE_INT8, megdnn::DTypeEnum::Int8},
+                {LiteDataType::LITE_UINT8, megdnn::DTypeEnum::Uint8}};
+
+        if (m_force_batch_size > 0) {
+            LITE_WARN("force set batch size to %d", m_force_batch_size);
+            auto all_inputs_name = network->get_all_input_name();
+            for (auto& name : all_inputs_name) {
+                std::shared_ptr<lite::Tensor> input_tensor =
+                        network->get_io_tensor(name);
+                //! set lite layout
+                lite::Layout layout;
+                mgb::TensorShape new_shape;
+                new_shape.ndim = input_tensor->get_layout().ndim;
+                layout.ndim = input_tensor->get_layout().ndim;
+                for (size_t idx = 0; idx < new_shape.ndim; idx++) {
+                    new_shape.shape[idx] = input_tensor->get_layout().shapes[idx];
+                    layout.shapes[idx] = new_shape.shape[idx];
+                }
+                new_shape.shape[0] = m_force_batch_size;
+                layout.shapes[0] = m_force_batch_size;
+
+                //! gengrate tesnor copy from origin tensor
+                mgb::HostTensorND hv;
+                hv.comp_node(mgb::CompNode::default_cpu(), true)
+                        .dtype(megdnn::DType::from_enum(
+                                type_map[input_tensor->get_layout().data_type]))
+                        .resize(new_shape);
+                mgb::dt_byte* raw_ptr = hv.raw_ptr();
+                //! single batch input size
+                size_t batch_stride = hv.dtype().size() * hv.layout().total_nr_elems() /
+                                      m_force_batch_size;
+                size_t curr_batch_size = m_force_batch_size;
+                //! copy data from origin input_tensor
+                size_t init_batch = input_tensor->get_layout().shapes[0];
+                while (curr_batch_size > init_batch) {
+                    memcpy((char*)raw_ptr, (char*)(input_tensor->get_memory_ptr()),
+                           batch_stride * init_batch);
+                    curr_batch_size -= init_batch;
+                    raw_ptr += batch_stride * init_batch;
+                }
+                memcpy((char*)raw_ptr, (char*)(input_tensor->get_memory_ptr()),
+                       batch_stride * curr_batch_size);
+
+                input_tensor->reset(hv.raw_ptr(), layout);
+                parser.inputs[name] = std::move(hv);
             }
-            layout.data_type = type_map[data_type.enumv()];
+        } else {
+            for (auto& i : parser.inputs) {
+                //! get tensor information from data parser
+                auto tensor = i.second;
+                auto tensor_shape = tensor.shape();
+                mgb::dt_byte* src = tensor.raw_ptr();
+                std::shared_ptr<lite::Tensor> input_tensor =
+                        network->get_io_tensor(i.first);
+                //! set lite layout
+                lite::Layout layout;
+                layout.ndim = tensor_shape.ndim;
+                for (size_t idx = 0; idx < tensor_shape.ndim; idx++) {
+                    layout.shapes[idx] = tensor_shape[idx];
+                }
+                layout.data_type = input_tensor->get_layout().data_type;
 
-            //! set network input tensor
-            std::shared_ptr<lite::Tensor> input_tensor =
-                    network->get_io_tensor(i.first);
-            input_tensor->reset(src, layout);
+                //! set data for only given shape
+                if (tensor.storage().empty()) {
+                    mgb::HostTensorND hv;
+                    hv.comp_node(mgb::CompNode::default_cpu(), true)
+                            .dtype(megdnn::DType::from_enum(type_map[layout.data_type]))
+                            .resize(tensor.shape());
+                    mgb::dt_byte* raw_ptr = hv.raw_ptr();
+                    //! set all value in tesnor to 1
+                    memset((char*)raw_ptr, 1,
+                           hv.layout().total_nr_elems() * hv.dtype().size());
+                    parser.inputs[i.first] = std::move(hv);
+                    input_tensor->reset(raw_ptr, layout);
+                } else {
+                    //! set network input tensor
+                    input_tensor->reset(src, layout);
+                }
+            }
         }
     }
 }
@@ -67,22 +124,58 @@ void InputOption::config_model_internel<ModelMdl>(
     } else if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
         auto&& parser = model->get_input_parser();
         auto&& network = model->get_mdl_load_result();
-        auto tensormap = network.tensor_map;
-        for (auto& i : parser.inputs) {
-            mgb_assert(
-                    tensormap.find(i.first) != tensormap.end(),
-                    "can't find tesnor named %s", i.first.c_str());
-            auto& in = tensormap.find(i.first)->second;
-            if (i.second.storage().empty()) {
+        auto&& tensormap = network.tensor_map;
+
+        if (m_force_batch_size > 0) {
+            mgb_log_warn("force set batch size to %d", m_force_batch_size);
+            for (auto& iter : tensormap) {
+                auto& in = iter.second;
                 mgb::HostTensorND hv;
+                mgb::TensorShape new_shape = in->shape();
+                new_shape[0] = m_force_batch_size;
                 hv.comp_node(mgb::CompNode::default_cpu(), true)
                         .dtype(in->dtype())
-                        .resize(i.second.shape());
+                        .resize(new_shape);
                 mgb::dt_byte* raw_ptr = hv.raw_ptr();
-                memset((char*)raw_ptr, 1, hv.layout().total_nr_elems());
+
+                //! copy given batch data into new tensor
+                size_t batch_stride = in->dtype().size() *
+                                      in->layout().total_nr_elems() / (in->shape()[0]);
+                size_t curr_batch_size = m_force_batch_size;
+
+                //! copy data from origin input_tensor
+                size_t init_batch = in->shape()[0];
+                while (curr_batch_size > init_batch) {
+                    memcpy((char*)raw_ptr, (char*)(in->raw_ptr()),
+                           batch_stride * init_batch);
+                    curr_batch_size -= init_batch;
+                    raw_ptr += batch_stride * init_batch;
+                }
+                memcpy((char*)raw_ptr, (char*)(in->raw_ptr()),
+                       batch_stride * curr_batch_size);
+                //! set input tensor
                 in->copy_from(hv);
-            } else {
-                in->copy_from(i.second);
+                parser.inputs[iter.first] = std::move(hv);
+            }
+        } else {
+            for (auto& i : parser.inputs) {
+                mgb_assert(
+                        tensormap.find(i.first) != tensormap.end(),
+                        "can't find tesnor named %s", i.first.c_str());
+                auto& in = tensormap.find(i.first)->second;
+                if (i.second.storage().empty()) {
+                    mgb::HostTensorND hv;
+                    hv.comp_node(mgb::CompNode::default_cpu(), true)
+                            .dtype(in->dtype())
+                            .resize(i.second.shape());
+                    mgb::dt_byte* raw_ptr = hv.raw_ptr();
+                    memset((char*)raw_ptr, 1,
+                           hv.layout().total_nr_elems() * hv.dtype().size());
+                    in->copy_from(hv);
+                    parser.inputs[i.first] = std::move(hv);
+                } else {
+                    in->copy_from(i.second);
+                }
             }
         }
     }
@@ -191,6 +284,7 @@ void IOdumpOption::config_model_internel<ModelMdl>(
 using namespace lar;
 
 void InputOption::update() {
+    data_path.clear();
     m_option_name = "input";
     size_t start = 0;
     auto end = FLAGS_input.find(";", start);
@@ -201,6 +295,7 @@ void InputOption::update() {
         end = FLAGS_input.find(";", start);
     }
     data_path.emplace_back(FLAGS_input.substr(start));
+    m_force_batch_size = FLAGS_batch_size;
 }
 
 std::shared_ptr<lar::OptionBase> lar::InputOption::create_option() {
@@ -283,7 +378,10 @@ void IOdumpOption::config_model(
 ////////////////////// Input gflags ////////////////////////
 DEFINE_string(
         input, "", "Set up inputs data for model --input [ file_path | data_string]");
-
+DEFINE_int32(
+        batch_size, -1,
+        "set the batch size of input(especially for global layout transform "
+        "optimization working on)");
 ////////////////////// OprIOdump gflags ////////////////////////
 
 DEFINE_string(io_dump, "", "set the io dump file path in text format");
@@ -299,4 +397,5 @@ DEFINE_string(
 DEFINE_bool(copy_to_host, false, "copy device data to host");
 
 REGIST_OPTION_CREATOR(input, lar::InputOption::create_option);
+
 REGIST_OPTION_CREATOR(iodump, lar::IOdumpOption::create_option);
diff --git a/lite/load_and_run/src/options/io_options.h b/lite/load_and_run/src/options/io_options.h
index 595e2cb81..8d1c5dfc4 100644
--- a/lite/load_and_run/src/options/io_options.h
+++ b/lite/load_and_run/src/options/io_options.h
@@ -13,7 +13,7 @@ DECLARE_bool(io_dump_stderr);
 DECLARE_string(bin_io_dump);
 DECLARE_string(bin_out_dump);
 DECLARE_bool(copy_to_host);
-
+DECLARE_int32(batch_size);
 namespace lar {
 
 /*!
@@ -22,7 +22,7 @@ namespace lar {
 class InputOption final : public OptionBase {
 public:
     //! static function for registe options
-    static bool is_valid() { return !FLAGS_input.empty(); };
+    static bool is_valid() { return !FLAGS_input.empty() || FLAGS_batch_size > 0; };
     static std::shared_ptr<OptionBase> create_option();
 
     void config_model(
@@ -40,6 +40,7 @@ private:
 
     std::string m_option_name;
     std::vector<std::string> data_path;  // data string or data file path
+    int32_t m_force_batch_size;
 };
 
 class IOdumpOption : public OptionBase {
diff --git a/lite/load_and_run/src/options/layout_trans_options.cpp b/lite/load_and_run/src/options/layout_trans_options.cpp
index 6c5d6d32c..e5aad6dd9 100644
--- a/lite/load_and_run/src/options/layout_trans_options.cpp
+++ b/lite/load_and_run/src/options/layout_trans_options.cpp
@@ -11,7 +11,7 @@ void GoptLayoutOption::config_model_internel<ModelLite>(
         RuntimeParam& runtime_param, std::shared_ptr<ModelLite> model) {
     if (runtime_param.stage == RunStage::AFTER_NETWORK_CREATED) {
         if (m_layout_transform) {
-            LITE_LOG("using global layout transform optimization\n");
+            LITE_LOG("using global layout transform optimization");
             if (m_layout_transform_target ==
                 mgb::gopt::GraphTuningOptions::Target::CPU) {
                 model->get_config().device_type = LiteDeviceType::LITE_CPU;
@@ -43,67 +43,25 @@ void GoptLayoutOption::config_model_internel<ModelMdl>(
         RuntimeParam& runtime_param, std::shared_ptr<ModelMdl> model) {
     if (runtime_param.stage == RunStage::AFTER_MODEL_LOAD) {
         if (m_layout_transform) {
-            mgb_log_debug("update input shape for global layout transform\n");
             auto&& load_result = model->get_mdl_load_result();
-            if (m_force_batch_size > 0) {
-                for (auto&& i : load_result.tensor_map) {
-                    auto& in = i.second;
-                    mgb::TensorShape new_shape = in->shape();
-                    new_shape[0] = m_force_batch_size;
-                    mgb::HostTensorND new_tensor;
-                    new_tensor.comp_node(mgb::CompNode::default_cpu(), true)
-                            .dtype(in->dtype())
-                            .resize(new_shape);
-                    mgb::dt_byte* raw_ptr = new_tensor.raw_ptr();
-                    memset((char*)raw_ptr, 1, new_tensor.layout().total_nr_elems());
-                    in->copy_from(new_tensor);
-                }
-            }
             for (auto&& item : load_result.output_var_list) {
                 if (item.shape()[0] > 1) {
                     mgb_log_warn(
                             " model may be dumped with multi batch and will cost lots "
-                            "of time to profile during global layout transform!!!\n");
-                }
-            }
-            //! update output varlist when input shape maybe change(some pass excution
-            //! time depends on the shape of init input)
-            mgb::thin_hash_table::ThinHashMap<mgb::cg::SymbolVar, mgb::cg::SymbolVar>
-                    varmap;
-            mgb::cg::DepOprIter dep([&](mgb::cg::OperatorNodeBase* opr) {
-                if (auto h2d = opr->try_cast_final<mgb::opr::Host2DeviceCopy>()) {
-                    auto param = h2d->param();
-                    mgb::TensorShape new_shape = h2d->host_data()->shape();
-                    std::shared_ptr<mgb::HostTensorND> new_tensor =
-                            std::make_shared<mgb::HostTensorND>(
-                                    h2d->host_data()->comp_node(), new_shape,
-                                    h2d->host_data()->dtype());
-                    new_tensor->only_reset_raw_storage(h2d->host_data()->storage());
-                    auto h2d_opr = mgb::opr::Host2DeviceCopy::make(
-                            *h2d->owner_graph(), new_tensor, param, h2d->config());
-                    varmap[h2d->output(0)] = h2d_opr;
-                }
-            });
-
-            for (auto&& i : load_result.output_var_list)
-                dep.add(i);
-
-            if (!varmap.empty()) {
-                auto output_vars =
-                        mgb::cg::replace_vars(load_result.output_var_list, varmap);
-                for (size_t i = 0; i < load_result.output_var_list.size(); ++i) {
-                    output_vars[i].rename(
-                            load_result.output_var_list[i].node()->name());
+                            "of time to profile during global layout transform!!!");
                 }
-                load_result.output_var_list = output_vars;
             }
         }
     } else if (runtime_param.stage == RunStage::GLOBAL_OPTIMIZATION) {
         if (m_layout_transform) {
-            mgb_log("using global layout transform optimization\n");
+            mgb_log("using global layout transform optimization");
             auto&& load_result = model->get_mdl_load_result();
-            load_result.output_var_list = mgb::gopt::layout_transform(
+            auto output_vars = mgb::gopt::layout_transform(
                     load_result.output_var_list, m_layout_transform_target);
+            for (size_t i = 0; i < load_result.output_var_list.size(); ++i) {
+                output_vars[i].rename(load_result.output_var_list[i].node()->name());
+            }
+            load_result.output_var_list = output_vars;
 
             if (!m_layout_transform_dump_file.empty()) {
                 auto out_file = mgb::serialization::OutputFile::make_fs(
@@ -176,8 +134,6 @@ void GoptLayoutOption::update() {
     }
     m_layout_transform_dump_file = FLAGS_layout_transform_dump;
 
-    m_force_batch_size = FLAGS_layout_transform_batch_size;
-
     m_option = {
             {"layout_transform", lar::String::make("")},
     };
@@ -204,14 +160,6 @@ bool GoptLayoutOption::is_valid() {
         }
     }
     ret = ret || !FLAGS_layout_transform_dump.empty();
-    if (FLAGS_layout_transform_batch_size > 0) {
-        mgb_assert(
-                FLAGS_layout_transform_batch_size > 0 &&
-                        !FLAGS_layout_transform.empty(),
-                "\"layout-transform-batch-size\" should be set with "
-                "\"layout-transform\"");
-        ret = ret || FLAGS_layout_transform_batch_size > 0;
-    }
     return ret || m_valid;
 }
 
@@ -264,8 +212,5 @@ DEFINE_string(
         "The computing graph after global layout transform will be dumped to the given "
         "file path.");
 
-DEFINE_int32(
-        layout_transform_batch_size, -1,
-        "the batch size of input for global layout transform optimization working on");
 REGIST_OPTION_CREATOR(gopt_layout, lar::GoptLayoutOption::create_option);
 REGIST_OPTION_VALIDATER(gopt_layout, lar::GoptLayoutOption::set_valid);
diff --git a/lite/load_and_run/src/options/layout_trans_options.h b/lite/load_and_run/src/options/layout_trans_options.h
index 911bb2b72..ac9a8642f 100644
--- a/lite/load_and_run/src/options/layout_trans_options.h
+++ b/lite/load_and_run/src/options/layout_trans_options.h
@@ -5,7 +5,6 @@
 #include "models/model.h"
 #include "option_base.h"
 DECLARE_string(layout_transform);
-DECLARE_int32(layout_transform_batch_size);
 DECLARE_string(layout_transform_dump);
 
 namespace lar {
@@ -41,6 +40,5 @@ private:
     mgb::gopt::GraphTuningOptions::Target m_layout_transform_target;
     static bool m_valid;
     OptionValMap m_option;
-    int32_t m_force_batch_size;
 };
 }  // namespace lar
diff --git a/lite/load_and_run/src/options/plugin_options.cpp b/lite/load_and_run/src/options/plugin_options.cpp
index d6cfa7cd0..3c0fa8d68 100644
--- a/lite/load_and_run/src/options/plugin_options.cpp
+++ b/lite/load_and_run/src/options/plugin_options.cpp
@@ -199,7 +199,7 @@ void DebugOption::format_and_print(
 
     std::stringstream ss;
     ss << table;
-    LITE_LOG("%s\n\n", ss.str().c_str());
+    LITE_LOG("\n%s\n", ss.str().c_str());
 }
 
 template <>
@@ -243,7 +243,7 @@ void DebugOption::format_and_print(
 
     std::stringstream ss;
     ss << table;
-    mgb_log("%s\n\n", ss.str().c_str());
+    mgb_log("\n%s\n", ss.str().c_str());
 }
 
 template <>
diff --git a/lite/load_and_run/src/options/strategy_options.cpp b/lite/load_and_run/src/options/strategy_options.cpp
index 4b6241e36..cb9f783c8 100644
--- a/lite/load_and_run/src/options/strategy_options.cpp
+++ b/lite/load_and_run/src/options/strategy_options.cpp
@@ -32,13 +32,19 @@ void StrategyOption::config_model(
         runtime_param.run_iter = run_iter;
         runtime_param.threads = threads;
         runtime_param.testcase_num = 1;
+    } else if (runtime_param.stage == RunStage::UPDATE_IO) {
+        if (model->type() == ModelType::MEGDL_MODEL) {
+            auto model_ptr = std::static_pointer_cast<ModelMdl>(model);
+            //! update input and output related varnode
+            model_ptr->update_io();
+        }
     } else if (runtime_param.stage == RunStage::BEFORE_OUTSPEC_SET) {
         if (model->type() == ModelType::MEGDL_MODEL) {
             auto model_ptr = std::static_pointer_cast<ModelMdl>(model);
             auto num = model_ptr->get_testcase_num();
             if (num != 0)
                 runtime_param.testcase_num = num;
-
+            //! make output specification
             model_ptr->make_output_spec();
         }
     }
diff --git a/lite/load_and_run/src/strategys/strategy_fitting.cpp b/lite/load_and_run/src/strategys/strategy_fitting.cpp
index 5f7754e66..4849a3a25 100644
--- a/lite/load_and_run/src/strategys/strategy_fitting.cpp
+++ b/lite/load_and_run/src/strategys/strategy_fitting.cpp
@@ -205,9 +205,9 @@ void OptionsTimeProfiler::profile_with_given_options(
         //! after load configure
         auto config_model_before_runing = [&]() {
             for (auto stage :
-                 {RunStage::AFTER_MODEL_LOAD, RunStage::GLOBAL_OPTIMIZATION,
-                  RunStage::BEFORE_OUTSPEC_SET, RunStage::AFTER_OUTSPEC_SET,
-                  RunStage::MODEL_RUNNING}) {
+                 {RunStage::AFTER_MODEL_LOAD, RunStage::UPDATE_IO,
+                  RunStage::GLOBAL_OPTIMIZATION, RunStage::BEFORE_OUTSPEC_SET,
+                  RunStage::AFTER_OUTSPEC_SET, RunStage::MODEL_RUNNING}) {
                 runtime_param.stage = stage;
                 stage_config_model();
             }
@@ -453,9 +453,9 @@ void FittingStrategy::dump_best_options_with_model() {
 
     //! get model binary data after optimized
     for (auto stage :
-         {RunStage::AFTER_MODEL_LOAD, RunStage::GLOBAL_OPTIMIZATION,
-          RunStage::BEFORE_OUTSPEC_SET, RunStage::AFTER_OUTSPEC_SET,
-          RunStage::MODEL_RUNNING}) {
+         {RunStage::AFTER_MODEL_LOAD, RunStage::UPDATE_IO,
+          RunStage::GLOBAL_OPTIMIZATION, RunStage::BEFORE_OUTSPEC_SET,
+          RunStage::AFTER_OUTSPEC_SET, RunStage::MODEL_RUNNING}) {
         runtime_param.stage = stage;
         stage_config_model();
     }
@@ -502,9 +502,9 @@ void FittingStrategy::AutoCleanFile::dump_model() {
     model->load_model();
     //! get model binary data after optimized
     for (auto stage :
-         {RunStage::AFTER_MODEL_LOAD, RunStage::GLOBAL_OPTIMIZATION,
-          RunStage::BEFORE_OUTSPEC_SET, RunStage::AFTER_OUTSPEC_SET,
-          RunStage::MODEL_RUNNING}) {
+         {RunStage::AFTER_MODEL_LOAD, RunStage::UPDATE_IO,
+          RunStage::GLOBAL_OPTIMIZATION, RunStage::BEFORE_OUTSPEC_SET,
+          RunStage::AFTER_OUTSPEC_SET, RunStage::MODEL_RUNNING}) {
         runtime_param.stage = stage;
         stage_config_model();
     }
diff --git a/lite/load_and_run/src/strategys/strategy_normal.cpp b/lite/load_and_run/src/strategys/strategy_normal.cpp
index 37bf037c0..c0e842ab4 100644
--- a/lite/load_and_run/src/strategys/strategy_normal.cpp
+++ b/lite/load_and_run/src/strategys/strategy_normal.cpp
@@ -53,8 +53,9 @@ void NormalStrategy::run_subline() {
     //! after load configure
     auto config_after_load = [&]() {
         for (auto stage :
-             {RunStage::AFTER_MODEL_LOAD, RunStage::GLOBAL_OPTIMIZATION,
-              RunStage::BEFORE_OUTSPEC_SET, RunStage::AFTER_OUTSPEC_SET}) {
+             {RunStage::AFTER_MODEL_LOAD, RunStage::UPDATE_IO,
+              RunStage::GLOBAL_OPTIMIZATION, RunStage::BEFORE_OUTSPEC_SET,
+              RunStage::AFTER_OUTSPEC_SET}) {
             m_runtime_param.stage = stage;
             stage_config_model();
         }
diff --git a/lite/test/test_io_options.cpp b/lite/test/test_io_options.cpp
new file mode 100644
index 000000000..ab8898a61
--- /dev/null
+++ b/lite/test/test_io_options.cpp
@@ -0,0 +1,77 @@
+#include <gtest/gtest.h>
+#include <string.h>
+#include <memory>
+#include "test_options.h"
+
+using namespace lar;
+DECLARE_bool(lite);
+DECLARE_string(input);
+DECLARE_int32(batch_size);
+DECLARE_int32(iter);
+namespace {
+STRING_OPTION_WRAP(input, "");
+INT32_OPTION_WRAP(batch_size, -1);
+BOOL_OPTION_WRAP(lite);
+INT32_OPTION_WRAP(iter, 10);
+}  // anonymous namespace
+
+TEST(TestLarIO, INPUT) {
+    DEFINE_INT32_WRAP(iter, 1);
+    {
+        std::string model_path = "./resnet50.mge";
+        TEST_STRING_OPTION(input, "data:./resnet50_input.npy");
+    }
+    {
+        std::string model_path = "./add_demo.mge";
+        TEST_STRING_OPTION(input, "data:add_demo_input.json");
+    }
+    {
+        std::string model_path = "./resnet50_uint8.mge";
+        TEST_STRING_OPTION(input, "data:./cat.ppm");
+    }
+    {
+        std::string model_path = "./add_demo.mge";
+        TEST_STRING_OPTION(input, "data:[2.0,3.0,4.0]");
+    }
+    {
+        std::string model_path = "./shufflenet.mge";
+        TEST_STRING_OPTION(input, "data:{2,3,224,224}");
+    }
+    {
+        std::string model_path = "./resnet50_b10.mdl";
+        TEST_INT32_OPTION(batch_size, 1);
+        TEST_INT32_OPTION(batch_size, 5);
+        TEST_INT32_OPTION(batch_size, 11);
+    }
+}
+
+TEST(TestLarIO, INPUT_LITE) {
+    DEFINE_INT32_WRAP(iter, 1);
+    DEFINE_BOOL_WRAP(lite);
+    {
+        std::string model_path = "./resnet50.mge";
+        TEST_STRING_OPTION(input, "data:./resnet50_input.npy");
+    }
+    {
+        std::string model_path = "./add_demo.mge";
+        TEST_STRING_OPTION(input, "data:add_demo_input.json");
+    }
+    {
+        std::string model_path = "./resnet50_uint8.mge";
+        TEST_STRING_OPTION(input, "data:./cat.ppm");
+    }
+    {
+        std::string model_path = "./add_demo.mge";
+        TEST_STRING_OPTION(input, "data:[2.0,3.0,4.0]");
+    }
+    {
+        std::string model_path = "./shufflenet.mge";
+        TEST_STRING_OPTION(input, "data:{2,3,224,224}");
+    }
+    {
+        std::string model_path = "./resnet50_b10.mdl";
+        TEST_INT32_OPTION(batch_size, 1);
+        TEST_INT32_OPTION(batch_size, 5);
+        TEST_INT32_OPTION(batch_size, 11);
+    }
+}
\ No newline at end of file
diff --git a/lite/test/test_lar_options.cpp b/lite/test/test_lar_options.cpp
index 5093ccb73..9ee8c2e83 100644
--- a/lite/test/test_lar_options.cpp
+++ b/lite/test/test_lar_options.cpp
@@ -24,7 +24,7 @@ BOOL_OPTION_WRAP(cuda);
 }  // anonymous namespace
 
 TEST(TestLarOption, OPTIMIZE_FOR_INFERENCE) {
-    DEFINE_WRAP(cpu);
+    DEFINE_BOOL_WRAP(cpu);
     std::string model_path = "./shufflenet.mge";
 
     TEST_BOOL_OPTION(optimize_for_inference);
@@ -33,7 +33,7 @@ TEST(TestLarOption, OPTIMIZE_FOR_INFERENCE) {
 #if LITE_WITH_OPENCL
 TEST(TestLarOption, OPTIMIZE_FOR_INFERENCE_OPENCL) {
     REQUIRE_OPENCL();
-    DEFINE_WRAP(opencl);
+    DEFINE_BOOL_WRAP(opencl);
     std::string model_path = "./shufflenet.mge";
 
     TEST_BOOL_OPTION(optimize_for_inference);
@@ -43,7 +43,7 @@ TEST(TestLarOption, OPTIMIZE_FOR_INFERENCE_OPENCL) {
 #if LITE_WITH_CUDA
 TEST(TestLarOption, OPTIMIZE_FOR_INFERENCE_CUDA) {
     REQUIRE_CUDA();
-    DEFINE_WRAP(cuda);
+    DEFINE_BOOL_WRAP(cuda);
     std::string model_path = "./shufflenet.mge";
 
     TEST_BOOL_OPTION(optimize_for_inference);
diff --git a/lite/test/test_layout_options.cpp b/lite/test/test_layout_options.cpp
index be9f57975..e0cd2b07c 100644
--- a/lite/test/test_layout_options.cpp
+++ b/lite/test/test_layout_options.cpp
@@ -20,6 +20,7 @@ DECLARE_bool(enable_nchw64);
 DECLARE_bool(enable_nhwcd4);
 DECLARE_bool(enable_nchw44_dot);
 DECLARE_bool(fast_run);
+DECLARE_int32(iter);
 namespace {
 BOOL_OPTION_WRAP(enable_nchw4);
 BOOL_OPTION_WRAP(enable_chwn4);
@@ -30,6 +31,7 @@ BOOL_OPTION_WRAP(enable_nchw64);
 BOOL_OPTION_WRAP(enable_nhwcd4);
 BOOL_OPTION_WRAP(enable_nchw44_dot);
 BOOL_OPTION_WRAP(fast_run);
+INT32_OPTION_WRAP(iter, 10);
 
 BOOL_OPTION_WRAP(lite);
 BOOL_OPTION_WRAP(cpu);
@@ -39,7 +41,8 @@ BOOL_OPTION_WRAP(cuda);
 }  // anonymous namespace
 
 TEST(TestLarLayout, X86_CPU) {
-    DEFINE_WRAP(cpu);
+    DEFINE_INT32_WRAP(iter, 1);
+    DEFINE_BOOL_WRAP(cpu);
     std::string model_path = "./shufflenet.mge";
 
     TEST_BOOL_OPTION(enable_nchw4);
@@ -52,8 +55,9 @@ TEST(TestLarLayout, X86_CPU) {
 }
 
 TEST(TestLarLayout, X86_CPU_LITE) {
-    DEFINE_WRAP(cpu);
-    DEFINE_WRAP(lite);
+    DEFINE_INT32_WRAP(iter, 1);
+    DEFINE_BOOL_WRAP(cpu);
+    DEFINE_BOOL_WRAP(lite);
     std::string model_path = "./shufflenet.mge";
 
     TEST_BOOL_OPTION(enable_nchw4);
@@ -65,18 +69,20 @@ TEST(TestLarLayout, X86_CPU_LITE) {
 }
 
 TEST(TestLarLayoutFastRun, CPU_LITE) {
-    DEFINE_WRAP(cpu);
-    DEFINE_WRAP(lite);
+    DEFINE_INT32_WRAP(iter, 1);
+    DEFINE_BOOL_WRAP(cpu);
+    DEFINE_BOOL_WRAP(lite);
     std::string model_path = "./shufflenet.mge";
     {
-        DEFINE_WRAP(enable_nchw44);
-        DEFINE_WRAP(fast_run);
+        DEFINE_BOOL_WRAP(enable_nchw44);
+        DEFINE_BOOL_WRAP(fast_run);
         run_NormalStrategy(model_path);
     }
 }
 #if LITE_WITH_CUDA
 TEST(TestLarLayout, CUDA) {
-    DEFINE_WRAP(cuda);
+    DEFINE_INT32_WRAP(iter, 1);
+    DEFINE_BOOL_WRAP(cuda);
     std::string model_path = "./shufflenet.mge";
     TEST_BOOL_OPTION(enable_nchw4);
     TEST_BOOL_OPTION(enable_chwn4);
@@ -87,8 +93,9 @@ TEST(TestLarLayout, CUDA) {
 }
 
 TEST(TestLarLayout, CUDA_LITE) {
-    DEFINE_WRAP(cuda);
-    DEFINE_WRAP(lite);
+    DEFINE_INT32_WRAP(iter, 1);
+    DEFINE_BOOL_WRAP(cuda);
+    DEFINE_BOOL_WRAP(lite);
     std::string model_path = "./shufflenet.mge";
 
     TEST_BOOL_OPTION(enable_nchw4);
diff --git a/lite/test/test_options.h b/lite/test/test_options.h
index 81f8be47f..39501e62b 100644
--- a/lite/test/test_options.h
+++ b/lite/test/test_options.h
@@ -23,11 +23,35 @@ void run_NormalStrategy(std::string model_path);
         ~BoolOptionWrap_##option() { FLAGS_##option = false; } \
     };
 
-#define DEFINE_WRAP(option) BoolOptionWrap_##option flags_##option;
+#define STRING_OPTION_WRAP(option, default_val)                              \
+    struct StringOptionWrap_##option {                                       \
+        StringOptionWrap_##option(const char* val) { FLAGS_##option = val; } \
+        ~StringOptionWrap_##option() { FLAGS_##option = default_val; }       \
+    };
+
+#define INT32_OPTION_WRAP(option, default_val)                          \
+    struct Int32OptionWrap_##option {                                   \
+        Int32OptionWrap_##option(int32_t val) { FLAGS_##option = val; } \
+        ~Int32OptionWrap_##option() { FLAGS_##option = default_val; }   \
+    };
+#define DEFINE_BOOL_WRAP(option) BoolOptionWrap_##option flags_##option;
+#define DEFINE_STRING_WRAP(option, value) \
+    StringOptionWrap_##option flags_##option(value);
+#define DEFINE_INT32_WRAP(option, value) Int32OptionWrap_##option flags_##option(value);
 
 #define TEST_BOOL_OPTION(option)        \
     {                                   \
-        DEFINE_WRAP(option);            \
+        DEFINE_BOOL_WRAP(option);       \
         run_NormalStrategy(model_path); \
     }
+#define TEST_STRING_OPTION(option, value)  \
+    {                                      \
+        DEFINE_STRING_WRAP(option, value); \
+        run_NormalStrategy(model_path);    \
+    }
+#define TEST_INT32_OPTION(option, value)  \
+    {                                     \
+        DEFINE_INT32_WRAP(option, value); \
+        run_NormalStrategy(model_path);   \
+    }
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/gopt/impl/framework.cpp b/src/gopt/impl/framework.cpp
index becf75d2a..6a72ff9f0 100644
--- a/src/gopt/impl/framework.cpp
+++ b/src/gopt/impl/framework.cpp
@@ -64,7 +64,8 @@ OperatorNodeBase* SubGraph::Rewriter::auto_replace_outputs(OperatorNodeBase* opr
             bool v0 = out0[i]->contain_flag(VarNode::Flag::VOLATILE_CONTENT),
                  v1 = out1[i]->contain_flag(VarNode::Flag::VOLATILE_CONTENT);
             mgb_assert(v0 == v1, "%s", err_msg().c_str());
-
+            //! rename new var
+            out1[i]->name(out0[i]->cname());
             auto&& ins = m_varmap.insert({out0[i], {true, nullptr}});
             mgb_assert(
                     ins.second || ins.first->second.first,
-- 
GitLab