From dc0ab9b64ea15a043a96b192f426eabd02dddc97 Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Mon, 8 Aug 2022 15:25:07 +0800
Subject: [PATCH] feat(lite): replace warp when src is discrete input

GitOrigin-RevId: 2bf7980ac6373b691081ab7be9975ec6fa57f8ae
---
 lite/include/lite/network.h                |  17 ++
 lite/lite-c/include/lite-c/network_c.h     |  17 ++
 lite/lite-c/src/network.cpp                |  20 ++-
 lite/pylite/megenginelite/network.py       |  33 ++++
 lite/pylite/test/test_network.py           |  42 +++++
 lite/src/mge/network_impl.cpp              | 182 +++++++++++++++++++++
 lite/src/mge/network_impl.h                |  17 ++
 lite/src/network.cpp                       |  17 ++
 lite/src/network_impl_base.h               |  16 ++
 lite/test/test_network.cpp                 |  90 ++++++++++
 lite/test/test_network_c.cpp               |  42 +++++
 src/core/include/megbrain/graph/helper.h   |   2 +-
 src/gopt/include/megbrain/gopt/framework.h |  11 +-
 src/opr/impl/imgproc.cpp                   |  12 --
 src/opr/include/megbrain/opr/imgproc.h     |   2 +-
 15 files changed, 500 insertions(+), 20 deletions(-)
diff --git a/lite/include/lite/network.h b/lite/include/lite/network.h
index b06efa0d5..2b7e5abf8 100644
--- a/lite/include/lite/network.h
+++ b/lite/include/lite/network.h
@@ -117,6 +117,9 @@ struct LITE_API Options {
  *
  * @param auto_optimize_inference lite will detect the device information add
  * set the options heuristically
+ *
+ * @param discrete_input_name configure which input is composed of discrete
+ * multiple tensors
  */
 struct LITE_API Config {
     bool has_compression = false;
@@ -126,6 +129,7 @@ struct LITE_API Config {
     std::string bare_model_cryption_name = {};
     Options options = {};
     bool auto_optimize_inference = false;
+    std::string discrete_input_name = {};
 };
 
 /*!
@@ -289,9 +293,22 @@ public:
     std::shared_ptr<Tensor> get_io_tensor(
             std::string io_name, LiteTensorPhase phase = LiteTensorPhase::LITE_IO);
 
+    /** @brief get the network input tensors which input consists of discrete multiple
+     * tensors, layout (1, c, h, w)
+     *
+     * @param io_name the name of the tensor
+     * @param phase indicate the tensor is input tensor
+     */
+    std::vector<std::shared_ptr<Tensor>> get_io_tensors(
+            std::string io_name, LiteTensorPhase phase = LiteTensorPhase::LITE_INPUT);
+
     //! get the network input tensor by index
     std::shared_ptr<Tensor> get_input_tensor(size_t index);
 
+    //! get the network input tensors which input consists of discrete multiple tensors
+    //! by index
+    std::vector<std::shared_ptr<Tensor>> get_input_tensors(size_t index);
+
     //! get the network output tensor by index
     std::shared_ptr<Tensor> get_output_tensor(size_t index);
 
diff --git a/lite/lite-c/include/lite-c/network_c.h b/lite/lite-c/include/lite-c/network_c.h
index 7634304a3..8b7316a74 100644
--- a/lite/lite-c/include/lite-c/network_c.h
+++ b/lite/lite-c/include/lite-c/network_c.h
@@ -103,6 +103,9 @@ extern LITE_API const LiteOptions default_option;
 
  *\param auto_optimize_inference lite will detect the device information add
  * set the options heuristically
+ *
+ * \param discrete_input_name configure which input is composed of discrete
+ * multiple tensors
  */
 typedef struct LiteConfig {
     int has_compression;
@@ -112,6 +115,7 @@ typedef struct LiteConfig {
     const char* bare_model_cryption_name;
     LiteOptions options;
     int auto_optimize_inference;
+    const char* discrete_input_name;
 } LiteConfig;
 
 //! get default config
@@ -298,6 +302,19 @@ LITE_API int LITE_get_io_tensor(
         LiteNetwork network, const char* io_name, LiteTensorPhase phase,
         LiteTensor* tensor);
 
+/**
+ * \brief get the n'th tensor in the network input tensors whose input
+ * consists of discrete multiple tensors and name is io_name, layout (1, c, h, w)
+ * \param[in] network The loaded model
+ * \param[in] io_name The input name
+ * \param[in] n_idx The index of tensor
+ * \param[in] phase The tensor phase
+ * \param[out] tensor The IO tensor get from the network
+ */
+LITE_API int LITE_get_io_tensors(
+        LiteNetwork network, const char* io_name, size_t n_idx, LiteTensorPhase phase,
+        LiteTensor* tensor);
+
 /**
  * \brief get the input tensor name in the order in loaded model
  * \param[in] network The loaded model
diff --git a/lite/lite-c/src/network.cpp b/lite/lite-c/src/network.cpp
index 8325c73c3..0c8ec8a4b 100644
--- a/lite/lite-c/src/network.cpp
+++ b/lite/lite-c/src/network.cpp
@@ -43,7 +43,8 @@ LiteConfig default_config_t = {
         .backend = LiteBackend::LITE_DEFAULT,
         .bare_model_cryption_name = nullptr,
         .options = default_option,
-        .auto_optimize_inference = false};
+        .auto_optimize_inference = false,
+        .discrete_input_name = nullptr};
 LiteConfig* default_config() {
     return &default_config_t;
 }
@@ -135,6 +136,9 @@ lite::Config convert_to_lite_config(const LiteConfig c_config) {
     lite_config.options.enable_nchw64 = c_config.options.enable_nchw64;
 
     lite_config.auto_optimize_inference = c_config.auto_optimize_inference;
+    if (c_config.discrete_input_name) {
+        lite_config.discrete_input_name = c_config.discrete_input_name;
+    }
 
     return lite_config;
 }
@@ -274,6 +278,20 @@ int LITE_get_io_tensor(
     LITE_CAPI_END();
 }
 
+int LITE_get_io_tensors(
+        LiteNetwork network, const char* io_name, size_t n_idx, LiteTensorPhase phase,
+        LiteTensor* tensor) {
+    LITE_CAPI_BEGIN();
+    LITE_ASSERT(network, "The network pass to LITE api is null");
+    auto io_tensors =
+            static_cast<lite::Network*>(network)->get_io_tensors(io_name, phase);
+    LITE_ASSERT(
+            n_idx < io_tensors.size(), "n_idx should be less than %zu",
+            io_tensors.size());
+    *tensor = io_tensors[n_idx].get();
+    LITE_CAPI_END();
+}
+
 int LITE_get_input_name(const LiteNetwork network, size_t index, const char** name) {
     LITE_CAPI_BEGIN();
     LITE_ASSERT(network && name, "The network pass to LITE api is null");
diff --git a/lite/pylite/megenginelite/network.py b/lite/pylite/megenginelite/network.py
index 57c984ab4..390e7627c 100644
--- a/lite/pylite/megenginelite/network.py
+++ b/lite/pylite/megenginelite/network.py
@@ -173,6 +173,8 @@ class LiteConfig(Structure):
 
         auto_optimize_inference: lite will detect the device information add set the options heuristically
 
+        discrete_input_name: configure which input is composed of discrete multiple tensors
+
     Examples:
         .. code-block::
 
@@ -193,6 +195,7 @@ class LiteConfig(Structure):
         ("_bare_model_cryption_name", c_char_p),
         ("options", LiteOptions),
         ("auto_optimize_inference", c_int),
+        ("discrete_input_name", c_char_p),
     ]
 
     def __init__(self, device_type=LiteDeviceType.LITE_CPU, option=None):
@@ -207,6 +210,7 @@ class LiteConfig(Structure):
         self.has_compression = 0
         self.backend = LiteBackend.LITE_DEFAULT
         self.auto_optimize_inference = 0
+        self.discrete_input_name = c_char_p(b"")
 
     @property
     def bare_model_cryption_name(self):
@@ -229,6 +233,7 @@ class LiteConfig(Structure):
             "bare_model_cryption_name": self.bare_model_cryption_name,
             "options": self.options,
             "auto_optimize_inference": self.auto_optimize_inference,
+            "discrete_input_name": self.discrete_input_name,
         }
         return data.__repr__()
 
@@ -536,6 +541,10 @@ class _NetworkAPI(_LiteCObjBase):
             [c_char_p, c_size_t, LiteConfig, POINTER(_LiteNetworkIO)],
         ),
         ("LITE_extra_configure", [_Cnetwork, LiteExtraConfig]),
+        (
+            "LITE_get_io_tensors",
+            [_Cnetwork, c_char_p, c_size_t, c_int, POINTER(_Ctensor)],
+        ),
     ]
 
 
@@ -736,6 +745,30 @@ class LiteNetwork(object):
         tensor.update()
         return tensor
 
+    def get_io_tensors(self, name, n_idx, phase=LiteTensorPhase.LITE_INPUT):
+        """
+        get the n_idx'th tensor in the network input tensors whose
+        input consists of discrete multiple tensors and tensor name is name
+
+        Args:
+            name: the name of input tensor
+            n_idx: the tensor index
+            phase: the type of LiteTensor, this is useful to separate input tensor with the same name
+
+        Returns:
+            the tensors with given name and type
+        """
+        if type(name) == str:
+            c_name = c_char_p(name.encode("utf-8"))
+        else:
+            c_name = c_char_p(name)
+        tensor = LiteTensor(physic_construct=False)
+        self._api.LITE_get_io_tensors(
+            self._network, c_name, n_idx, phase, byref(tensor._tensor)
+        )
+        tensor.update()
+        return tensor
+
     def get_input_name(self, index):
         """
         get the input name by the index in the network
diff --git a/lite/pylite/test/test_network.py b/lite/pylite/test/test_network.py
index 336c84244..8e0469924 100644
--- a/lite/pylite/test/test_network.py
+++ b/lite/pylite/test/test_network.py
@@ -500,3 +500,45 @@ class TestNetwork(TestShuffleNet):
 
         os.remove(fast_run_cache)
         os.remove(global_layout_transform_model)
+
+
+class TestDiscreteInputNet(unittest.TestCase):
+    source_dir = os.getenv("LITE_TEST_RESOURCE")
+    data0_path = os.path.join(source_dir, "data0.npy")
+    data1_path = os.path.join(source_dir, "data1.npy")
+    data2_path = os.path.join(source_dir, "data2.npy")
+    model_path = os.path.join(source_dir, "test_discrete_input.mge")
+    data0 = np.load(data0_path)
+    data1 = np.load(data1_path)
+    data2 = np.load(data2_path)
+
+    def do_forward(self, network, times=3):
+        data_name = network.get_input_name(1)
+        datas = []
+        datas.append(network.get_io_tensors(data_name, 0))
+        datas.append(network.get_io_tensors(data_name, 1))
+        datas.append(network.get_io_tensors(data_name, 2))
+
+        datas[0].set_data_by_copy(self.data0)
+        datas[1].set_data_by_copy(self.data1)
+        datas[2].set_data_by_copy(self.data2)
+        for i in range(times):
+            network.forward()
+            network.wait()
+
+
+class TestDiscreteInput(TestDiscreteInputNet):
+    def test_discrete_input(self):
+        config = LiteConfig()
+        config.discrete_input_name = "data".encode("utf-8")
+        input_io = LiteIO(
+            "data",
+            is_host=True,
+            io_type=LiteIOType.LITE_IO_VALUE,
+            layout=LiteLayout([3, 3, 224, 224]),
+        )
+        ios = LiteNetworkIO()
+        ios.add_input(input_io)
+        network = LiteNetwork(config, ios)
+        network.load(self.model_path)
+        self.do_forward(network)
diff --git a/lite/src/mge/network_impl.cpp b/lite/src/mge/network_impl.cpp
index 5aec5beb8..a37c9d455 100644
--- a/lite/src/mge/network_impl.cpp
+++ b/lite/src/mge/network_impl.cpp
@@ -13,6 +13,7 @@
 #include "megbrain/comp_node_env.h"
 #include "megbrain/graph.h"
 #include "megbrain/graph/cg.h"
+#include "megbrain/opr/imgproc.h"
 #include "megbrain/opr/io.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/tensor.h"
@@ -259,6 +260,88 @@ void NetworkImplDft::make_output_spec() {
     }
 }
 
+void NetworkImplDft::replace_src_discrete_input_opr_pass() {
+    mgb::ThinHashMap<mgb::SymbolVar, mgb::SymbolVar> out_var_map;
+
+    auto dest_with_extra_deps =
+            get_dest_vars_with_extra_deps(m_load_result.output_var_list);
+    gopt::SubGraph graph{dest_with_extra_deps};
+    auto rewriter = graph.make_rewriter();
+
+    auto on_opr = [&](mgb::cg::OperatorNodeBase* opr) {
+        if (opr->same_type<mgb::opr::WarpPerspective>()) {
+            bool is_h2d = true;
+            if (opr->input(0)->owner_opr()->same_type<mgb::opr::Host2DeviceCopy>())
+                is_h2d = true;
+            else if (opr->input(0)
+                             ->owner_opr()
+                             ->same_type<mgb::opr::VolatileSharedDeviceTensor>())
+                is_h2d = false;
+            else
+                return;
+
+            SymbolVarArray srcs;
+            if (is_h2d) {
+                auto h2d = opr->input(0)->owner_opr();
+                for (auto&& inp : get_io_tensors(m_user_config->discrete_input_name)) {
+                    auto val = TensorHelper::implement(inp)
+                                       ->cast_final_safe<TensorImplDft>()
+                                       .m_host_tensor;
+                    LITE_ASSERT(val);
+                    srcs.push_back(mgb::opr::Host2DeviceCopy::make(
+                            *m_load_result.graph, val, h2d->config()));
+                }
+            } else {
+                auto volatiled = opr->input(0)->owner_opr();
+                for (auto&& inp : get_io_tensors(m_user_config->discrete_input_name)) {
+                    auto val = TensorHelper::implement(inp)
+                                       ->cast_final_safe<TensorImplDft>()
+                                       .m_dev_tensor;
+                    LITE_ASSERT(val);
+                    srcs.push_back(mgb::opr::VolatileSharedDeviceTensor::make(
+                            *m_load_result.graph, val, volatiled->config()));
+                }
+            }
+
+            auto& warp = opr->cast_final<mgb::opr::WarpPerspective>();
+            SymbolVar new_out;
+            if (opr->input().size() == 3) {
+                new_out = mgb::opr::WarpPerspective::make(
+                        srcs, warp.input(1), warp.input(2), warp.param(),
+                        warp.config());
+            } else {
+                LITE_ASSERT(opr->input().size() == 4);
+                new_out = mgb::opr::WarpPerspective::make(
+                        srcs, warp.input(1), warp.input(2), warp.input(3), warp.param(),
+                        warp.config());
+            }
+            rewriter.replace_var(
+                    warp.output(0), new_out.node(),
+                    "replace WarpPerspective to WarpPerspective multi src version.");
+        } else {
+            rewriter.auto_replace_outputs(opr);
+        }
+    };
+    graph.iter(on_opr);
+    rewriter.apply_inplace();
+    auto new_ovar = graph.endpoint_vars();
+    new_ovar.resize(m_load_result.output_var_list.size());
+
+    for (size_t i = 0; i < new_ovar.size(); ++i) {
+        out_var_map[m_load_result.output_var_list[i]] = new_ovar[i];
+    }
+    for (auto&& i : m_load_result.output_var_map) {
+        i.second = out_var_map.at(i.second);
+    }
+    for (auto&& i : m_load_result.output_var_map_id) {
+        i.second = out_var_map.at(i.second);
+    }
+    for (size_t i = 0; i < m_load_result.output_var_list.size(); i++) {
+        new_ovar[i].rename(m_load_result.output_var_list[i].node()->name());
+    }
+    m_load_result.output_var_list = std::move(new_ovar);
+}
+
 void NetworkImplDft::replace_dev_input_pass() {
     mgb::CompNode::Locator locator;
     m_load_config.comp_node_mapper(locator);
@@ -528,6 +611,8 @@ void NetworkImplDft::configure_after_loaded() {
 
 void NetworkImplDft::compile_graph() {
     replace_dev_input_pass();
+    if (!m_user_config->discrete_input_name.empty())
+        replace_src_discrete_input_opr_pass();
     make_output_spec();
     m_execute_func = m_load_result.graph_compile(m_output_spec);
 }
@@ -691,6 +776,11 @@ void NetworkImplDft::update_input() {
             m_network_io->inputs.push_back(io_in);
         }
     }
+
+    if (!m_user_config->discrete_input_name.empty()) {
+        update_input_lite_tensors();
+    }
+
     //! delete the IO that is not the network
     for (auto it = m_network_io->inputs.begin(); it != m_network_io->inputs.end();) {
         if (it->lite_tensor == nullptr) {
@@ -702,6 +792,79 @@ void NetworkImplDft::update_input() {
     }
 }
 
+void NetworkImplDft::update_input_lite_tensors() {
+    auto device_type = m_user_config->device_type;
+    auto device_id = m_compnode_locator.device;
+    auto stream_id = m_compnode_locator.stream;
+
+    for (auto&& in_tensor_iter : m_load_result.tensor_map) {
+        if (in_tensor_iter.first != m_user_config->discrete_input_name) {
+            continue;
+        }
+        bool found = false;
+        for (auto&& config_in : m_network_io->inputs) {
+            if (in_tensor_iter.first == config_in.name) {
+                found = true;
+                size_t bs = in_tensor_iter.second->shape(0);
+                auto shape = in_tensor_iter.second->shape();
+                shape.shape[0] = 1;
+                if (config_in.config_layout.ndim) {
+                    bs = config_in.config_layout.shapes[0];
+                    shape.shape[1] = config_in.config_layout.shapes[1];
+                    shape.shape[2] = config_in.config_layout.shapes[2];
+                    shape.shape[3] = config_in.config_layout.shapes[3];
+                }
+                HostTensorND tensor(
+                        in_tensor_iter.second->comp_node(), shape,
+                        in_tensor_iter.second->dtype(),
+                        in_tensor_iter.second->format());
+                for (size_t i = 0; i < bs; ++i) {
+                    if (config_in.is_host) {
+                        config_in.lite_tensors.push_back(std::make_shared<Tensor>(
+                                device_id, stream_id, device_type, true));
+                        TensorHelper::implement(config_in.lite_tensors[i])
+                                ->cast_final_safe<TensorImplDft>()
+                                .m_host_tensor = std::make_shared<HostTensorND>(tensor);
+                        config_in.lite_tensors[i]->update_from_implement();
+                    } else {
+                        config_in.lite_tensors.push_back(std::make_shared<Tensor>(
+                                device_id, stream_id, device_type));
+                        config_in.lite_tensors[i]->set_layout(
+                                to_lite_layout(tensor.layout()));
+                    }
+                    TensorHelper::implement(config_in.lite_tensors[i])
+                            ->cast_final_safe<TensorImplDft>()
+                            .m_record_reset =
+                            m_user_config->options.comp_node_seq_record_level > 0;
+                }
+            }
+        }
+        if (!found) {
+            size_t bs = in_tensor_iter.second->shape(0);
+            auto shape = in_tensor_iter.second->shape();
+            shape.shape[0] = 1;
+            HostTensorND tensor(
+                    in_tensor_iter.second->comp_node(), shape,
+                    in_tensor_iter.second->dtype(), in_tensor_iter.second->format());
+            IOInner io_in;
+            io_in.name = in_tensor_iter.first;
+            for (size_t i = 0; i < bs; ++i) {
+                io_in.lite_tensors.push_back(std::make_shared<Tensor>(
+                        device_id, stream_id, device_type, true));
+                TensorHelper::implement(io_in.lite_tensors[i])
+                        ->cast_final_safe<TensorImplDft>()
+                        .m_host_tensor = std::make_shared<HostTensorND>(tensor);
+                TensorHelper::implement(io_in.lite_tensors[i])
+                        ->cast_final_safe<TensorImplDft>()
+                        .m_record_reset =
+                        m_user_config->options.comp_node_seq_record_level > 0;
+                io_in.lite_tensors[i]->update_from_implement();
+            }
+            m_network_io->inputs.push_back(io_in);
+        }
+    }
+}
+
 void NetworkImplDft::update_output() {
     auto device_type = m_user_config->device_type;
     auto device_id = m_compnode_locator.device;
@@ -855,10 +1018,29 @@ std::shared_ptr<Tensor> NetworkImplDft::get_io_tensor(
     return nullptr;
 }
 
+std::vector<std::shared_ptr<Tensor>> NetworkImplDft::get_io_tensors(
+        std::string io_name, LiteTensorPhase phase) {
+    if (phase == LiteTensorPhase::LITE_INPUT) {
+        for (auto&& config_in : m_network_io->inputs) {
+            if (io_name == config_in.name &&
+                config_in.name == m_user_config->discrete_input_name) {
+                return config_in.lite_tensors;
+            }
+        }
+    }
+    LITE_THROW(mgb::ssprintf(
+            "tensor name must be %s input tensor name.", io_name.c_str()));
+    return {};
+}
+
 std::shared_ptr<Tensor> NetworkImplDft::get_input_tensor(size_t index) {
     return get_io_tensor(get_input_name(index));
 }
 
+std::vector<std::shared_ptr<Tensor>> NetworkImplDft::get_input_tensors(size_t index) {
+    return get_io_tensors(get_input_name(index));
+}
+
 std::shared_ptr<Tensor> NetworkImplDft::get_output_tensor(size_t index) {
     return get_io_tensor(get_output_name(index));
 }
diff --git a/lite/src/mge/network_impl.h b/lite/src/mge/network_impl.h
index 02999e23e..bef304bcd 100644
--- a/lite/src/mge/network_impl.h
+++ b/lite/src/mge/network_impl.h
@@ -57,9 +57,19 @@ public:
             std::string io_name,
             LiteTensorPhase phase = LiteTensorPhase::LITE_IO) override;
 
+    //! get the network input tensors which input consists of discrete multiple tensors,
+    //! layout (1, c, h, w)
+    std::vector<std::shared_ptr<Tensor>> get_io_tensors(
+            std::string io_name,
+            LiteTensorPhase phase = LiteTensorPhase::LITE_INPUT) override;
+
     //! get the input tensor by index in the load_result tensormap
     std::shared_ptr<Tensor> get_input_tensor(size_t index) override;
 
+    //! get the network input tensors which input consists of discrete multiple tensors
+    //! by index
+    std::vector<std::shared_ptr<Tensor>> get_input_tensors(size_t index) override;
+
     //! get the output tensor by index in the load_result output_var_list
     std::shared_ptr<Tensor> get_output_tensor(size_t index) override;
 
@@ -190,6 +200,11 @@ private:
     //! VolatileSharedDeviceTensor Opr
     void replace_dev_input_pass();
 
+    //! if the input to the network is a list of tensors, this pass will replace
+    //! the opr that supports the input of a list of tensors with the corresponding
+    //! version, current support WarpPerspective
+    void replace_src_discrete_input_opr_pass();
+
     //! check whether the model is cross compnode
     void cross_compnode_model_detect();
 
@@ -199,6 +214,8 @@ private:
 
     void update_input();
     void update_output();
+    //! initialization lite_tensors when input is composed of discrete multiple tensors
+    void update_input_lite_tensors();
 
     //! when the model info have loaded, update the config according the model
     //! info, finaly use it in compute graph
diff --git a/lite/src/network.cpp b/lite/src/network.cpp
index 6ee7ff4af..fad1343a6 100644
--- a/lite/src/network.cpp
+++ b/lite/src/network.cpp
@@ -127,6 +127,15 @@ std::shared_ptr<Tensor> Network::get_io_tensor(
     LITE_ERROR_HANDLER_END
 }
 
+std::vector<std::shared_ptr<Tensor>> Network::get_io_tensors(
+        std::string name, LiteTensorPhase phase) {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(m_loaded, "get_io_tensor should be used after model loaded.");
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    return m_impl->get_io_tensors(name, phase);
+    LITE_ERROR_HANDLER_END
+}
+
 std::shared_ptr<Tensor> Network::get_input_tensor(size_t index) {
     LITE_ERROR_HANDLER_BEGIN
     LITE_ASSERT(m_loaded, "get_input_tensor should be used after model loaded.");
@@ -135,6 +144,14 @@ std::shared_ptr<Tensor> Network::get_input_tensor(size_t index) {
     LITE_ERROR_HANDLER_END
 }
 
+std::vector<std::shared_ptr<Tensor>> Network::get_input_tensors(size_t index) {
+    LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(m_loaded, "get_input_tensor should be used after model loaded.");
+    LITE_CHECK_NON_NULL_POINTER(m_impl);
+    return m_impl->get_input_tensors(index);
+    LITE_ERROR_HANDLER_END
+}
+
 std::shared_ptr<Tensor> Network::get_output_tensor(size_t index) {
     LITE_ERROR_HANDLER_BEGIN
     LITE_ASSERT(m_loaded, "get_output_tensor should be used after model loaded.");
diff --git a/lite/src/network_impl_base.h b/lite/src/network_impl_base.h
index 0760c81a7..dd1d3c751 100644
--- a/lite/src/network_impl_base.h
+++ b/lite/src/network_impl_base.h
@@ -42,6 +42,9 @@ public:
     bool have_sync = false;
     //! Real input and output data location
     std::shared_ptr<Tensor> lite_tensor = nullptr;
+    //! If the input is consists of discrete multiple tensors, lite_tensors is real
+    //! input data location
+    std::vector<std::shared_ptr<Tensor>> lite_tensors;
 
     IOInner() = default;
     IOInner(const IO& io) {
@@ -86,9 +89,22 @@ public:
     virtual std::shared_ptr<Tensor> get_io_tensor(
             std::string io_name, LiteTensorPhase phase = LiteTensorPhase::LITE_IO) = 0;
 
+    //! get the network input tensors which input consists of discrete multiple tensors,
+    //! layout (1, c, h, w)
+    virtual std::vector<std::shared_ptr<Tensor>> get_io_tensors(
+            std::string io_name, LiteTensorPhase phase = LiteTensorPhase::LITE_INPUT) {
+        return {};
+    }
+
     //! get the input tensor by index in the load_result tensormap
     virtual std::shared_ptr<Tensor> get_input_tensor(size_t index) = 0;
 
+    //! get the network input tensors which input consists of discrete multiple tensors
+    //! by index
+    virtual std::vector<std::shared_ptr<Tensor>> get_input_tensors(size_t index) {
+        return {};
+    }
+
     //! get the output tensor by index in the load_result output_var_list
     virtual std::shared_ptr<Tensor> get_output_tensor(size_t index) = 0;
 
diff --git a/lite/test/test_network.cpp b/lite/test/test_network.cpp
index ec3133324..d252b3c44 100644
--- a/lite/test/test_network.cpp
+++ b/lite/test/test_network.cpp
@@ -1387,6 +1387,96 @@ TEST(TestNetWork, DeviceAsyncExec) {
 }
 
 #endif
+
+TEST(TestNetWork, Discrete_Input) {
+    auto data = get_input_data("./data_b3.npy");
+    auto data_0 = get_input_data("./data0.npy");
+    auto data_1 = get_input_data("./data1.npy");
+    auto data_2 = get_input_data("./data2.npy");
+    std::string model_path = "./test_discrete_input.mge";
+
+    Config config;
+    config.device_type = LiteDeviceType::LITE_CUDA;
+
+    std::shared_ptr<Network> network0 = std::make_shared<Network>(config);
+    network0->load_model(model_path);
+
+    std::shared_ptr<Tensor> data_tensor = network0->get_io_tensor("data");
+    data_tensor->share_memory_with(*data);
+
+    network0->forward();
+    network0->wait();
+    std::shared_ptr<Tensor> output_tensor0 = network0->get_output_tensor(0);
+
+    config.discrete_input_name = "data";
+    NetworkIO ios;
+    bool is_host = true;
+    Layout d_ly{{3, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT};
+    ios.inputs.push_back({"data", is_host, LiteIOType::LITE_IO_VALUE, d_ly});
+
+    std::shared_ptr<Network> network1 = std::make_shared<Network>(config, ios);
+    network1->load_model(model_path);
+
+    std::vector<std::shared_ptr<Tensor>> data_tensors =
+            network1->get_io_tensors("data");
+    data_tensors[0]->share_memory_with(*data_0);
+    data_tensors[1]->share_memory_with(*data_1);
+    data_tensors[2]->share_memory_with(*data_2);
+
+    network1->forward();
+    network1->wait();
+    std::shared_ptr<Tensor> output_tensor1 = network1->get_output_tensor(0);
+
+    compare_lite_tensor<float>(output_tensor0, output_tensor1);
+}
+
+TEST(TestNetWork, Discrete_Input_Device) {
+    auto data = get_input_data("./data_b3.npy");
+    auto data_0 = get_input_data("./data0.npy");
+    auto data_1 = get_input_data("./data1.npy");
+    auto data_2 = get_input_data("./data2.npy");
+    std::string model_path = "./test_discrete_input.mge";
+
+    Config config;
+    config.device_type = LiteDeviceType::LITE_CUDA;
+
+    std::shared_ptr<Network> network0 = std::make_shared<Network>(config);
+    network0->load_model(model_path);
+
+    std::shared_ptr<Tensor> data_tensor = network0->get_io_tensor("data");
+    data_tensor->share_memory_with(*data);
+
+    network0->forward();
+    network0->wait();
+    std::shared_ptr<Tensor> output_tensor0 = network0->get_output_tensor(0);
+
+    config.discrete_input_name = "data";
+    NetworkIO ios;
+    bool is_host = false;
+    Layout d_ly{{3, 3, 224, 224}, 4, LiteDataType::LITE_FLOAT};
+    ios.inputs.push_back({"data", is_host, LiteIOType::LITE_IO_VALUE, d_ly});
+
+    std::shared_ptr<Network> network1 = std::make_shared<Network>(config, ios);
+    network1->load_model(model_path);
+
+    std::vector<std::shared_ptr<Tensor>> data_tensors =
+            network1->get_io_tensors("data");
+    auto d0_cuda = Tensor(LiteDeviceType::LITE_CUDA, d_ly);
+    auto d1_cuda = Tensor(LiteDeviceType::LITE_CUDA, d_ly);
+    auto d2_cuda = Tensor(LiteDeviceType::LITE_CUDA, d_ly);
+    d0_cuda.copy_from(*data_0);
+    d1_cuda.copy_from(*data_1);
+    d2_cuda.copy_from(*data_2);
+    data_tensors[0]->share_memory_with(d0_cuda);
+    data_tensors[1]->share_memory_with(d1_cuda);
+    data_tensors[2]->share_memory_with(d2_cuda);
+
+    network1->forward();
+    network1->wait();
+    std::shared_ptr<Tensor> output_tensor1 = network1->get_output_tensor(0);
+
+    compare_lite_tensor<float>(output_tensor0, output_tensor1);
+}
 #endif
 
 #if MGB_ATLAS || MGB_CAMBRICON
diff --git a/lite/test/test_network_c.cpp b/lite/test/test_network_c.cpp
index fcc5ff9a1..53fe3ee35 100644
--- a/lite/test/test_network_c.cpp
+++ b/lite/test/test_network_c.cpp
@@ -290,6 +290,48 @@ TEST(TestCapiNetWork, GetAllNameAhead) {
     ASSERT_TRUE(ios_mem.outputs->config_layout.shapes[1] == 1000);
 }
 
+TEST(TestCapiNetWork, Discrete_Input) {
+    std::vector<std::shared_ptr<lite::Tensor>> datas;
+    datas.push_back(lite::get_input_data("./data0.npy"));
+    datas.push_back(lite::get_input_data("./data1.npy"));
+    datas.push_back(lite::get_input_data("./data2.npy"));
+    size_t data_length_in_byte = datas[0]->get_tensor_total_size_in_byte();
+
+    LiteIO input_io = default_io;
+    input_io.is_host = true;
+    input_io.name = "data";
+    LiteLayout d_ly;
+    d_ly.ndim = 4;
+    d_ly.data_type = LiteDataType::LITE_FLOAT;
+    std::vector<size_t> input_shape = {3, 3, 224, 224};
+    for (size_t i = 0; i < d_ly.ndim; i++) {
+        d_ly.shapes[i] = input_shape[i];
+    }
+    input_io.config_layout = d_ly;
+
+    LiteNetworkIO network_io = *default_network_io();
+    network_io.inputs = &input_io;
+    network_io.input_size = 1;
+
+    LiteConfig c_config = *default_config();
+    c_config.discrete_input_name = "data";
+    LiteNetwork c_network;
+    LITE_CAPI_CHECK(LITE_make_network(&c_network, c_config, network_io));
+    std::string model_path = "./test_discrete_input.mge";
+    LITE_CAPI_CHECK(LITE_load_model_from_path(c_network, model_path.c_str()));
+
+    std::vector<LiteTensor> c_data_tensors(3, nullptr);
+    for (size_t i = 0; i < 3; i++) {
+        LITE_CAPI_CHECK(LITE_get_io_tensors(
+                c_network, "data", i, LITE_INPUT, &c_data_tensors[i]));
+        LITE_CAPI_CHECK(LITE_reset_tensor_memory(
+                c_data_tensors[i], datas[i]->get_memory_ptr(), data_length_in_byte));
+    }
+
+    ForwardNetwork;
+    LITE_CAPI_CHECK(LITE_destroy_network(c_network));
+}
+
 #if LITE_BUILD_WITH_RKNPU
 
 static int GetTop(
diff --git a/src/core/include/megbrain/graph/helper.h b/src/core/include/megbrain/graph/helper.h
index b01825b1c..f8c3a571c 100644
--- a/src/core/include/megbrain/graph/helper.h
+++ b/src/core/include/megbrain/graph/helper.h
@@ -381,7 +381,7 @@ public:
 };
 
 //! shortcut for calling ExtraDependencyMerger
-SymbolVarArray get_dest_vars_with_extra_deps(
+MGE_WIN_DECLSPEC_FUC SymbolVarArray get_dest_vars_with_extra_deps(
         const SymbolVarArray& dest_vars, SpecialOprStat* sopr_stat = nullptr);
 
 }  // namespace cg
diff --git a/src/gopt/include/megbrain/gopt/framework.h b/src/gopt/include/megbrain/gopt/framework.h
index 6ab56d7fe..f484bdfd7 100644
--- a/src/gopt/include/megbrain/gopt/framework.h
+++ b/src/gopt/include/megbrain/gopt/framework.h
@@ -44,13 +44,14 @@ public:
     //! rewrite vars in a graph
     class Rewriter;
 
-    SubGraph(const SymbolVarArray& endpoint_vars);
+    MGE_WIN_DECLSPEC_FUC SubGraph(const SymbolVarArray& endpoint_vars);
 
     //! get the associated ComputingGraph
     ComputingGraph* comp_graph() const { return m_comp_graph; }
 
     //! iterate in topology order
-    void iter(const Callback& cb, std::shared_ptr<ExtraDep> = nullptr) const;
+    MGE_WIN_DECLSPEC_FUC void iter(
+            const Callback& cb, std::shared_ptr<ExtraDep> = nullptr) const;
 
     //! make a Rewriter bound to this graph
     inline Rewriter make_rewriter();
@@ -99,7 +100,7 @@ public:
      * \return new operator that uses new inputs; it would be
      *      opr if no input is changed
      */
-    OperatorNodeBase* auto_replace_outputs(OperatorNodeBase* opr);
+    MGE_WIN_DECLSPEC_FUC OperatorNodeBase* auto_replace_outputs(OperatorNodeBase* opr);
 
     //! get current var: if var has been replaced, return its
     //! new value; otherwise return var itself
@@ -119,11 +120,11 @@ public:
      *
      * \param msg see OptState::on_var_replaced
      */
-    void replace_var(VarNode* src, VarNode* dst, const char* msg);
+    MGE_WIN_DECLSPEC_FUC void replace_var(VarNode* src, VarNode* dst, const char* msg);
 
     //! apply this rewriter to the owner graph and modify owner
     //! SubGraph inplace
-    void apply_inplace() const;
+    MGE_WIN_DECLSPEC_FUC void apply_inplace() const;
 };
 SubGraph::Rewriter SubGraph::make_rewriter() {
     return {this};
diff --git a/src/opr/impl/imgproc.cpp b/src/opr/impl/imgproc.cpp
index 4b5f0529c..a4bd021df 100644
--- a/src/opr/impl/imgproc.cpp
+++ b/src/opr/impl/imgproc.cpp
@@ -160,18 +160,6 @@ void WarpPerspectiveForward::outshape_by_symvar_do_get_output_shape(
                 "out2d=%s",
                 imgshp.to_string().c_str(), matshp.to_string().c_str(),
                 oshp2d.to_string().c_str());
-        if (input().size() - m_srcs_size == 2) {
-            mgb_assert(
-                    m_srcs_size == matshp[0], "batchsize mismatch: img=%zu mat=%zu",
-                    m_srcs_size, matshp[0]);
-        } else {
-            mgb_assert(input().size() - m_srcs_size == 3);
-            mat_idx_shp = shpinfo.shape_inp_shp.at(m_srcs_size + 1);
-            mgb_assert(
-                    mat_idx_shp[0] == matshp[0] && mat_idx_shp.ndim == 1,
-                    "invalid mat_idx shape: mat=%zu mat_idx=%s", matshp[0],
-                    mat_idx_shp.to_string().c_str());
-        }
         size_t height_idx = 0;
         if (param().format == Param::Format::NCHW) {
             height_idx = 2;
diff --git a/src/opr/include/megbrain/opr/imgproc.h b/src/opr/include/megbrain/opr/imgproc.h
index f272f2f25..cb392146d 100644
--- a/src/opr/include/megbrain/opr/imgproc.h
+++ b/src/opr/include/megbrain/opr/imgproc.h
@@ -22,7 +22,7 @@ namespace opr {
  * Impl note: this operator might have 3 or 4 inputs depending on whether
  * \p mat_idx is given
  */
-MGB_DEFINE_OPR_CLASS(
+MGB_DEFINE_OPR_CLASS_WITH_EXPORT(
         WarpPerspectiveForward,
         intl::WorkspaceSizeInfer<intl::OutshapeBySymvarSCNOpr<
                 mixin::MegDNNOprHolderImpl<megdnn::WarpPerspectiveForward>>>) // {
-- 
GitLab