diff --git a/lite/include/lite/network.h b/lite/include/lite/network.h
index 1bf26b82459f8cb4cba07a2c27c89b8d05ada816..3fa6fbefcb70b229f05a332d9c91861244f4e26d 100644
--- a/lite/include/lite/network.h
+++ b/lite/include/lite/network.h
@@ -93,6 +93,7 @@ struct LITE_API Options {
     bool const_shape = false;
     bool force_dynamic_alloc = false;
     bool force_output_dynamic_alloc = false;
+    bool force_output_use_user_specified_memory = false;
     bool no_profiling_on_shape_change = false;
     uint8_t jit_level = 0;
     uint8_t comp_node_seq_record_level = 0;
diff --git a/lite/lite-c/include/lite-c/network_c.h b/lite/lite-c/include/lite-c/network_c.h
index ff838d0e815e2bfd53182c63fc6b24f08aff9246..df01fdb0c8c882c05d6f5e03438ff2326281be0d 100644
--- a/lite/lite-c/include/lite-c/network_c.h
+++ b/lite/lite-c/include/lite-c/network_c.h
@@ -83,6 +83,7 @@ typedef struct Options {
     int const_shape;
     int force_dynamic_alloc;
     int force_output_dynamic_alloc;
+    int force_output_use_user_specified_memory;
     int no_profiling_on_shape_change;
     int jit_level;
     int comp_node_seq_record_level;
diff --git a/lite/lite-c/src/network.cpp b/lite/lite-c/src/network.cpp
index 51df08cf99c10316d36628fff4013afb1af11542..49fb94248b66de39c74d6108f707b69e0c430583 100644
--- a/lite/lite-c/src/network.cpp
+++ b/lite/lite-c/src/network.cpp
@@ -29,6 +29,7 @@ const LiteOptions default_option = {
         .const_shape = false,
         .force_dynamic_alloc = false,
         .force_output_dynamic_alloc = false,
+        .force_output_use_user_specified_memory = false,
         .no_profiling_on_shape_change = false,
         .jit_level = 0,
         .comp_node_seq_record_level = 0,
@@ -122,7 +123,9 @@ lite::Config convert_to_lite_config(const LiteConfig c_config) {
     lite_config.options.var_sanity_check_first_run =
             c_config.options.var_sanity_check_first_run;
     lite_config.options.const_shape = c_config.options.const_shape;
-    lite_config.options.force_dynamic_alloc = c_config.options.const_shape;
+    lite_config.options.force_dynamic_alloc = c_config.options.force_dynamic_alloc;
+    lite_config.options.force_output_use_user_specified_memory =
+            c_config.options.force_output_use_user_specified_memory;
     lite_config.options.force_output_dynamic_alloc =
             c_config.options.force_output_dynamic_alloc;
     lite_config.options.no_profiling_on_shape_change =
diff --git a/lite/pylite/megenginelite/network.py b/lite/pylite/megenginelite/network.py
index 7324f48d142c13b39958e77b677b2c3a3f6a93ea..1106079acd4fa064986eaeaabb8addd749576efc 100644
--- a/lite/pylite/megenginelite/network.py
+++ b/lite/pylite/megenginelite/network.py
@@ -29,6 +29,7 @@ class LiteOptions(Structure):
         ("const_shape", c_int),
         ("force_dynamic_alloc", c_int),
         ("force_output_dynamic_alloc", c_int),
+        ("force_output_use_user_specified_memory", c_int),
         ("no_profiling_on_shape_change", c_int),
         ("jit_level", c_int),
         ("comp_node_seq_record_level", c_int),
@@ -52,6 +53,7 @@ class LiteOptions(Structure):
         self.const_shape = False
         self.force_dynamic_alloc = False
         self.force_output_dynamic_alloc = False
+        self.force_output_use_user_specified_memory = False
         self.no_profiling_on_shape_change = False
         self.jit_level = 0
         self.comp_node_seq_record_level = 0
@@ -67,6 +69,7 @@ class LiteOptions(Structure):
             "const_shape": bool(self.const_shape),
             "force_dynamic_alloc": bool(self.force_dynamic_alloc),
             "force_output_dynamic_alloc": bool(self.force_output_dynamic_alloc),
+            "force_output_nocopy": bool(self.force_output_nocopy),
             "no_profiling_on_shape_change": bool(self.no_profiling_on_shape_change),
             "jit_level": self.jit_level,
             "comp_node_seq_record_level": self.comp_node_seq_record_level,
diff --git a/lite/src/mge/network_impl.cpp b/lite/src/mge/network_impl.cpp
index 247c486afb132ab492fca1a67ea94ef38ad2e1fa..1cfde0fa2ad63ff69d3a63f16f67bf036599f3da 100644
--- a/lite/src/mge/network_impl.cpp
+++ b/lite/src/mge/network_impl.cpp
@@ -84,6 +84,9 @@ void NetworkImplDft::application_config() {
     m_load_config.const_var_shape = m_user_config->options.const_shape;
     ConfigOption(force_dynamic_alloc, force_dynamic_alloc);
     ConfigOption(force_output_dynamic_alloc, force_output_dynamic_alloc);
+    ConfigOption(
+            force_output_use_user_specified_memory,
+            force_output_use_user_specified_memory);
     ConfigOption(no_profiling_on_shape_change, no_profiling_on_shape_change);
     LITE_ASSERT(
             m_user_config->options.jit_level == 0 ||
@@ -250,7 +253,13 @@ void NetworkImplDft::make_output_spec() {
                     }
                 }
             };
-            m_output_spec.emplace_back(load_out, std::move(cb));
+            //! if write to user-specified memory, the CallbackCaller must be nullptr.
+            if (m_user_config->options.force_output_use_user_specified_memory ||
+                m_user_config->options.force_output_dynamic_alloc) {
+                m_output_spec.emplace_back(load_out, nullptr);
+            } else {
+                m_output_spec.emplace_back(load_out, std::move(cb));
+            }
         } else {
             LITE_THROW(ssprintf("no output named : %s in the mode", out.name.c_str()));
         }
@@ -444,8 +453,7 @@ void NetworkImplDft::set_io(const NetworkIO& network_io) {
     }
 }
 
-void NetworkImplDft::try_infer_tensor_layout(
-        std::shared_ptr<Tensor> tensor, mgb::cg::SymbolVar var) {
+void NetworkImplDft::try_infer_tensor_layout(std::shared_ptr<Tensor> tensor, Var var) {
     auto&& static_infer_mgr = m_load_config.comp_graph->static_infer_manager();
     auto infer_trait = var.node()->get_static_infer_trait();
     if (std::get<0>(infer_trait)) {
@@ -455,9 +463,13 @@ void NetworkImplDft::try_infer_tensor_layout(
                     "Lite infer output shape failed, maybe the model is "
                     "dynamic "
                     "shape.\n");
+            LITE_ASSERT(
+                    !m_user_config->options.force_output_use_user_specified_memory,
+                    "force_output_use_user_specified_memory can't be used when output "
+                    "shape can't be derived.");
             return;
         }
-        Layout layout = to_lite_layout(mgb::TensorLayout{*shape, var.dtype()});
+        Layout layout = to_lite_layout(TensorLayout{*shape, var.dtype()});
         tensor->set_layout(layout);
     }
 }
@@ -559,8 +571,7 @@ void NetworkImplDft::update_output() {
          out_it != m_network_io->outputs.end();) {
         if (std::find_if(
                     m_load_result.output_var_list.begin(),
-                    m_load_result.output_var_list.end(),
-                    [out_it](const mgb::SymbolVar var) {
+                    m_load_result.output_var_list.end(), [out_it](const SymbolVar var) {
                         return var.node()->name() == out_it->name;
                     }) == m_load_result.output_var_list.end()) {
             LITE_LOG("%s is not the network output, ignore it.", out_it->name.c_str());
@@ -584,7 +595,7 @@ void NetworkImplDft::update_output() {
                 out_it->lite_tensor =
                         std::make_shared<Tensor>(device_id, stream_id, device_type);
             }
-            mgb::SymbolVar var;
+            SymbolVar var;
             for (auto&& out_var : m_load_result.output_var_list) {
                 if (out_var.node()->name() == out_it->name) {
                     var = out_var;
@@ -592,10 +603,12 @@ void NetworkImplDft::update_output() {
                 }
             }
             try_infer_tensor_layout(out_it->lite_tensor, var);
+            output_tensor_copy_optimize(var, out_it->lite_tensor);
         }
         //! user not set, use default output
     } else {
         for (auto&& out : m_load_result.output_var_list) {
+            std::shared_ptr<Tensor> lite_tensor = nullptr;
             auto it = std::find_if(
                     m_network_io->outputs.begin(), m_network_io->outputs.end(),
                     [&out](const IOInner io) { return io.name == out.node()->name(); });
@@ -608,6 +621,7 @@ void NetworkImplDft::update_output() {
                             std::make_shared<Tensor>(device_id, stream_id, device_type);
                 }
                 try_infer_tensor_layout(it->lite_tensor, out);
+                lite_tensor = it->lite_tensor;
             } else {
                 IOInner output;
                 output.name = out.node()->name();
@@ -615,11 +629,47 @@ void NetworkImplDft::update_output() {
                         device_id, stream_id, device_type, true);
                 m_network_io->outputs.push_back({output});
                 try_infer_tensor_layout(output.lite_tensor, out);
+                lite_tensor = output.lite_tensor;
             }
+            output_tensor_copy_optimize(out, lite_tensor);
         }
     }
 }
 
+void NetworkImplDft::output_tensor_copy_optimize(
+        Var var, std::shared_ptr<Tensor> tensor) {
+    LITE_ASSERT(
+            !(m_user_config->options.force_output_use_user_specified_memory &&
+              m_user_config->options.force_output_dynamic_alloc),
+            "Can't set force_output_use_user_specified_memory and "
+            "force_output_dynamic_alloc at the same time.");
+    if (m_user_config->options.force_output_use_user_specified_memory) {
+        TensorHelper::implement(tensor)
+                ->cast_final_safe<TensorImplDft>()
+                .set_reset_callback([var](TensorImplDft* dft_tensor) {
+                    dft_tensor->device_share_host_memory();
+                    auto dv = dft_tensor->dev_tensor().get();
+                    dv->comp_node(var.node()->comp_node(), true);
+                    var.node()->init_mem_plan(dv);
+                    var.node()->reset_dev_tensor_from_tensor(*dv);
+                });
+    }
+    if (m_user_config->options.force_output_dynamic_alloc) {
+        TensorHelper::implement(tensor)
+                ->cast_final_safe<TensorImplDft>()
+                .set_get_memory_callback([var](TensorImplDft* dft_tensor) {
+                    if (dft_tensor->is_host()) {
+                        auto host_tensor = dft_tensor->m_host_tensor;
+                        *host_tensor =
+                                HostTensorND::make_proxy(var.node()->dev_tensor());
+                    } else {
+                        auto dev_tensor = dft_tensor->m_dev_tensor;
+                        *dev_tensor = var.node()->dev_tensor();
+                    }
+                });
+    }
+}
+
 std::shared_ptr<Tensor> NetworkImplDft::get_io_tensor(
         std::string io_name, LiteTensorPhase phase) {
     if (phase == LiteTensorPhase::LITE_INPUT || phase == LiteTensorPhase::LITE_IO) {
diff --git a/lite/src/mge/network_impl.h b/lite/src/mge/network_impl.h
index 098bb7fb170ed7b213ede00e2cc73c4ec9307221..903f92f02141921403a8677493e6da347cfde52d 100644
--- a/lite/src/mge/network_impl.h
+++ b/lite/src/mge/network_impl.h
@@ -12,6 +12,7 @@
 #pragma once
 
 #include "lite_build_config.h"
+#include "megbrain/graph.h"
 
 #if LITE_BUILD_WITH_MGE
 #include "lite/network.h"
@@ -41,6 +42,7 @@ class NetworkImplDft final : public Network::NetworkImplBase {
 public:
     NetworkImplDft() { m_load_config.comp_graph = mgb::ComputingGraph::make(); }
     using S = megdnn::param::ExecutionPolicy::Strategy;
+    using Var = mgb::cg::SymbolVar;
     //! set the config of the network, include:
     //! the inference device
     //! the other inference options, such as record_level, weight_preprocess...
@@ -207,8 +209,10 @@ private:
     void compile_graph();
 
     //! try to infer output tensor layout
-    void try_infer_tensor_layout(
-            std::shared_ptr<Tensor> tensor, mgb::cg::SymbolVar var);
+    void try_infer_tensor_layout(std::shared_ptr<Tensor> tensor, Var var);
+
+    //! optimized output tensor copy
+    void output_tensor_copy_optimize(Var var, std::shared_ptr<Tensor> tensor);
 
 private:
     bool m_async = false;
diff --git a/lite/src/mge/tensor_impl.cpp b/lite/src/mge/tensor_impl.cpp
index d9fc7ce8f9c6cc28ebf0611865b4491f90c97a61..63d38ec70f0246a55b8fb257b64f58e7eadcc543 100644
--- a/lite/src/mge/tensor_impl.cpp
+++ b/lite/src/mge/tensor_impl.cpp
@@ -149,6 +149,9 @@ Layout TensorImplDft::get_layout() const {
 }
 
 void* TensorImplDft::get_memory_ptr() const {
+    if (m_get_memory_callback) {
+        m_get_memory_callback(const_cast<TensorImplDft*>(this));
+    }
     if (is_host()) {
         return static_cast<void*>(m_host_tensor->raw_ptr());
     } else {
@@ -157,6 +160,9 @@ void* TensorImplDft::get_memory_ptr() const {
 }
 
 void* TensorImplDft::get_memory_ptr(const std::vector<size_t>& idx) const {
+    if (m_get_memory_callback) {
+        m_get_memory_callback(const_cast<TensorImplDft*>(this));
+    }
     if (is_host()) {
         auto elemsize_log = m_host_tensor->layout().dtype.size_log();
         switch (elemsize_log) {
@@ -317,6 +323,9 @@ void TensorImplDft::reset(void* prepared_data) {
         storage.reset(cn, size, raw_storage);
         m_dev_tensor->reset(storage, mge_layout);
     }
+    if (m_reset_callback) {
+        m_reset_callback(this);
+    }
 }
 
 void TensorImplDft::reset(void* prepared_data, const Layout& layout) {
@@ -430,6 +439,34 @@ void TensorImplDft::copy_from_mge_tensor(const mgb::DeviceTensorND& dv) {
     }
 }
 
+void TensorImplDft::set_reset_callback(const std::function<void(TensorImplDft*)>& cb) {
+    m_reset_callback = cb;
+}
+
+void TensorImplDft::set_get_memory_callback(
+        const std::function<void(TensorImplDft*)>& cb) {
+    m_get_memory_callback = cb;
+}
+
+void TensorImplDft::device_share_host_memory() {
+    if (is_host()) {
+        if (!m_dev_tensor) {
+            m_dev_tensor = std::make_shared<mgb::DeviceTensorND>(
+                    m_host_tensor->comp_node(), m_host_tensor->layout());
+        }
+        if (m_host_tensor->raw_ptr() != m_dev_tensor->raw_ptr()) {
+            auto raw_storage = std::shared_ptr<mgb::dt_byte>(
+                    m_host_tensor->raw_ptr(), [](void*) {});
+            auto cn = m_host_tensor->comp_node();
+            auto mge_layout = m_host_tensor->layout();
+            size_t size = mge_layout.span().dist_byte();
+            mgb::DeviceTensorStorage storage;
+            storage.reset(cn, size, raw_storage);
+            m_dev_tensor->reset(storage, mge_layout);
+        }
+    }
+}
+
 #endif
 
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/lite/src/mge/tensor_impl.h b/lite/src/mge/tensor_impl.h
index 17f125323efa6209c04ceed1b9ecbbe418ca3679..a33e2bf77fb79acafb255d8623d044a00e6349ba 100644
--- a/lite/src/mge/tensor_impl.h
+++ b/lite/src/mge/tensor_impl.h
@@ -97,11 +97,22 @@ public:
 
     //! get host tensor
     std::shared_ptr<mgb::HostTensorND> host_tensor() const { return m_host_tensor; }
+
     //! get device tensor
     std::shared_ptr<mgb::DeviceTensorND> dev_tensor() const { return m_dev_tensor; }
+
     //! copy from mgb tensor
     void copy_from_mge_tensor(const mgb::DeviceTensorND& dv);
 
+    //! set tensor reset callback
+    void set_reset_callback(const std::function<void(TensorImplDft*)>& cb);
+
+    //! set tensor get memory callback
+    void set_get_memory_callback(const std::function<void(TensorImplDft*)>& cb);
+
+    //! shared the same memory with host and device tensor
+    void device_share_host_memory();
+
 public:
     friend class NetworkImplDft;
 
@@ -115,6 +126,8 @@ private:
     void set_mge_tensor_compnode(const mgb::CompNode& comp_node);
 
 private:
+    std::function<void(TensorImplDft*)> m_get_memory_callback;
+    std::function<void(TensorImplDft*)> m_reset_callback;
     std::shared_ptr<mgb::HostTensorND> m_host_tensor;
     std::shared_ptr<mgb::DeviceTensorND> m_dev_tensor;
 };
diff --git a/lite/src/network.cpp b/lite/src/network.cpp
index 745dbe5d23e0aa7b49653cbabec79b6b1bc27615..abe1db3037022c845344fae254879de550770395 100644
--- a/lite/src/network.cpp
+++ b/lite/src/network.cpp
@@ -153,6 +153,10 @@ std::shared_ptr<Tensor> Network::get_output_tensor(size_t index) {
 
 Network& Network::set_async_callback(const AsyncCallback& callback) {
     LITE_ERROR_HANDLER_BEGIN
+    LITE_ASSERT(
+            !m_config.options.force_output_use_user_specified_memory,
+            "Async mode can't run with force_output_use_user_specified_memory which "
+            "output data is written to use specific memory.");
     LITE_CHECK_NON_NULL_POINTER(m_impl);
     m_impl->set_async_callback(std::move(callback));
     return *this;
diff --git a/lite/test/test_network.cpp b/lite/test/test_network.cpp
index 617f8055e4d515ecc72c7590445dd3a80fcf1ee7..f786b92c1e2b05df84b7121417b1a09395c1c148 100644
--- a/lite/test/test_network.cpp
+++ b/lite/test/test_network.cpp
@@ -397,6 +397,73 @@ TEST(TestNetWork, ResetOutput) {
     compare_lite_tensor<float>(output_tensor, result_mgb);
 }
 
+TEST(TestNetWork, OutputNoCopy) {
+    Config config;
+    config.options.force_output_use_user_specified_memory = true;
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    network->load_model(model_path);
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    size_t times = 5;
+    std::vector<std::shared_ptr<Tensor>> result_tensors;
+    for (size_t i = 0; i < times; i++) {
+        auto tmp = std::make_shared<Tensor>(
+                LiteDeviceType::LITE_CPU,
+                Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT});
+        result_tensors.push_back(tmp);
+    }
+
+    for (size_t i = 0; i < times; i++) {
+        void* out_data = result_tensors[i]->get_memory_ptr();
+        output_tensor->reset(out_data, result_tensors[i]->get_layout());
+
+        network->forward();
+        network->wait();
+        ASSERT_EQ(output_tensor->get_memory_ptr(), out_data);
+        compare_lite_tensor<float>(output_tensor, result_mgb);
+    }
+    for (size_t i = 0; i < times; i++) {
+        compare_lite_tensor<float>(result_tensors[i], result_mgb);
+    }
+}
+
+TEST(TestNetWork, OutputDynamicAlloc) {
+    Config config;
+    config.options.force_output_dynamic_alloc = true;
+    auto tensor = get_input_data("./input_data.npy");
+    std::string model_path = "./shufflenet.mge";
+    std::string input_name = "data";
+    auto result_mgb = mgb_lar(model_path, config, input_name, tensor);
+
+    std::shared_ptr<Network> network = std::make_shared<Network>(config);
+
+    network->load_model(model_path);
+    std::shared_ptr<Tensor> input_tensor = network->get_io_tensor(input_name);
+
+    auto src_ptr = tensor->get_memory_ptr();
+    auto src_layout = tensor->get_layout();
+    input_tensor->reset(src_ptr, src_layout);
+
+    std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
+    size_t times = 5;
+    for (size_t i = 0; i < times; i++) {
+        network->forward();
+        network->wait();
+        compare_lite_tensor<float>(output_tensor, result_mgb);
+    }
+}
+
 TEST(TestNetWork, AsyncExec) {
     Config config;
     config.options.var_sanity_check_first_run = false;
diff --git a/src/core/impl/graph/cg_impl.cpp b/src/core/impl/graph/cg_impl.cpp
index b175489dce2426ae8c3816ad0c6efedd436e3109..add192779c6609bde3be6d0531791f48cba79609 100644
--- a/src/core/impl/graph/cg_impl.cpp
+++ b/src/core/impl/graph/cg_impl.cpp
@@ -507,13 +507,12 @@ void ComputingGraphImpl::dest_var_optimize(VarNodeArray& dest_vars) {
             i->add_flag(F::NO_MEM_RECLAIM);
         }
     }
-    if (dest_vars[0]->owner_graph()->options().force_output_write_to_user_memory) {
+    if (dest_vars[0]->owner_graph()->options().force_output_use_user_specified_memory) {
         for (auto&& i : dest_vars) {
             mgb_assert(
                     !i->contain_flag(F::RT_FORCE_DYNAMIC_MEM_ALLOC),
-                    "var %s with force dynamic allocate should be set to write output "
-                    "to "
-                    "user memory",
+                    "var %s with RT_FORCE_DYNAMIC_MEM_ALLOC flag should not set "
+                    "force write output to user memory",
                     i->cname());
             i->add_flag(
                     F::NO_SYS_MEM_ALLOC | F::NO_SYS_STATIC_MEM_ALLOC |
diff --git a/src/core/impl/graph/var_node.cpp b/src/core/impl/graph/var_node.cpp
index 5c8909b51fa77693390621ebbe0359f25c541c74..833da401d6f7c8faedd80698d8a8dd883a5bdbd9 100644
--- a/src/core/impl/graph/var_node.cpp
+++ b/src/core/impl/graph/var_node.cpp
@@ -574,6 +574,10 @@ MemAllocPlan& VarNode::init_mem_plan(const DeviceTensorND* fixed_alloc) {
     return m_mem_plan;
 }
 
+bool VarNode::is_graph_dest_varnode() {
+    return ComputingGraphImpl::downcast(owner_graph())->var_receiver(this).size() == 0;
+}
+
 VarNode& VarNode::add_flag(Flag flag) {
     modify_flag(flag, m_flag | flag);
     return *this;
@@ -582,10 +586,13 @@ VarNode& VarNode::add_flag(Flag flag) {
 void VarNode::modify_flag(Flag delta, Flag new_flag) {
     if (contain_flag(Flag::FLAG_FREEZED)) {
         mgb_assert(
-                (delta & (Flag::NO_SYS_MEM_ALLOC | Flag::NO_MEM_RECLAIM |
-                          Flag::NO_SYS_STATIC_MEM_ALLOC |
-                          Flag::RT_FORCE_DYNAMIC_MEM_ALLOC)) == delta ||
-                (new_flag & Flag::MEMORY_NO_NEED));
+                (delta & (Flag::NO_MEM_RECLAIM | Flag::NO_SYS_STATIC_MEM_ALLOC |
+                          Flag::RT_FORCE_DYNAMIC_MEM_ALLOC | Flag::MEMORY_NO_NEED)) ==
+                                delta ||
+                        is_graph_dest_varnode(),
+                "After the FLAG_FREEZED flag setting, var can only modify "
+                "NO_MEM_RECLAIM, NO_SYS_STATIC_MEM_ALLOC, RT_FORCE_DYNAMIC_MEM_ALLOC, "
+                "MEMORY_NO_NEED flag except graph dest var.");
 
         mgb_assert(
                 !ComputingGraphImpl::downcast(owner_graph())
diff --git a/src/core/include/megbrain/graph/cg.h b/src/core/include/megbrain/graph/cg.h
index 9d6c846fd7cdecaab72d15b9bec2e6eb270b59c5..ae1d4a16554eb8e598b682394781da95cf1c16c6 100644
--- a/src/core/include/megbrain/graph/cg.h
+++ b/src/core/include/megbrain/graph/cg.h
@@ -421,7 +421,7 @@ public:
          * Force the output to be written to the user specified memory, which
          * can optimize the copy of output data at one time
          */
-        bool force_output_write_to_user_memory = false;
+        bool force_output_use_user_specified_memory = false;
 
         //! whether to perform var sanity check on first run
         bool var_sanity_check_first_run = true;
diff --git a/src/core/include/megbrain/graph/var_node.h b/src/core/include/megbrain/graph/var_node.h
index 611f482a1a94af430af54d243b138379ab21555a..74db600a267aa34eb72968223c6b91265c716c3f 100644
--- a/src/core/include/megbrain/graph/var_node.h
+++ b/src/core/include/megbrain/graph/var_node.h
@@ -549,6 +549,10 @@ private:
 
     MGE_WIN_DECLSPEC_FUC void modify_flag(Flag delta, Flag new_flag);
 
+    //! whether the var is graph output, if it is output, the Flag of
+    //! NO_SYS_MEM_ALLOC can be modified.
+    bool is_graph_dest_varnode();
+
     MGE_WIN_DECLSPEC_FUC void assign_dev_tensor_from_tensor(
             const DeviceTensorND& value);
 
diff --git a/src/gopt/test/no_memory_copy.cpp b/src/gopt/test/no_memory_copy.cpp
index f1c5f00678c3d74a7a17246fcc6df26085630c79..5e19f76467eea022695d67c47ace357a038797c5 100644
--- a/src/gopt/test/no_memory_copy.cpp
+++ b/src/gopt/test/no_memory_copy.cpp
@@ -82,7 +82,7 @@ TEST(TestNoCopy, BasicInputNoCopy) {
 TEST(TestNoCopy, IONoCopyPtrEQ) {
     auto test_graph = TestGraph();
     auto compute_graph = test_graph.m_network->graph;
-    compute_graph->options().force_output_write_to_user_memory = true;
+    compute_graph->options().force_output_use_user_specified_memory = true;
     test_graph.create_graph();
     auto func = test_graph.compile_without_copy();
     auto&& outvar = func->get_output_vars()[0];
@@ -123,7 +123,7 @@ TEST(TestNoCopy, IONoCopyPtrEQ) {
 TEST(TestNoCopy, IONoCopyCorrect) {
     auto test_graph = TestGraph();
     auto compute_graph = test_graph.m_network->graph;
-    compute_graph->options().force_output_write_to_user_memory = true;
+    compute_graph->options().force_output_use_user_specified_memory = true;
     test_graph.create_graph();
     HostTensorND truth;
     auto func = test_graph.compile_without_copy();