diff --git a/imperative/python/megengine/jit/tracing.py b/imperative/python/megengine/jit/tracing.py
index c95f5402ff73aebff38472f065e66943ba145a07..2d3c0ca3de85ce705db9b5d86cb0ff4413df4669 100644
--- a/imperative/python/megengine/jit/tracing.py
+++ b/imperative/python/megengine/jit/tracing.py
@@ -170,9 +170,9 @@ class trace:
         self._graph = None
         self._need_reset_nodes = None
         self._lazy_eval_graph = None
-        self._lazy_eval_tensors = {}
+        self._lazy_eval_tensors = set()
         self._lazy_eval_links = None
-        self._active_tensors = {}
+        self._active_tensors = set()
         self._tensor_remaps = None
         self._inputs_to_restore = None
         self._arg_bindings = None
@@ -258,7 +258,7 @@ class trace:
             y._compiled_info = CompiledTensorProxy(h)
             y._mixin_handle = h
             outputs += [y]
-            self._active_tensors[h] = TensorWeakRef(y)
+            self._active_tensors.add(TensorWeakRef(y))
         self._output_handles.update(ohandles)
         return outputs
 
@@ -318,9 +318,9 @@ class trace:
             x._mixin_handle = h
             x._recording = True
             x._trace_mixin_info = info
-            self._active_tensors[h] = TensorWeakRef(x)
+            self._active_tensors.add(TensorWeakRef(x))
             if self._symbolic:
-                self._lazy_eval_tensors[h] = TensorWeakRef(x)
+                self._lazy_eval_tensors.add(TensorWeakRef(x))
 
         self._seq.append((op, tuple(ihandles), tuple(ohandles)))
 
@@ -345,7 +345,7 @@ class trace:
         x._recording = True
         x._trace_mixin_info = info
         if self._symbolic:
-            self._lazy_eval_tensors[h] = TensorWeakRef(x)
+            self._lazy_eval_tensors.add(TensorWeakRef(x))
         self._seq.append(("Const", tuple(), tuple(ohandles)))
 
     def _set_active(self, active: bool):
@@ -365,17 +365,14 @@ class trace:
             self._lazy_eval_links = ()
 
     def _take_escaped_tensors(self):
-        escaped_tensors = tuple(
-            filter(lambda x: x() is not None, self._active_tensors.values())
-        )
+        escaped_tensors = tuple(filter(lambda x: x() is not None, self._active_tensors))
         self._active_tensors.clear()
         return escaped_tensors
 
     def _lazy_eval(self, lazy_eval_graph, lazy_eval_tensors, lazy_eval_links):
-        lazy_eval_tensors = list(
-            filter(lambda x: x() is not None, lazy_eval_tensors.values())
-        )
-        readers = [G.OutputNode(x()._varnode).outputs[0] for x in lazy_eval_tensors]
+        lazy_eval_tensors = [x() for x in lazy_eval_tensors]
+        lazy_eval_tensors = [x for x in lazy_eval_tensors if x is not None]
+        readers = [G.OutputNode(x._varnode).outputs[0] for x in lazy_eval_tensors]
         self._apply_graph_options(lazy_eval_graph)
         lazy_eval_graph.options.graph_opt_level = self._graph_opt_level
         lazy_eval_graph._set_priority_to_id([*lazy_eval_links, *readers])
@@ -383,8 +380,8 @@ class trace:
         lazy_eval_graph()
         for r, x in zip(readers, lazy_eval_tensors):
             # get values from lazy_eval_graph and assign to lazy_eval tensor
-            x()._handle = RawTensor(r.op.get_value())._handle
-            x()._reset_varnode()
+            x._handle = RawTensor(r.op.get_value())._handle
+            x._reset_varnode()
 
     @contextlib.contextmanager
     def _setup(self):
@@ -454,13 +451,14 @@ class trace:
                 raise TraceMismatchError("premature end")
             if not self._symbolic or not self._untraced:
                 # reset output tensors
-                for x in self._active_tensors.values():
-                    if x() is not None:
-                        x()._dev_tensor()
-                        x()._reset_varnode()
-                        x()._mixin_handle = -1
-                        x()._recording = False
-                        x()._trace_mixin_info = None
+                for x in self._active_tensors.copy():
+                    strong_x = x()
+                    if strong_x is not None:
+                        strong_x._dev_tensor()
+                        strong_x._reset_varnode()
+                        strong_x._mixin_handle = -1
+                        strong_x._recording = False
+                        strong_x._trace_mixin_info = None
 
         try:
             do_enter()
@@ -482,15 +480,17 @@ class trace:
         if self._untraced:
             # conditionally reading a compiled tensor in excluded region
             # is permitted, so we have to assume every tensor might be read
-            for x in self._active_tensors.values():
-                if x():
-                    info = self._tinfo[x()._mixin_handle]
+            for x in self._active_tensors:
+                strong_x = x()
+                if strong_x:
+                    info = self._tinfo[strong_x._mixin_handle]
                     info.exported = True
                     info.data_read = True
         else:
-            for x in self._active_tensors.values():
-                if x():
-                    x()._dev_tensor()
+            for x in self._active_tensors:
+                strong_x = x()
+                if strong_x:
+                    strong_x._dev_tensor()
 
     def _apply_graph_options(self, graph):
 
@@ -520,7 +520,6 @@ class trace:
         graph = self._graph = G.Graph()
         graph.options.async_exec_level = 0b100
         self._apply_graph_options(graph)
-        # graph.options.graph_opt_level = 0
         need_reset_nodes = self._need_reset_nodes = []
         # links enforce ordering of I/O nodes
         in_out_links = ()
@@ -563,7 +562,7 @@ class trace:
                 if not hasattr(info, "varnode"):
                     assert info.external
                     if info.bound_data:
-                        if hasattr(info, "is_const") and info.is_const:
+                        if getattr(info, "is_const", False):
                             info.varnode = graph.make_const(
                                 info.bound_data.numpy(),
                                 info.bound_data.dtype,
@@ -635,30 +634,12 @@ class trace:
             opnode.reset()
 
     def __call__(self, *args, **kwargs):
-        if is_tracing():
-            return self.__wrapped__(*args, **kwargs)
         with self._setup():
             if self._capture_as_const:
                 self._process_inputs(*args, **kwargs)
             outputs = self.__wrapped__(*args, **kwargs)
             if self._capture_as_const:
                 self._process_outputs(outputs)
-
-            # outputs could be None
-            if outputs is not None:
-                list_outputs = outputs
-                if isinstance(outputs, collections.abc.Mapping):
-                    _, list_outputs = zip(*sorted(outputs.items()))
-                elif not isinstance(outputs, collections.abc.Sequence):
-                    list_outputs = (outputs,)
-
-                for o in list_outputs:
-                    # if outputs are copied, then use the newest info in trace data structure
-                    if o._copied:
-                        self._active_tensors[o._mixin_handle] = TensorWeakRef(o)
-                        if self._untraced and self._symbolic:
-                            self._lazy_eval_tensors[o._mixin_handle] = TensorWeakRef(o)
-
             return outputs
 
     def dump(
diff --git a/imperative/python/src/grad.cpp b/imperative/python/src/grad.cpp
index 80e37376bea71268f517eab0bda51301df72cae6..cffdbc5710731de1a40b4763378eb3852747ba59 100644
--- a/imperative/python/src/grad.cpp
+++ b/imperative/python/src/grad.cpp
@@ -9,11 +9,12 @@
  * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  */
 
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+
 #include "./grad.h"
 #include "megbrain/imperative/proxy_graph_detail.h"
 #include "megbrain/imperative/backward_graph_opt.h"
 #include "megbrain/imperative/ops/autogen.h"
-#include "megbrain/imperative/ops/utility.h"
 #include "megbrain/utils/mempool.h"
 
 #include "range/v3/all.hpp"
@@ -434,7 +435,8 @@ apply_result_t apply_grad(ApplyContext& ctx) {
                 if (backward.output_requires_grad(i)) {
                     if (backward.output_captured(i)) {
                         // avoid reference cycle [Tensor <-> GradFn]
-                        outputs[i] = outputs[i]->copy();
+                        static std::shared_ptr<OpDef> op = std::shared_ptr<OpDef>(new FastpathCopy());
+                        outputs[i] = python::apply(op, outputs[i])[0];
                     }
                     // populate grad info of output tensor
                     auto& grad_info = outputs[i]->m_grad_info;
diff --git a/imperative/python/src/grad.h b/imperative/python/src/grad.h
index 1780f311c73146f688af0abc68f97677ad1eb5df..a3fb58e181d68f38e13236c426af0a2e2ae08042 100644
--- a/imperative/python/src/grad.h
+++ b/imperative/python/src/grad.h
@@ -12,6 +12,7 @@
 #pragma once
 
 #include "./tensor.h"
+#include "megbrain/imperative/ops/utility.h"
 
 #include <megbrain/utils/small_vector.h>
 #include <memory>
diff --git a/imperative/python/src/grad_override.cpp b/imperative/python/src/grad_override.cpp
index ccc5a5078268b37abd7c829daf47fba7bf2ec6c5..5a54623eec02c65687e56b42a0e40f7c5333b810 100644
--- a/imperative/python/src/grad_override.cpp
+++ b/imperative/python/src/grad_override.cpp
@@ -221,6 +221,21 @@ apply_result_t removeAxis_grad_rule(ApplyContext& ctx, CustomBackward::Maker& ma
     return apply(ctx);
 }
 
+apply_result_t fastpathcopy_grad_rule(ApplyContext& ctx, CustomBackward::Maker& maker) {
+    mgb_assert(ctx.nargs == 1);
+    maker.output_size(1).output_captured(0, false);
+    maker.backward([](BackwardContext&, Tensor*const* grads, size_t ngrads) {
+        mgb_assert(ngrads == 1);
+        Tensor* grad = grads[0];
+        apply_result_t ret(1);
+        if (grad) {
+            ret[0] = grad->shared_from_this();
+        }
+        return ret;
+    });
+    return apply(ctx);
+}
+
 struct Init {
     Init() {
         auto& reg = grad_rule_registry();
@@ -231,6 +246,7 @@ struct Init {
         reg.emplace(Reduce::typeinfo(), reduce_grad_rule);
         reg.emplace(AddAxis::typeinfo(), addAxis_grad_rule);
         reg.emplace(RemoveAxis::typeinfo(), removeAxis_grad_rule);
+        reg.emplace(FastpathCopy::typeinfo(), fastpathcopy_grad_rule);
     }
 } _;
 
diff --git a/imperative/python/src/graph_rt.cpp b/imperative/python/src/graph_rt.cpp
index 15768f7ecb005c76bebc90a0ed2afee9411a2a54..6369b4903a478c67b0d7f9edba6d43de0526055c 100644
--- a/imperative/python/src/graph_rt.cpp
+++ b/imperative/python/src/graph_rt.cpp
@@ -23,6 +23,7 @@
 #include "./common.h"
 #include "./ops.h"
 #include "megbrain/gopt/inference.h"
+#include "megbrain/imperative/ops/utility.h"
 
 
 namespace py = pybind11;
diff --git a/imperative/python/src/tensor.cpp b/imperative/python/src/tensor.cpp
index 5a3da4112fb1a16d71d432d16e1f68ca297dd9ea..9ba144fc0eb5887fb051e0662bf2a935489323b2 100644
--- a/imperative/python/src/tensor.cpp
+++ b/imperative/python/src/tensor.cpp
@@ -118,9 +118,18 @@ apply_result_t apply(ApplyContext& ctx) {
             handles[i] = ctx.args[i]->m_handle.get();
         }
 
+        apply_result_t outputs;
+
+        // fast copy without really applying
+        if (ctx.op->same_type<FastpathCopy>()) {
+            mgb_assert(ctx.nargs == 1);
+            outputs.reserve(ctx.nargs);
+            outputs.emplace_back(std::make_shared<Tensor>(ctx.args[0]->m_handle));
+            return outputs;
+        }
+
         auto output_handles = interpreter_for_py->apply_op(ctx.op, handles);
 
-        apply_result_t outputs;
         outputs.reserve(output_handles.size());
         for (auto h : output_handles) {
             outputs.emplace_back(std::make_shared<Tensor>(h));
@@ -303,11 +312,6 @@ REGISTE_TENSORWRAPPER_FUNC(bool, recording)
 #undef REGISTE_TENSORWRAPPER_FUNC
 
 
-PyObject* TensorWrapper::copied() {
-    return py::cast(m_tensor->m_trace_info.copied).release().ptr();
-}
-
-
 #define REGISTE_TENSORWRAPPER_PYOBJECT_FUNC(member)                                 \
         PyObject* TensorWrapper::member() {                                         \
             if (m_tensor->m_trace_info.member) {                                    \
@@ -841,7 +845,6 @@ void init_tensor(py::module m) {
         .def<&TensorWrapper::reset_varnode>("_reset_varnode")
         .def<&TensorWrapper::_use_cnt>("_use_cnt")
         .def_getset<&TensorWrapper::varnode>("_varnode")
-        .def_getset<&TensorWrapper::copied>("_copied")
         .def_getset<&TensorWrapper::mixin_handle, &TensorWrapper::set_mixin_handle>("_mixin_handle")
         .def_getset<&TensorWrapper::recording, &TensorWrapper::set_recording>("_recording")
         .def_getset<&TensorWrapper::handle, &TensorWrapper::set_handle>("_handle")
diff --git a/imperative/python/src/tensor.h b/imperative/python/src/tensor.h
index f2a36568935b88829a5eb8ffb7b3ede6d24a400f..2b0d0be10a708804aff4a0f5e88b6f7f2da40e50 100644
--- a/imperative/python/src/tensor.h
+++ b/imperative/python/src/tensor.h
@@ -10,6 +10,7 @@
  */
 
 #pragma once
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
 
 #include <variant>
 
diff --git a/imperative/python/src/trace.cpp b/imperative/python/src/trace.cpp
index 9571c5cda77024ba861e2017df161ce7c7a3c4a3..8f597f1b1aa2671a80d3c3aefaeeecc45c58045a 100644
--- a/imperative/python/src/trace.cpp
+++ b/imperative/python/src/trace.cpp
@@ -35,7 +35,7 @@ apply_result_t apply_trace(ApplyContext& ctx) {
 
         // assumption: python function always returns PyList
         auto tup = py::reinterpret_borrow<py::list>(ret);
-        for (auto i = 0; i < tup.size(); i++) {
+        for (size_t i = 0; i < tup.size(); i++) {
             auto pitem = tup[i].cast<cg::VarNode*>();
             outputs.emplace_back(std::make_shared<Tensor>(pitem));
         }
diff --git a/imperative/python/src/trace_info.h b/imperative/python/src/trace_info.h
index 11736ca7bec6a1c4a195b9ee54ba047e9c7d271d..e394d344de7ee5c52908ee41fbfd07b55882b9c2 100644
--- a/imperative/python/src/trace_info.h
+++ b/imperative/python/src/trace_info.h
@@ -17,7 +17,6 @@ namespace mgb::imperative::python {
 struct TraceInfo {
     int64_t mixin_handle = -1;
     bool recording = false;
-    bool copied = false;
 
     // refer to CompiledTensorProxy in tracing.py, works from second trace step
     PyObject* compiled_info = nullptr;
@@ -35,7 +34,6 @@ struct TraceInfo {
         compiled_info = that.compiled_info;
         Py_XINCREF(compiled_info);
 
-        copied = true;
         return *this;
     }
 
diff --git a/imperative/src/impl/ops/utility.cpp b/imperative/src/impl/ops/utility.cpp
index d6084e339a0e4343037e8a70d0ee71e86c5fcadb..72e72abc862a231d95dce3146e213b061a763a75 100644
--- a/imperative/src/impl/ops/utility.cpp
+++ b/imperative/src/impl/ops/utility.cpp
@@ -18,4 +18,18 @@ namespace mgb::imperative {
 
 MGB_DYN_TYPE_OBJ_FINAL_IMPL(GenericPyOp);
 
+namespace { namespace fastpathcopy {
+    auto apply_on_var_node(
+            const OpDef& def,
+            const VarNodeArray& inputs) {
+        return inputs;
+    }
+
+OP_TRAIT_REG(FastpathCopy,FastpathCopy)
+    .apply_on_var_node(apply_on_var_node)
+    .fallback();
+}} // fastpathcopy
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(FastpathCopy);
+
 } // namespace mgb::imperative
diff --git a/imperative/src/include/megbrain/imperative/ops/utility.h b/imperative/src/include/megbrain/imperative/ops/utility.h
index 609995a4ed6948df37d3fd346d46b1ffd40cfe8f..ba85f3665d259c6e52ae565498c233b9b86bd69b 100644
--- a/imperative/src/include/megbrain/imperative/ops/utility.h
+++ b/imperative/src/include/megbrain/imperative/ops/utility.h
@@ -35,4 +35,18 @@ struct GenericPyOp final : OpDefImplBase<GenericPyOp> {
     MGB_DYN_TYPE_OBJ_FINAL_DECL;
 };
 
+struct FastpathCopy final : OpDefImplBase<FastpathCopy> {
+    FastpathCopy() = default;
+
+    size_t hash() const override {
+        return mgb::hash(this->dyn_typeinfo());
+    }
+
+    bool is_same_st(const Hashable& rhs) const override {
+        return this->dyn_typeinfo() == rhs.dyn_typeinfo();
+    }
+
+    MGB_DYN_TYPE_OBJ_FINAL_DECL;
+};
+
 } // namespace mgb::imperative