feat(mge/imperative): run oss test and restore cmake list build items

GitOrigin-RevId: 11411b6964185700ead75f8f4319d28adaf64907

feat(mge/imperative): run oss test and restore cmake list build items
GitOrigin-RevId: 11411b6964185700ead75f8f4319d28adaf64907
6b380e89 · Megvii Engine Team · 03808112 · 6b380e89 · 6b380e89 · 6b380e89
262 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -247,10 +247,6 @@ if(MGE_BUILD_IMPERATIVE_RT)
    set(CMAKE_CXX_STANDARD 17)
 endif()

-if(MGE_BUILD_IMPERATIVE_RT)
-    set(MGE_BUILD_SDK OFF)
-endif()
-
 if(NOT MGE_WITH_CUDA)
    message("-- Disable distributed support, as CUDA is not enabled.")
    set(MGE_WITH_DISTRIBUTED OFF)
@@ -697,9 +693,7 @@ if(MGE_WITH_PYTHON_MODULE)
 endif()

 if(MGE_WITH_TEST AND MGE_ENABLE_RTTI)
-    if(NOT MGE_BUILD_IMPERATIVE_RT)
-        add_subdirectory(test)
-    endif()
+    add_subdirectory(test)
 endif()

 if(TARGET mgb)

--- a/dnn/CMakeLists.txt
+++ b/dnn/CMakeLists.txt
@@ -66,9 +66,7 @@ if(MGE_WITH_CUDA)
 endif()

 if(MGE_WITH_TEST)
-    if(NOT MGE_BUILD_IMPERATIVE_RT)
-        add_subdirectory(test)
-    endif()
+    add_subdirectory(test)
 endif()

 add_subdirectory(src)

--- a/imperative/.gitignore
+++ b/imperative/.gitignore
+Makefile
+/test/imperative_test
+*.so
+/python/megengine/core/ops/_internal/generated_ops.py
+/python/megengine/core/ops/_internal/param_defs.py
--- a/imperative/CMakeLists.txt
+++ b/imperative/CMakeLists.txt
+find_package(NumPy REQUIRED)
+
+set(PACKAGE_NAME megengine)
+set(PACKAGE_NAME ${PACKAGE_NAME} PARENT_SCOPE)
+set(MODULE_NAME _imperative_rt)
+set(MODULE_NAME ${MODULE_NAME} PARENT_SCOPE)
+file(GLOB_RECURSE SRCS src/impl/*.cpp src/include/*.h python/src/*.cpp python/src/*.h)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMGB_WITH_IMPERATIVE=1")
+
+file(GLOB_RECURSE OPR_DECL_SRCS "${PROJECT_SOURCE_DIR}/src/**/*.oprdecl")
+file(GLOB_RECURSE PYTHON_SRCS python/${PACKAGE_NAME}/*.py)
+list(REMOVE_ITEM PYTHON_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/ops/_internal/generated_ops.py ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/ops/_internal/param_defs.py)
+file(GLOB_RECURSE ALL_HEADERS src/cpp/megbrain_pubapi.h
+    ${PROJECT_SOURCE_DIR}/src/core/include/*
+    ${PROJECT_SOURCE_DIR}/src/opr/include/*
+    ${PROJECT_SOURCE_DIR}/src/serialization/include/*
+    ${PROJECT_SOURCE_DIR}/src/plugin/include/*
+    ${PROJECT_SOURCE_DIR}/dnn/include/*)
+
+set(MEGENGINE_DIR ${CMAKE_CURRENT_BINARY_DIR}/python/)
+set(GEN_OPS_DIR ${MEGENGINE_DIR}/${PACKAGE_NAME}/core/ops/_internal)
+file(MAKE_DIRECTORY ${GEN_OPS_DIR})
+set(GEN_OPS_FILE ${GEN_OPS_DIR}/generated_ops.py)
+set(GEN_OP_PARAMS_FILE ${MEGENGINE_DIR}/${PACKAGE_NAME}/core/ops/_internal/param_defs.py)
+set(GEN_OP_PARAMS_TEMPLATE ${CMAKE_CURRENT_SOURCE_DIR}/python/tools/ops.tpl.py)
+
+##################### generate python opr_param_defs.py   ##############
+
+file(COPY ${PROJECT_SOURCE_DIR}/dnn/scripts/opr_param_defs.py DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+file(READ ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py CONTENTS)
+file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/opr_param_defs.py ${CONTENTS})
+
+add_custom_command(
+    OUTPUT ${GEN_OPS_FILE}
+    COMMAND ${CMAKE_COMMAND} -E touch ${MEGENGINE_DIR}/${PACKAGE_NAME}/core/${MODULE_NAME}.so ${GEN_OPS_FILE} ${GEN_OP_PARAMS_FILE}
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/${PACKAGE_NAME} ${MEGENGINE_DIR}/${PACKAGE_NAME}
+    COMMAND ${CMAKE_COMMAND} -E remove -f ${MEGENGINE_DIR}/${PACKAGE_NAME}/core/${MODULE_NAME}.so ${GEN_OPS_FILE} ${GEN_OP_PARAMS_FILE}
+    COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/python/tools/gen_ops.py ${OPR_DECL_SRCS} -o ${GEN_OPS_FILE}
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/test ${MEGENGINE_DIR}/${PACKAGE_NAME}/test
+    COMMAND ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_param_defs.py -t py --imperative ${CMAKE_CURRENT_BINARY_DIR}/opr_param_defs.py ${GEN_OP_PARAMS_FILE}
+    DEPENDS ${OPR_DECL_SRCS} ${PYTHON_SRCS} ${ALL_HEADERS} ${GEN_OP_PARAMS_TEMPLATE}
+    VERBATIM
+)
+
+add_custom_target(gen_opr_py DEPENDS ${GEN_OPS_FILE})
+
+##################### generate opdef c header and python binding ##############
+
+set(OP_DEF_HEADER_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/include)
+file(MAKE_DIRECTORY ${OP_DEF_HEADER_OUT_DIR}/megbrain/imperative/opdef)
+set(OP_DEF_HEADER ${OP_DEF_HEADER_OUT_DIR}/megbrain/imperative/opdef/all.h)
+set(OP_DEF_PYTHON_BINDING_OUT_DIR ${MEGENGINE_DIR}/${PACKAGE_NAME}/src)
+file(MAKE_DIRECTORY ${OP_DEF_PYTHON_BINDING_OUT_DIR})
+set(OP_DEF_PYTHON_BINDING ${OP_DEF_PYTHON_BINDING_OUT_DIR}/opdef.inl)
+set(OP_PARAM_DEF ${CMAKE_CURRENT_BINARY_DIR}/opr_param_defs.py)
+set(GEN_OP_DEF_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/python/tools/gen_op_defs.py)
+
+add_custom_command(
+    OUTPUT ${OP_DEF_HEADER} ${OP_DEF_PYTHON_BINDING}
+    COMMAND ${PYTHON_EXECUTABLE} ${GEN_OP_DEF_SCRIPT} ${OP_PARAM_DEF} ${OP_DEF_HEADER}
+    COMMAND ${PYTHON_EXECUTABLE} ${GEN_OP_DEF_SCRIPT} -t py ${OP_PARAM_DEF} ${OP_DEF_PYTHON_BINDING}
+    DEPENDS ${GEN_OP_DEF_SCRIPT} ${OP_PARAM_DEF}
+    VERBATIM
+)
+
+add_custom_target(gen_op_def_internal DEPENDS ${OP_DEF_HEADER} ${OP_DEF_PYTHON_BINDING})
+add_library(gen_op_def INTERFACE)
+target_include_directories(gen_op_def INTERFACE ${OP_DEF_HEADER_OUT_DIR} ${OP_DEF_PYTHON_BINDING_OUT_DIR})
+add_dependencies(gen_op_def gen_op_def_internal)
+
+##################### end of opdef generation #########################
+
+set(VERSION_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/src/version.ld)
+add_custom_target(_version_ld SOURCES ${VERSION_SCRIPT})
+
+add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/pybind11 ${PROJECT_BINARY_DIR}/third_party/pybind11)
+pybind11_add_module(${MODULE_NAME} NO_EXTRAS ${SRCS})
+target_link_libraries(${MODULE_NAME} PRIVATE gen_op_def megbrain megdnn -Wl,--version-script=${VERSION_SCRIPT})
+if (MGE_WITH_DISTRIBUTED)
+    message("Imperative configured to link megray")
+    target_link_libraries(${MODULE_NAME} PRIVATE megray)
+endif()
+target_include_directories(${MODULE_NAME} PUBLIC src/include PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR})
+target_compile_definitions(${MODULE_NAME} PRIVATE MODULE_NAME=${MODULE_NAME})
+target_compile_options(${MODULE_NAME} PRIVATE -Wno-unused-parameter)
+if(CXX_SUPPORT_WCLASS_MEMACCESS)
+    target_compile_options(${MODULE_NAME} PRIVATE "-Wno-class-memaccess")
+endif()
+set_target_properties(${MODULE_NAME} PROPERTIES
+    SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}
+    LIBRARY_OUTPUT_DIRECTORY ${MEGENGINE_DIR}/${PACKAGE_NAME}/core
+)
+add_dependencies(${MODULE_NAME} gen_opr_py _version_ld)
+
+if(MGE_WITH_TEST AND MGE_ENABLE_RTTI)
+    add_subdirectory(test)
+endif()
+
+add_custom_command(
+    TARGET ${MODULE_NAME} POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/LICENSE ${PROJECT_SOURCE_DIR}/ACKNOWLEDGMENTS ${PROJECT_BINARY_DIR}
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine ${CMAKE_CURRENT_BINARY_DIR}/python/megengine
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/test ${CMAKE_CURRENT_BINARY_DIR}/python/test
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/setup.py ${CMAKE_CURRENT_BINARY_DIR}/python/setup.py
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires.txt
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-style.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-style.txt
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-test.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-test.txt
+)
+
--- a/imperative/python/megengine/__init__.py
+++ b/imperative/python/megengine/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import os
+import sys
+
+from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func
+from .device import *
+from .logger import enable_debug_log, get_logger, set_log_file, set_log_level
+from .serialization import load, save
+from .tensor import Tensor, tensor
+from .tensor_nn import Buffer, Parameter
+from .version import __version__
+
+_set_fork_exec_path_for_timed_func(
+    sys.executable,
+    os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"),
+)
+
+del _set_fork_exec_path_for_timed_func
--- a/imperative/python/megengine/core/__init__.py
+++ b/imperative/python/megengine/core/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import os
+import sys
+
+from .tensor import Tensor
--- a/imperative/python/megengine/core/_wrap.py
+++ b/imperative/python/megengine/core/_wrap.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+from ._imperative_rt import CompNode
+
+
+class Device:
+    def __init__(self, device=None):
+        if device is None:
+            self._cn = CompNode()
+        elif isinstance(device, Device):
+            self._cn = device._cn
+        elif isinstance(device, CompNode):
+            self._cn = device
+        else:
+            self._cn = CompNode(device)
+
+    def to_c(self):
+        return self._cn
+
+    def __repr__(self):
+        return "{}({})".format(type(self).__qualname__, self)
+
+    def __str__(self):
+        return str(self._cn)
+
+    def __hash__(self):
+        return hash(str(self._cn))
+
+    def __eq__(self, rhs):
+        if not isinstance(rhs, Device):
+            rhs = Device(rhs)
+        return str(self._cn) == str(rhs._cn)
+
+
+def device(obj):
+    if isinstance(obj, Device):
+        return obj
+    return Device(obj)
--- a/imperative/python/megengine/core/autodiff/__init__.py
+++ b/imperative/python/megengine/core/autodiff/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--- a/imperative/python/megengine/core/autodiff/builtin_op_utils.py
+++ b/imperative/python/megengine/core/autodiff/builtin_op_utils.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import functools
+import itertools
+
+import numpy as np
+
+from .._imperative_rt import TensorAttr, imperative
+from ..ops.builtin import Elemwise, GetVarShape, OpDef, OprAttr, Reduce, Reshape
+from ..tensor.core import apply
+from ..tensor.function import Function
+
+
+@functools.singledispatch
+def builtin_op_get_backward_fn(op: OpDef, inputs, outputs, input_requires_grad):
+    assert 0
+
+
+_elemwise_add_param = Elemwise(mode="add").to_c().param
+
+
+@builtin_op_get_backward_fn.register(OpDef)
+def _(op: OpDef, inputs, outputs, input_requires_grad):
+    if (
+        isinstance(op, OprAttr)
+        and op.type == "Elemwise"
+        and op.param == _elemwise_add_param
+    ):
+        grad_fn = elemwise_grad_fn
+    elif isinstance(op, OprAttr) and op.type == Reshape.name:
+        grad_fn = reshape_grad_fn
+    else:
+        grad_fn = default_grad_fn
+    return grad_fn(op, inputs, outputs, input_requires_grad)
+
+
+@builtin_op_get_backward_fn.register(Function)
+def _(op: Function, inputs, outputs, input_requires_grad):
+    return op.get_backward_fn(), [True,] * len(outputs)
+
+
+def default_grad_fn(op, inputs, outputs, input_requires_grad):
+    def get_tensor_attr(x):
+        attr = TensorAttr()
+        attr.dtype = x.dtype
+        attr.comp_node = x.device.to_c()
+        return attr
+
+    output_has_grads = [True,] * len(outputs)
+    result = imperative.make_backward_graph(
+        op, list(map(get_tensor_attr, inputs)), input_requires_grad, output_has_grads
+    )
+    if result is None:
+        nr_inputs = len(inputs)
+        nr_outputs = len(outputs)
+
+        def backward(*args):
+            return nr_inputs * [
+                None,
+            ]
+
+        return backward, nr_outputs * [False,]
+    backward_graph, save_for_backward_mask, input_has_grad = result
+
+    intput_output_mask = save_for_backward_mask[: len(inputs + outputs) :]
+    output_grad_mask = save_for_backward_mask[len(inputs + outputs) :]
+    save_for_backward = tuple(
+        val for val, mask in zip(inputs + outputs, intput_output_mask) if mask
+    )
+    del inputs
+    del outputs
+
+    def backward(*args):
+        output_grads = tuple(val for val, mask in zip(args, output_grad_mask) if mask)
+        assert None not in output_grads
+        ret = iter(apply(backward_graph, *(save_for_backward + output_grads)))
+        return tuple(next(ret) if mask else None for mask in input_has_grad)
+
+    return backward, output_grad_mask
+
+
+# override for elemwise
+def elemwise_grad_fn(op, inputs, outputs, input_requires_grad):
+    assert len(inputs) == len(input_requires_grad) == 2
+
+    def get_shape(x):
+        (s,) = apply(GetVarShape(), x)
+        return s
+
+    input_shapes = [
+        get_shape(x) if i else None for i, x in zip(input_requires_grad, inputs)
+    ]
+
+    def reduce_to(x, s):
+        (y,) = apply(Reduce(), x, s)
+        return y
+
+    def backward(dy):
+        return tuple(
+            reduce_to(dy, s) if i else None
+            for i, s in zip(input_requires_grad, input_shapes)
+        )
+
+    return backward, [True]
+
+
+def reshape_grad_fn(op, inputs, outputs, input_requires_grad):
+    assert len(inputs) == len(input_requires_grad) == 2
+
+    def get_shape(x):
+        (s,) = apply(GetVarShape(), x)
+        return s
+
+    input_shapes = [
+        get_shape(x) if i else None for i, x in zip(input_requires_grad, inputs)
+    ]
+
+    def reshape_to(dy, s):
+        (dx,) = apply(Reshape(), dy, s)
+        return dx
+
+    def backward(dy):
+        return tuple(
+            reshape_to(dy, s) if i else None
+            for i, s in zip(input_requires_grad, input_shapes)
+        )
+
+    return backward, [True]
--- a/imperative/python/megengine/core/autodiff/grad.py
+++ b/imperative/python/megengine/core/autodiff/grad.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import functools
+import heapq
+import itertools
+import typing
+import weakref
+
+import numpy as np
+
+from ..ops.builtin import Elemwise, OpDef
+from ..ops.special import Const
+from ..tensor.core import TensorBase, TensorWrapperBase, apply
+from ..tensor.function import Function
+from ..tensor.tensor import Tensor, get_context
+from . import builtin_op_utils
+
+""" Some notes:
+    1. Initialize the optimizer:
+        for each trainable parameter:
+            call wrt(param, callback)
+        Each parameter tensor will be assciated with a Tracer object saved in Tensor._extra_data
+    2. Tracer has one member: node, which is a VariableNode
+    3. VariableNode has a OpNode member: opnode
+    4. OpNode has four members:
+        a. id
+        b. inputs, which is made of VariableNode
+        c. outputs, which are weakref's to VariableNode
+        d. backward: call back function
+        e. has_grad_fn: call has_grad_fn(opnode, reached) to check grad exist
+        f. backward_allow_noinput: whether backward allow noinput
+
+"""
+
+_grad_count = 0
+_grad_manager_dict = weakref.WeakValueDictionary()
+
+
+def get_grad_managers():
+    return [_grad_manager_dict[key] for key in _grad_manager_dict]
+
+
+def add(a, b):
+    (c,) = apply(Elemwise(mode="add"), a, b)
+    return c
+
+
+def get_tensor(x):
+    # use recursion to avoid infinite loop
+    if isinstance(x, Tensor):
+        return x
+    try:
+        x = x.__wrapped__
+    except AttributeError:
+        raise TypeError(type(x))
+    return get_tensor(x)
+
+
+class Grad:
+    def __init__(self, name=None):
+
+        if name is None:
+            global _grad_count
+            self._name = "grad_" + str(_grad_count)
+            _grad_count += 1
+        else:
+            self._name = name
+        assert self._name not in _grad_manager_dict, "grad manager name duplicated"
+        _grad_manager_dict[self._name] = self
+
+        # list of all x in partial(y) / partial(x)
+        self.xs = []
+
+        # constains weak reference of all OpNode during forward
+        # OpNode contains inputs, outputs and its backward
+        # ops forms the computational graph
+        self.ops = []
+
+        self._enabled = True
+
+    @property
+    def name(self):
+        return self._name
+
+    def wrt(self, *args: Tensor, callback=None):
+        """ Indicates the loss is a function of the input tensors (usually the net trainable parameters),
+        i.e., d (loss) / d (Tensor) != 0
+
+        callback is used to perform additional operations after gradient is obtained in backward.
+        e.g., copy the grad to a particular place
+
+        A VariableNode will be created and saved in the tensor/s _extra_data slot.
+        """
+
+        for x in map(get_tensor, args):
+            v = self._new_variable(x, callback=callback)
+            assert self not in x._extra_data
+            x._extra_data[self] = Tracer(v)
+            self.xs.append(v)
+
+        return self
+
+    def _new_variable(self, owner, opnode=None, callback=None):
+        return VariableNode(self, owner, opnode=opnode, callback=callback)
+
+    def _new_opnode(self, inputs, outputs):
+        inputs = tuple(inputs)
+        for i in inputs:
+            assert i is None or isinstance(i, VariableNode)
+        o = OpNode()
+        o.inputs = inputs
+        o.outputs = []
+        tracers = []
+        for i in outputs:
+            assert isinstance(i, Tensor)
+            v = self._new_variable(i, o)
+            o.outputs.append(weakref.ref(v))
+            tracers.append(Tracer(v))
+        self.ops.append(weakref.ref(o))
+        return o, tracers
+
+    def copy(self):
+        raise NotImplementedError
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *_):
+        """clear all resources"""
+        self._enabled = False
+        for o in self.ops:
+            o = o()
+            if o:
+                o.clear()
+
+    def __call__(self, ys, dys):
+        """ Defines Grad().
+
+        :param ys: outputs of forward operators, e.g., the loss tensor
+        :type ys: list of Tensor or TensorWrapperBase
+        :param dys: delta of outputs, physically equivalent to sensitivity of outputs to the loss,
+            e.g., one for the loss itself
+        :type dys: list of Tensor or TensorWrapperBase
+        """
+        assert self._enabled
+        self._enabled = False
+
+        def check_wrapper():
+            if isinstance(dys, TensorWrapperBase):
+                return type(dys)
+            if isinstance(dys, TensorBase):
+                return
+            assert isinstance(dys, (tuple, list))
+            for i in dys:
+                if isinstance(i, TensorWrapperBase):
+                    return type(i)
+
+        Wrapper = check_wrapper()
+
+        def aslist(x):
+            if isinstance(x, (Tensor, TensorWrapperBase)):
+                x = [x]
+            else:
+                x = list(x)
+            x = [i.__wrapped__ if isinstance(i, TensorWrapperBase) else i for i in x]
+            for i in x:
+                assert isinstance(i, Tensor)
+            return x
+
+        ys = aslist(ys)
+        dys = aslist(dys)
+        assert len(ys) == len(dys)
+
+        # ys is changed to a list of VariableNode which contains more information
+        # such as OpNode, callback, etc.
+        ys = [i._extra_data[self].node for i in ys]
+
+        # NOTE: callback is called only if grad is not None
+
+        # the OpNode sequence in backward
+        op_seq = []
+
+        # VariableNode -> (i, j), where i is time stamp in backward, j means jth input
+        last_written_to = {}
+
+        def schedule():
+            reached = set(ys)
+            # i is the time stamp in backward
+            i = 0
+            for o in self.ops[::-1]:
+                o = o()
+                if o is None:
+                    continue
+
+                if not o.has_grad_fn(o, reached):
+                    continue
+                op_seq.append(o)
+                for j, v in enumerate(o.inputs):
+                    reached.add(v)
+                    last_written_to[v] = i, j
+                i += 1
+
+        schedule()
+
+        # VariableNode -> Tensor
+        cache = {}
+
+        def initialize():
+            for y, dy in zip(ys, dys):
+                cache[y] = dy
+                if y not in last_written_to and y.callback:
+                    y.callback(y.owner(), dy)
+
+        initialize()
+
+        # NOTE: None is used to mark a node has been consumed
+
+        for seqno, opnode in enumerate(op_seq):
+            input_nodes = opnode.inputs
+            output_nodes = [i() for i in opnode.outputs]
+            backward = opnode.backward
+            backward_allow_noinput = opnode.backward_allow_noinput
+            opnode.clear()
+
+            output_grads = []
+            for i in output_nodes:
+                if i is not None:
+                    if i in cache:
+                        assert cache[i] is not None
+                        output_grads.append(cache[i])
+                    else:
+                        output_grads.append(None)
+                    # read by backward, mark consumed
+                    cache[i] = None
+                else:
+                    output_grads.append(None)
+            if (
+                any([grad is not None for grad in output_grads])
+                or backward_allow_noinput
+            ):
+                input_grads = backward(*output_grads)
+            else:
+                input_grads = [None] * len(input_nodes)
+
+            assert len(input_nodes) == len(input_grads)
+            for i, (v, g) in enumerate(zip(input_nodes, input_grads)):
+                if v is None:
+                    continue
+                if v in cache:
+                    assert cache[v]
+                    if g is not None:
+                        cache[v] = add(cache[v], g)
+                elif g is not None:
+                    cache[v] = g
+                if last_written_to[v] == (seqno, i):
+                    if v.callback:
+                        v.callback(
+                            v.owner(), Wrapper(cache[v]) if Wrapper else cache[v]
+                        )
+                    if v.opnode is None:
+                        # won't read by backward, mark consumed
+                        cache[v] = None
+
+        for v in cache.values():
+            assert v is None
+
+
+class clearable:
+    __cleared = False
+
+    def __bool__(self):
+        return not self.__cleared
+
+    def clear(self):
+        self.__dict__.clear()
+        self.__cleared = True
+
+
+class OpNode(clearable):
+    """ OpNode saves all the information to form the computational graph.
+    """
+
+    def __init__(self):
+        self.id = None
+        self.inputs = None  # Could be VariableNode
+        self.outputs = None  # Could be VariableNode
+        self.backward = None
+        self.has_grad_fn = None
+        self.backward_allow_noinput = False
+
+
+class VariableNode(clearable):
+    """ VariableNode saves OpNode and callback.
+    FIXME!!! Explain manager and owner
+    """
+
+    def __init__(self, manager, owner, opnode=None, callback=None):
+        # manager is Grad type
+        self.manager = weakref.ref(manager)
+        # owner is Tensor type
+        self.owner = weakref.ref(owner)
+        self.opnode = opnode
+        self.callback = callback
+
+
+class Tracer(clearable, TensorBase):
+    def __init__(self, node=None):
+        """ type(node) is VariableNode
+        """
+        self.node = node
+
+
+@functools.singledispatch
+def check_backward_allow_noinput(op: OpDef):
+    return False
+
+
+@functools.singledispatch
+def get_op_has_grad_fn(op: OpDef):
+    assert 0
+
+
+@get_op_has_grad_fn.register(OpDef)
+def _(op: OpDef):
+    return default_has_grad_fn
+
+
+@get_op_has_grad_fn.register(Function)
+def _(op: Function):
+    return default_has_grad_fn
+
+
+def default_has_grad_fn(opnode, reached):
+    for v in opnode.outputs:
+        if v() in reached:
+            return True
+    return False
+
+
+@apply.add
+def tracer_apply(op: (OpDef, Function), *args: typing.Optional[Tracer]):
+    args = tuple(i if isinstance(i, Tracer) else None for i in args)
+    input_requires_grad = list(map(bool, args))
+    if not any(input_requires_grad):
+        return
+
+    ctx = get_context()
+    manager = None
+    assert len(ctx.inputs) == len(args)
+    for i, j in zip(ctx.inputs, args):
+        if j:
+            j = j.node
+            assert i is j.owner()
+            if manager is None:
+                manager = j.manager()
+                assert manager
+            else:
+                assert manager is j.manager()
+
+    if not manager._enabled:
+        return
+
+    opnode, outputs = manager._new_opnode([i and i.node for i in args], ctx.outputs)
+
+    # register backward method
+    # tuple of backward functions corresponding to dy / dx_i
+    # None means y is not a function of x_i
+    opnode.backward, output_need_grad = builtin_op_utils.builtin_op_get_backward_fn(
+        op, ctx.inputs, ctx.outputs, input_requires_grad
+    )
+
+    assert len(outputs) == len(output_need_grad)
+    outputs = [x if y else None for (x, y) in zip(outputs, output_need_grad)]
+
+    opnode.backward_allow_noinput = check_backward_allow_noinput(op)
+
+    opnode.has_grad_fn = get_op_has_grad_fn(op)
+
+    return tuple(outputs)
+
+
+@apply.add
+def _(op: Const, *_: typing.Optional[Tracer]):
+    return None
--- a/imperative/python/megengine/core/ops/__init__.py
+++ b/imperative/python/megengine/core/ops/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--- a/imperative/python/megengine/core/ops/_internal/__init__.py
+++ b/imperative/python/megengine/core/ops/_internal/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--- a/imperative/python/megengine/core/ops/_internal/all_ops.py
+++ b/imperative/python/megengine/core/ops/_internal/all_ops.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .generated_ops import *
+from .misc_ops import *
--- a/imperative/python/megengine/core/ops/_internal/enum36.py
+++ b/imperative/python/megengine/core/ops/_internal/enum36.py
--- a/imperative/python/megengine/core/ops/_internal/helper.py
+++ b/imperative/python/megengine/core/ops/_internal/helper.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import warnings
+
+from ..._imperative_rt.ops import OprAttr
+from . import param_defs
+
+
+def make_param(param, ptype, kwargs):
+    if param is not None:
+        if isinstance(param, ptype):
+            return param
+
+        param = [param]
+        assert len(param) == len(
+            ptype.__slots__
+        ), "{} needs {} params, but {} are provided".format(
+            ptype, len(ptype.__slots__), len(param)
+        )
+        return ptype(*param)
+
+    ckw = {}
+    for i in ptype.__slots__:
+        val = kwargs.pop(i, ckw)
+        if val is not ckw:
+            ckw[i] = val
+    return ptype(**ckw)
+
+
+class PodOpVisitor:
+    __name2subclass = {}
+    __c = None
+
+    name = None
+    param_names = []
+    config = None
+
+    def __init__(self, config, **params):
+        self.config = config
+        assert set(params) == set(self.param_names)
+        self.__dict__.update(params)
+
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)  # python 3.5 does not have this
+        name = cls.name
+        if name in cls.__name2subclass:
+            if not issubclass(cls, cls.__name2subclass[name]):
+                warnings.warn("Multiple subclasses for bultin op: %s" % name)
+        cls.__name2subclass[name] = cls
+
+    def to_c(self):
+        if self.__c:
+            return self.__c
+        op = OprAttr()
+        op.type = self.name
+        if self.config is not None:
+            op.config = self.config
+        # first 4 bytes is TAG, has to remove them currently
+        op.param = b"".join(self.__dict__[k].serialize()[4:] for k in self.param_names)
+        self.__c = op
+        return op
+
+    def __eq__(self, rhs):
+        return self.to_c() == rhs.to_c()
+
+    def __repr__(self):
+        name = self.__class__.__name__
+
+        if self.__c:
+            return "{}(<binary data>)".format(name)
+
+        kwargs = {}
+        for i in self.param_names:
+            p = self.__dict__[i]
+            if isinstance(p, param_defs._ParamDefBase):
+                for k in p.__slots__:
+                    v = getattr(p, k)
+                    if isinstance(v, param_defs._EnumBase):
+                        v = v.name
+                    kwargs[k] = repr(v)
+            else:
+                kwargs[i] = repr(p)
+        if self.config:
+            if len(self.config.comp_node_arr) == 1:
+                kwargs["device"] = "'%s'" % self.config.comp_node
+        return "{}({})".format(
+            name, ", ".join("{}={}".format(k, v) for k, v in kwargs.items())
+        )
--- a/imperative/python/megengine/core/ops/_internal/misc_ops.py
+++ b/imperative/python/megengine/core/ops/_internal/misc_ops.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+import ctypes
+
+from ..._imperative_rt import OperatorNodeConfig as Config
+from . import param_defs
+from .helper import PodOpVisitor, make_param
+
+__all__ = ["ConvolutionBackwardData", "Dimshuffle", "Reshape", "AxisAddRemove"]
+
+
+class TensorShape:
+    MAX_NDIM = 7
+
+
+class ConvolutionBackwardData(PodOpVisitor):
+    param_names = (
+        "param",
+        "execution_polity",
+    )
+    name = "ConvolutionBackwardDataV1"
+
+    def __init__(
+        self,
+        *,
+        param=None,
+        execution_polity=None,
+        name=None,
+        comp_node=None,
+        config=None,
+        dtype=None,
+        **kwargs
+    ):
+        config = config or Config()
+        if name:
+            config.name = name
+        if comp_node:
+            config.comp_node = comp_node
+        if dtype:
+            config.dtype = dtype
+        self.config = config
+
+        self.param = make_param(param, param_defs.Convolution, kwargs)
+        self.execution_polity = make_param(
+            execution_polity, param_defs.ExecutionPolicy, kwargs
+        )
+        assert not kwargs, "extra kwargs: {}".format(kwargs)
+
+
+class Dimshuffle(PodOpVisitor):
+    name = "Dimshuffle"
+    param_names = ("pattern",)
+
+    class Pattern(ctypes.Structure):
+        Pattern_Array = ctypes.c_int32 * TensorShape.MAX_NDIM
+        _fields_ = [
+            ("length", ctypes.c_uint32),
+            ("pattern", Pattern_Array),
+            ("ndim", ctypes.c_uint32),
+        ]
+
+        def serialize(self):
+            return bytes(ctypes.c_uint32(0)) + bytes(self)
+
+    def __init__(self, pattern, ndim=0):
+        assert isinstance(pattern, collections.Iterable)
+        assert len(pattern) <= TensorShape.MAX_NDIM
+        pattern_array = Dimshuffle.Pattern.Pattern_Array()
+        for idx, v in enumerate(pattern):
+            pattern_array[idx] = ctypes.c_int32(-1 if v == "x" else int(v))
+        self.pattern = Dimshuffle.Pattern(len(pattern), pattern_array, ndim)
+
+
+class Reshape(PodOpVisitor):
+    name = "ReshapeV1"
+    param_names = ("unspec_axis",)
+
+    def __init__(self, unspec_axis=None):
+        if unspec_axis is None:
+            self.unspec_axis = param_defs.OptionalAxisV1()
+        else:
+            self.unspec_axis = param_defs.OptionalAxisV1(unspec_axis)
+
+
+class AxisNum(ctypes.Structure):
+    _fields_ = [
+        ("m_num", ctypes.c_int),
+    ]
+
+
+class AxisDesc(ctypes.Structure):
+    class Method(ctypes.c_int):
+        ADD_1 = 0
+        REMOVE = 1
+
+    _fields_ = [
+        ("method", Method),
+        ("axis", AxisNum),
+    ]
+
+    @classmethod
+    def make_add(cls, axis):
+        return cls(cls.Method.ADD_1, AxisNum(axis))
+
+    @classmethod
+    def make_remove(cls, axis):
+        return cls(cls.Method.REMOVE, AxisNum(axis))
+
+
+class AxisAddRemove(PodOpVisitor):
+    name = "AxisAddRemove"
+    param_names = ("param",)
+
+    AxisDesc = AxisDesc
+
+    class Param(ctypes.Structure):
+        MAX_DESC_SIZE = TensorShape.MAX_NDIM * 2
+
+        _fields_ = [("nr_desc", ctypes.c_uint32), ("desc", AxisDesc * MAX_DESC_SIZE)]
+
+        def __init__(self, *args):
+            super().__init__()
+            self.nr_desc = len(args)
+            for i, a in enumerate(args):
+                self.desc[i] = a
+
+        def serialize(self):
+            return bytes(ctypes.c_uint32(0)) + bytes(self)
+
+    def __init__(self, param):
+        assert isinstance(param, self.Param)
+        self.param = param
+
+
+del AxisDesc
+
+
+class IndexingOpBase(PodOpVisitor):
+    param_names = ("index_desc",)
+
+    class IndexDescMaskDump(ctypes.Structure):
+        class Item(ctypes.Structure):
+            _fields_ = [
+                ("axis", ctypes.c_int8),
+                ("begin", ctypes.c_bool),
+                ("end", ctypes.c_bool),
+                ("step", ctypes.c_bool),
+                ("idx", ctypes.c_bool),
+            ]
+
+        Item_Array = Item * TensorShape.MAX_NDIM
+
+        _fields_ = [("nr_item", ctypes.c_uint8), ("items", Item_Array)]
+
+        def serialize(self):
+            return bytes(ctypes.c_uint32(0)) + bytes(self)
+
+    def __init__(self, items):
+        nr_item = len(items)
+        assert nr_item <= TensorShape.MAX_NDIM
+        item_array = IndexingOpBase.IndexDescMaskDump.Item_Array()
+        for idx, item in enumerate(items):
+            assert isinstance(item, (tuple, list)) and len(item) == 5
+            item_array[idx] = IndexingOpBase.IndexDescMaskDump.Item(*item)
+        self.index_desc = IndexingOpBase.IndexDescMaskDump(nr_item, item_array)
+
+
+def _gen_indexing_defs(*names):
+    for name in names:
+        globals()[name] = type(name, (IndexingOpBase,), dict(name=name))
+        __all__.append(name)
+
+
+_gen_indexing_defs(
+    "Subtensor",
+    "SetSubtensor",
+    "IncrSubtensor",
+    "IndexingMultiAxisVec",
+    "IndexingSetMultiAxisVec",
+    "IndexingIncrMultiAxisVec",
+    "MeshIndexing",
+    "IncrMeshIndexing",
+    "SetMeshIndexing",
+    "BatchedMeshIndexing",
+    "BatchedIncrMeshIndexing",
+    "BatchedSetMeshIndexing",
+)
--- a/imperative/python/megengine/core/ops/builtin/__init__.py
+++ b/imperative/python/megengine/core/ops/builtin/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import warnings
+from typing import Union
+
+from ..._imperative_rt import OpDef, ops
+from ...tensor.core import OpBase, TensorBase, TensorWrapperBase, apply
+from .._internal import all_ops
+from .._internal.helper import PodOpVisitor
+
+# register OpDef as a "virtual subclass" of OpBase, so any of registered
+# apply(OpBase, ...) rules could work well on OpDef
+OpBase.register(OpDef)
+
+# forward to apply(OpDef, ...)
+@apply.add
+def _(op: PodOpVisitor, *args: Union[TensorBase, TensorWrapperBase]):
+    return apply(op.to_c(), *args)
+
+
+__all__ = ["OpDef", "PodOpVisitor"]
+
+for k, v in all_ops.__dict__.items():
+    if isinstance(v, type) and issubclass(v, PodOpVisitor):
+        globals()[k] = v
+        __all__.append(k)
+
+for k, v in ops.__dict__.items():
+    if isinstance(v, type) and issubclass(v, OpDef):
+        globals()[k] = v
+        __all__.append(k)
--- a/imperative/python/megengine/core/ops/special.py
+++ b/imperative/python/megengine/core/ops/special.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from ..tensor.core import OpBase, TensorBase, apply
+
+
+class Const(OpBase):
+    def __init__(self, value=None, *, dtype=None, device=None):
+        self.value = value
+        self.dtype = dtype
+        self.device = device
--- a/imperative/python/megengine/core/tensor/__init__.py
+++ b/imperative/python/megengine/core/tensor/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .tensor_wrapper import TensorWrapper as Tensor
--- a/imperative/python/megengine/core/tensor/core.py
+++ b/imperative/python/megengine/core/tensor/core.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+import functools
+import inspect
+import sys
+import typing
+from abc import ABC
+
+import multipledispatch
+
+
+class OpBase(ABC):
+    def __call__(self, *args):
+        return apply(self, *args)
+
+
+class TensorBase:
+    pass
+
+
+class TensorWrapperBase:
+    pass
+
+
+class Dispatcher(multipledispatch.Dispatcher):
+    def add(self, f, g=None):
+        if g is None:
+            super().add(get_signature(f), f)
+        else:
+            super().add(f, g)
+
+        return f
+
+    def __get__(self, instance, owner=None):
+        if instance is not None:
+            return self
+        return functools.partial(self, instance)
+
+
+if sys.version_info < (3, 6):
+
+    def parse_union(ann):
+        if type(ann) is not typing.UnionMeta:
+            return
+        return ann.__union_params__
+
+
+elif sys.version_info < (3, 7):
+
+    def parse_union(ann):
+        if type(ann) is not typing._Union:
+            return
+        return ann.__args__
+
+
+elif sys.version_info < (3, 8):
+
+    def parse_union(ann):
+        if type(ann) is not typing._GenericAlias:
+            if type(ann) is not typing.Union:
+                return
+        else:
+            if ann.__origin__ is not typing.Union:
+                return
+        return ann.__args__
+
+
+else:
+
+    def parse_union(ann):
+        if typing.get_origin(ann) is not typing.Union:
+            return
+        return typing.get_args(ann)
+
+
+def get_signature(function, op_type=None):
+    sig = inspect.signature(function)
+    types = []
+    for p in sig.parameters.values():
+        ann = p.annotation
+        ann = parse_union(ann) or ann
+        if p.kind in (
+            inspect.Parameter.POSITIONAL_ONLY,
+            inspect.Parameter.POSITIONAL_OR_KEYWORD,
+        ):
+            types.append(ann)
+        if p.kind == inspect.Parameter.VAR_POSITIONAL:
+            types.append([ann])
+    return tuple(types)
+
+
+apply = Dispatcher("apply")
+
+OpBase.apply = apply
+
+
+@apply.add
+def _(op: OpBase, *args: TensorBase):
+    raise NotImplementedError
+
+
+@apply.add
+def _(op: OpBase, *args: TensorWrapperBase):
+    assert args
+    Wrapper = type(args[0])
+    outputs = apply(op, *(i.__wrapped__ for i in args))
+    assert isinstance(outputs, tuple)
+    return tuple(map(Wrapper, outputs))
--- a/imperative/python/megengine/core/tensor/dtype.py
+++ b/imperative/python/megengine/core/tensor/dtype.py
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import collections
+from typing import Union
+
+import numpy as np
+
+# normal dtype related
+from .._imperative_rt import bfloat16, intb1, intb2, intb4
+
+
+def is_lowbit(dtype):
+    return (dtype is intb1) or (dtype is intb2) or (dtype is intb4)
+
+
+def is_bfloat16(dtype):
+    return dtype is bfloat16
+
+
+# quantization dtype related
+_QuantDtypeMetadata = collections.namedtuple(
+    "QuantDtypeMetadata", ["name", "np_dtype_str", "is_unsigned", "qmin", "qmax",]
+)
+
+_metadata_dict = {
+    "quint8": _QuantDtypeMetadata("Quantized8Asymm", "uint8", True, 0, 255),
+    "qint8": _QuantDtypeMetadata("QuantizedS8", "int8", False, -128, 127),
+    "quint4": _QuantDtypeMetadata("Quantized4Asymm", "uint8", True, 0, 15),
+    "qint4": _QuantDtypeMetadata("QuantizedS4", "int8", False, -8, 7),
+    "qint32": _QuantDtypeMetadata(
+        "QuantizedS32", "int32", False, -(2 ** 31), 2 ** 31 - 1,
+    ),
+    # NOTE: int2 is not supported for model dump yet
+    "quint2": _QuantDtypeMetadata(None, "uint8", True, 0, 3),
+    "qint2": _QuantDtypeMetadata(None, "int8", False, -2, 1),
+}
+
+
+def is_quantize(dtype):
+    return (
+        hasattr(dtype, "metadata")
+        and dtype.metadata is not None
+        and "mgb_dtype" in dtype.metadata
+    )
+
+
+def get_scale(dtype):
+    assert is_quantize(dtype)
+    return dtype.metadata["mgb_dtype"]["scale"]
+
+
+def get_zero_point(dtype):
+    assert is_quantize(dtype)
+    metadata = dtype.metadata["mgb_dtype"]
+    assert metadata["name"] in ("Quantized8Asymm", "Quantized4Asymm")
+    return metadata["zero_point"]
+
+
+def _check_zero_point(zp: int, dtype_str: str):
+    qmin = _metadata_dict[dtype_str].qmin
+    qmax = _metadata_dict[dtype_str].qmax
+    if zp < qmin or zp > qmax:
+        raise ValueError(
+            "zero_point should be within [{}, {}] for {}".format(qmin, qmax, dtype_str)
+        )
+
+
+def get_quantized_dtype(dtype_str: str, scale: float, zp: Union[int, None]):
+    r"""
+    Get quantized dtype with metadata attribute according to _metadata_dict.
+
+    Note that unsigned dtype must have ``zero_point`` and signed dtype must
+    not have ``zero_point``, to be consitent with tensor generated by calling
+    compiled function from `CompGraph.compile(inputs, outspec)`.
+
+    :param dtype: a string indicating which dtype to return
+    :param scale: a number for scale to store in dtype's metadata
+    :param zp: a number for zero_point to store in dtype's metadata
+    """
+    metadata = _metadata_dict[dtype_str]
+    np_dtype_str = metadata.np_dtype_str
+    is_unsigned = metadata.is_unsigned
+    if is_unsigned:
+        if zp is None or int(zp) != zp:
+            raise ValueError("zero_point should be an integer")
+        zp = int(zp)
+        _check_zero_point(zp, dtype_str)
+        return np.dtype(
+            np_dtype_str,
+            metadata={
+                "mgb_dtype": {
+                    "name": metadata.name,
+                    "scale": float(scale),
+                    "zero_point": zp,
+                }
+            },
+        )
+    else:
+        return np.dtype(
+            np_dtype_str,
+            metadata={"mgb_dtype": {"name": metadata.name, "scale": float(scale)}},
+        )
+
+
+def quint8(scale, zero_point):
+    """
+    Consturct a quantized unsigned int8 data type with ``scale`` (float) and
+    ``zero_point`` (uint8). The real value represented by a quint8 data type is
+    float_val = scale * (uint8_val - zero_point)
+    """
+    return get_quantized_dtype("quint8", scale, zero_point)
+
+
+def qint8(scale):
+    """
+    Construct a quantized int8 data type with ``scale`` (float). The real value
+    represented by a qint8 data type is float_val = scale * int8_val
+    """
+    return get_quantized_dtype("qint8", scale, None)
+
+
+def qint32(scale):
+    """
+    Construct a quantized int32 data type with ``scale`` (float). The real value
+    represented by a qint32 data type is float_val = scale * int32_val
+    """
+    return get_quantized_dtype("qint32", scale, None)
+
+
+def quint4(scale, zero_point):
+    """
+    Consturct a quantized unsigned int4 data type with ``scale`` (float) and
+    ``zero_point`` (uint8). The real value represented by a quint4 data type is
+    float_val = scale * (uint4_val - zero_point)
+    """
+    return get_quantized_dtype("quint4", scale, zero_point)
+
+
+def qint4(scale):
+    """
+    Construct a quantized int4 data type with ``scale`` (float). The real value
+    represented by a qint4 data type is float_val = scale * int4_val
+    """
+    return get_quantized_dtype("qint4", scale, None)
+
+
+def _convert_to_quantized_dtype(arr: np.ndarray, dtype: np.dtype, dtype_str: str):
+    metadata = _metadata_dict[dtype_str]
+    arr_metadata = dtype.metadata["mgb_dtype"]
+    if not isinstance(arr, np.ndarray):
+        raise ValueError("arr parameter should be instance of np.ndarray")
+    if not is_quantize(dtype) or arr_metadata["name"] != metadata.name:
+        raise ValueError("dtype parameter should be a {} dtype".format(dtype_str))
+    is_unsigned = metadata.is_unsigned
+    if is_unsigned:
+        scale, zp = (
+            arr_metadata["scale"],
+            arr_metadata["zero_point"],
+        )
+        return (
+            (np.round(arr / scale) + zp)
+            .clip(metadata.qmin, metadata.qmax)
+            .astype(dtype)
+        )
+    else:
+        # don't trick to combine with is_unsigned, seeing ``get_quantized_dtype``
+        scale = arr_metadata["scale"]
+        return np.round(arr / scale).clip(metadata.qmin, metadata.qmax).astype(dtype)
+
+
+def _convert_from_quantized_dtype(arr: np.ndarray, dtype_str: str):
+    metadata = _metadata_dict[dtype_str]
+    arr_metadata = arr.dtype.metadata["mgb_dtype"]
+    if not isinstance(arr, np.ndarray):
+        raise ValueError("arr parameter should be instance of np.ndarray")
+    if not is_quantize(arr.dtype) or arr_metadata["name"] != metadata.name:
+        raise ValueError("arr's dtype should be a {} dtype".format(dtype_str))
+    is_unsigned = metadata.is_unsigned
+    if is_unsigned:
+        scale, zp = (
+            arr_metadata["scale"],
+            arr_metadata["zero_point"],
+        )
+        return (arr.astype(np.float32) - zp) * scale
+    else:
+        # don't trick to combine with is_unsigned, seeing ``get_quantized_dtype``
+        scale = arr_metadata["scale"]
+        return (arr.astype(np.float32)) * scale
+
+
+def convert_to_quint8(arr: np.ndarray, q: np.dtype):
+    """
+    Quantize a float NumPy ndarray into a quint8 one with specified params.
+
+    :param arr: Input ndarray.
+    :param q: Target data type, should be a quint8.
+    """
+    return _convert_to_quantized_dtype(arr, q, "quint8")
+
+
+def convert_from_quint8(arr: np.ndarray):
+    """
+    Dequantize a quint8 NumPy ndarray into a float one.
+
+    :param arr: Input ndarray.
+    """
+    return _convert_from_quantized_dtype(arr, "quint8")
+
+
+def convert_to_qint8(arr: np.ndarray, q: np.dtype):
+    """
+    Quantize a float NumPy ndarray into a qint8 one with specified params.
+
+    :param arr: Input ndarray.
+    :param q: Target data type, should be a qint8.
+    """
+    return _convert_to_quantized_dtype(arr, q, "qint8")
+
+
+def convert_from_qint8(arr: np.ndarray):
+    """
+    Dequantize a qint8 NumPy ndarray into a float one.
+
+    :param arr: Input ndarray.
+    """
+    return _convert_from_quantized_dtype(arr, "qint8")
+
+
+def convert_to_qint32(arr: np.ndarray, q: np.dtype):
+    """
+    Quantize a float NumPy ndarray into a qint32 one with specified params.
+
+    :param arr: Input ndarray.
+    :param q: Target data type, should be a qint8.
+    """
+    return _convert_to_quantized_dtype(arr, q, "qint32")
+
+
+def convert_from_qint32(arr):
+    """
+    Dequantize a qint32 NumPy ndarray into a float one.
+
+    :param arr: Input ndarray.
+    """
+    return _convert_from_quantized_dtype(arr, "qint32")
+
+
+def convert_to_quint4(arr: np.ndarray, q: np.dtype):
+    """
+    Quantize a float NumPy ndarray into a quint4 one with specified params.
+
+    :param arr: Input ndarray.
+    :param q: Target data type, should be a quint4.
+    """
+    return _convert_to_quantized_dtype(arr, q, "quint4")
+
+
+def convert_from_quint4(arr: np.ndarray):
+    """
+    Dequantize a quint4 NumPy ndarray into a float one.
+
+    :param arr: Input ndarray.
+    """
+    return _convert_from_quantized_dtype(arr, "quint4")
+
+
+def convert_to_qint4(arr: np.ndarray, q: np.dtype):
+    """
+    Quantize a float NumPy ndarray into a qint4 one with specified params.
+
+    :param arr: Input ndarray.
+    :param q: Target data type, should be a qint4.
+    """
+    return _convert_to_quantized_dtype(arr, q, "qint4")
+
+
+def convert_from_qint4(arr: np.ndarray):
+    """
+    Dequantize a qint4 NumPy ndarray into a float one.
+
+    :param arr: Input ndarray.
+    """
+    return _convert_from_quantized_dtype(arr, "qint4")
--- a/imperative/python/megengine/core/tensor/function.py
+++ b/imperative/python/megengine/core/tensor/function.py
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from ..ops.builtin import OpDef
+from .core import TensorBase, TensorWrapperBase, apply
+from .raw_tensor import RawTensor
+from .tensor import Tensor, push_context
+from .tensor_wrapper import TensorWrapper
+
+
+class Function:
+    """
+    Defines a block of operations with customizable differentiation.
+
+    The computation should be defined in ``forward`` method, with gradient
+    computation defined in ``backward`` method.
+
+    Each instance of ``Function`` should be used only once during forwardding.
+
+    Examples:
+
+    .. testcode::
+
+        class Sigmoid(Function):
+            def forward(self, x):
+                y = 1 / (1 + F.exp(-x))
+                self.y = y
+                return y
+
+            def backward(self. output_grads):
+                y = self.y
+                return output_grads * y * (1-y)
+
+    """
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __call__(self, *args):
+        ret = apply(self, *args)
+        if type(ret) == tuple and len(ret) == 1:
+            return ret[0]
+        return ret
+
+    def forward(self, *args, **kwargs):
+        """
+        Applies operations to ``inputs`` and returns results. It must be overriden by all subclasses.
+
+        :param input: Input tensors.
+        :return: A tuple of Tensor or a single Tensor.
+
+        .. note::
+
+            This method should return a tuple of Tensor or a single Tensor representing the output
+            of the function.
+        """
+        raise NotImplementedError
+
+    def backward(self, *output_grads):
+        """
+        Compute the gradient of the forward function. It must be overriden by all subclasses.
+
+        :param output_grads: gradients of outputs that are returned by :meth:`~.function.Function.forward`
+
+            .. note::
+
+                In case when some tensors of outputs are not related to loss function, the corresponding
+                values in ``output_grads`` would be ``None``.
+
+        .. note::
+
+            This method should return a tuple which containing the gradients of all inputs, in the same order
+            as the ``inputs`` argument of :meth:`~.function.Function.forward` . A ``Tensor`` could be returned
+            instead if there is only one input. If users want to stop the propagation of some gradients,
+            the corresponding returned values should be set ``None`` .
+
+        """
+        raise NotImplementedError
+
+    def get_backward_fn(self):
+        if self.backward is None:
+            return None
+
+        def _backward(*output_grads):
+            if type(output_grads) is tuple:
+                _output_grads = map(TensorWrapper, output_grads)
+            else:
+                _output_grads = (TensorWrapper(output_grads),)
+            ret = self.backward(*_output_grads)
+            if type(ret) is not tuple:
+                ret = (ret,)
+            ret = tuple([i.__wrapped__ for i in ret])
+            return ret
+
+        return _backward
+
+
+Function.apply = Function.__call__
+
+
+@apply.add
+def _(op: Function, *args: TensorWrapperBase):
+    assert args
+    Wrapper = type(args[0])
+
+    # compute the value for self define function
+    extra_data_dic = {}
+    for arg in args:
+        extra_data_dic[arg.__wrapped__] = arg.__wrapped__._extra_data
+        arg.__wrapped__._extra_data = {}
+
+    rets = op.forward(*args)
+
+    for arg in args:
+        arg.__wrapped__._extra_data = extra_data_dic[arg.__wrapped__]
+
+    # update the gradient information for self define function
+    inputs = tuple(map(lambda i: i.__wrapped__, args))
+    outputs = (
+        tuple(map(lambda i: i.__wrapped__, rets))
+        if type(rets) is tuple
+        else (rets.__wrapped__,)
+    )
+
+    for output in outputs:
+        output._extra_data = {}
+
+    with push_context() as ctx:
+        ctx.inputs = inputs
+        ctx.outputs = outputs
+        for k in set().union(*(i._extra_data for i in inputs if isinstance(i, Tensor))):
+            ctx.key = k
+            data = tuple(
+                i._extra_data.get(k) if isinstance(i, Tensor) else i for i in inputs
+            )
+            # data are instances of Tracer
+            # dispatched to apply.add@grad.py
+            rets = apply(op, *data)
+            if rets is not None:
+                assert len(outputs) == len(rets)
+                for t, i in zip(outputs, rets):
+                    t._extra_data[k] = i
+
+    return tuple(map(Wrapper, outputs))
+
+
+@apply.add
+def _(op: Function, *args: Tensor):
+    raise NotImplementedError
+
+
+@apply.add
+def _(op: Function, *args: RawTensor):
+    raise NotImplementedError
--- a/imperative/python/megengine/core/tensor/indexing.py
+++ b/imperative/python/megengine/core/tensor/indexing.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+from ..ops import builtin
+from ..ops.special import Const
+from .core import TensorBase, TensorWrapperBase, apply
+
+
+def remove_ellipsis(tensor, tuple_val):
+    ndim_sum = tensor.ndim
+    cur_sum = 0
+    pos = -1
+    for i_idx, i in enumerate(tuple_val):
+        if i is Ellipsis:
+            for j in tuple_val[:i_idx:-1]:
+                if j is Ellipsis:
+                    raise IndexError("only one ellipsis is allowed")
+            pos = i_idx
+        else:
+            cur_sum += i.ndim if hasattr(i, "ndim") else 1
+    if pos == -1:
+        return tuple_val
+    else:
+        return (
+            tuple_val[:pos]
+            + (slice(None, None, None),) * (ndim_sum - cur_sum)
+            + tuple_val[pos + 1 :]
+        )
+
+
+def check_bool_index(tensor, tuple_val):
+    cur_shape = tensor.shape
+    new_tuple_val = []
+    offset = 0
+    tdim = 0
+    for idx, i in enumerate(tuple_val):
+        if hasattr(i, "dtype") and i.dtype == np.bool_:
+            if i.ndim > 1:
+                tot = i.ndim
+                for j in range(i.ndim):
+                    if cur_shape[tdim + j - offset] != i.shape[j]:
+                        raise IndexError(
+                            "boolean index did not match tensor along dimension {}; dimension is {} but corresponding boolean dimension is {}".format(
+                                tdim + j, cur_shape[tdim + j - offset], i.shape[j]
+                            )
+                        )
+                i = i.reshape(-1)
+                cur_shape = (
+                    cur_shape[:idx] + (i.shape[0],) + cur_shape[tdim + tot - offset :]
+                )
+                offset += 1
+                tensor = tensor.reshape(cur_shape)
+                tdim += tot
+            new_tuple_val.append(i)
+        else:
+            new_tuple_val.append(i)
+            tdim += 1
+    return tensor, new_tuple_val
+
+
+def unpack_getitem(inp, tuple_val, *, allow_newaxis=True):
+    if not isinstance(tuple_val, tuple):
+        tuple_val = (tuple_val,)
+    ndim_indexed = 0
+    for i in tuple_val:
+        if not i is Ellipsis:
+            ndim_indexed += 1 if not hasattr(i, "ndim") else i.ndim
+    if ndim_indexed > inp.ndim:
+        raise IndexError(
+            "too many indices for tensor: tensor is {}-dimensional, but {} were indexed".format(
+                inp.ndim, ndim_indexed
+            )
+        )
+
+    tuple_val = remove_ellipsis(inp, tuple_val)
+    use_subtensor = True
+    inp, tuple_val = check_bool_index(inp, tuple_val)
+
+    def is_scalar(d):
+        if isinstance(i, int):
+            return True
+        if type(d).__module__ == np.__name__:
+            return np.isscalar(d)
+        # if isinstance(d, (TensorBase, TensorWrapperBase)):
+        #     return d.shape == (1,)
+        return False
+
+    new_axes = []
+    tensors = []
+    items = []
+    cur_axis = -1
+    for i_idx, i in enumerate(tuple_val):
+        cur_axis += 1
+        if i is np.newaxis:
+            if cur_axis >= 0:
+                new_axes.append(cur_axis)
+            continue
+
+        if i is Ellipsis:
+            cur_axis = -1
+            for j in tuple_val[:i_idx:-1]:
+                if j is Ellipsis:
+                    raise IndexError("only one ellipsis is allowed")
+                if j is np.newaxis:
+                    new_axes.append(cur_axis)
+                cur_axis -= 1
+            continue
+
+        if (
+            not is_scalar(i)
+            and not i is np.newaxis
+            and not i is Ellipsis
+            and not isinstance(i, slice)
+        ):
+            use_subtensor = False
+
+        item = [
+            cur_axis,
+        ]
+
+        def is_bool_list(x):
+            if not isinstance(x, list):
+                return False
+            for i in x:
+                if not isinstance(i, bool):
+                    return False
+            return True
+
+        def get_index(i):
+            if not isinstance(i, (TensorBase, TensorWrapperBase)):
+                if is_bool_list(i) or isinstance(i, np.ndarray) and i.dtype == np.bool_:
+                    (i,) = Const(i, dtype=np.bool_, device=inp.device)(inp)
+                else:
+                    (i,) = Const(i, dtype=np.int32, device=inp.device)(inp)
+                    return i
+            assert isinstance(i, (TensorBase, TensorWrapperBase))
+            if i.dtype != np.bool_:
+                return i
+            _, ind = apply(builtin.CondTake(), i, i)
+            return ind
+
+        def push(v, item, tensors):
+            if v is None:
+                item.append(False)
+            else:
+                item.append(True)
+                v = get_index(v)
+                assert np.issubdtype(v.dtype, np.integer) or np.issubdtype(
+                    v.dtype, np.bool
+                ), "var type in the subscript must be int or bool"
+                tensors.append(v)
+
+        if isinstance(i, slice):
+            if i.start is None and i.stop is None and i.step is None:
+                continue
+            push(i.start, item, tensors)
+            push(i.stop, item, tensors)
+            push(i.step, item, tensors)
+            item.append(False)  # idx
+        else:
+            item += [False,] * 3  # begin, end, stop
+            push(i, item, tensors)
+        assert len(item) == 5
+        items.append(item)
+    if new_axes:
+        raise IndexError("newaxis is not allowed here")
+    return inp, tensors, items, use_subtensor
+
+
+def try_condtake(tensor, index):
+    if not hasattr(index, "dtype") or not hasattr(index, "shape"):
+        return []
+    if index.dtype != np.bool_ or index.shape != tensor.shape:
+        return []
+    if isinstance(index, np.ndarray):
+        (i,) = Const(i, dtype=np.bool_, device=inp.device)(inp)
+    assert isinstance(index, (TensorBase, TensorWrapperBase))
+    if not isinstance(tensor, (TensorWrapperBase, TensorBase)):
+        raise TypeError("input must be a tensor")
+    if tensor.device != index.device:
+        raise ValueError(
+            "ambiguous device: {} vs {}".format(tensor.device, index.device)
+        )
+    return apply(builtin.CondTake(), tensor, index)
+
+
+def getitem(tensor, index):
+    try_result = try_condtake(tensor, index)
+    if len(try_result) == 2:
+        return try_result[0]
+    tensor, tensors, items, use_subtensor = unpack_getitem(tensor, index)
+    for v in tensors:
+        if v.shape[0] == 0:
+            (empty_tensor,) = Const([], dtype=tensor.dtype, device=tensor.device)(
+                tensor
+            )
+            return empty_tensor
+    if use_subtensor:
+        op = builtin.Subtensor(items=items)
+    else:
+        op = builtin.IndexingMultiAxisVec(items=items)
+    (result,) = apply(op, tensor, *tensors)
+    return result
+
+
+def setitem(tensor, index, value):
+    org_shape = tensor.shape
+    try_result = try_condtake(tensor, index)
+    if len(try_result) == 2:
+        index = try_result[1]
+        if index.shape[0] == 0:
+            return tensor
+        tensor = tensor.reshape(-1)
+    if not isinstance(value, (TensorBase, TensorWrapperBase)):
+        op = Const(value, dtype=tensor.dtype, device=tensor.device)
+        (value,) = op(tensor)
+    tensor, tensors, items, use_subtensor = unpack_getitem(tensor, index)
+    for v in tensors:
+        if v.shape[0] == 0:
+            return tensor
+    if use_subtensor:
+        op = builtin.Subtensor(items=items)
+    else:
+        op = builtin.IndexingMultiAxisVec(items=items)
+    (tmp_result,) = apply(op, tensor, *tensors)
+    if value.shape != tmp_result.shape:
+        for i in range(min(len(value.shape), len(tmp_result.shape))):
+            if (
+                value.shape[-i - 1] != 1
+                and value.shape[-i - 1] != tmp_result.shape[-i - 1]
+            ):
+                raise ValueError(
+                    "cannot copy tensor with shape {} to subtensor with shape {}".format(
+                        value.shape, tmp_result.shape
+                    )
+                )
+        value = value.broadcast(tmp_result.shape)
+    if use_subtensor:
+        op = builtin.SetSubtensor(items=items)
+    else:
+        op = builtin.IndexingSetMultiAxisVec(items=items)
+    (result,) = apply(op, tensor, value, *tensors)
+    result = result.reshape(org_shape)
+    return result
--- a/imperative/python/megengine/core/tensor/megbrain_graph.py
+++ b/imperative/python/megengine/core/tensor/megbrain_graph.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+import threading
+import weakref
+from concurrent.futures import Future, ThreadPoolExecutor
+
+from .. import _imperative_rt
+from .._wrap import device as as_device
+from ..ops.builtin import OpDef
+from .core import OpBase, TensorBase, apply
+
+
+class CompiledFunction:
+    def __init__(self, graph, function):
+        self._graph = graph
+        self._function = function
+        self._future = None
+
+    def execute(self, *args):
+        assert self._future is None
+        self._future = self._graph._executor.submit(self._function.execute, *args)
+
+    def wait(self):
+        assert self._future is not None
+        self._future.exception()
+        self._function.wait()
+        try:
+            return self._future.result()
+        finally:
+            self._future = None
+
+    def __call__(self, *args):
+        self.execute(*args)
+        return self.wait()
+
+
+class Graph(_imperative_rt.ComputingGraph):
+    def __init__(self):
+        super().__init__()
+        self._var_cache = weakref.WeakKeyDictionary()
+        self._op_cache = weakref.WeakKeyDictionary()
+        self._executor = ThreadPoolExecutor(1)
+
+    def _wrap(self, obj):
+        if type(obj) is _imperative_rt.VarNode:
+            wrapper, cache = VarNode, self._var_cache
+        elif type(obj) is _imperative_rt.OperatorNode:
+            wrapper, cache = OpNode, self._op_cache
+        if obj not in cache:
+            cache[obj] = wrapper(obj)
+        return cache[obj]
+
+    def compile(self, *args):
+        return CompiledFunction(self, super().compile(_unwrap(args)))
+
+
+class VarNode(TensorBase):
+    def __init__(self, node: _imperative_rt.VarNode):
+        self._node = node
+
+    @property
+    def graph(self) -> Graph:
+        return self._node.graph
+
+    @property
+    def op(self):
+        return self.graph._wrap(self._node.owner)
+
+    @property
+    def dtype(self):
+        return self._node.dtype
+
+    @property
+    def device(self):
+        return as_device(self._node.comp_node)
+
+
+class OpNode:
+    def __init__(self, node: _imperative_rt.OperatorNode):
+        self._node = node
+
+    @property
+    def graph(self) -> Graph:
+        return self._node.graph
+
+    @property
+    def inputs(self):
+        return tuple(map(self.graph._wrap, self._node.inputs))
+
+    @property
+    def outputs(self):
+        return tuple(map(self.graph._wrap, self._node.outputs))
+
+
+def _wrap(x):
+    if isinstance(x, collections.Sequence):
+        return type(x)(map(_wrap, x))
+    return x.graph._wrap(x)
+
+
+def _unwrap(x):
+    if isinstance(x, collections.Sequence):
+        return type(x)(map(_unwrap, x))
+    return x._node
+
+
+@apply.add
+def _(op: OpDef, *args: VarNode):
+    outputs = _imperative_rt.invoke_op(op, _unwrap(args))
+    return _wrap(outputs)
+
+
+def input_callback(callback, *args, device=None, dtype=None, graph=None):
+    outputs = _imperative_rt.input_callback(
+        callback, as_device(device).to_c(), dtype, _unwrap(args), graph=graph
+    )
+    value, dummy = _wrap(outputs)
+    return value, dummy
+
+
+class InputNode(OpNode):
+    def __init__(self, *args: VarNode, device=None, dtype=None, graph=None):
+        r = _imperative_rt.DeviceTensorNDRendezvous()
+        if device is not None:
+            device = as_device(device).to_c()
+        outputs = _imperative_rt.input_callback(
+            r, device, dtype, _unwrap(args), graph=graph
+        )
+        super().__init__(outputs[0].owner)
+        self._rendezvous = r
+
+    def set_value(self, value):
+        assert isinstance(value, _imperative_rt.DeviceTensorND)
+        self._rendezvous.set(value)
+
+    def reset(self):
+        self._rendezvous.reset()
+
+    @property
+    def device(self):
+        return self.outputs[0].device
+
+    @property
+    def dtype(self):
+        return self.outputs[0].dtype
+
+
+def output_callback(callback, var, *args):
+    args = (var,) + args
+    dummy = _imperative_rt.output_callback(callback, _unwrap(args))
+    return _wrap(dummy)
+
+
+class OutputNode(OpNode):
+    def __init__(self, var, *args):
+        args = (var,) + args
+        r = _imperative_rt.DeviceTensorNDRendezvous()
+        dummy = _imperative_rt.output_callback(r, _unwrap(args))
+        super().__init__(dummy.owner)
+        self._rendezvous = r
+
+    def get_value(self):
+        return self._rendezvous.get()
+
+    def reset(self):
+        self._rendezvous.reset()
+
+
+class TensorAttr:
+    def __init__(self, shape, dtype, device):
+        self.shape = shape
+        self.dtype = dtype
+        self.device = device
+
+
+class AttrOutputNode(OpNode):
+    def __init__(self, var, *args):
+        args = (var,) + args
+        r = _imperative_rt.TensorAttrRendezvous()
+        dummy = _imperative_rt.attr_output_callback(r, _unwrap(args))
+        super().__init__(dummy.owner)
+        self._rendezvous = r
+
+    def get_value(self):
+        attr = self._rendezvous.get()
+        return TensorAttr(attr.shape, attr.dtype, as_device(attr.comp_node))
+
+    def reset(self):
+        self._rendezvous.reset()
--- a/imperative/python/megengine/core/tensor/raw_tensor/__init__.py
+++ b/imperative/python/megengine/core/tensor/raw_tensor/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import functools
+
+import numpy as np
+
+from ..._imperative_rt import CompNode, DeviceTensorND
+from ..._imperative_rt.imperative import (
+    _get_dev_tensor,
+    apply_op,
+    delete,
+    get_device,
+    get_dtype,
+    get_shape,
+    get_value,
+    put,
+)
+from ..._wrap import device as as_device
+from ...ops.builtin import Copy, OpDef, TypeCvt
+from ...ops.special import Const
+from ..core import OpBase, TensorBase, apply
+
+
+class RawTensor(TensorBase):
+
+    _init_cb = None
+    _del_cb = None
+
+    def __init__(self, handle):
+        self._handle = handle
+        if self._init_cb:
+            self._init_cb()
+
+    @property
+    def dtype(self):
+        return get_dtype(self._handle)
+
+    @property
+    def device(self):
+        return as_device(get_device(self._handle))
+
+    @property
+    def shape(self):
+        return get_shape(self._handle)
+
+    def numpy(self):
+        return get_value(self._handle)
+
+    def _dev_tensor(self):
+        return _get_dev_tensor(self._handle)
+
+    def __repr__(self):
+        return "{}({}, device='{}')".format(
+            type(self).__qualname__, repr(self.numpy()), self.device
+        )
+
+    def __del__(self):
+        if self._del_cb:
+            self._del_cb()
+        delete(self._handle)
+
+
+@apply.add
+def _(op: OpDef, *args: RawTensor):
+    outputs = apply_op(op, tuple(i._handle for i in args))
+    return tuple(map(RawTensor, outputs))
+
+
+@apply.add
+def _(op: Const, *args: RawTensor):
+    dtype = op.dtype
+    device = as_device(op.device).to_c()
+    return (as_raw_tensor(op.value, dtype=dtype, device=device),)
+
+
+@functools.singledispatch
+def as_raw_tensor(obj, dtype=None, device=None):
+    obj = np.asarray(obj, dtype=dtype)
+    if obj.dtype == np.float64:
+        obj = obj.astype(np.float32)
+    if obj.dtype == np.int64:
+        obj = obj.astype(np.int32)
+    return as_raw_tensor(obj, device=device)
+
+
+@as_raw_tensor.register(np.ndarray)
+def _(array: np.ndarray, dtype=None, device=None):
+    device = None if device is None else as_device(device).to_c()
+    return RawTensor(put(array, dtype=dtype, device=device))
+
+
+@as_raw_tensor.register(RawTensor)
+def _(tensor: RawTensor, dtype=None, device=None):
+    if dtype is not None:
+        dtype = np.dtype(dtype)
+        if dtype != tensor.dtype:
+            (tensor,) = apply(TypeCvt(dtype=dtype), tensor)
+    if device is not None:
+        device = as_device(device)
+        if device != tensor.device:
+            (tensor,) = apply(Copy(comp_node=device.to_c()), tensor)
+    return tensor
--- a/imperative/python/megengine/core/tensor/raw_tensor/jit.py
+++ b/imperative/python/megengine/core/tensor/raw_tensor/jit.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import functools
+import io
+import weakref
+
+
+class partial(functools.partial):
+    def __get__(self, instance, owner=None):
+        if instance is None:
+            return self
+        return functools.partial(self, instance)
+
+
+def hook(f):
+    def decorator(impl):
+        return functools.update_wrapper(partial(f, impl), impl)
+
+    return decorator
+
+
+def on_input(impl, value):
+    tensor = impl(value)
+    trace = get_trace()
+    if trace:
+        var = trace.get_var(tensor)
+        event = InputEvent(var)
+        trace.append(event)
+    return tensor
+
+
+def on_read_dtype(impl, self):
+    trace = get_trace()
+    if trace:
+        var = trace.get_var(self)
+        event = ReadDtypeEvent(var)
+        trace.append(event)
+
+    return impl(self)
+
+
+def on_read_device(impl, self):
+    trace = get_trace()
+    if trace:
+        var = trace.get_var(self)
+        event = ReadDeviceEvent(var)
+        trace.append(event)
+
+    return impl(self)
+
+
+def on_read_shape(impl, self):
+    trace = get_trace()
+    if trace:
+        var = trace.get_var(self)
+        event = ReadShapeEvent(var)
+        trace.append(event)
+
+    return impl(self)
+
+
+def on_read_value(impl, self):
+    trace = get_trace()
+    if trace:
+        var = trace.get_var(self)
+        event = ReadValueEvent(var)
+        trace.append(event)
+
+    return impl(self)
+
+
+def on_builtin_op(impl, op, *args):
+    outputs = impl(op, *args)
+
+    trace = get_trace()
+    if trace:
+        input_vars = tuple(map(trace.get_var, args))
+        output_vars = outputs and tuple(map(trace.get_var, outputs))
+        event = OpEvent(op, input_vars, output_vars)
+        trace.append(event)
+
+    return outputs
+
+
+def on_del(impl, self):
+    trace = get_trace()
+    if trace:
+        var = trace.get_var(self)
+        event = DelEvent(var)
+        trace.append(event)
+
+    return impl(self)
+
+
+class Trace(list):
+    def __init__(self):
+        self._var_id = 1
+        self._t2v = weakref.WeakKeyDictionary()
+        self._v2t = weakref.WeakValueDictionary()
+
+    def get_var(self, x):
+        v = self._t2v.get(x)
+        if v:
+            return v
+        v = self._var_id
+        self._var_id += 1
+        self._t2v[x] = v
+        self._v2t[v] = x
+        return v
+
+    def __bool__(self):
+        return True
+
+    def __enter__(self):
+        global _current_trace
+        if hasattr(self, "_prev_trace"):
+            raise RuntimeError
+        self._prev_trace = _current_trace
+        _current_trace = self
+        return self
+
+    def __exit__(self, *_):
+        global _current_trace
+        if _current_trace is not self:
+            raise RuntimeError
+        _current_trace = self._prev_trace
+        del self._prev_trace
+
+
+class Event:
+    pass
+
+
+class InputEvent(Event):
+    def __init__(self, var):
+        self.var = var
+
+
+class ReadEvent(Event):
+    def __init__(self, var):
+        self.var = var
+
+
+class ReadDtypeEvent(ReadEvent):
+    pass
+
+
+class ReadDeviceEvent(ReadEvent):
+    pass
+
+
+class ReadShapeEvent(ReadEvent):
+    pass
+
+
+class ReadValueEvent(ReadEvent):
+    pass
+
+
+class OpEvent(Event):
+    def __init__(self, op, inputs, outputs):
+        self.op = op
+        self.inputs = inputs
+        self.outputs = outputs
+
+
+class DelEvent(Event):
+    def __init__(self, var):
+        self.var = var
+
+
+_current_trace = None
+
+
+def get_trace() -> Trace:
+    global _current_trace
+    return _current_trace
+
+
+def format_trace(trace):
+    buf = io.StringIO()
+    active_vars = set()
+
+    def write(fmt, *args, **kwargs):
+        print(fmt.format(*args, **kwargs), file=buf)
+
+    def init_vars(*args):
+        for i in args:
+            if i in active_vars:
+                continue
+            active_vars.add(i)
+            write("_{} = input()", i)
+
+    for event in trace:
+        if isinstance(event, InputEvent):
+            init_vars(event.var)
+        elif isinstance(event, ReadDtypeEvent):
+            init_vars(event.var)
+            write("output(_{}.dtype)", event.var)
+        elif isinstance(event, ReadDeviceEvent):
+            init_vars(event.var)
+            write("output(_{}.device)", event.var)
+        elif isinstance(event, ReadShapeEvent):
+            init_vars(event.var)
+            write("output(_{}.shape)", event.var)
+        elif isinstance(event, ReadValueEvent):
+            init_vars(event.var)
+            write("output(_{}.dtype)", event.var)
+        elif isinstance(event, ReadValueEvent):
+            init_vars(event.var)
+            write("output(_{}.value)", event.var)
+        elif isinstance(event, OpEvent):
+            init_vars(*event.inputs)
+            active_vars.update(event.outputs)
+            ovars = ", ".join(map("_{}".format, event.outputs))
+            ivars = ", ".join(map("_{}".format, event.inputs))
+            if ovars:
+                write("{} = {}({})", ovars, repr(event.op), ivars)
+            else:
+                write("{}({})", repr(event.op), ivars)
+        elif isinstance(event, DelEvent):
+            init_vars(event.var)
+            write("del _{}", event.var)
+        else:
+            raise TypeError(type(event))
+
+    return buf.getvalue()
+
+
+def compile_trace(trace):
+    trace = list(trace)
+
+
+def static_function(f):
+    trace = None
+
+    @functools.wraps(f)
+    def wrapper(*args, **kwargs):
+        nonlocal trace
+        if trace is None:
+            with Trace() as trace:
+                return f(*args, **kwargs)
+        return f(*args, **kwargs)
+
+    return wrapper
--- a/imperative/python/megengine/core/tensor/raw_tensor/trace_exec.py
+++ b/imperative/python/megengine/core/tensor/raw_tensor/trace_exec.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import functools
+import weakref
+
+# Concepts
+#
+# * Internal tensor
+#   Tensor produced by the static sequence
+#
+# * External tensor
+#   Tensor not produced, but used as input, by the static sequence
+#
+# * Irrelevant tensor
+#   Tensor not present in input/output of any op
+#
+# * Escape
+#   An internal tensor is said to escape if it is still alive
+#   at the end of the sequence
+
+# JIT-ed execution
+#
+# 1. read attr (dtype, device, shape)
+#    a. internal tensor
+#       read out as soon as tensor is produced
+#    b. external or irrelevant tensor
+#       fallback
+#
+# 2. apply op
+#    bind external tensors in input
+#
+# 3. del
+
+
+class Action:
+    pass
+
+
+class ReadAttrAction(Action):
+    def __init__(self, var, name, getter):
+        self.var = var
+        self.name = name
+        self.getter = getter
+
+
+class ReadValueAction(Action):
+    def __init__(self, var, getter):
+        self.var = var
+        self.getter = getter
+
+
+class GetTensorAction(Action):
+    def __init__(self, var, getter):
+        self.var = var
+        self.getter = getter
+
+
+class OpAction(Action):
+    def __init__(self, op, inputs, outputs, input_receivers):
+        self.op = op
+        self.inputs = inputs
+        self.outputs = outputs
+        self.input_receivers = input_receivers
+
+
+class TensorAttr:
+    def __init__(self):
+        self.shape = None
+        self.dtype = None
+        self.device = None
+
+
+class Bailout(Exception):
+    pass
+
+
+class Fallback(Exception):
+    pass
+
+
+def handle_bailout_fallback_finalize(f):
+    @functools.wraps(f)
+    def wrapper(self, impl, *args, **kwargs):
+        try:
+            return f(*args, **kwargs)
+        except Bailout:
+            self.bailout()
+        except Fallback:
+            pass
+        finally:
+            if self.pc == len(self):
+                self.finalize()
+        return impl(*args, **kwargs)
+
+    return wrapper
+
+
+class ExecTrajectory(list):
+    def __init__(self):
+        super().__init__()
+        self.reset()
+
+    def __bool__(self):
+        return True
+
+    def __enter__(self):
+        global _current_trajectory
+        if hasattr(self, "_prev_trajectory"):
+            raise RuntimeError
+        self._prev_trajectory = _current_trajectory
+        _current_trajectory = self
+        self._exited = False
+        return self
+
+    def __exit__(self, *exc_info):
+        # cleanup should be done at completion,
+        # which is before exiting context manager
+        assert self._exited == (exc_info == (None, None, None))
+        if not self._exited:
+            assert self.pc < len(self)
+            self.bailout()
+
+    def _exit(self):
+        # clean up self and global varaible
+        assert not self._exited
+        self.reset()
+
+        global _current_trajectory
+        if _current_trajectory is not self:
+            raise RuntimeError
+        _current_trajectory = self._prev_trajectory
+        del self._prev_trajectory
+
+    def reset(self):
+        self._exited = True
+        self.pc = 0
+        self.attr_cache = weakref.WeakKeyDictionary()
+
+        ### Internal and External Tensor ###
+        # internal tensors are those produced by us
+        # external tensors are those received from outside
+        # during JIT-ed execution, internal tensors are just placeholders.
+        # var_to_tensor is the binding table for all tensors
+        self.var_to_tensor = {}  # var -> weakref[tensor]
+        # tensor_to_var is the reverse binding table for internal tensors
+        # note that external tensors could map to >1 vars.
+        self.tensor_to_var = weakref.WeakKeyDictionary()
+        # internal tensor will be materialized if its .data is accessed from outside
+        # after being meterialized, an intern tensor is much like an external tensor
+
+    def finalize(self):
+        assert self.pc == len(self)
+        self._exit()
+
+    def bailout(self):
+        self._exit()
+        raise NotImplementedError
+
+    def next_action(self):
+        assert not self._exited
+        assert self.pc < len(self)
+        return self[self.pc]
+
+    @handle_bailout_fallback_finalize
+    def read_attr(self, tensor, name):
+        attrs = self.attr_cache.setdefault(tensor, TensorAttr())
+        value = getattr(attrs, name, None)
+        if value is None:
+            action = self.next_action()
+            if not isinstance(action, ReadAttrAction):
+                raise Bailout
+            if name != action.name:
+                raise Bailout
+            value = action.getter()
+            setattr(attrs, name, value)
+        return value
+
+    @handle_bailout_fallback_finalize
+    def read_value(self, impl, tensor):
+        # possibilities:
+        # 1. internal tensor
+        # 2. external tensor
+        # 3. irrelevant tensor (not an input / output of any op)
+        if tensor not in self.tensor_to_var:
+            raise Fallback
+        assert tensor._data is None
+        action = self.next_action()
+        if not isinstance(action, ReadValueAction):
+            raise Bailout
+        return action.getter()
+
+    @handle_bailout_fallback_finalize
+    def apply_op(self, impl, op, *args):
+        from . import RawTensor
+
+        action = self.next_action()
+        if not isinstance(action, OpAction):
+            raise Bailout
+        if len(args) != len(action.inputs):
+            raise Bailout
+        assert len(actions.inputs) == len(action.input_receivers)
+
+        for v, t, r in zip(action.inputs, args, action.input_receivers):
+            if v in self.var_to_tensor:
+                assert r is None
+                if t is not self.var_to_tensor[v]():
+                    raise Bailout
+            else:
+                # NOTE: not checking for aliasing (>=2 vars map to 1 tensor)
+                #       the static execution backend must handle this
+                self.var_to_tensor[v] = weakref.ref(t)
+                r(t)
+
+        outputs = []
+        for v in action.outputs:
+            assert v not in self.var_to_tensor
+            t = RawTensor()
+            t._data_getter = functools.partial(self.get_data, v)
+            outputs.append(t)
+            self.var_to_tensor[v] = weakref.ref(t)
+
+        return tuple(outputs)
+
+    def get_data(self, var):
+        tensor = self.var_to_tensor[var]()
+        assert tensor is not None
+        assert tensor._data is None
+        assert tensor in self.tensor_to_var
+        action = self.next_action()
+        if not isinstance(action, GetTensorAction):
+            self.bailout()
+        elif action.var != var:
+            self.bailout()
+        else:
+            tensor._data = action.getter()
+            del tensor._data_getter
+            del self.tensor_to_var[tensor]
+        assert "_data_getter" not in tensor.__dict__
+        return tensor._data_getter()
+
+
+_current_trajectory = None
+
+
+def get_trajectory():
+    return _current_trajectory
+
+
+def compile_trace(trace):
+    from .jit import ReadDTypeEvent, ReadDeviceEvent, ReadShapeEvent, OpEvent, DelEvent
+
+    traj = ExecutionTrajectory()
+    active_vars = set()
+
+    for event in trace:
+        if isinstance(event, ReadDTypeEvent):
+            traj.append(ReadAttrAction())
--- a/imperative/python/megengine/core/tensor/tensor.py
+++ b/imperative/python/megengine/core/tensor/tensor.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import contextlib
+import copy
+
+from .core import Dispatcher, OpBase, TensorBase, apply
+
+
+class Tensor(TensorBase):
+    def __init__(self, data: TensorBase):
+        self._data = data
+        # _extra_data is set up in Grad.wrt
+        self._extra_data = {}
+        self._user_data = {}
+
+    def __getattr__(self, name):
+        if name in self._user_data:
+            return self._user_data[name]
+        raise AttributeError(name)
+
+    def reset(self, other):
+        assert isinstance(other, __class__)
+        self.__dict__.clear()
+        self._data = other.data
+        self._extra_data = other._extra_data.copy()
+        self._user_data = other._user_data.copy()
+
+    def copy(self):
+        other = object.__new__(type(self))
+        other.reset(self)
+        return other
+
+    # tensor interface
+
+    @property
+    def shape(self):
+        return self._data.shape
+
+    @property
+    def dtype(self):
+        return self._data.dtype
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def numpy(self):
+        return self._data.numpy()
+
+
+class ApplyContext:
+    def __init__(self):
+        self.inputs = None
+        self.outputs = None
+        self.key = None
+
+
+_context = None
+
+
+@contextlib.contextmanager
+def push_context():
+    global _context
+    backup = _context
+    try:
+        _context = ApplyContext()
+        yield _context
+    finally:
+        _context = backup
+
+
+def get_context():
+    return _context
+
+
+@apply.add
+def tensor_apply(op: OpBase, *args: Tensor):
+    data = tuple(i._data if isinstance(i, Tensor) else i for i in args)
+    # type(Tensor._data) is RawTensor
+    # dispached to apply.add@RawTensor.py if passed Tensor args
+    outputs = apply(op, *data)
+    ret = tuple(map(Tensor, outputs))
+
+    with push_context() as ctx:
+        ctx.inputs = args
+        ctx.outputs = ret
+        for k in set().union(*(i._extra_data for i in args if isinstance(i, Tensor))):
+            ctx.key = k
+            data = tuple(
+                i._extra_data.get(k) if isinstance(i, Tensor) else i for i in args
+            )
+            # data are instances of Tracer
+            # dispatched to apply.add@grad.py
+            outputs = apply(op, *data)
+            if outputs is not None:
+                assert len(outputs) == len(ret)
+                for t, i in zip(ret, outputs):
+                    t._extra_data[k] = i
+
+    return ret
--- a/imperative/python/megengine/core/tensor/tensor_wrapper.py
+++ b/imperative/python/megengine/core/tensor/tensor_wrapper.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import abc
+import collections
+
+import numpy as np
+
+from ..ops import builtin
+from ..ops.special import Const
+from . import utils
+from .core import OpBase, TensorBase, TensorWrapperBase, apply
+from .indexing import getitem as _getitem
+from .indexing import setitem as _setitem
+from .raw_tensor import RawTensor, as_raw_tensor
+from .tensor import Tensor
+
+
+def _elwise(*args, mode):
+    op = builtin.Elemwise(mode=mode)
+    args = utils.convert_inputs(*args)
+    (result,) = apply(op, *args)
+    return result
+
+
+def _matmul(inp1, inp2):
+    op = builtin.MatrixMul(
+        transposeA=False, transposeB=False, compute_mode="DEFAULT", format="DEFAULT"
+    )
+    inp1, inp2 = utils.convert_inputs(inp1, inp2)
+    (result,) = apply(op, inp1, inp2)
+    return result
+
+
+def _transpose(data, axes):
+    op = builtin.Dimshuffle(axes)
+    (data,) = utils.convert_inputs(data)
+    (result,) = apply(op, data)
+    return result
+
+
+def _broadcast(inp, shape):
+    shape = utils.astensor1d(shape, inp, dtype="int32", device=inp.device)
+    (result,) = apply(builtin.Broadcast(), inp, shape)
+    return result
+
+
+def _reshape(x, shape):
+    if isinstance(shape, (TensorBase, TensorWrapperBase)):
+        shape = shape.numpy()
+    shape = tuple(map(int, shape))
+    unspec_axis = None
+    for i, s in enumerate(shape):
+        if s < 0:
+            if s != -1:
+                raise ValueError("expect shape[{}] >= -1, got {}".format(i, s))
+            if unspec_axis is not None:
+                raise ValueError("multiple -1 in shape: {} & {}".format(unspec_axis, i))
+            unspec_axis = i
+
+    # TODO: device should be None (cpu)
+    (shape,) = Const(shape, dtype=np.int32, device=x.device)(x)
+    if unspec_axis is None:
+        op = builtin.Reshape()
+    else:
+        op = builtin.Reshape(unspec_axis=unspec_axis)
+    (x,) = apply(op, x, shape)
+    return x
+
+
+def _unary_elwise(mode):
+    def f(self):
+        return _elwise(self, mode=mode)
+
+    return f
+
+
+def _binary_elwise(mode, rev=False):
+    if not rev:
+
+        def f(self, value):
+            return _elwise(self, value, mode=mode)
+
+    else:
+
+        def f(self, value):
+            return _elwise(value, self, mode=mode)
+
+    return f
+
+
+def _logical_unary_elwise(mode, rev=False):
+    def f(self):
+        if self.dtype != np.bool_:
+            raise TypeError("{} requires a bool tensor".format(mode))
+        return _elwise(self, mode=mode)
+
+    return f
+
+
+def _logical_binary_elwise(mode, rev=False):
+    if not rev:
+
+        def f(self, value):
+            if self.dtype != np.bool_ or value.dtype != np.bool_:
+                raise TypeError("{} requires 2 bool tensors".format(mode))
+            return _elwise(self, value, mode=mode)
+
+    else:
+
+        def f(self, value):
+            if self.dtype != np.bool_ or value.dtype != np.bool_:
+                raise TypeError("{} requires 2 bool tensors".format(mode))
+            return _elwise(value, self, mode=mode)
+
+    return f
+
+
+def _reduce(mode):
+    def f(self, axis=None):
+        inp = self
+        if axis is None:
+            inp = self.flatten()
+            axis = 0
+        op = builtin.Reduce(mode=mode, axis=axis)
+        (result,) = utils.convert_inputs(inp)
+        (result,) = apply(op, result)
+        return result
+
+    return f
+
+
+def _inplace(f):
+    def g(self, value):
+        result = f(self, value)
+        if result is NotImplemented:
+            raise NotImplementedError
+        self._reset(result)
+        return self
+
+    return g
+
+
+def _todo(*_):
+    raise NotImplementedError
+
+
+class ArrayMethodMixin(abc.ABC):
+
+    __array_priority__ = 233333
+
+    @abc.abstractmethod
+    def _reset(self, other):
+        pass
+
+    @abc.abstractproperty
+    def dtype(self) -> np.dtype:
+        pass
+
+    @abc.abstractproperty
+    def shape(self) -> tuple:
+        pass
+
+    @abc.abstractmethod
+    def numpy(self) -> np.ndarray:
+        pass
+
+    __hash__ = None  # due to __eq__ diviates from python convention
+
+    __lt__ = lambda self, value: _elwise(self, value, mode="LT").astype("bool")
+    __le__ = lambda self, value: _elwise(self, value, mode="LEQ").astype("bool")
+    __gt__ = lambda self, value: _elwise(value, self, mode="LT").astype("bool")
+    __ge__ = lambda self, value: _elwise(value, self, mode="LEQ").astype("bool")
+    __eq__ = lambda self, value: _elwise(self, value, mode="EQ").astype("bool")
+    __ne__ = lambda self, value: _elwise(
+        _elwise(self, value, mode="EQ").astype("bool"), mode="NOT"
+    )
+
+    __neg__ = _unary_elwise("NEGATE")
+    __pos__ = lambda self: self
+    __abs__ = _unary_elwise("ABS")
+    __invert__ = _logical_unary_elwise("NOT")
+    __round__ = _unary_elwise("ROUND")
+    __trunc__ = _todo
+    __floor__ = _unary_elwise("FLOOR")
+    __ceil__ = _unary_elwise("CEIL")
+
+    __add__ = _binary_elwise("ADD")
+    __sub__ = _binary_elwise("SUB")
+    __mul__ = _binary_elwise("MUL")
+    __matmul__ = lambda self, other: _matmul(self, other)
+    __truediv__ = _binary_elwise("TRUE_DIV")
+    __floordiv__ = _binary_elwise("FLOOR_DIV")
+    __mod__ = _binary_elwise("MOD")
+    # __divmode__
+    __pow__ = _binary_elwise("POW")
+    __lshift__ = _binary_elwise("SHL")
+    __rshift__ = _binary_elwise("SHR")
+    __and__ = _logical_binary_elwise("AND")
+    __or__ = _logical_binary_elwise("OR")
+    __xor__ = _logical_binary_elwise("XOR")
+
+    __radd__ = _binary_elwise("ADD", rev=1)
+    __rsub__ = _binary_elwise("SUB", rev=1)
+    __rmul__ = _binary_elwise("MUL", rev=1)
+    __rmatmul__ = lambda self, other: _matmul(other, self)
+    __rtruediv__ = _binary_elwise("TRUE_DIV", rev=1)
+    __rfloordiv__ = _binary_elwise("FLOOR_DIV", rev=1)
+    __rmod__ = _binary_elwise("MOD", rev=1)
+    # __rdivmode__
+    __rpow__ = _binary_elwise("POW", rev=1)
+    __rlshift__ = _binary_elwise("SHL", rev=1)
+    __rrshift__ = _binary_elwise("SHR", rev=1)
+    __rand__ = _logical_binary_elwise("AND", rev=1)
+    __ror__ = _logical_binary_elwise("OR", rev=1)
+    __rxor__ = _logical_binary_elwise("XOR", rev=1)
+
+    __iadd__ = _inplace(__add__)
+    __isub__ = _inplace(__sub__)
+    __imul__ = _inplace(__mul__)
+    __imatmul__ = _inplace(__matmul__)
+    __itruediv__ = _inplace(__truediv__)
+    __ifloordiv__ = _inplace(__floordiv__)
+    __imod__ = _inplace(__mod__)
+    __ipow__ = _inplace(__pow__)
+    __ilshift__ = _inplace(__lshift__)
+    __irshift__ = _inplace(__rshift__)
+    __iand__ = _inplace(__and__)
+    __ior__ = _inplace(__or__)
+    __ixor__ = _inplace(__xor__)
+
+    __index__ = lambda self: self.item().__index__()
+    __bool__ = lambda self: bool(self.item())
+    __int__ = lambda self: int(self.item())
+    __float__ = lambda self: float(self.item())
+    __complex__ = lambda self: complex(self.item())
+
+    def __len__(self):
+        shape = self.shape
+        if shape:
+            return int(shape[0])
+        raise TypeError("ndim is 0")
+
+    def __iter__(self):
+        for i in range(len(self)):
+            yield self[i]
+
+    def __getitem__(self, index):
+        return _getitem(self, index)
+
+    def __setitem__(self, index, value):
+        if index is not Ellipsis:
+            value = _setitem(self, index, value)
+        self._reset(value)
+
+    __contains__ = _todo
+
+    @property
+    def ndim(self):
+        return len(self.shape)
+
+    @property
+    def size(self):
+        return np.prod(self.shape).item()
+
+    @property
+    def T(self):
+        return self.transpose()
+
+    def item(self, *args):
+        if not args:
+            assert self.size == 1
+            return self.numpy().item()
+        return self[args].item()
+
+    def tolist(self):
+        return self.numpy().tolist()
+
+    def astype(self, dtype):
+        return utils.astype(self, dtype)
+
+    def reshape(self, *args):
+        if len(args) == 1:
+            if isinstance(args[0], collections.Sequence):
+                args = args[0]
+        return _reshape(self, args)
+
+    def broadcast(self, *args):
+        if len(args) == 1:
+            if isinstance(args[0], collections.Sequence):
+                args = args[0]
+        return _broadcast(self, args)
+
+    def transpose(self, *args):
+        if not args:
+            args = reversed(range(self.ndim))
+        elif len(args) == 1:
+            if isinstance(args[0], collections.Sequence):
+                args = args[0]
+        return _transpose(self, args)
+
+    def flatten(self):
+        return self.reshape(-1)
+
+    sum = _reduce("SUM")
+    prod = _reduce("PRODUCT")
+    min = _reduce("MIN")
+    max = _reduce("MAX")
+    mean = _reduce("MEAN")
+
+
+class GenericTensorWrapper(ArrayMethodMixin, TensorWrapperBase):
+    def __init__(self, data):
+        self.__wrapped__ = data
+
+    def _reset(self, other):
+        if not isinstance(other, __class__):
+            raise TypeError(type(other))
+        self.__wrapped__ = other.__wrapped__
+        return self
+
+    @property
+    def dtype(self):
+        return self.__wrapped__.dtype
+
+    @property
+    def shape(self):
+        return self.__wrapped__.shape
+
+    @property
+    def device(self):
+        return self.__wrapped__.device
+
+    def numpy(self):
+        return self.__wrapped__.numpy()
+
+
+class TensorWrapper(GenericTensorWrapper):
+    def __init__(self, data, dtype=None, device=None):
+        if isinstance(data, TensorWrapperBase):
+            data = data.__wrapped__
+        elif not isinstance(data, TensorBase):
+            assert data is not None, "Cannot init a tensor with data as None"
+            data = Tensor(as_raw_tensor(data, dtype=dtype, device=device))
+        super().__init__(data)
+
+    def _reset(self, other):
+        if isinstance(other, TensorWrapperBase):
+            self.__wrapped__ = other.__wrapped__
+        elif isinstance(other, TensorBase):
+            self.__wrapped__ = other
+        else:
+            self._reset(type(self)(other, dtype=self.dtype, device=self.device))
+
+    def __repr__(self):
+        piece = "Tensor("
+        with np.printoptions(precision=4, suppress=True):
+            piece += "{}".format(str(self.numpy()))
+        if self.dtype != np.float32:
+            piece += ", dtype={}".format(np.dtype(self.dtype).name)
+        piece += ", device={}".format(self.device) + ")"
+        return piece
--- a/imperative/python/megengine/core/tensor/utils.py
+++ b/imperative/python/megengine/core/tensor/utils.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+from typing import Iterable, Union
+
+import numpy as np
+
+from ..ops import builtin
+from ..ops.special import Const
+from ..tensor.core import OpBase, TensorBase, TensorWrapperBase, apply
+
+
+def dtype_promotion(raw_inputs):
+    def add_dtype(i):
+        if type(i) == int:
+            return np.array(i, dtype=np.int32)
+        if type(i) == float:
+            return np.array(i, dtype=np.float32)
+        if type(i) == bool:
+            return np.array(i, dtype=np.bool_)
+        return None
+
+    scalar_inputs = [
+        add_dtype(i) for i in raw_inputs if not hasattr(i, "dtype") and add_dtype(i)
+    ]
+    inputs = [i for i in raw_inputs if hasattr(i, "dtype")]
+    assert len(scalar_inputs + inputs) > 0
+    dtype = np.result_type(*inputs)
+    dtype_all = np.result_type(*(inputs + scalar_inputs))
+    assert (
+        dtype != np.float64 and dtype != np.int64
+    ), "unsupport dtype {} by dtype_promotion, please use explict type convert".format(
+        dtype
+    )
+    if dtype_all == np.bool_:
+        for i in raw_inputs:
+            if not hasattr(i, "dtype") or i.dtype != np.bool_:
+                raise TypeError(
+                    "bool dtype can not be operated with an element without bool dtype"
+                )
+    if dtype_all == np.float64:
+        dtype_all = np.float32
+    return dtype_all
+
+
+def get_device(inputs):
+    device = None
+    for i in inputs:
+        if isinstance(i, (TensorWrapperBase, TensorBase)):
+            if device is None:
+                device = i.device
+            elif device != i.device:
+                raise ValueError("ambiguous device: {} vs {}".format(device, i.device))
+    assert device is not None
+    return device
+
+
+def concatenate(inputs, axis=0, *, device=None):
+    dtype = dtype_promotion(inputs)
+    device = get_device(inputs)
+
+    def convert(x):
+        return convert_single_value(x, inputs, dtype=dtype)
+
+    inputs = tuple(map(convert, inputs))
+    (result,) = apply(builtin.Concat(axis=axis, comp_node=device.to_c()), *inputs)
+    return result
+
+
+def astype(x, dtype):
+    dtype = np.dtype(dtype)
+    if x.dtype != dtype:
+        (x,) = apply(builtin.TypeCvt(param=dtype), x)
+    return x
+
+
+def convert_single_value(v, inputs, *, dtype=None, device=None):
+    tensors = [i for i in inputs if isinstance(i, (TensorBase, TensorWrapperBase))]
+    assert len(tensors) > 0
+    if isinstance(v, (TensorWrapperBase, TensorBase)):
+        v = astype(v, dtype)
+    else:
+        (v,) = Const(v, dtype=dtype, device=device)(*tensors)
+    return v
+
+
+def convert_inputs(*args: TensorBase):
+    dtype = dtype_promotion(args)
+    device = get_device(args)
+
+    def convert(value):
+        if value is None:
+            return value
+        return convert_single_value(value, args, dtype=dtype, device=device)
+
+    return tuple(map(convert, args))
+
+
+def result_type(*args):
+    dtypes = []
+    for i in args:
+        if isinstance(i, (TensorWrapperBase, TensorBase)):
+            dtypes.append(i.dtype)
+            continue
+        try:
+            dtypes.append(np.dtype(i))
+        except TypeError:
+            pass
+    return np.result_type(*dtypes)
+
+
+def isscalar(x):
+    try:
+        return x.ndim == 0
+    except:
+        pass
+    return np.isscalar(x)
+
+
+def astensor1d(x, *reference, dtype=None, device=None):
+    """
+    Convert something to 1D tensor. Support following types
+    * sequence of scalar literal / tensor
+    * numpy array
+    * tensor (returned as is, regardless of dtype and device)
+    """
+    try:
+        ndim = x.ndim
+    except AttributeError:
+        pass
+    else:
+        if ndim != 1:
+            raise ValueError("ndim != 1: %d" % ndim)
+        if not isinstance(x, (TensorBase, TensorWrapperBase)):
+            (x,) = Const(x, dtype=dtype, device=device)(*reference)
+        return x
+
+    if not isinstance(x, collections.Sequence):
+        raise TypeError
+
+    if any(isinstance(i, (TensorBase, TensorWrapperBase)) for i in x):
+        x = concatenate(x, device=device)
+        if dtype is not None:
+            x = astype(x, dtype)
+        return x
+
+    (x,) = Const(x, dtype=dtype, device=device)(*reference)
+    return x
--- a/imperative/python/megengine/data/__init__.py
+++ b/imperative/python/megengine/data/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .collator import Collator
+from .dataloader import DataLoader
+from .sampler import (
+    Infinite,
+    RandomSampler,
+    ReplacementSampler,
+    Sampler,
+    SequentialSampler,
+)
--- a/imperative/python/megengine/data/_queue.py
+++ b/imperative/python/megengine/data/_queue.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import binascii
+import os
+import queue
+import subprocess
+from multiprocessing import Queue
+
+import pyarrow
+import pyarrow.plasma as plasma
+
+MGE_PLASMA_MEMORY = int(os.environ.get("MGE_PLASMA_MEMORY", 4000000000))  # 4GB
+
+# Each process only need to start one plasma store, so we set it as a global variable.
+# TODO: how to share between different processes?
+MGE_PLASMA_STORE_MANAGER = None
+
+
+def _clear_plasma_store():
+    # `_PlasmaStoreManager.__del__` will not be called automaticly in subprocess,
+    # so this function should be called explicitly
+    global MGE_PLASMA_STORE_MANAGER
+    if MGE_PLASMA_STORE_MANAGER is not None:
+        del MGE_PLASMA_STORE_MANAGER
+        MGE_PLASMA_STORE_MANAGER = None
+
+
+class _PlasmaStoreManager:
+    __initialized = False
+
+    def __init__(self):
+        self.socket_name = "/tmp/mge_plasma_{}".format(
+            binascii.hexlify(os.urandom(8)).decode()
+        )
+        debug_flag = bool(os.environ.get("MGE_DATALOADER_PLASMA_DEBUG", 0))
+        # NOTE: this is a hack. Directly use `plasma_store` may make subprocess
+        # difficult to handle the exception happened in `plasma-store-server`.
+        # For `plasma_store` is just a wrapper of `plasma-store-server`, which use
+        # `os.execv` to call the executable `plasma-store-server`.
+        cmd_path = os.path.join(pyarrow.__path__[0], "plasma-store-server")
+        self.plasma_store = subprocess.Popen(
+            [cmd_path, "-s", self.socket_name, "-m", str(MGE_PLASMA_MEMORY),],
+            stdout=None if debug_flag else subprocess.DEVNULL,
+            stderr=None if debug_flag else subprocess.DEVNULL,
+        )
+        self.__initialized = True
+
+    def __del__(self):
+        if self.__initialized and self.plasma_store.returncode is None:
+            self.plasma_store.kill()
+
+
+class PlasmaShmQueue:
+    def __init__(self, maxsize: int = 0):
+        r"""Use pyarrow in-memory plasma store to implement shared memory queue.
+
+        Compared to native `multiprocess.Queue`, `PlasmaShmQueue` avoid pickle/unpickle
+        and communication overhead, leading to better performance in multi-process
+        application.
+
+        :type maxsize: int
+        :param maxsize: maximum size of the queue, `None` means no limit. (default: ``None``)
+        """
+
+        # Lazy start the plasma store manager
+        global MGE_PLASMA_STORE_MANAGER
+        if MGE_PLASMA_STORE_MANAGER is None:
+            try:
+                MGE_PLASMA_STORE_MANAGER = _PlasmaStoreManager()
+            except Exception as e:
+                err_info = (
+                    "Please make sure pyarrow installed correctly!\n"
+                    "You can try reinstall pyarrow and see if you can run "
+                    "`plasma_store -s /tmp/mge_plasma_xxx -m 1000` normally."
+                )
+                raise RuntimeError(
+                    "Exception happened in starting plasma_store: {}\n"
+                    "Tips: {}".format(str(e), err_info)
+                )
+
+        self.socket_name = MGE_PLASMA_STORE_MANAGER.socket_name
+
+        # TODO: how to catch the exception happened in `plasma.connect`?
+        self.client = None
+
+        # Used to store the header for the data.(ObjectIDs)
+        self.queue = Queue(maxsize)  # type: Queue
+
+    def put(self, data, block=True, timeout=None):
+        if self.client is None:
+            self.client = plasma.connect(self.socket_name)
+        try:
+            object_id = self.client.put(data)
+        except plasma.PlasmaStoreFull:
+            raise RuntimeError("plasma store out of memory!")
+        try:
+            self.queue.put(object_id, block, timeout)
+        except queue.Full:
+            self.client.delete([object_id])
+            raise queue.Full
+
+    def get(self, block=True, timeout=None):
+        if self.client is None:
+            self.client = plasma.connect(self.socket_name)
+        object_id = self.queue.get(block, timeout)
+        if not self.client.contains(object_id):
+            raise RuntimeError(
+                "ObjectID: {} not found in plasma store".format(object_id)
+            )
+        data = self.client.get(object_id)
+        self.client.delete([object_id])
+        return data
+
+    def qsize(self):
+        return self.queue.qsize()
+
+    def empty(self):
+        return self.queue.empty()
+
+    def join(self):
+        self.queue.join()
+
+    def disconnect_client(self):
+        if self.client is not None:
+            self.client.disconnect()
+
+    def close(self):
+        self.queue.close()
+        self.disconnect_client()
+        _clear_plasma_store()
+
+    def cancel_join_thread(self):
+        self.queue.cancel_join_thread()
--- a/imperative/python/megengine/data/collator.py
+++ b/imperative/python/megengine/data/collator.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+# ---------------------------------------------------------------------
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#
+# This file has been modified by Megvii ("Megvii Modifications").
+# All Megvii Modifications are Copyright (C) 2014-2020 Megvii Inc. All rights reserved.
+# ----------------------------------------------------------------------
+import collections.abc
+import re
+
+import numpy as np
+
+np_str_obj_array_pattern = re.compile(r"[aO]")
+default_collate_err_msg_format = (
+    "default_collator: inputs must contain numpy arrays, numbers, "
+    "Unicode strings, bytes, dicts or lists; found {}"
+)
+
+
+class Collator:
+    r"""
+    Used for merge a list of samples to form a mini-batch of Tenor(s). Used when using batched loading from a dataset.
+    modified from https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py
+    """
+
+    def apply(self, inputs):
+        """
+        input : sequence_N(tuple(CHW, C, CK))
+        output : tuple(NCHW, NC, NCK)
+        """
+        elem = inputs[0]
+        elem_type = type(elem)
+        if (
+            elem_type.__module__ == "numpy"
+            and elem_type.__name__ != "str_"
+            and elem_type.__name__ != "string_"
+        ):
+            elem = inputs[0]
+            if elem_type.__name__ == "ndarray":
+                # array of string classes and object
+                if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                    raise TypeError(default_collate_err_msg_format.format(elem.dtype))
+
+                return np.ascontiguousarray(np.stack(inputs))
+            elif elem.shape == ():  # scalars
+                return np.array(inputs)
+        elif isinstance(elem, float):
+            return np.array(inputs, dtype=np.float64)
+        elif isinstance(elem, int):
+            return np.array(inputs)
+        elif isinstance(elem, (str, bytes)):
+            return inputs
+        elif isinstance(elem, collections.abc.Mapping):
+            return {key: self.apply([d[key] for d in inputs]) for key in elem}
+        elif isinstance(elem, tuple) and hasattr(elem, "_fields"):  # namedtuple
+            return elem_type(*(self.apply(samples) for samples in zip(*inputs)))
+        elif isinstance(elem, collections.abc.Sequence):
+            transposed = zip(*inputs)
+            return [self.apply(samples) for samples in transposed]
+
+        raise TypeError(default_collate_err_msg_format.format(elem_type))
--- a/imperative/python/megengine/data/dataloader.py
+++ b/imperative/python/megengine/data/dataloader.py
--- a/imperative/python/megengine/data/dataset/__init__.py
+++ b/imperative/python/megengine/data/dataset/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .meta_dataset import ArrayDataset, Dataset, MapDataset, StreamDataset
+from .vision import *
--- a/imperative/python/megengine/data/dataset/meta_dataset.py
+++ b/imperative/python/megengine/data/dataset/meta_dataset.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from abc import ABC, abstractmethod
+from typing import Tuple
+
+
+class Dataset(ABC):
+    r"""
+    An abstract class for all Datasets
+    """
+
+    @abstractmethod
+    def __init__(self):
+        pass
+
+
+class MapDataset(Dataset):
+    r"""
+    An abstract class for map data
+    __getitem__ and __len__ method are aditionally needed
+    """
+
+    @abstractmethod
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def __getitem__(self, index):
+        pass
+
+    @abstractmethod
+    def __len__(self):
+        pass
+
+
+class StreamDataset(Dataset):
+    r"""
+    An abstract class for stream data
+    __iter__ method is aditionally needed
+    """
+
+    @abstractmethod
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def __iter__(self):
+        pass
+
+
+class ArrayDataset(MapDataset):
+    def __init__(self, *arrays):
+        r"""
+        ArrayDataset is a dataset for numpy array data, one or more numpy arrays
+         are needed to initiate the dataset. And the dimensions represented sample number
+         are expected to be the same.
+        """
+        super().__init__()
+        if not all(len(arrays[0]) == len(array) for array in arrays):
+            raise ValueError("lengths of input arrays are inconsistent")
+        self.arrays = arrays
+
+    def __getitem__(self, index: int) -> Tuple:
+        return tuple(array[index] for array in self.arrays)
+
+    def __len__(self) -> int:
+        return len(self.arrays[0])
--- a/imperative/python/megengine/data/dataset/vision/__init__.py
+++ b/imperative/python/megengine/data/dataset/vision/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .cifar import CIFAR10, CIFAR100
+from .cityscapes import Cityscapes
+from .coco import COCO
+from .folder import ImageFolder
+from .imagenet import ImageNet
+from .meta_vision import VisionDataset
+from .mnist import MNIST
+from .objects365 import Objects365
+from .voc import PascalVOC
--- a/imperative/python/megengine/data/dataset/vision/cifar.py
+++ b/imperative/python/megengine/data/dataset/vision/cifar.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import os
+import pickle
+import tarfile
+from typing import Tuple
+
+import numpy as np
+
+from ....logger import get_logger
+from .meta_vision import VisionDataset
+from .utils import _default_dataset_root, load_raw_data_from_url
+
+logger = get_logger(__name__)
+
+
+class CIFAR10(VisionDataset):
+    r""" ``Dataset`` for CIFAR10 meta data
+    """
+
+    url_path = "http://www.cs.utoronto.ca/~kriz/"
+    raw_file_name = "cifar-10-python.tar.gz"
+    raw_file_md5 = "c58f30108f718f92721af3b95e74349a"
+    raw_file_dir = "cifar-10-batches-py"
+    train_batch = [
+        "data_batch_1",
+        "data_batch_2",
+        "data_batch_3",
+        "data_batch_4",
+        "data_batch_5",
+    ]
+    test_batch = ["test_batch"]
+    meta_info = {"name": "batches.meta"}
+
+    def __init__(
+        self,
+        root: str = None,
+        train: bool = True,
+        download: bool = True,
+        timeout: int = 500,
+    ):
+        super().__init__(root, order=("image", "image_category"))
+
+        self.timeout = timeout
+
+        # process the root path
+        if root is None:
+            self.root = self._default_root
+            if not os.path.exists(self.root):
+                os.makedirs(self.root)
+        else:
+            self.root = root
+            if not os.path.exists(self.root):
+                if download:
+                    logger.debug(
+                        "dir %s does not exist, will be automatically created",
+                        self.root,
+                    )
+                    os.makedirs(self.root)
+                else:
+                    raise ValueError("dir %s does not exist" % self.root)
+
+        self.target_file = os.path.join(self.root, self.raw_file_dir)
+
+        # check existence of target pickle dir, if exists load the
+        # pickle file no matter what download is set
+        if os.path.exists(self.target_file):
+            if train:
+                self.arrays = self.bytes2array(self.train_batch)
+            else:
+                self.arrays = self.bytes2array(self.test_batch)
+        else:
+            if download:
+                self.download()
+                if train:
+                    self.arrays = self.bytes2array(self.train_batch)
+                else:
+                    self.arrays = self.bytes2array(self.test_batch)
+            else:
+                raise ValueError(
+                    "dir does not contain target file %s, please set download=True"
+                    % (self.target_file)
+                )
+
+    def __getitem__(self, index: int) -> Tuple:
+        return tuple(array[index] for array in self.arrays)
+
+    def __len__(self) -> int:
+        return len(self.arrays[0])
+
+    @property
+    def _default_root(self):
+        return os.path.join(_default_dataset_root(), self.__class__.__name__)
+
+    @property
+    def meta(self):
+        meta_path = os.path.join(self.root, self.raw_file_dir, self.meta_info["name"])
+        with open(meta_path, "rb") as f:
+            meta = pickle.load(f, encoding="bytes")
+        return meta
+
+    def download(self):
+        url = self.url_path + self.raw_file_name
+        load_raw_data_from_url(
+            url, self.raw_file_name, self.raw_file_md5, self.root, self.timeout
+        )
+        self.process()
+
+    def untar(self, file_path, dirs):
+        assert file_path.endswith(".tar.gz")
+        logger.debug("untar file %s to %s", file_path, dirs)
+        t = tarfile.open(file_path)
+        t.extractall(path=dirs)
+
+    def bytes2array(self, filenames):
+        data = []
+        label = []
+        for filename in filenames:
+            path = os.path.join(self.root, self.raw_file_dir, filename)
+            logger.debug("unpickle file %s", path)
+            with open(path, "rb") as fo:
+                dic = pickle.load(fo, encoding="bytes")
+                batch_data = dic[b"data"].reshape(-1, 3, 32, 32).transpose((0, 2, 3, 1))
+                data.extend(list(batch_data[..., [2, 1, 0]]))
+                label.extend(dic[b"labels"])
+        label = np.array(label, dtype=np.int32)
+        return (data, label)
+
+    def process(self):
+        logger.info("process raw data ...")
+        self.untar(os.path.join(self.root, self.raw_file_name), self.root)
+
+
+class CIFAR100(CIFAR10):
+    url_path = "http://www.cs.utoronto.ca/~kriz/"
+    raw_file_name = "cifar-100-python.tar.gz"
+    raw_file_md5 = "eb9058c3a382ffc7106e4002c42a8d85"
+    raw_file_dir = "cifar-100-python"
+    train_batch = ["train"]
+    test_batch = ["test"]
+    meta_info = {"name": "meta"}
+
+    @property
+    def meta(self):
+        meta_path = os.path.join(self.root, self.raw_file_dir, self.meta_info["name"])
+        with open(meta_path, "rb") as f:
+            meta = pickle.load(f, encoding="bytes")
+        return meta
+
+    def bytes2array(self, filenames):
+        data = []
+        fine_label = []
+        coarse_label = []
+        for filename in filenames:
+            path = os.path.join(self.root, self.raw_file_dir, filename)
+            logger.debug("unpickle file %s", path)
+            with open(path, "rb") as fo:
+                dic = pickle.load(fo, encoding="bytes")
+                batch_data = dic[b"data"].reshape(-1, 3, 32, 32).transpose((0, 2, 3, 1))
+                data.extend(list(batch_data[..., [2, 1, 0]]))
+                fine_label.extend(dic[b"fine_labels"])
+                coarse_label.extend(dic[b"coarse_labels"])
+        fine_label = np.array(fine_label, dtype=np.int32)
+        coarse_label = np.array(coarse_label, dtype=np.int32)
+        return data, fine_label, coarse_label
--- a/imperative/python/megengine/data/dataset/vision/cityscapes.py
+++ b/imperative/python/megengine/data/dataset/vision/cityscapes.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# ---------------------------------------------------------------------
+# Part of the following code in this file refs to torchvision
+# BSD 3-Clause License
+#
+# Copyright (c) Soumith Chintala 2016,
+# All rights reserved.
+# ---------------------------------------------------------------------
+import json
+import os
+
+import cv2
+import numpy as np
+
+from .meta_vision import VisionDataset
+
+
+class Cityscapes(VisionDataset):
+    r"""`Cityscapes <http://www.cityscapes-dataset.com/>`_ Dataset.
+    """
+
+    supported_order = (
+        "image",
+        "mask",
+        "info",
+    )
+
+    def __init__(self, root, image_set, mode, *, order=None):
+        super().__init__(root, order=order, supported_order=self.supported_order)
+
+        city_root = self.root
+        if not os.path.isdir(city_root):
+            raise RuntimeError("Dataset not found or corrupted.")
+
+        self.mode = mode
+        self.images_dir = os.path.join(city_root, "leftImg8bit", image_set)
+        self.masks_dir = os.path.join(city_root, self.mode, image_set)
+        self.images, self.masks = [], []
+        # self.target_type = ["instance", "semantic", "polygon", "color"]
+
+        # for semantic segmentation
+        if mode == "gtFine":
+            valid_modes = ("train", "test", "val")
+        else:
+            valid_modes = ("train", "train_extra", "val")
+
+        for city in os.listdir(self.images_dir):
+            img_dir = os.path.join(self.images_dir, city)
+            mask_dir = os.path.join(self.masks_dir, city)
+            for file_name in os.listdir(img_dir):
+                mask_name = "{}_{}".format(
+                    file_name.split("_leftImg8bit")[0],
+                    self._get_target_suffix(self.mode, "semantic"),
+                )
+                self.images.append(os.path.join(img_dir, file_name))
+                self.masks.append(os.path.join(mask_dir, mask_name))
+
+    def __getitem__(self, index):
+        target = []
+        for k in self.order:
+            if k == "image":
+                image = cv2.imread(self.images[index], cv2.IMREAD_COLOR)
+                target.append(image)
+            elif k == "mask":
+                mask = cv2.imread(self.masks[index], cv2.IMREAD_GRAYSCALE)
+                mask = self._trans_mask(mask)
+                mask = mask[:, :, np.newaxis]
+                target.append(mask)
+            elif k == "info":
+                if image is None:
+                    image = cv2.imread(self.images[index], cv2.IMREAD_COLOR)
+                info = [image.shape[0], image.shape[1], self.images[index]]
+                target.append(info)
+            else:
+                raise NotImplementedError
+
+        return tuple(target)
+
+    def __len__(self):
+        return len(self.images)
+
+    def _trans_mask(self, mask):
+        trans_labels = [
+            7,
+            8,
+            11,
+            12,
+            13,
+            17,
+            19,
+            20,
+            21,
+            22,
+            23,
+            24,
+            25,
+            26,
+            27,
+            28,
+            31,
+            32,
+            33,
+        ]
+        label = np.ones(mask.shape) * 255
+        for i, tl in enumerate(trans_labels):
+            label[mask == tl] = i
+        return label.astype(np.uint8)
+
+    def _get_target_suffix(self, mode, target_type):
+        if target_type == "instance":
+            return "{}_instanceIds.png".format(mode)
+        elif target_type == "semantic":
+            return "{}_labelIds.png".format(mode)
+        elif target_type == "color":
+            return "{}_color.png".format(mode)
+        else:
+            return "{}_polygons.json".format(mode)
+
+    def _load_json(self, path):
+        with open(path, "r") as file:
+            data = json.load(file)
+        return data
+
+    class_names = (
+        "road",
+        "sidewalk",
+        "building",
+        "wall",
+        "fence",
+        "pole",
+        "traffic light",
+        "traffic sign",
+        "vegetation",
+        "terrain",
+        "sky",
+        "person",
+        "rider",
+        "car",
+        "truck",
+        "bus",
+        "train",
+        "motorcycle",
+        "bicycle",
+    )
--- a/imperative/python/megengine/data/dataset/vision/coco.py
+++ b/imperative/python/megengine/data/dataset/vision/coco.py
--- a/imperative/python/megengine/data/dataset/vision/folder.py
+++ b/imperative/python/megengine/data/dataset/vision/folder.py
--- a/imperative/python/megengine/data/dataset/vision/imagenet.py
+++ b/imperative/python/megengine/data/dataset/vision/imagenet.py
--- a/imperative/python/megengine/data/dataset/vision/meta_vision.py
+++ b/imperative/python/megengine/data/dataset/vision/meta_vision.py
--- a/imperative/python/megengine/data/dataset/vision/mnist.py
+++ b/imperative/python/megengine/data/dataset/vision/mnist.py
--- a/imperative/python/megengine/data/dataset/vision/objects365.py
+++ b/imperative/python/megengine/data/dataset/vision/objects365.py
--- a/imperative/python/megengine/data/dataset/vision/utils.py
+++ b/imperative/python/megengine/data/dataset/vision/utils.py
--- a/imperative/python/megengine/data/dataset/vision/voc.py
+++ b/imperative/python/megengine/data/dataset/vision/voc.py
--- a/imperative/python/megengine/data/sampler.py
+++ b/imperative/python/megengine/data/sampler.py
--- a/imperative/python/megengine/data/transform/__init__.py
+++ b/imperative/python/megengine/data/transform/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .meta_transform import PseudoTransform, Transform
+from .vision import *
--- a/imperative/python/megengine/data/transform/meta_transform.py
+++ b/imperative/python/megengine/data/transform/meta_transform.py
--- a/imperative/python/megengine/data/transform/vision/__init__.py
+++ b/imperative/python/megengine/data/transform/vision/__init__.py
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .transform import *
--- a/imperative/python/megengine/data/transform/vision/functional.py
+++ b/imperative/python/megengine/data/transform/vision/functional.py
--- a/imperative/python/megengine/data/transform/vision/transform.py
+++ b/imperative/python/megengine/data/transform/vision/transform.py
--- a/imperative/python/megengine/device.py
+++ b/imperative/python/megengine/device.py
--- a/imperative/python/megengine/distributed/__init__.py
+++ b/imperative/python/megengine/distributed/__init__.py
--- a/imperative/python/megengine/distributed/group.py
+++ b/imperative/python/megengine/distributed/group.py
--- a/imperative/python/megengine/distributed/helper.py
+++ b/imperative/python/megengine/distributed/helper.py
--- a/imperative/python/megengine/distributed/launcher.py
+++ b/imperative/python/megengine/distributed/launcher.py
--- a/imperative/python/megengine/distributed/server.py
+++ b/imperative/python/megengine/distributed/server.py
--- a/imperative/python/megengine/distributed/util.py
+++ b/imperative/python/megengine/distributed/util.py
--- a/imperative/python/megengine/functional/__init__.py
+++ b/imperative/python/megengine/functional/__init__.py
--- a/imperative/python/megengine/functional/debug_param.py
+++ b/imperative/python/megengine/functional/debug_param.py
--- a/imperative/python/megengine/functional/distributed.py
+++ b/imperative/python/megengine/functional/distributed.py
--- a/imperative/python/megengine/functional/elemwise.py
+++ b/imperative/python/megengine/functional/elemwise.py
--- a/imperative/python/megengine/functional/external.py
+++ b/imperative/python/megengine/functional/external.py
--- a/imperative/python/megengine/functional/graph.py
+++ b/imperative/python/megengine/functional/graph.py
--- a/imperative/python/megengine/functional/loss.py
+++ b/imperative/python/megengine/functional/loss.py
--- a/imperative/python/megengine/functional/math.py
+++ b/imperative/python/megengine/functional/math.py
--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
--- a/imperative/python/megengine/functional/quantized.py
+++ b/imperative/python/megengine/functional/quantized.py
--- a/imperative/python/megengine/functional/tensor.py
+++ b/imperative/python/megengine/functional/tensor.py
--- a/imperative/python/megengine/functional/types.py
+++ b/imperative/python/megengine/functional/types.py
--- a/imperative/python/megengine/functional/utils.py
+++ b/imperative/python/megengine/functional/utils.py
--- a/imperative/python/megengine/hub/__init__.py
+++ b/imperative/python/megengine/hub/__init__.py
--- a/imperative/python/megengine/hub/const.py
+++ b/imperative/python/megengine/hub/const.py
--- a/imperative/python/megengine/hub/exceptions.py
+++ b/imperative/python/megengine/hub/exceptions.py
--- a/imperative/python/megengine/hub/fetcher.py
+++ b/imperative/python/megengine/hub/fetcher.py
--- a/imperative/python/megengine/hub/hub.py
+++ b/imperative/python/megengine/hub/hub.py
--- a/imperative/python/megengine/hub/tools.py
+++ b/imperative/python/megengine/hub/tools.py
--- a/imperative/python/megengine/logger.py
+++ b/imperative/python/megengine/logger.py
--- a/imperative/python/megengine/module/__init__.py
+++ b/imperative/python/megengine/module/__init__.py
--- a/imperative/python/megengine/module/activation.py
+++ b/imperative/python/megengine/module/activation.py
--- a/imperative/python/megengine/module/batchnorm.py
+++ b/imperative/python/megengine/module/batchnorm.py
--- a/imperative/python/megengine/module/concat.py
+++ b/imperative/python/megengine/module/concat.py
--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
--- a/imperative/python/megengine/module/conv_bn.py
+++ b/imperative/python/megengine/module/conv_bn.py
--- a/imperative/python/megengine/module/dropout.py
+++ b/imperative/python/megengine/module/dropout.py
--- a/imperative/python/megengine/module/elemwise.py
+++ b/imperative/python/megengine/module/elemwise.py
--- a/imperative/python/megengine/module/embedding.py
+++ b/imperative/python/megengine/module/embedding.py
--- a/imperative/python/megengine/module/external.py
+++ b/imperative/python/megengine/module/external.py
--- a/imperative/python/megengine/module/identity.py
+++ b/imperative/python/megengine/module/identity.py
--- a/imperative/python/megengine/module/init.py
+++ b/imperative/python/megengine/module/init.py
--- a/imperative/python/megengine/module/linear.py
+++ b/imperative/python/megengine/module/linear.py
--- a/imperative/python/megengine/module/module.py
+++ b/imperative/python/megengine/module/module.py
--- a/imperative/python/megengine/module/parampack.py
+++ b/imperative/python/megengine/module/parampack.py
--- a/imperative/python/megengine/module/pooling.py
+++ b/imperative/python/megengine/module/pooling.py
--- a/imperative/python/megengine/module/qat/__init__.py
+++ b/imperative/python/megengine/module/qat/__init__.py
--- a/imperative/python/megengine/module/qat/concat.py
+++ b/imperative/python/megengine/module/qat/concat.py
--- a/imperative/python/megengine/module/qat/conv.py
+++ b/imperative/python/megengine/module/qat/conv.py
--- a/imperative/python/megengine/module/qat/conv_bn.py
+++ b/imperative/python/megengine/module/qat/conv_bn.py
--- a/imperative/python/megengine/module/qat/elemwise.py
+++ b/imperative/python/megengine/module/qat/elemwise.py
--- a/imperative/python/megengine/module/qat/linear.py
+++ b/imperative/python/megengine/module/qat/linear.py
--- a/imperative/python/megengine/module/qat/module.py
+++ b/imperative/python/megengine/module/qat/module.py
--- a/imperative/python/megengine/module/qat/quant_dequant.py
+++ b/imperative/python/megengine/module/qat/quant_dequant.py
--- a/imperative/python/megengine/module/quant_dequant.py
+++ b/imperative/python/megengine/module/quant_dequant.py
--- a/imperative/python/megengine/module/quantized/__init__.py
+++ b/imperative/python/megengine/module/quantized/__init__.py
--- a/imperative/python/megengine/module/quantized/concat.py
+++ b/imperative/python/megengine/module/quantized/concat.py
--- a/imperative/python/megengine/module/quantized/conv.py
+++ b/imperative/python/megengine/module/quantized/conv.py
--- a/imperative/python/megengine/module/quantized/conv_bn.py
+++ b/imperative/python/megengine/module/quantized/conv_bn.py
--- a/imperative/python/megengine/module/quantized/elemwise.py
+++ b/imperative/python/megengine/module/quantized/elemwise.py
--- a/imperative/python/megengine/module/quantized/linear.py
+++ b/imperative/python/megengine/module/quantized/linear.py
--- a/imperative/python/megengine/module/quantized/module.py
+++ b/imperative/python/megengine/module/quantized/module.py
--- a/imperative/python/megengine/module/quantized/quant_dequant.py
+++ b/imperative/python/megengine/module/quantized/quant_dequant.py
--- a/imperative/python/megengine/module/sequential.py
+++ b/imperative/python/megengine/module/sequential.py
--- a/imperative/python/megengine/optimizer/__init__.py
+++ b/imperative/python/megengine/optimizer/__init__.py
--- a/imperative/python/megengine/optimizer/adadelta.py
+++ b/imperative/python/megengine/optimizer/adadelta.py
--- a/imperative/python/megengine/optimizer/adagrad.py
+++ b/imperative/python/megengine/optimizer/adagrad.py
--- a/imperative/python/megengine/optimizer/adam.py
+++ b/imperative/python/megengine/optimizer/adam.py
--- a/imperative/python/megengine/optimizer/distributed_optimizer.py
+++ b/imperative/python/megengine/optimizer/distributed_optimizer.py
--- a/imperative/python/megengine/optimizer/lr_scheduler.py
+++ b/imperative/python/megengine/optimizer/lr_scheduler.py
--- a/imperative/python/megengine/optimizer/multi_step_lr.py
+++ b/imperative/python/megengine/optimizer/multi_step_lr.py
--- a/imperative/python/megengine/optimizer/optimizer.py
+++ b/imperative/python/megengine/optimizer/optimizer.py
--- a/imperative/python/megengine/optimizer/param_pack.py
+++ b/imperative/python/megengine/optimizer/param_pack.py
--- a/imperative/python/megengine/optimizer/sgd.py
+++ b/imperative/python/megengine/optimizer/sgd.py
--- a/imperative/python/megengine/quantization/__init__.py
+++ b/imperative/python/megengine/quantization/__init__.py
--- a/imperative/python/megengine/quantization/fake_quant.py
+++ b/imperative/python/megengine/quantization/fake_quant.py
--- a/imperative/python/megengine/quantization/internal_fake_quant.py
+++ b/imperative/python/megengine/quantization/internal_fake_quant.py
--- a/imperative/python/megengine/quantization/observer.py
+++ b/imperative/python/megengine/quantization/observer.py
--- a/imperative/python/megengine/quantization/qconfig.py
+++ b/imperative/python/megengine/quantization/qconfig.py
--- a/imperative/python/megengine/quantization/quantize.py
+++ b/imperative/python/megengine/quantization/quantize.py
--- a/imperative/python/megengine/quantization/utils.py
+++ b/imperative/python/megengine/quantization/utils.py
--- a/imperative/python/megengine/random/__init__.py
+++ b/imperative/python/megengine/random/__init__.py
--- a/imperative/python/megengine/random/distribution.py
+++ b/imperative/python/megengine/random/distribution.py
--- a/imperative/python/megengine/random/rng.py
+++ b/imperative/python/megengine/random/rng.py
--- a/imperative/python/megengine/serialization.py
+++ b/imperative/python/megengine/serialization.py
--- a/imperative/python/megengine/tensor.py
+++ b/imperative/python/megengine/tensor.py
--- a/imperative/python/megengine/tensor_nn.py
+++ b/imperative/python/megengine/tensor_nn.py
--- a/imperative/python/megengine/test/__init__.py
+++ b/imperative/python/megengine/test/__init__.py
--- a/imperative/python/megengine/utils/__init__.py
+++ b/imperative/python/megengine/utils/__init__.py
--- a/imperative/python/megengine/utils/_timed_func_fork_exec_entry.py
+++ b/imperative/python/megengine/utils/_timed_func_fork_exec_entry.py
--- a/imperative/python/megengine/utils/hook.py
+++ b/imperative/python/megengine/utils/hook.py
--- a/imperative/python/megengine/utils/http_download.py
+++ b/imperative/python/megengine/utils/http_download.py
--- a/imperative/python/megengine/utils/max_recursion_limit.py
+++ b/imperative/python/megengine/utils/max_recursion_limit.py
--- a/imperative/python/megengine/utils/net_stats.py
+++ b/imperative/python/megengine/utils/net_stats.py
--- a/imperative/python/megengine/utils/profile_analyze.py
+++ b/imperative/python/megengine/utils/profile_analyze.py
--- a/imperative/python/megengine/utils/profile_analyzer.py
+++ b/imperative/python/megengine/utils/profile_analyzer.py
--- a/imperative/python/megengine/utils/profiler.py
+++ b/imperative/python/megengine/utils/profiler.py
--- a/imperative/python/megengine/utils/types.py
+++ b/imperative/python/megengine/utils/types.py
--- a/imperative/python/megengine/version.py
+++ b/imperative/python/megengine/version.py
--- a/imperative/python/requires-style.txt
+++ b/imperative/python/requires-style.txt
--- a/imperative/python/requires-test.txt
+++ b/imperative/python/requires-test.txt
--- a/imperative/python/requires.txt
+++ b/imperative/python/requires.txt
--- a/imperative/python/setup.py
+++ b/imperative/python/setup.py
--- a/imperative/python/src/common.cpp
+++ b/imperative/python/src/common.cpp
--- a/imperative/python/src/common.h
+++ b/imperative/python/src/common.h
--- a/imperative/python/src/graph_rt.cpp
+++ b/imperative/python/src/graph_rt.cpp
--- a/imperative/python/src/graph_rt.h
+++ b/imperative/python/src/graph_rt.h
--- a/imperative/python/src/helper.cpp
+++ b/imperative/python/src/helper.cpp
--- a/imperative/python/src/helper.h
+++ b/imperative/python/src/helper.h
--- a/imperative/python/src/imperative_rt.cpp
+++ b/imperative/python/src/imperative_rt.cpp
--- a/imperative/python/src/imperative_rt.h
+++ b/imperative/python/src/imperative_rt.h
--- a/imperative/python/src/module.cpp
+++ b/imperative/python/src/module.cpp
--- a/imperative/python/src/numpy_dtypes.h
+++ b/imperative/python/src/numpy_dtypes.h
--- a/imperative/python/src/numpy_dtypes_bfloat16.cpp
+++ b/imperative/python/src/numpy_dtypes_bfloat16.cpp
--- a/imperative/python/src/numpy_dtypes_intbx.cpp
+++ b/imperative/python/src/numpy_dtypes_intbx.cpp
--- a/imperative/python/src/ops.cpp
+++ b/imperative/python/src/ops.cpp
--- a/imperative/python/src/ops.h
+++ b/imperative/python/src/ops.h
--- a/imperative/python/src/utils.cpp
+++ b/imperative/python/src/utils.cpp
--- a/imperative/python/src/utils.h
+++ b/imperative/python/src/utils.h
--- a/imperative/python/test/integration/mnist_model_with_test.mge
+++ b/imperative/python/test/integration/mnist_model_with_test.mge
--- a/imperative/python/test/integration/mnist_model_with_test_cpu.mge
+++ b/imperative/python/test/integration/mnist_model_with_test_cpu.mge
--- a/imperative/python/test/integration/test_advance_indexing.py
+++ b/imperative/python/test/integration/test_advance_indexing.py
--- a/imperative/python/test/integration/test_ai.py
+++ b/imperative/python/test/integration/test_ai.py
--- a/imperative/python/test/integration/test_bn.py
+++ b/imperative/python/test/integration/test_bn.py
--- a/imperative/python/test/integration/test_converge.py
+++ b/imperative/python/test/integration/test_converge.py
--- a/imperative/python/test/integration/test_correctness.py
+++ b/imperative/python/test/integration/test_correctness.py
--- a/imperative/python/test/integration/test_detach.py
+++ b/imperative/python/test/integration/test_detach.py
--- a/imperative/python/test/integration/test_dp_correctness.py
+++ b/imperative/python/test/integration/test_dp_correctness.py
--- a/imperative/python/test/integration/test_hello_world.py
+++ b/imperative/python/test/integration/test_hello_world.py
--- a/imperative/python/test/integration/test_lr_scheduler.py
+++ b/imperative/python/test/integration/test_lr_scheduler.py
--- a/imperative/python/test/integration/test_optimizer.py
+++ b/imperative/python/test/integration/test_optimizer.py
--- a/imperative/python/test/integration/test_save_load.py
+++ b/imperative/python/test/integration/test_save_load.py
--- a/imperative/python/test/integration/test_sgd_momentum.py
+++ b/imperative/python/test/integration/test_sgd_momentum.py
--- a/imperative/python/test/pytest.ini
+++ b/imperative/python/test/pytest.ini
--- a/imperative/python/test/unit/functional/__init__.py
+++ b/imperative/python/test/unit/functional/__init__.py
--- a/imperative/python/test/unit/functional/test_distributed.py
+++ b/imperative/python/test/unit/functional/test_distributed.py
--- a/imperative/python/test/unit/functional/test_elemwise.py
+++ b/imperative/python/test/unit/functional/test_elemwise.py
--- a/imperative/python/test/unit/functional/test_functional.py
+++ b/imperative/python/test/unit/functional/test_functional.py
--- a/imperative/python/test/unit/functional/test_math.py
+++ b/imperative/python/test/unit/functional/test_math.py
--- a/imperative/python/test/unit/functional/test_tensor.py
+++ b/imperative/python/test/unit/functional/test_tensor.py
--- a/imperative/python/test/unit/quantization/quantize.py
+++ b/imperative/python/test/unit/quantization/quantize.py
--- a/imperative/python/test/unit/quantization/test_fake_quant.py
+++ b/imperative/python/test/unit/quantization/test_fake_quant.py
--- a/imperative/python/test/unit/test_autodiff.py
+++ b/imperative/python/test/unit/test_autodiff.py
--- a/imperative/python/test/unit/test_distributed.py
+++ b/imperative/python/test/unit/test_distributed.py
--- a/imperative/python/test/unit/test_function.py
+++ b/imperative/python/test/unit/test_function.py
--- a/imperative/python/test/unit/test_imperative_rt.py
+++ b/imperative/python/test/unit/test_imperative_rt.py
--- a/imperative/python/test/unit/test_indexing_op.py
+++ b/imperative/python/test/unit/test_indexing_op.py
--- a/imperative/python/test/unit/test_jit.py
+++ b/imperative/python/test/unit/test_jit.py
--- a/imperative/python/test/unit/test_loss.py
+++ b/imperative/python/test/unit/test_loss.py
--- a/imperative/python/test/unit/test_megbrain_graph.py
+++ b/imperative/python/test/unit/test_megbrain_graph.py
--- a/imperative/python/test/unit/test_module.py
+++ b/imperative/python/test/unit/test_module.py
--- a/imperative/python/test/unit/test_raw_tensor.py
+++ b/imperative/python/test/unit/test_raw_tensor.py
--- a/imperative/python/test/unit/test_serialization.py
+++ b/imperative/python/test/unit/test_serialization.py
--- a/imperative/python/test/unit/test_tensor_wrapper.py
+++ b/imperative/python/test/unit/test_tensor_wrapper.py
--- a/imperative/python/test/unit/test_util.py
+++ b/imperative/python/test/unit/test_util.py
--- a/imperative/python/tools/gen_op_defs.py
+++ b/imperative/python/tools/gen_op_defs.py
--- a/imperative/python/tools/gen_ops.py
+++ b/imperative/python/tools/gen_ops.py
--- a/imperative/python/tools/ops.tpl.py
+++ b/imperative/python/tools/ops.tpl.py
--- a/imperative/src/impl/blob_manager_impl.cpp
+++ b/imperative/src/impl/blob_manager_impl.cpp
--- a/imperative/src/impl/blob_manager_impl.h
+++ b/imperative/src/impl/blob_manager_impl.h
--- a/imperative/src/impl/dnn_op_helper.h
+++ b/imperative/src/impl/dnn_op_helper.h
--- a/imperative/src/impl/interpreter_impl.cpp
+++ b/imperative/src/impl/interpreter_impl.cpp
--- a/imperative/src/impl/interpreter_impl.h
+++ b/imperative/src/impl/interpreter_impl.h
--- a/imperative/src/impl/op_def.cpp
+++ b/imperative/src/impl/op_def.cpp
--- a/imperative/src/impl/op_trait.cpp
+++ b/imperative/src/impl/op_trait.cpp
--- a/imperative/src/impl/op_trait.h
+++ b/imperative/src/impl/op_trait.h
--- a/imperative/src/impl/opr_utility.cpp
+++ b/imperative/src/impl/opr_utility.cpp
--- a/imperative/src/impl/ops/backward_graph.cpp
+++ b/imperative/src/impl/ops/backward_graph.cpp
--- a/imperative/src/impl/ops/collective_comm.cpp
+++ b/imperative/src/impl/ops/collective_comm.cpp
--- a/imperative/src/impl/ops/cond_take.cpp
+++ b/imperative/src/impl/ops/cond_take.cpp
--- a/imperative/src/impl/ops/io_remote.cpp
+++ b/imperative/src/impl/ops/io_remote.cpp
--- a/imperative/src/impl/ops/nms.cpp
+++ b/imperative/src/impl/ops/nms.cpp
--- a/imperative/src/impl/ops/opr_attr.cpp
+++ b/imperative/src/impl/ops/opr_attr.cpp
--- a/imperative/src/impl/ops/tensor_manip.cpp
+++ b/imperative/src/impl/ops/tensor_manip.cpp
--- a/imperative/src/impl/physical_tensor.cpp
+++ b/imperative/src/impl/physical_tensor.cpp
--- a/imperative/src/impl/profiler.cpp
+++ b/imperative/src/impl/profiler.cpp
--- a/imperative/src/impl/proxy_graph.cpp
+++ b/imperative/src/impl/proxy_graph.cpp
--- a/imperative/src/impl/proxy_graph.h
+++ b/imperative/src/impl/proxy_graph.h
--- a/imperative/src/impl/proxy_graph_detail.cpp
+++ b/imperative/src/impl/proxy_graph_detail.cpp
--- a/imperative/src/impl/proxy_graph_detail.h
+++ b/imperative/src/impl/proxy_graph_detail.h
--- a/imperative/src/include/megbrain/imperative.h
+++ b/imperative/src/include/megbrain/imperative.h
--- a/imperative/src/include/megbrain/imperative/blob_manager.h
+++ b/imperative/src/include/megbrain/imperative/blob_manager.h
--- a/imperative/src/include/megbrain/imperative/interpreter.h
+++ b/imperative/src/include/megbrain/imperative/interpreter.h
--- a/imperative/src/include/megbrain/imperative/op_def.h
+++ b/imperative/src/include/megbrain/imperative/op_def.h
--- a/imperative/src/include/megbrain/imperative/opr_utility.h
+++ b/imperative/src/include/megbrain/imperative/opr_utility.h
--- a/imperative/src/include/megbrain/imperative/ops/backward_graph.h
+++ b/imperative/src/include/megbrain/imperative/ops/backward_graph.h
--- a/imperative/src/include/megbrain/imperative/ops/collective_comm.h
+++ b/imperative/src/include/megbrain/imperative/ops/collective_comm.h
--- a/imperative/src/include/megbrain/imperative/ops/cond_take.h
+++ b/imperative/src/include/megbrain/imperative/ops/cond_take.h
--- a/imperative/src/include/megbrain/imperative/ops/io_remote.h
+++ b/imperative/src/include/megbrain/imperative/ops/io_remote.h
--- a/imperative/src/include/megbrain/imperative/ops/nms.h
+++ b/imperative/src/include/megbrain/imperative/ops/nms.h
--- a/imperative/src/include/megbrain/imperative/ops/opr_attr.h
+++ b/imperative/src/include/megbrain/imperative/ops/opr_attr.h
--- a/imperative/src/include/megbrain/imperative/ops/tensor_manip.h
+++ b/imperative/src/include/megbrain/imperative/ops/tensor_manip.h
--- a/imperative/src/include/megbrain/imperative/physical_tensor.h
+++ b/imperative/src/include/megbrain/imperative/physical_tensor.h
--- a/imperative/src/include/megbrain/imperative/profiler.h
+++ b/imperative/src/include/megbrain/imperative/profiler.h
--- a/imperative/src/test/backward_graph.cpp
+++ b/imperative/src/test/backward_graph.cpp
--- a/imperative/src/test/collective_comm.cpp
+++ b/imperative/src/test/collective_comm.cpp
--- a/imperative/src/test/cond_take.cpp
+++ b/imperative/src/test/cond_take.cpp
--- a/imperative/src/test/helper.cpp
+++ b/imperative/src/test/helper.cpp
--- a/imperative/src/test/helper.h
+++ b/imperative/src/test/helper.h
--- a/imperative/src/test/imperative.cpp
+++ b/imperative/src/test/imperative.cpp
--- a/imperative/src/test/io_remote.cpp
+++ b/imperative/src/test/io_remote.cpp
--- a/imperative/src/test/opr_utility.cpp
+++ b/imperative/src/test/opr_utility.cpp
--- a/imperative/src/version.ld
+++ b/imperative/src/version.ld
--- a/imperative/test/CMakeLists.txt
+++ b/imperative/test/CMakeLists.txt
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
--- a/src/opr/impl/standalone/nms_cpu.cpp
+++ b/src/opr/impl/standalone/nms_cpu.cpp
--- a/src/opr/impl/standalone/nms_cpu.h
+++ b/src/opr/impl/standalone/nms_cpu.h
--- a/src/opr/impl/standalone/nms_kern.cu
+++ b/src/opr/impl/standalone/nms_kern.cu
--- a/src/opr/impl/standalone/nms_kern.cuh
+++ b/src/opr/impl/standalone/nms_kern.cuh
--- a/src/opr/impl/standalone/nms_opr.cpp
+++ b/src/opr/impl/standalone/nms_opr.cpp
--- a/src/opr/include/megbrain/opr/standalone/nms_opr.h
+++ b/src/opr/include/megbrain/opr/standalone/nms_opr.h
--- a/src/opr/test/standalone/nms.cpp
+++ b/src/opr/test/standalone/nms.cpp