diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39bf70cd422d898442bbac16fd9b96ceb4063c16..2bb5d3b636309abe989ee780afa9db9b2775ac9f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -247,10 +247,6 @@ if(MGE_BUILD_IMPERATIVE_RT)
     set(CMAKE_CXX_STANDARD 17)
 endif()
 
-if(MGE_BUILD_IMPERATIVE_RT)
-    set(MGE_BUILD_SDK OFF)
-endif()
-
 if(NOT MGE_WITH_CUDA)
     message("-- Disable distributed support, as CUDA is not enabled.")
     set(MGE_WITH_DISTRIBUTED OFF)
@@ -697,9 +693,7 @@ if(MGE_WITH_PYTHON_MODULE)
 endif()
 
 if(MGE_WITH_TEST AND MGE_ENABLE_RTTI)
-    if(NOT MGE_BUILD_IMPERATIVE_RT)
-        add_subdirectory(test)
-    endif()
+    add_subdirectory(test)
 endif()
 
 if(TARGET mgb)
diff --git a/dnn/CMakeLists.txt b/dnn/CMakeLists.txt
index 6bdb2681723c6652a36ad6b5033d41c80f6abe52..7e2012d1abd62e8fc67cc46cd02e5d0c17b89f4a 100644
--- a/dnn/CMakeLists.txt
+++ b/dnn/CMakeLists.txt
@@ -66,9 +66,7 @@ if(MGE_WITH_CUDA)
 endif()
 
 if(MGE_WITH_TEST)
-    if(NOT MGE_BUILD_IMPERATIVE_RT)
-        add_subdirectory(test)
-    endif()
+    add_subdirectory(test)
 endif()
 
 add_subdirectory(src)
diff --git a/imperative/.gitignore b/imperative/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..67074a4e4ec3ca9363645acabc71228213de65e6
--- /dev/null
+++ b/imperative/.gitignore
@@ -0,0 +1,5 @@
+Makefile
+/test/imperative_test
+*.so
+/python/megengine/core/ops/_internal/generated_ops.py
+/python/megengine/core/ops/_internal/param_defs.py
diff --git a/imperative/CMakeLists.txt b/imperative/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..55a97a20f5ea2b2b92e33bb92ca87a31d76230a5
--- /dev/null
+++ b/imperative/CMakeLists.txt
@@ -0,0 +1,110 @@
+find_package(NumPy REQUIRED)
+
+set(PACKAGE_NAME megengine)
+set(PACKAGE_NAME ${PACKAGE_NAME} PARENT_SCOPE)
+set(MODULE_NAME _imperative_rt)
+set(MODULE_NAME ${MODULE_NAME} PARENT_SCOPE)
+file(GLOB_RECURSE SRCS src/impl/*.cpp src/include/*.h python/src/*.cpp python/src/*.h)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMGB_WITH_IMPERATIVE=1")
+
+file(GLOB_RECURSE OPR_DECL_SRCS "${PROJECT_SOURCE_DIR}/src/**/*.oprdecl")
+file(GLOB_RECURSE PYTHON_SRCS python/${PACKAGE_NAME}/*.py)
+list(REMOVE_ITEM PYTHON_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/ops/_internal/generated_ops.py ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine/core/ops/_internal/param_defs.py)
+file(GLOB_RECURSE ALL_HEADERS src/cpp/megbrain_pubapi.h
+    ${PROJECT_SOURCE_DIR}/src/core/include/*
+    ${PROJECT_SOURCE_DIR}/src/opr/include/*
+    ${PROJECT_SOURCE_DIR}/src/serialization/include/*
+    ${PROJECT_SOURCE_DIR}/src/plugin/include/*
+    ${PROJECT_SOURCE_DIR}/dnn/include/*)
+
+set(MEGENGINE_DIR ${CMAKE_CURRENT_BINARY_DIR}/python/)
+set(GEN_OPS_DIR ${MEGENGINE_DIR}/${PACKAGE_NAME}/core/ops/_internal)
+file(MAKE_DIRECTORY ${GEN_OPS_DIR})
+set(GEN_OPS_FILE ${GEN_OPS_DIR}/generated_ops.py)
+set(GEN_OP_PARAMS_FILE ${MEGENGINE_DIR}/${PACKAGE_NAME}/core/ops/_internal/param_defs.py)
+set(GEN_OP_PARAMS_TEMPLATE ${CMAKE_CURRENT_SOURCE_DIR}/python/tools/ops.tpl.py)
+
+##################### generate python opr_param_defs.py   ##############
+
+file(COPY ${PROJECT_SOURCE_DIR}/dnn/scripts/opr_param_defs.py DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+file(READ ${PROJECT_SOURCE_DIR}/tools/param_defs/mgb_opr_param_defs.py CONTENTS)
+file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/opr_param_defs.py ${CONTENTS})
+
+add_custom_command(
+    OUTPUT ${GEN_OPS_FILE}
+    COMMAND ${CMAKE_COMMAND} -E touch ${MEGENGINE_DIR}/${PACKAGE_NAME}/core/${MODULE_NAME}.so ${GEN_OPS_FILE} ${GEN_OP_PARAMS_FILE}
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/${PACKAGE_NAME} ${MEGENGINE_DIR}/${PACKAGE_NAME}
+    COMMAND ${CMAKE_COMMAND} -E remove -f ${MEGENGINE_DIR}/${PACKAGE_NAME}/core/${MODULE_NAME}.so ${GEN_OPS_FILE} ${GEN_OP_PARAMS_FILE}
+    COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/python/tools/gen_ops.py ${OPR_DECL_SRCS} -o ${GEN_OPS_FILE}
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/test ${MEGENGINE_DIR}/${PACKAGE_NAME}/test
+    COMMAND ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/dnn/scripts/gen_param_defs.py -t py --imperative ${CMAKE_CURRENT_BINARY_DIR}/opr_param_defs.py ${GEN_OP_PARAMS_FILE}
+    DEPENDS ${OPR_DECL_SRCS} ${PYTHON_SRCS} ${ALL_HEADERS} ${GEN_OP_PARAMS_TEMPLATE}
+    VERBATIM
+)
+
+add_custom_target(gen_opr_py DEPENDS ${GEN_OPS_FILE})
+
+##################### generate opdef c header and python binding ##############
+
+set(OP_DEF_HEADER_OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/include)
+file(MAKE_DIRECTORY ${OP_DEF_HEADER_OUT_DIR}/megbrain/imperative/opdef)
+set(OP_DEF_HEADER ${OP_DEF_HEADER_OUT_DIR}/megbrain/imperative/opdef/all.h)
+set(OP_DEF_PYTHON_BINDING_OUT_DIR ${MEGENGINE_DIR}/${PACKAGE_NAME}/src)
+file(MAKE_DIRECTORY ${OP_DEF_PYTHON_BINDING_OUT_DIR})
+set(OP_DEF_PYTHON_BINDING ${OP_DEF_PYTHON_BINDING_OUT_DIR}/opdef.inl)
+set(OP_PARAM_DEF ${CMAKE_CURRENT_BINARY_DIR}/opr_param_defs.py)
+set(GEN_OP_DEF_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/python/tools/gen_op_defs.py)
+
+add_custom_command(
+    OUTPUT ${OP_DEF_HEADER} ${OP_DEF_PYTHON_BINDING}
+    COMMAND ${PYTHON_EXECUTABLE} ${GEN_OP_DEF_SCRIPT} ${OP_PARAM_DEF} ${OP_DEF_HEADER}
+    COMMAND ${PYTHON_EXECUTABLE} ${GEN_OP_DEF_SCRIPT} -t py ${OP_PARAM_DEF} ${OP_DEF_PYTHON_BINDING}
+    DEPENDS ${GEN_OP_DEF_SCRIPT} ${OP_PARAM_DEF}
+    VERBATIM
+)
+
+add_custom_target(gen_op_def_internal DEPENDS ${OP_DEF_HEADER} ${OP_DEF_PYTHON_BINDING})
+add_library(gen_op_def INTERFACE)
+target_include_directories(gen_op_def INTERFACE ${OP_DEF_HEADER_OUT_DIR} ${OP_DEF_PYTHON_BINDING_OUT_DIR})
+add_dependencies(gen_op_def gen_op_def_internal)
+
+##################### end of opdef generation #########################
+
+set(VERSION_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/src/version.ld)
+add_custom_target(_version_ld SOURCES ${VERSION_SCRIPT})
+
+add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/pybind11 ${PROJECT_BINARY_DIR}/third_party/pybind11)
+pybind11_add_module(${MODULE_NAME} NO_EXTRAS ${SRCS})
+target_link_libraries(${MODULE_NAME} PRIVATE gen_op_def megbrain megdnn -Wl,--version-script=${VERSION_SCRIPT})
+if (MGE_WITH_DISTRIBUTED)
+    message("Imperative configured to link megray")
+    target_link_libraries(${MODULE_NAME} PRIVATE megray)
+endif()
+target_include_directories(${MODULE_NAME} PUBLIC src/include PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR})
+target_compile_definitions(${MODULE_NAME} PRIVATE MODULE_NAME=${MODULE_NAME})
+target_compile_options(${MODULE_NAME} PRIVATE -Wno-unused-parameter)
+if(CXX_SUPPORT_WCLASS_MEMACCESS)
+    target_compile_options(${MODULE_NAME} PRIVATE "-Wno-class-memaccess")
+endif()
+set_target_properties(${MODULE_NAME} PROPERTIES
+    SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}
+    LIBRARY_OUTPUT_DIRECTORY ${MEGENGINE_DIR}/${PACKAGE_NAME}/core
+)
+add_dependencies(${MODULE_NAME} gen_opr_py _version_ld)
+
+if(MGE_WITH_TEST AND MGE_ENABLE_RTTI)
+    add_subdirectory(test)
+endif()
+
+add_custom_command(
+    TARGET ${MODULE_NAME} POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/LICENSE ${PROJECT_SOURCE_DIR}/ACKNOWLEDGMENTS ${PROJECT_BINARY_DIR}
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/megengine ${CMAKE_CURRENT_BINARY_DIR}/python/megengine
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/python/test ${CMAKE_CURRENT_BINARY_DIR}/python/test
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/setup.py ${CMAKE_CURRENT_BINARY_DIR}/python/setup.py
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires.txt
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-style.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-style.txt
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-test.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-test.txt
+)
+
diff --git a/imperative/python/megengine/__init__.py b/imperative/python/megengine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f27cdc7270dfb0dd99f640611906e4b0d7a03757
--- /dev/null
+++ b/imperative/python/megengine/__init__.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import os
+import sys
+
+from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func
+from .device import *
+from .logger import enable_debug_log, get_logger, set_log_file, set_log_level
+from .serialization import load, save
+from .tensor import Tensor, tensor
+from .tensor_nn import Buffer, Parameter
+from .version import __version__
+
+_set_fork_exec_path_for_timed_func(
+    sys.executable,
+    os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"),
+)
+
+del _set_fork_exec_path_for_timed_func
diff --git a/imperative/python/megengine/core/__init__.py b/imperative/python/megengine/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e24057552a33f3d62428ae53ad0a4f17186b9e5b
--- /dev/null
+++ b/imperative/python/megengine/core/__init__.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import os
+import sys
+
+from .tensor import Tensor
diff --git a/imperative/python/megengine/core/_wrap.py b/imperative/python/megengine/core/_wrap.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4bf756440cb0654cdb08917376075abe3f2e528
--- /dev/null
+++ b/imperative/python/megengine/core/_wrap.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+from ._imperative_rt import CompNode
+
+
+class Device:
+    def __init__(self, device=None):
+        if device is None:
+            self._cn = CompNode()
+        elif isinstance(device, Device):
+            self._cn = device._cn
+        elif isinstance(device, CompNode):
+            self._cn = device
+        else:
+            self._cn = CompNode(device)
+
+    def to_c(self):
+        return self._cn
+
+    def __repr__(self):
+        return "{}({})".format(type(self).__qualname__, self)
+
+    def __str__(self):
+        return str(self._cn)
+
+    def __hash__(self):
+        return hash(str(self._cn))
+
+    def __eq__(self, rhs):
+        if not isinstance(rhs, Device):
+            rhs = Device(rhs)
+        return str(self._cn) == str(rhs._cn)
+
+
+def device(obj):
+    if isinstance(obj, Device):
+        return obj
+    return Device(obj)
diff --git a/imperative/python/megengine/core/autodiff/__init__.py b/imperative/python/megengine/core/autodiff/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1207b5d98cd3578bc39e9ce600a1254a434880c8
--- /dev/null
+++ b/imperative/python/megengine/core/autodiff/__init__.py
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/imperative/python/megengine/core/autodiff/builtin_op_utils.py b/imperative/python/megengine/core/autodiff/builtin_op_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..54d959d173ee437111e5d7bb79df0acb22a4edaf
--- /dev/null
+++ b/imperative/python/megengine/core/autodiff/builtin_op_utils.py
@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import functools
+import itertools
+
+import numpy as np
+
+from .._imperative_rt import TensorAttr, imperative
+from ..ops.builtin import Elemwise, GetVarShape, OpDef, OprAttr, Reduce, Reshape
+from ..tensor.core import apply
+from ..tensor.function import Function
+
+
+@functools.singledispatch
+def builtin_op_get_backward_fn(op: OpDef, inputs, outputs, input_requires_grad):
+    assert 0
+
+
+_elemwise_add_param = Elemwise(mode="add").to_c().param
+
+
+@builtin_op_get_backward_fn.register(OpDef)
+def _(op: OpDef, inputs, outputs, input_requires_grad):
+    if (
+        isinstance(op, OprAttr)
+        and op.type == "Elemwise"
+        and op.param == _elemwise_add_param
+    ):
+        grad_fn = elemwise_grad_fn
+    elif isinstance(op, OprAttr) and op.type == Reshape.name:
+        grad_fn = reshape_grad_fn
+    else:
+        grad_fn = default_grad_fn
+    return grad_fn(op, inputs, outputs, input_requires_grad)
+
+
+@builtin_op_get_backward_fn.register(Function)
+def _(op: Function, inputs, outputs, input_requires_grad):
+    return op.get_backward_fn(), [True,] * len(outputs)
+
+
+def default_grad_fn(op, inputs, outputs, input_requires_grad):
+    def get_tensor_attr(x):
+        attr = TensorAttr()
+        attr.dtype = x.dtype
+        attr.comp_node = x.device.to_c()
+        return attr
+
+    output_has_grads = [True,] * len(outputs)
+    result = imperative.make_backward_graph(
+        op, list(map(get_tensor_attr, inputs)), input_requires_grad, output_has_grads
+    )
+    if result is None:
+        nr_inputs = len(inputs)
+        nr_outputs = len(outputs)
+
+        def backward(*args):
+            return nr_inputs * [
+                None,
+            ]
+
+        return backward, nr_outputs * [False,]
+    backward_graph, save_for_backward_mask, input_has_grad = result
+
+    intput_output_mask = save_for_backward_mask[: len(inputs + outputs) :]
+    output_grad_mask = save_for_backward_mask[len(inputs + outputs) :]
+    save_for_backward = tuple(
+        val for val, mask in zip(inputs + outputs, intput_output_mask) if mask
+    )
+    del inputs
+    del outputs
+
+    def backward(*args):
+        output_grads = tuple(val for val, mask in zip(args, output_grad_mask) if mask)
+        assert None not in output_grads
+        ret = iter(apply(backward_graph, *(save_for_backward + output_grads)))
+        return tuple(next(ret) if mask else None for mask in input_has_grad)
+
+    return backward, output_grad_mask
+
+
+# override for elemwise
+def elemwise_grad_fn(op, inputs, outputs, input_requires_grad):
+    assert len(inputs) == len(input_requires_grad) == 2
+
+    def get_shape(x):
+        (s,) = apply(GetVarShape(), x)
+        return s
+
+    input_shapes = [
+        get_shape(x) if i else None for i, x in zip(input_requires_grad, inputs)
+    ]
+
+    def reduce_to(x, s):
+        (y,) = apply(Reduce(), x, s)
+        return y
+
+    def backward(dy):
+        return tuple(
+            reduce_to(dy, s) if i else None
+            for i, s in zip(input_requires_grad, input_shapes)
+        )
+
+    return backward, [True]
+
+
+def reshape_grad_fn(op, inputs, outputs, input_requires_grad):
+    assert len(inputs) == len(input_requires_grad) == 2
+
+    def get_shape(x):
+        (s,) = apply(GetVarShape(), x)
+        return s
+
+    input_shapes = [
+        get_shape(x) if i else None for i, x in zip(input_requires_grad, inputs)
+    ]
+
+    def reshape_to(dy, s):
+        (dx,) = apply(Reshape(), dy, s)
+        return dx
+
+    def backward(dy):
+        return tuple(
+            reshape_to(dy, s) if i else None
+            for i, s in zip(input_requires_grad, input_shapes)
+        )
+
+    return backward, [True]
diff --git a/imperative/python/megengine/core/autodiff/grad.py b/imperative/python/megengine/core/autodiff/grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b1b337685adb179f51c6f3e160507e0cbaa0a97
--- /dev/null
+++ b/imperative/python/megengine/core/autodiff/grad.py
@@ -0,0 +1,390 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import functools
+import heapq
+import itertools
+import typing
+import weakref
+
+import numpy as np
+
+from ..ops.builtin import Elemwise, OpDef
+from ..ops.special import Const
+from ..tensor.core import TensorBase, TensorWrapperBase, apply
+from ..tensor.function import Function
+from ..tensor.tensor import Tensor, get_context
+from . import builtin_op_utils
+
+""" Some notes:
+    1. Initialize the optimizer:
+        for each trainable parameter:
+            call wrt(param, callback)
+        Each parameter tensor will be assciated with a Tracer object saved in Tensor._extra_data
+    2. Tracer has one member: node, which is a VariableNode
+    3. VariableNode has a OpNode member: opnode
+    4. OpNode has four members:
+        a. id
+        b. inputs, which is made of VariableNode
+        c. outputs, which are weakref's to VariableNode
+        d. backward: call back function
+        e. has_grad_fn: call has_grad_fn(opnode, reached) to check grad exist
+        f. backward_allow_noinput: whether backward allow noinput
+
+"""
+
+_grad_count = 0
+_grad_manager_dict = weakref.WeakValueDictionary()
+
+
+def get_grad_managers():
+    return [_grad_manager_dict[key] for key in _grad_manager_dict]
+
+
+def add(a, b):
+    (c,) = apply(Elemwise(mode="add"), a, b)
+    return c
+
+
+def get_tensor(x):
+    # use recursion to avoid infinite loop
+    if isinstance(x, Tensor):
+        return x
+    try:
+        x = x.__wrapped__
+    except AttributeError:
+        raise TypeError(type(x))
+    return get_tensor(x)
+
+
+class Grad:
+    def __init__(self, name=None):
+
+        if name is None:
+            global _grad_count
+            self._name = "grad_" + str(_grad_count)
+            _grad_count += 1
+        else:
+            self._name = name
+        assert self._name not in _grad_manager_dict, "grad manager name duplicated"
+        _grad_manager_dict[self._name] = self
+
+        # list of all x in partial(y) / partial(x)
+        self.xs = []
+
+        # constains weak reference of all OpNode during forward
+        # OpNode contains inputs, outputs and its backward
+        # ops forms the computational graph
+        self.ops = []
+
+        self._enabled = True
+
+    @property
+    def name(self):
+        return self._name
+
+    def wrt(self, *args: Tensor, callback=None):
+        """ Indicates the loss is a function of the input tensors (usually the net trainable parameters),
+        i.e., d (loss) / d (Tensor) != 0
+
+        callback is used to perform additional operations after gradient is obtained in backward.
+        e.g., copy the grad to a particular place
+
+        A VariableNode will be created and saved in the tensor/s _extra_data slot.
+        """
+
+        for x in map(get_tensor, args):
+            v = self._new_variable(x, callback=callback)
+            assert self not in x._extra_data
+            x._extra_data[self] = Tracer(v)
+            self.xs.append(v)
+
+        return self
+
+    def _new_variable(self, owner, opnode=None, callback=None):
+        return VariableNode(self, owner, opnode=opnode, callback=callback)
+
+    def _new_opnode(self, inputs, outputs):
+        inputs = tuple(inputs)
+        for i in inputs:
+            assert i is None or isinstance(i, VariableNode)
+        o = OpNode()
+        o.inputs = inputs
+        o.outputs = []
+        tracers = []
+        for i in outputs:
+            assert isinstance(i, Tensor)
+            v = self._new_variable(i, o)
+            o.outputs.append(weakref.ref(v))
+            tracers.append(Tracer(v))
+        self.ops.append(weakref.ref(o))
+        return o, tracers
+
+    def copy(self):
+        raise NotImplementedError
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *_):
+        """clear all resources"""
+        self._enabled = False
+        for o in self.ops:
+            o = o()
+            if o:
+                o.clear()
+
+    def __call__(self, ys, dys):
+        """ Defines Grad().
+
+        :param ys: outputs of forward operators, e.g., the loss tensor
+        :type ys: list of Tensor or TensorWrapperBase
+        :param dys: delta of outputs, physically equivalent to sensitivity of outputs to the loss,
+            e.g., one for the loss itself
+        :type dys: list of Tensor or TensorWrapperBase
+        """
+        assert self._enabled
+        self._enabled = False
+
+        def check_wrapper():
+            if isinstance(dys, TensorWrapperBase):
+                return type(dys)
+            if isinstance(dys, TensorBase):
+                return
+            assert isinstance(dys, (tuple, list))
+            for i in dys:
+                if isinstance(i, TensorWrapperBase):
+                    return type(i)
+
+        Wrapper = check_wrapper()
+
+        def aslist(x):
+            if isinstance(x, (Tensor, TensorWrapperBase)):
+                x = [x]
+            else:
+                x = list(x)
+            x = [i.__wrapped__ if isinstance(i, TensorWrapperBase) else i for i in x]
+            for i in x:
+                assert isinstance(i, Tensor)
+            return x
+
+        ys = aslist(ys)
+        dys = aslist(dys)
+        assert len(ys) == len(dys)
+
+        # ys is changed to a list of VariableNode which contains more information
+        # such as OpNode, callback, etc.
+        ys = [i._extra_data[self].node for i in ys]
+
+        # NOTE: callback is called only if grad is not None
+
+        # the OpNode sequence in backward
+        op_seq = []
+
+        # VariableNode -> (i, j), where i is time stamp in backward, j means jth input
+        last_written_to = {}
+
+        def schedule():
+            reached = set(ys)
+            # i is the time stamp in backward
+            i = 0
+            for o in self.ops[::-1]:
+                o = o()
+                if o is None:
+                    continue
+
+                if not o.has_grad_fn(o, reached):
+                    continue
+                op_seq.append(o)
+                for j, v in enumerate(o.inputs):
+                    reached.add(v)
+                    last_written_to[v] = i, j
+                i += 1
+
+        schedule()
+
+        # VariableNode -> Tensor
+        cache = {}
+
+        def initialize():
+            for y, dy in zip(ys, dys):
+                cache[y] = dy
+                if y not in last_written_to and y.callback:
+                    y.callback(y.owner(), dy)
+
+        initialize()
+
+        # NOTE: None is used to mark a node has been consumed
+
+        for seqno, opnode in enumerate(op_seq):
+            input_nodes = opnode.inputs
+            output_nodes = [i() for i in opnode.outputs]
+            backward = opnode.backward
+            backward_allow_noinput = opnode.backward_allow_noinput
+            opnode.clear()
+
+            output_grads = []
+            for i in output_nodes:
+                if i is not None:
+                    if i in cache:
+                        assert cache[i] is not None
+                        output_grads.append(cache[i])
+                    else:
+                        output_grads.append(None)
+                    # read by backward, mark consumed
+                    cache[i] = None
+                else:
+                    output_grads.append(None)
+            if (
+                any([grad is not None for grad in output_grads])
+                or backward_allow_noinput
+            ):
+                input_grads = backward(*output_grads)
+            else:
+                input_grads = [None] * len(input_nodes)
+
+            assert len(input_nodes) == len(input_grads)
+            for i, (v, g) in enumerate(zip(input_nodes, input_grads)):
+                if v is None:
+                    continue
+                if v in cache:
+                    assert cache[v]
+                    if g is not None:
+                        cache[v] = add(cache[v], g)
+                elif g is not None:
+                    cache[v] = g
+                if last_written_to[v] == (seqno, i):
+                    if v.callback:
+                        v.callback(
+                            v.owner(), Wrapper(cache[v]) if Wrapper else cache[v]
+                        )
+                    if v.opnode is None:
+                        # won't read by backward, mark consumed
+                        cache[v] = None
+
+        for v in cache.values():
+            assert v is None
+
+
+class clearable:
+    __cleared = False
+
+    def __bool__(self):
+        return not self.__cleared
+
+    def clear(self):
+        self.__dict__.clear()
+        self.__cleared = True
+
+
+class OpNode(clearable):
+    """ OpNode saves all the information to form the computational graph.
+    """
+
+    def __init__(self):
+        self.id = None
+        self.inputs = None  # Could be VariableNode
+        self.outputs = None  # Could be VariableNode
+        self.backward = None
+        self.has_grad_fn = None
+        self.backward_allow_noinput = False
+
+
+class VariableNode(clearable):
+    """ VariableNode saves OpNode and callback.
+    FIXME!!! Explain manager and owner
+    """
+
+    def __init__(self, manager, owner, opnode=None, callback=None):
+        # manager is Grad type
+        self.manager = weakref.ref(manager)
+        # owner is Tensor type
+        self.owner = weakref.ref(owner)
+        self.opnode = opnode
+        self.callback = callback
+
+
+class Tracer(clearable, TensorBase):
+    def __init__(self, node=None):
+        """ type(node) is VariableNode
+        """
+        self.node = node
+
+
+@functools.singledispatch
+def check_backward_allow_noinput(op: OpDef):
+    return False
+
+
+@functools.singledispatch
+def get_op_has_grad_fn(op: OpDef):
+    assert 0
+
+
+@get_op_has_grad_fn.register(OpDef)
+def _(op: OpDef):
+    return default_has_grad_fn
+
+
+@get_op_has_grad_fn.register(Function)
+def _(op: Function):
+    return default_has_grad_fn
+
+
+def default_has_grad_fn(opnode, reached):
+    for v in opnode.outputs:
+        if v() in reached:
+            return True
+    return False
+
+
+@apply.add
+def tracer_apply(op: (OpDef, Function), *args: typing.Optional[Tracer]):
+    args = tuple(i if isinstance(i, Tracer) else None for i in args)
+    input_requires_grad = list(map(bool, args))
+    if not any(input_requires_grad):
+        return
+
+    ctx = get_context()
+    manager = None
+    assert len(ctx.inputs) == len(args)
+    for i, j in zip(ctx.inputs, args):
+        if j:
+            j = j.node
+            assert i is j.owner()
+            if manager is None:
+                manager = j.manager()
+                assert manager
+            else:
+                assert manager is j.manager()
+
+    if not manager._enabled:
+        return
+
+    opnode, outputs = manager._new_opnode([i and i.node for i in args], ctx.outputs)
+
+    # register backward method
+    # tuple of backward functions corresponding to dy / dx_i
+    # None means y is not a function of x_i
+    opnode.backward, output_need_grad = builtin_op_utils.builtin_op_get_backward_fn(
+        op, ctx.inputs, ctx.outputs, input_requires_grad
+    )
+
+    assert len(outputs) == len(output_need_grad)
+    outputs = [x if y else None for (x, y) in zip(outputs, output_need_grad)]
+
+    opnode.backward_allow_noinput = check_backward_allow_noinput(op)
+
+    opnode.has_grad_fn = get_op_has_grad_fn(op)
+
+    return tuple(outputs)
+
+
+@apply.add
+def _(op: Const, *_: typing.Optional[Tracer]):
+    return None
diff --git a/imperative/python/megengine/core/ops/__init__.py b/imperative/python/megengine/core/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1207b5d98cd3578bc39e9ce600a1254a434880c8
--- /dev/null
+++ b/imperative/python/megengine/core/ops/__init__.py
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/imperative/python/megengine/core/ops/_internal/__init__.py b/imperative/python/megengine/core/ops/_internal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1207b5d98cd3578bc39e9ce600a1254a434880c8
--- /dev/null
+++ b/imperative/python/megengine/core/ops/_internal/__init__.py
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/imperative/python/megengine/core/ops/_internal/all_ops.py b/imperative/python/megengine/core/ops/_internal/all_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1627ee978cbcc459535da8ba1af93821be6e46a
--- /dev/null
+++ b/imperative/python/megengine/core/ops/_internal/all_ops.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .generated_ops import *
+from .misc_ops import *
diff --git a/imperative/python/megengine/core/ops/_internal/enum36.py b/imperative/python/megengine/core/ops/_internal/enum36.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fb4bb6f424ddc270a35b3412917882929a18517
--- /dev/null
+++ b/imperative/python/megengine/core/ops/_internal/enum36.py
@@ -0,0 +1,929 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import sys
+from functools import reduce
+from operator import or_ as _or_
+from types import DynamicClassAttribute, MappingProxyType
+
+# try _collections first to reduce startup cost
+try:
+    from _collections import OrderedDict
+except ImportError:
+    from collections import OrderedDict
+
+
+__all__ = [
+    "EnumMeta",
+    "Enum",
+    "IntEnum",
+    "Flag",
+    "IntFlag",
+    "auto",
+    "unique",
+]
+
+
+def _is_descriptor(obj):
+    """Returns True if obj is a descriptor, False otherwise."""
+    return (
+        hasattr(obj, "__get__") or hasattr(obj, "__set__") or hasattr(obj, "__delete__")
+    )
+
+
+def _is_dunder(name):
+    """Returns True if a __dunder__ name, False otherwise."""
+    return (
+        name[:2] == name[-2:] == "__"
+        and name[2:3] != "_"
+        and name[-3:-2] != "_"
+        and len(name) > 4
+    )
+
+
+def _is_sunder(name):
+    """Returns True if a _sunder_ name, False otherwise."""
+    return (
+        name[0] == name[-1] == "_"
+        and name[1:2] != "_"
+        and name[-2:-1] != "_"
+        and len(name) > 2
+    )
+
+
+def _make_class_unpicklable(cls):
+    """Make the given class un-picklable."""
+
+    def _break_on_call_reduce(self, proto):
+        raise TypeError("%r cannot be pickled" % self)
+
+    cls.__reduce_ex__ = _break_on_call_reduce
+    cls.__module__ = "<unknown>"
+
+
+_auto_null = object()
+
+
+class auto:
+    """
+    Instances are replaced with an appropriate value in Enum class suites.
+    """
+
+    value = _auto_null
+
+
+class _EnumDict(dict):
+    """Track enum member order and ensure member names are not reused.
+
+    EnumMeta will use the names found in self._member_names as the
+    enumeration member names.
+
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._member_names = []
+        self._last_values = []
+
+    def __setitem__(self, key, value):
+        """Changes anything not dundered or not a descriptor.
+
+        If an enum member name is used twice, an error is raised; duplicate
+        values are not checked for.
+
+        Single underscore (sunder) names are reserved.
+
+        """
+        if _is_sunder(key):
+            if key not in (
+                "_order_",
+                "_create_pseudo_member_",
+                "_generate_next_value_",
+                "_missing_",
+            ):
+                raise ValueError("_names_ are reserved for future Enum use")
+            if key == "_generate_next_value_":
+                setattr(self, "_generate_next_value", value)
+        elif _is_dunder(key):
+            if key == "__order__":
+                key = "_order_"
+        elif key in self._member_names:
+            # descriptor overwriting an enum?
+            raise TypeError("Attempted to reuse key: %r" % key)
+        elif not _is_descriptor(value):
+            if key in self:
+                # enum overwriting a descriptor?
+                raise TypeError("%r already defined as: %r" % (key, self[key]))
+            if isinstance(value, auto):
+                if value.value == _auto_null:
+                    value.value = self._generate_next_value(
+                        key, 1, len(self._member_names), self._last_values[:]
+                    )
+                value = value.value
+            self._member_names.append(key)
+            self._last_values.append(value)
+        super().__setitem__(key, value)
+
+
+# Dummy value for Enum as EnumMeta explicitly checks for it, but of course
+# until EnumMeta finishes running the first time the Enum class doesn't exist.
+# This is also why there are checks in EnumMeta like `if Enum is not None`
+Enum = None
+
+
+class EnumMeta(type):
+    """Metaclass for Enum"""
+
+    @classmethod
+    def __prepare__(metacls, cls, bases):
+        # create the namespace dict
+        enum_dict = _EnumDict()
+        # inherit previous flags and _generate_next_value_ function
+        member_type, first_enum = metacls._get_mixins_(bases)
+        if first_enum is not None:
+            enum_dict["_generate_next_value_"] = getattr(
+                first_enum, "_generate_next_value_", None
+            )
+        return enum_dict
+
+    def __new__(metacls, cls, bases, classdict):
+        # an Enum class is final once enumeration items have been defined; it
+        # cannot be mixed with other types (int, float, etc.) if it has an
+        # inherited __new__ unless a new __new__ is defined (or the resulting
+        # class will fail).
+        member_type, first_enum = metacls._get_mixins_(bases)
+        __new__, save_new, use_args = metacls._find_new_(
+            classdict, member_type, first_enum
+        )
+
+        # save enum items into separate mapping so they don't get baked into
+        # the new class
+        enum_members = {k: classdict[k] for k in classdict._member_names}
+        for name in classdict._member_names:
+            del classdict[name]
+
+        # adjust the sunders
+        _order_ = classdict.pop("_order_", None)
+
+        # check for illegal enum names (any others?)
+        invalid_names = set(enum_members) & {
+            "mro",
+        }
+        if invalid_names:
+            raise ValueError(
+                "Invalid enum member name: {0}".format(",".join(invalid_names))
+            )
+
+        # create a default docstring if one has not been provided
+        if "__doc__" not in classdict:
+            classdict["__doc__"] = "An enumeration."
+
+        # create our new Enum type
+        enum_class = super().__new__(metacls, cls, bases, classdict)
+        enum_class._member_names_ = []  # names in definition order
+        enum_class._member_map_ = OrderedDict()  # name->value map
+        enum_class._member_type_ = member_type
+
+        # save attributes from super classes so we know if we can take
+        # the shortcut of storing members in the class dict
+        base_attributes = {a for b in enum_class.mro() for a in b.__dict__}
+
+        # Reverse value->name map for hashable values.
+        enum_class._value2member_map_ = {}
+
+        # If a custom type is mixed into the Enum, and it does not know how
+        # to pickle itself, pickle.dumps will succeed but pickle.loads will
+        # fail.  Rather than have the error show up later and possibly far
+        # from the source, sabotage the pickle protocol for this class so
+        # that pickle.dumps also fails.
+        #
+        # However, if the new class implements its own __reduce_ex__, do not
+        # sabotage -- it's on them to make sure it works correctly.  We use
+        # __reduce_ex__ instead of any of the others as it is preferred by
+        # pickle over __reduce__, and it handles all pickle protocols.
+        if "__reduce_ex__" not in classdict:
+            if member_type is not object:
+                methods = (
+                    "__getnewargs_ex__",
+                    "__getnewargs__",
+                    "__reduce_ex__",
+                    "__reduce__",
+                )
+                if not any(m in member_type.__dict__ for m in methods):
+                    _make_class_unpicklable(enum_class)
+
+        # instantiate them, checking for duplicates as we go
+        # we instantiate first instead of checking for duplicates first in case
+        # a custom __new__ is doing something funky with the values -- such as
+        # auto-numbering ;)
+        for member_name in classdict._member_names:
+            value = enum_members[member_name]
+            if not isinstance(value, tuple):
+                args = (value,)
+            else:
+                args = value
+            if member_type is tuple:  # special case for tuple enums
+                args = (args,)  # wrap it one more time
+            if not use_args:
+                enum_member = __new__(enum_class)
+                if not hasattr(enum_member, "_value_"):
+                    enum_member._value_ = value
+            else:
+                enum_member = __new__(enum_class, *args)
+                if not hasattr(enum_member, "_value_"):
+                    if member_type is object:
+                        enum_member._value_ = value
+                    else:
+                        enum_member._value_ = member_type(*args)
+            value = enum_member._value_
+            enum_member._name_ = member_name
+            enum_member.__objclass__ = enum_class
+            enum_member.__init__(*args)
+            # If another member with the same value was already defined, the
+            # new member becomes an alias to the existing one.
+            for name, canonical_member in enum_class._member_map_.items():
+                if canonical_member._value_ == enum_member._value_:
+                    enum_member = canonical_member
+                    break
+            else:
+                # Aliases don't appear in member names (only in __members__).
+                enum_class._member_names_.append(member_name)
+            # performance boost for any member that would not shadow
+            # a DynamicClassAttribute
+            if member_name not in base_attributes:
+                setattr(enum_class, member_name, enum_member)
+            # now add to _member_map_
+            enum_class._member_map_[member_name] = enum_member
+            try:
+                # This may fail if value is not hashable. We can't add the value
+                # to the map, and by-value lookups for this value will be
+                # linear.
+                enum_class._value2member_map_[value] = enum_member
+            except TypeError:
+                pass
+
+        # double check that repr and friends are not the mixin's or various
+        # things break (such as pickle)
+        for name in ("__repr__", "__str__", "__format__", "__reduce_ex__"):
+            class_method = getattr(enum_class, name)
+            obj_method = getattr(member_type, name, None)
+            enum_method = getattr(first_enum, name, None)
+            if obj_method is not None and obj_method is class_method:
+                setattr(enum_class, name, enum_method)
+
+        # replace any other __new__ with our own (as long as Enum is not None,
+        # anyway) -- again, this is to support pickle
+        if Enum is not None:
+            # if the user defined their own __new__, save it before it gets
+            # clobbered in case they subclass later
+            if save_new:
+                enum_class.__new_member__ = __new__
+            enum_class.__new__ = Enum.__new__
+
+        # py3 support for definition order (helps keep py2/py3 code in sync)
+        if _order_ is not None:
+            if isinstance(_order_, str):
+                _order_ = _order_.replace(",", " ").split()
+            if _order_ != enum_class._member_names_:
+                raise TypeError("member order does not match _order_")
+
+        return enum_class
+
+    def __bool__(self):
+        """
+        classes/types should always be True.
+        """
+        return True
+
+    def __call__(
+        cls, value, names=None, *, module=None, qualname=None, type=None, start=1
+    ):
+        """Either returns an existing member, or creates a new enum class.
+
+        This method is used both when an enum class is given a value to match
+        to an enumeration member (i.e. Color(3)) and for the functional API
+        (i.e. Color = Enum('Color', names='RED GREEN BLUE')).
+
+        When used for the functional API:
+
+        `value` will be the name of the new class.
+
+        `names` should be either a string of white-space/comma delimited names
+        (values will start at `start`), or an iterator/mapping of name, value pairs.
+
+        `module` should be set to the module this class is being created in;
+        if it is not set, an attempt to find that module will be made, but if
+        it fails the class will not be picklable.
+
+        `qualname` should be set to the actual location this class can be found
+        at in its module; by default it is set to the global scope.  If this is
+        not correct, unpickling will fail in some circumstances.
+
+        `type`, if set, will be mixed in as the first base class.
+
+        """
+        if names is None:  # simple value lookup
+            return cls.__new__(cls, value)
+        # otherwise, functional API: we're creating a new Enum type
+        return cls._create_(
+            value, names, module=module, qualname=qualname, type=type, start=start
+        )
+
+    def __contains__(cls, member):
+        return isinstance(member, cls) and member._name_ in cls._member_map_
+
+    def __delattr__(cls, attr):
+        # nicer error message when someone tries to delete an attribute
+        # (see issue19025).
+        if attr in cls._member_map_:
+            raise AttributeError("%s: cannot delete Enum member." % cls.__name__)
+        super().__delattr__(attr)
+
+    def __dir__(self):
+        return [
+            "__class__",
+            "__doc__",
+            "__members__",
+            "__module__",
+        ] + self._member_names_
+
+    def __getattr__(cls, name):
+        """Return the enum member matching `name`
+
+        We use __getattr__ instead of descriptors or inserting into the enum
+        class' __dict__ in order to support `name` and `value` being both
+        properties for enum members (which live in the class' __dict__) and
+        enum members themselves.
+
+        """
+        if _is_dunder(name):
+            raise AttributeError(name)
+        try:
+            return cls._member_map_[name]
+        except KeyError:
+            raise AttributeError(name) from None
+
+    def __getitem__(cls, name):
+        return cls._member_map_[name]
+
+    def __iter__(cls):
+        return (cls._member_map_[name] for name in cls._member_names_)
+
+    def __len__(cls):
+        return len(cls._member_names_)
+
+    @property
+    def __members__(cls):
+        """Returns a mapping of member name->value.
+
+        This mapping lists all enum members, including aliases. Note that this
+        is a read-only view of the internal mapping.
+
+        """
+        return MappingProxyType(cls._member_map_)
+
+    def __repr__(cls):
+        return "<enum %r>" % cls.__name__
+
+    def __reversed__(cls):
+        return (cls._member_map_[name] for name in reversed(cls._member_names_))
+
+    def __setattr__(cls, name, value):
+        """Block attempts to reassign Enum members.
+
+        A simple assignment to the class namespace only changes one of the
+        several possible ways to get an Enum member from the Enum class,
+        resulting in an inconsistent Enumeration.
+
+        """
+        member_map = cls.__dict__.get("_member_map_", {})
+        if name in member_map:
+            raise AttributeError("Cannot reassign members.")
+        super().__setattr__(name, value)
+
+    def _create_(
+        cls, class_name, names=None, *, module=None, qualname=None, type=None, start=1
+    ):
+        """Convenience method to create a new Enum class.
+
+        `names` can be:
+
+        * A string containing member names, separated either with spaces or
+          commas.  Values are incremented by 1 from `start`.
+        * An iterable of member names.  Values are incremented by 1 from `start`.
+        * An iterable of (member name, value) pairs.
+        * A mapping of member name -> value pairs.
+
+        """
+        metacls = cls.__class__
+        bases = (cls,) if type is None else (type, cls)
+        _, first_enum = cls._get_mixins_(bases)
+        classdict = metacls.__prepare__(class_name, bases)
+
+        # special processing needed for names?
+        if isinstance(names, str):
+            names = names.replace(",", " ").split()
+        if isinstance(names, (tuple, list)) and names and isinstance(names[0], str):
+            original_names, names = names, []
+            last_values = []
+            for count, name in enumerate(original_names):
+                value = first_enum._generate_next_value_(
+                    name, start, count, last_values[:]
+                )
+                last_values.append(value)
+                names.append((name, value))
+
+        # Here, names is either an iterable of (name, value) or a mapping.
+        for item in names:
+            if isinstance(item, str):
+                member_name, member_value = item, names[item]
+            else:
+                member_name, member_value = item
+            classdict[member_name] = member_value
+        enum_class = metacls.__new__(metacls, class_name, bases, classdict)
+
+        # TODO: replace the frame hack if a blessed way to know the calling
+        # module is ever developed
+        if module is None:
+            try:
+                module = sys._getframe(2).f_globals["__name__"]
+            except (AttributeError, ValueError) as exc:
+                pass
+        if module is None:
+            _make_class_unpicklable(enum_class)
+        else:
+            enum_class.__module__ = module
+        if qualname is not None:
+            enum_class.__qualname__ = qualname
+
+        return enum_class
+
+    @staticmethod
+    def _get_mixins_(bases):
+        """Returns the type for creating enum members, and the first inherited
+        enum class.
+
+        bases: the tuple of bases that was given to __new__
+
+        """
+        if not bases:
+            return object, Enum
+
+        # double check that we are not subclassing a class with existing
+        # enumeration members; while we're at it, see if any other data
+        # type has been mixed in so we can use the correct __new__
+        member_type = first_enum = None
+        for base in bases:
+            if base is not Enum and issubclass(base, Enum) and base._member_names_:
+                raise TypeError("Cannot extend enumerations")
+        # base is now the last base in bases
+        if not issubclass(base, Enum):
+            raise TypeError(
+                "new enumerations must be created as "
+                "`ClassName([mixin_type,] enum_type)`"
+            )
+
+        # get correct mix-in type (either mix-in type of Enum subclass, or
+        # first base if last base is Enum)
+        if not issubclass(bases[0], Enum):
+            member_type = bases[0]  # first data type
+            first_enum = bases[-1]  # enum type
+        else:
+            for base in bases[0].__mro__:
+                # most common: (IntEnum, int, Enum, object)
+                # possible:    (<Enum 'AutoIntEnum'>, <Enum 'IntEnum'>,
+                #               <class 'int'>, <Enum 'Enum'>,
+                #               <class 'object'>)
+                if issubclass(base, Enum):
+                    if first_enum is None:
+                        first_enum = base
+                else:
+                    if member_type is None:
+                        member_type = base
+
+        return member_type, first_enum
+
+    @staticmethod
+    def _find_new_(classdict, member_type, first_enum):
+        """Returns the __new__ to be used for creating the enum members.
+
+        classdict: the class dictionary given to __new__
+        member_type: the data type whose __new__ will be used by default
+        first_enum: enumeration to check for an overriding __new__
+
+        """
+        # now find the correct __new__, checking to see of one was defined
+        # by the user; also check earlier enum classes in case a __new__ was
+        # saved as __new_member__
+        __new__ = classdict.get("__new__", None)
+
+        # should __new__ be saved as __new_member__ later?
+        save_new = __new__ is not None
+
+        if __new__ is None:
+            # check all possibles for __new_member__ before falling back to
+            # __new__
+            for method in ("__new_member__", "__new__"):
+                for possible in (member_type, first_enum):
+                    target = getattr(possible, method, None)
+                    if target not in {
+                        None,
+                        None.__new__,
+                        object.__new__,
+                        Enum.__new__,
+                    }:
+                        __new__ = target
+                        break
+                if __new__ is not None:
+                    break
+            else:
+                __new__ = object.__new__
+
+        # if a non-object.__new__ is used then whatever value/tuple was
+        # assigned to the enum member name will be passed to __new__ and to the
+        # new enum member's __init__
+        if __new__ is object.__new__:
+            use_args = False
+        else:
+            use_args = True
+
+        return __new__, save_new, use_args
+
+
+class Enum(metaclass=EnumMeta):
+    """Generic enumeration.
+
+    Derive from this class to define new enumerations.
+
+    """
+
+    def __new__(cls, value):
+        # all enum instances are actually created during class construction
+        # without calling this method; this method is called by the metaclass'
+        # __call__ (i.e. Color(3) ), and by pickle
+        if type(value) is cls:
+            # For lookups like Color(Color.RED)
+            return value
+        # by-value search for a matching enum member
+        # see if it's in the reverse mapping (for hashable values)
+        try:
+            if value in cls._value2member_map_:
+                return cls._value2member_map_[value]
+        except TypeError:
+            # not there, now do long search -- O(n) behavior
+            for member in cls._member_map_.values():
+                if member._value_ == value:
+                    return member
+        # still not found -- try _missing_ hook
+        return cls._missing_(value)
+
+    def _generate_next_value_(name, start, count, last_values):
+        for last_value in reversed(last_values):
+            try:
+                return last_value + 1
+            except TypeError:
+                pass
+        else:
+            return start
+
+    @classmethod
+    def _missing_(cls, value):
+        raise ValueError("%r is not a valid %s" % (value, cls.__name__))
+
+    def __repr__(self):
+        return "<%s.%s: %r>" % (self.__class__.__name__, self._name_, self._value_)
+
+    def __str__(self):
+        return "%s.%s" % (self.__class__.__name__, self._name_)
+
+    def __dir__(self):
+        added_behavior = [
+            m
+            for cls in self.__class__.mro()
+            for m in cls.__dict__
+            if m[0] != "_" and m not in self._member_map_
+        ]
+        return ["__class__", "__doc__", "__module__"] + added_behavior
+
+    def __format__(self, format_spec):
+        # mixed-in Enums should use the mixed-in type's __format__, otherwise
+        # we can get strange results with the Enum name showing up instead of
+        # the value
+
+        # pure Enum branch
+        if self._member_type_ is object:
+            cls = str
+            val = str(self)
+        # mix-in branch
+        else:
+            cls = self._member_type_
+            val = self._value_
+        return cls.__format__(val, format_spec)
+
+    def __hash__(self):
+        return hash(self._name_)
+
+    def __reduce_ex__(self, proto):
+        return self.__class__, (self._value_,)
+
+    # DynamicClassAttribute is used to provide access to the `name` and
+    # `value` properties of enum members while keeping some measure of
+    # protection from modification, while still allowing for an enumeration
+    # to have members named `name` and `value`.  This works because enumeration
+    # members are not set directly on the enum class -- __getattr__ is
+    # used to look them up.
+
+    @DynamicClassAttribute
+    def name(self):
+        """The name of the Enum member."""
+        return self._name_
+
+    @DynamicClassAttribute
+    def value(self):
+        """The value of the Enum member."""
+        return self._value_
+
+    @classmethod
+    def _convert(cls, name, module, filter, source=None):
+        """
+        Create a new Enum subclass that replaces a collection of global constants
+        """
+        # convert all constants from source (or module) that pass filter() to
+        # a new Enum called name, and export the enum and its members back to
+        # module;
+        # also, replace the __reduce_ex__ method so unpickling works in
+        # previous Python versions
+        module_globals = vars(sys.modules[module])
+        if source:
+            source = vars(source)
+        else:
+            source = module_globals
+        # We use an OrderedDict of sorted source keys so that the
+        # _value2member_map is populated in the same order every time
+        # for a consistent reverse mapping of number to name when there
+        # are multiple names for the same number rather than varying
+        # between runs due to hash randomization of the module dictionary.
+        members = [(name, source[name]) for name in source.keys() if filter(name)]
+        try:
+            # sort by value
+            members.sort(key=lambda t: (t[1], t[0]))
+        except TypeError:
+            # unless some values aren't comparable, in which case sort by name
+            members.sort(key=lambda t: t[0])
+        cls = cls(name, members, module=module)
+        cls.__reduce_ex__ = _reduce_ex_by_name
+        module_globals.update(cls.__members__)
+        module_globals[name] = cls
+        return cls
+
+
+class IntEnum(int, Enum):
+    """Enum where members are also (and must be) ints"""
+
+
+def _reduce_ex_by_name(self, proto):
+    return self.name
+
+
+class Flag(Enum):
+    """Support for flags"""
+
+    def _generate_next_value_(name, start, count, last_values):
+        """
+        Generate the next value when not given.
+
+        name: the name of the member
+        start: the initital start value or None
+        count: the number of existing members
+        last_value: the last value assigned or None
+        """
+        if not count:
+            return start if start is not None else 1
+        for last_value in reversed(last_values):
+            try:
+                high_bit = _high_bit(last_value)
+                break
+            except Exception:
+                raise TypeError("Invalid Flag value: %r" % last_value) from None
+        return 2 ** (high_bit + 1)
+
+    @classmethod
+    def _missing_(cls, value):
+        original_value = value
+        if value < 0:
+            value = ~value
+        possible_member = cls._create_pseudo_member_(value)
+        if original_value < 0:
+            possible_member = ~possible_member
+        return possible_member
+
+    @classmethod
+    def _create_pseudo_member_(cls, value):
+        """
+        Create a composite member iff value contains only members.
+        """
+        pseudo_member = cls._value2member_map_.get(value, None)
+        if pseudo_member is None:
+            # verify all bits are accounted for
+            _, extra_flags = _decompose(cls, value)
+            if extra_flags:
+                raise ValueError("%r is not a valid %s" % (value, cls.__name__))
+            # construct a singleton enum pseudo-member
+            pseudo_member = object.__new__(cls)
+            pseudo_member._name_ = None
+            pseudo_member._value_ = value
+            # use setdefault in case another thread already created a composite
+            # with this value
+            pseudo_member = cls._value2member_map_.setdefault(value, pseudo_member)
+        return pseudo_member
+
+    def __contains__(self, other):
+        if not isinstance(other, self.__class__):
+            return NotImplemented
+        return other._value_ & self._value_ == other._value_
+
+    def __repr__(self):
+        cls = self.__class__
+        if self._name_ is not None:
+            return "<%s.%s: %r>" % (cls.__name__, self._name_, self._value_)
+        members, uncovered = _decompose(cls, self._value_)
+        return "<%s.%s: %r>" % (
+            cls.__name__,
+            "|".join([str(m._name_ or m._value_) for m in members]),
+            self._value_,
+        )
+
+    def __str__(self):
+        cls = self.__class__
+        if self._name_ is not None:
+            return "%s.%s" % (cls.__name__, self._name_)
+        members, uncovered = _decompose(cls, self._value_)
+        if len(members) == 1 and members[0]._name_ is None:
+            return "%s.%r" % (cls.__name__, members[0]._value_)
+        else:
+            return "%s.%s" % (
+                cls.__name__,
+                "|".join([str(m._name_ or m._value_) for m in members]),
+            )
+
+    def __bool__(self):
+        return bool(self._value_)
+
+    def __or__(self, other):
+        if not isinstance(other, self.__class__):
+            return NotImplemented
+        return self.__class__(self._value_ | other._value_)
+
+    def __and__(self, other):
+        if not isinstance(other, self.__class__):
+            return NotImplemented
+        return self.__class__(self._value_ & other._value_)
+
+    def __xor__(self, other):
+        if not isinstance(other, self.__class__):
+            return NotImplemented
+        return self.__class__(self._value_ ^ other._value_)
+
+    def __invert__(self):
+        members, uncovered = _decompose(self.__class__, self._value_)
+        inverted_members = [
+            m
+            for m in self.__class__
+            if m not in members and not m._value_ & self._value_
+        ]
+        inverted = reduce(_or_, inverted_members, self.__class__(0))
+        return self.__class__(inverted)
+
+
+class IntFlag(int, Flag):
+    """Support for integer-based Flags"""
+
+    @classmethod
+    def _missing_(cls, value):
+        if not isinstance(value, int):
+            raise ValueError("%r is not a valid %s" % (value, cls.__name__))
+        new_member = cls._create_pseudo_member_(value)
+        return new_member
+
+    @classmethod
+    def _create_pseudo_member_(cls, value):
+        pseudo_member = cls._value2member_map_.get(value, None)
+        if pseudo_member is None:
+            need_to_create = [value]
+            # get unaccounted for bits
+            _, extra_flags = _decompose(cls, value)
+            # timer = 10
+            while extra_flags:
+                # timer -= 1
+                bit = _high_bit(extra_flags)
+                flag_value = 2 ** bit
+                if (
+                    flag_value not in cls._value2member_map_
+                    and flag_value not in need_to_create
+                ):
+                    need_to_create.append(flag_value)
+                if extra_flags == -flag_value:
+                    extra_flags = 0
+                else:
+                    extra_flags ^= flag_value
+            for value in reversed(need_to_create):
+                # construct singleton pseudo-members
+                pseudo_member = int.__new__(cls, value)
+                pseudo_member._name_ = None
+                pseudo_member._value_ = value
+                # use setdefault in case another thread already created a composite
+                # with this value
+                pseudo_member = cls._value2member_map_.setdefault(value, pseudo_member)
+        return pseudo_member
+
+    def __or__(self, other):
+        if not isinstance(other, (self.__class__, int)):
+            return NotImplemented
+        result = self.__class__(self._value_ | self.__class__(other)._value_)
+        return result
+
+    def __and__(self, other):
+        if not isinstance(other, (self.__class__, int)):
+            return NotImplemented
+        return self.__class__(self._value_ & self.__class__(other)._value_)
+
+    def __xor__(self, other):
+        if not isinstance(other, (self.__class__, int)):
+            return NotImplemented
+        return self.__class__(self._value_ ^ self.__class__(other)._value_)
+
+    __ror__ = __or__
+    __rand__ = __and__
+    __rxor__ = __xor__
+
+    def __invert__(self):
+        result = self.__class__(~self._value_)
+        return result
+
+
+def _high_bit(value):
+    """returns index of highest bit, or -1 if value is zero or negative"""
+    return value.bit_length() - 1
+
+
+def unique(enumeration):
+    """Class decorator for enumerations ensuring unique member values."""
+    duplicates = []
+    for name, member in enumeration.__members__.items():
+        if name != member.name:
+            duplicates.append((name, member.name))
+    if duplicates:
+        alias_details = ", ".join(
+            ["%s -> %s" % (alias, name) for (alias, name) in duplicates]
+        )
+        raise ValueError(
+            "duplicate values found in %r: %s" % (enumeration, alias_details)
+        )
+    return enumeration
+
+
+def _decompose(flag, value):
+    """Extract all members from the value."""
+    # _decompose is only called if the value is not named
+    not_covered = value
+    negative = value < 0
+    # issue29167: wrap accesses to _value2member_map_ in a list to avoid race
+    #             conditions between iterating over it and having more psuedo-
+    #             members added to it
+    if negative:
+        # only check for named flags
+        flags_to_check = [
+            (m, v)
+            for v, m in list(flag._value2member_map_.items())
+            if m.name is not None
+        ]
+    else:
+        # check for named flags and powers-of-two flags
+        flags_to_check = [
+            (m, v)
+            for v, m in list(flag._value2member_map_.items())
+            if m.name is not None or _power_of_two(v)
+        ]
+    members = []
+    for member, member_value in flags_to_check:
+        if member_value and member_value & value == member_value:
+            members.append(member)
+            not_covered &= ~member_value
+    if not members and value in flag._value2member_map_:
+        members.append(flag._value2member_map_[value])
+    members.sort(key=lambda m: m._value_, reverse=True)
+    if len(members) > 1 and members[0].value == value:
+        # we have the breakdown, don't need the value member itself
+        members.pop(0)
+    return members, not_covered
+
+
+def _power_of_two(value):
+    if value < 1:
+        return False
+    return value == 2 ** _high_bit(value)
diff --git a/imperative/python/megengine/core/ops/_internal/helper.py b/imperative/python/megengine/core/ops/_internal/helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..52af3aa0a6499621bbf2ee2a2b329cd32ffaf6bd
--- /dev/null
+++ b/imperative/python/megengine/core/ops/_internal/helper.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import warnings
+
+from ..._imperative_rt.ops import OprAttr
+from . import param_defs
+
+
+def make_param(param, ptype, kwargs):
+    if param is not None:
+        if isinstance(param, ptype):
+            return param
+
+        param = [param]
+        assert len(param) == len(
+            ptype.__slots__
+        ), "{} needs {} params, but {} are provided".format(
+            ptype, len(ptype.__slots__), len(param)
+        )
+        return ptype(*param)
+
+    ckw = {}
+    for i in ptype.__slots__:
+        val = kwargs.pop(i, ckw)
+        if val is not ckw:
+            ckw[i] = val
+    return ptype(**ckw)
+
+
+class PodOpVisitor:
+    __name2subclass = {}
+    __c = None
+
+    name = None
+    param_names = []
+    config = None
+
+    def __init__(self, config, **params):
+        self.config = config
+        assert set(params) == set(self.param_names)
+        self.__dict__.update(params)
+
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)  # python 3.5 does not have this
+        name = cls.name
+        if name in cls.__name2subclass:
+            if not issubclass(cls, cls.__name2subclass[name]):
+                warnings.warn("Multiple subclasses for bultin op: %s" % name)
+        cls.__name2subclass[name] = cls
+
+    def to_c(self):
+        if self.__c:
+            return self.__c
+        op = OprAttr()
+        op.type = self.name
+        if self.config is not None:
+            op.config = self.config
+        # first 4 bytes is TAG, has to remove them currently
+        op.param = b"".join(self.__dict__[k].serialize()[4:] for k in self.param_names)
+        self.__c = op
+        return op
+
+    def __eq__(self, rhs):
+        return self.to_c() == rhs.to_c()
+
+    def __repr__(self):
+        name = self.__class__.__name__
+
+        if self.__c:
+            return "{}(<binary data>)".format(name)
+
+        kwargs = {}
+        for i in self.param_names:
+            p = self.__dict__[i]
+            if isinstance(p, param_defs._ParamDefBase):
+                for k in p.__slots__:
+                    v = getattr(p, k)
+                    if isinstance(v, param_defs._EnumBase):
+                        v = v.name
+                    kwargs[k] = repr(v)
+            else:
+                kwargs[i] = repr(p)
+        if self.config:
+            if len(self.config.comp_node_arr) == 1:
+                kwargs["device"] = "'%s'" % self.config.comp_node
+        return "{}({})".format(
+            name, ", ".join("{}={}".format(k, v) for k, v in kwargs.items())
+        )
diff --git a/imperative/python/megengine/core/ops/_internal/misc_ops.py b/imperative/python/megengine/core/ops/_internal/misc_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..e02ddee95c8a693df7f39cbc492f1152ebd27bcd
--- /dev/null
+++ b/imperative/python/megengine/core/ops/_internal/misc_ops.py
@@ -0,0 +1,194 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+import ctypes
+
+from ..._imperative_rt import OperatorNodeConfig as Config
+from . import param_defs
+from .helper import PodOpVisitor, make_param
+
+__all__ = ["ConvolutionBackwardData", "Dimshuffle", "Reshape", "AxisAddRemove"]
+
+
+class TensorShape:
+    MAX_NDIM = 7
+
+
+class ConvolutionBackwardData(PodOpVisitor):
+    param_names = (
+        "param",
+        "execution_polity",
+    )
+    name = "ConvolutionBackwardDataV1"
+
+    def __init__(
+        self,
+        *,
+        param=None,
+        execution_polity=None,
+        name=None,
+        comp_node=None,
+        config=None,
+        dtype=None,
+        **kwargs
+    ):
+        config = config or Config()
+        if name:
+            config.name = name
+        if comp_node:
+            config.comp_node = comp_node
+        if dtype:
+            config.dtype = dtype
+        self.config = config
+
+        self.param = make_param(param, param_defs.Convolution, kwargs)
+        self.execution_polity = make_param(
+            execution_polity, param_defs.ExecutionPolicy, kwargs
+        )
+        assert not kwargs, "extra kwargs: {}".format(kwargs)
+
+
+class Dimshuffle(PodOpVisitor):
+    name = "Dimshuffle"
+    param_names = ("pattern",)
+
+    class Pattern(ctypes.Structure):
+        Pattern_Array = ctypes.c_int32 * TensorShape.MAX_NDIM
+        _fields_ = [
+            ("length", ctypes.c_uint32),
+            ("pattern", Pattern_Array),
+            ("ndim", ctypes.c_uint32),
+        ]
+
+        def serialize(self):
+            return bytes(ctypes.c_uint32(0)) + bytes(self)
+
+    def __init__(self, pattern, ndim=0):
+        assert isinstance(pattern, collections.Iterable)
+        assert len(pattern) <= TensorShape.MAX_NDIM
+        pattern_array = Dimshuffle.Pattern.Pattern_Array()
+        for idx, v in enumerate(pattern):
+            pattern_array[idx] = ctypes.c_int32(-1 if v == "x" else int(v))
+        self.pattern = Dimshuffle.Pattern(len(pattern), pattern_array, ndim)
+
+
+class Reshape(PodOpVisitor):
+    name = "ReshapeV1"
+    param_names = ("unspec_axis",)
+
+    def __init__(self, unspec_axis=None):
+        if unspec_axis is None:
+            self.unspec_axis = param_defs.OptionalAxisV1()
+        else:
+            self.unspec_axis = param_defs.OptionalAxisV1(unspec_axis)
+
+
+class AxisNum(ctypes.Structure):
+    _fields_ = [
+        ("m_num", ctypes.c_int),
+    ]
+
+
+class AxisDesc(ctypes.Structure):
+    class Method(ctypes.c_int):
+        ADD_1 = 0
+        REMOVE = 1
+
+    _fields_ = [
+        ("method", Method),
+        ("axis", AxisNum),
+    ]
+
+    @classmethod
+    def make_add(cls, axis):
+        return cls(cls.Method.ADD_1, AxisNum(axis))
+
+    @classmethod
+    def make_remove(cls, axis):
+        return cls(cls.Method.REMOVE, AxisNum(axis))
+
+
+class AxisAddRemove(PodOpVisitor):
+    name = "AxisAddRemove"
+    param_names = ("param",)
+
+    AxisDesc = AxisDesc
+
+    class Param(ctypes.Structure):
+        MAX_DESC_SIZE = TensorShape.MAX_NDIM * 2
+
+        _fields_ = [("nr_desc", ctypes.c_uint32), ("desc", AxisDesc * MAX_DESC_SIZE)]
+
+        def __init__(self, *args):
+            super().__init__()
+            self.nr_desc = len(args)
+            for i, a in enumerate(args):
+                self.desc[i] = a
+
+        def serialize(self):
+            return bytes(ctypes.c_uint32(0)) + bytes(self)
+
+    def __init__(self, param):
+        assert isinstance(param, self.Param)
+        self.param = param
+
+
+del AxisDesc
+
+
+class IndexingOpBase(PodOpVisitor):
+    param_names = ("index_desc",)
+
+    class IndexDescMaskDump(ctypes.Structure):
+        class Item(ctypes.Structure):
+            _fields_ = [
+                ("axis", ctypes.c_int8),
+                ("begin", ctypes.c_bool),
+                ("end", ctypes.c_bool),
+                ("step", ctypes.c_bool),
+                ("idx", ctypes.c_bool),
+            ]
+
+        Item_Array = Item * TensorShape.MAX_NDIM
+
+        _fields_ = [("nr_item", ctypes.c_uint8), ("items", Item_Array)]
+
+        def serialize(self):
+            return bytes(ctypes.c_uint32(0)) + bytes(self)
+
+    def __init__(self, items):
+        nr_item = len(items)
+        assert nr_item <= TensorShape.MAX_NDIM
+        item_array = IndexingOpBase.IndexDescMaskDump.Item_Array()
+        for idx, item in enumerate(items):
+            assert isinstance(item, (tuple, list)) and len(item) == 5
+            item_array[idx] = IndexingOpBase.IndexDescMaskDump.Item(*item)
+        self.index_desc = IndexingOpBase.IndexDescMaskDump(nr_item, item_array)
+
+
+def _gen_indexing_defs(*names):
+    for name in names:
+        globals()[name] = type(name, (IndexingOpBase,), dict(name=name))
+        __all__.append(name)
+
+
+_gen_indexing_defs(
+    "Subtensor",
+    "SetSubtensor",
+    "IncrSubtensor",
+    "IndexingMultiAxisVec",
+    "IndexingSetMultiAxisVec",
+    "IndexingIncrMultiAxisVec",
+    "MeshIndexing",
+    "IncrMeshIndexing",
+    "SetMeshIndexing",
+    "BatchedMeshIndexing",
+    "BatchedIncrMeshIndexing",
+    "BatchedSetMeshIndexing",
+)
diff --git a/imperative/python/megengine/core/ops/builtin/__init__.py b/imperative/python/megengine/core/ops/builtin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4656cbd2890bed2a80e50b54470f1c5f63357267
--- /dev/null
+++ b/imperative/python/megengine/core/ops/builtin/__init__.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import warnings
+from typing import Union
+
+from ..._imperative_rt import OpDef, ops
+from ...tensor.core import OpBase, TensorBase, TensorWrapperBase, apply
+from .._internal import all_ops
+from .._internal.helper import PodOpVisitor
+
+# register OpDef as a "virtual subclass" of OpBase, so any of registered
+# apply(OpBase, ...) rules could work well on OpDef
+OpBase.register(OpDef)
+
+# forward to apply(OpDef, ...)
+@apply.add
+def _(op: PodOpVisitor, *args: Union[TensorBase, TensorWrapperBase]):
+    return apply(op.to_c(), *args)
+
+
+__all__ = ["OpDef", "PodOpVisitor"]
+
+for k, v in all_ops.__dict__.items():
+    if isinstance(v, type) and issubclass(v, PodOpVisitor):
+        globals()[k] = v
+        __all__.append(k)
+
+for k, v in ops.__dict__.items():
+    if isinstance(v, type) and issubclass(v, OpDef):
+        globals()[k] = v
+        __all__.append(k)
diff --git a/imperative/python/megengine/core/ops/special.py b/imperative/python/megengine/core/ops/special.py
new file mode 100644
index 0000000000000000000000000000000000000000..e427c8f592bd07ce6ec4ee248137f097fe890b00
--- /dev/null
+++ b/imperative/python/megengine/core/ops/special.py
@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from ..tensor.core import OpBase, TensorBase, apply
+
+
+class Const(OpBase):
+    def __init__(self, value=None, *, dtype=None, device=None):
+        self.value = value
+        self.dtype = dtype
+        self.device = device
diff --git a/imperative/python/megengine/core/tensor/__init__.py b/imperative/python/megengine/core/tensor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e008c110f3e4280d7ac40b666c80a6b38f89a940
--- /dev/null
+++ b/imperative/python/megengine/core/tensor/__init__.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .tensor_wrapper import TensorWrapper as Tensor
diff --git a/imperative/python/megengine/core/tensor/core.py b/imperative/python/megengine/core/tensor/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a09f5246fc7acf68670a68b9fb1d06dafe7feb5
--- /dev/null
+++ b/imperative/python/megengine/core/tensor/core.py
@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+import functools
+import inspect
+import sys
+import typing
+from abc import ABC
+
+import multipledispatch
+
+
+class OpBase(ABC):
+    def __call__(self, *args):
+        return apply(self, *args)
+
+
+class TensorBase:
+    pass
+
+
+class TensorWrapperBase:
+    pass
+
+
+class Dispatcher(multipledispatch.Dispatcher):
+    def add(self, f, g=None):
+        if g is None:
+            super().add(get_signature(f), f)
+        else:
+            super().add(f, g)
+
+        return f
+
+    def __get__(self, instance, owner=None):
+        if instance is not None:
+            return self
+        return functools.partial(self, instance)
+
+
+if sys.version_info < (3, 6):
+
+    def parse_union(ann):
+        if type(ann) is not typing.UnionMeta:
+            return
+        return ann.__union_params__
+
+
+elif sys.version_info < (3, 7):
+
+    def parse_union(ann):
+        if type(ann) is not typing._Union:
+            return
+        return ann.__args__
+
+
+elif sys.version_info < (3, 8):
+
+    def parse_union(ann):
+        if type(ann) is not typing._GenericAlias:
+            if type(ann) is not typing.Union:
+                return
+        else:
+            if ann.__origin__ is not typing.Union:
+                return
+        return ann.__args__
+
+
+else:
+
+    def parse_union(ann):
+        if typing.get_origin(ann) is not typing.Union:
+            return
+        return typing.get_args(ann)
+
+
+def get_signature(function, op_type=None):
+    sig = inspect.signature(function)
+    types = []
+    for p in sig.parameters.values():
+        ann = p.annotation
+        ann = parse_union(ann) or ann
+        if p.kind in (
+            inspect.Parameter.POSITIONAL_ONLY,
+            inspect.Parameter.POSITIONAL_OR_KEYWORD,
+        ):
+            types.append(ann)
+        if p.kind == inspect.Parameter.VAR_POSITIONAL:
+            types.append([ann])
+    return tuple(types)
+
+
+apply = Dispatcher("apply")
+
+OpBase.apply = apply
+
+
+@apply.add
+def _(op: OpBase, *args: TensorBase):
+    raise NotImplementedError
+
+
+@apply.add
+def _(op: OpBase, *args: TensorWrapperBase):
+    assert args
+    Wrapper = type(args[0])
+    outputs = apply(op, *(i.__wrapped__ for i in args))
+    assert isinstance(outputs, tuple)
+    return tuple(map(Wrapper, outputs))
diff --git a/imperative/python/megengine/core/tensor/dtype.py b/imperative/python/megengine/core/tensor/dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..85c22bb7a7258b170cd90c700363e18bce6170f6
--- /dev/null
+++ b/imperative/python/megengine/core/tensor/dtype.py
@@ -0,0 +1,289 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+import collections
+from typing import Union
+
+import numpy as np
+
+# normal dtype related
+from .._imperative_rt import bfloat16, intb1, intb2, intb4
+
+
+def is_lowbit(dtype):
+    return (dtype is intb1) or (dtype is intb2) or (dtype is intb4)
+
+
+def is_bfloat16(dtype):
+    return dtype is bfloat16
+
+
+# quantization dtype related
+_QuantDtypeMetadata = collections.namedtuple(
+    "QuantDtypeMetadata", ["name", "np_dtype_str", "is_unsigned", "qmin", "qmax",]
+)
+
+_metadata_dict = {
+    "quint8": _QuantDtypeMetadata("Quantized8Asymm", "uint8", True, 0, 255),
+    "qint8": _QuantDtypeMetadata("QuantizedS8", "int8", False, -128, 127),
+    "quint4": _QuantDtypeMetadata("Quantized4Asymm", "uint8", True, 0, 15),
+    "qint4": _QuantDtypeMetadata("QuantizedS4", "int8", False, -8, 7),
+    "qint32": _QuantDtypeMetadata(
+        "QuantizedS32", "int32", False, -(2 ** 31), 2 ** 31 - 1,
+    ),
+    # NOTE: int2 is not supported for model dump yet
+    "quint2": _QuantDtypeMetadata(None, "uint8", True, 0, 3),
+    "qint2": _QuantDtypeMetadata(None, "int8", False, -2, 1),
+}
+
+
+def is_quantize(dtype):
+    return (
+        hasattr(dtype, "metadata")
+        and dtype.metadata is not None
+        and "mgb_dtype" in dtype.metadata
+    )
+
+
+def get_scale(dtype):
+    assert is_quantize(dtype)
+    return dtype.metadata["mgb_dtype"]["scale"]
+
+
+def get_zero_point(dtype):
+    assert is_quantize(dtype)
+    metadata = dtype.metadata["mgb_dtype"]
+    assert metadata["name"] in ("Quantized8Asymm", "Quantized4Asymm")
+    return metadata["zero_point"]
+
+
+def _check_zero_point(zp: int, dtype_str: str):
+    qmin = _metadata_dict[dtype_str].qmin
+    qmax = _metadata_dict[dtype_str].qmax
+    if zp < qmin or zp > qmax:
+        raise ValueError(
+            "zero_point should be within [{}, {}] for {}".format(qmin, qmax, dtype_str)
+        )
+
+
+def get_quantized_dtype(dtype_str: str, scale: float, zp: Union[int, None]):
+    r"""
+    Get quantized dtype with metadata attribute according to _metadata_dict.
+
+    Note that unsigned dtype must have ``zero_point`` and signed dtype must
+    not have ``zero_point``, to be consitent with tensor generated by calling
+    compiled function from `CompGraph.compile(inputs, outspec)`.
+
+    :param dtype: a string indicating which dtype to return
+    :param scale: a number for scale to store in dtype's metadata
+    :param zp: a number for zero_point to store in dtype's metadata
+    """
+    metadata = _metadata_dict[dtype_str]
+    np_dtype_str = metadata.np_dtype_str
+    is_unsigned = metadata.is_unsigned
+    if is_unsigned:
+        if zp is None or int(zp) != zp:
+            raise ValueError("zero_point should be an integer")
+        zp = int(zp)
+        _check_zero_point(zp, dtype_str)
+        return np.dtype(
+            np_dtype_str,
+            metadata={
+                "mgb_dtype": {
+                    "name": metadata.name,
+                    "scale": float(scale),
+                    "zero_point": zp,
+                }
+            },
+        )
+    else:
+        return np.dtype(
+            np_dtype_str,
+            metadata={"mgb_dtype": {"name": metadata.name, "scale": float(scale)}},
+        )
+
+
+def quint8(scale, zero_point):
+    """
+    Consturct a quantized unsigned int8 data type with ``scale`` (float) and
+    ``zero_point`` (uint8). The real value represented by a quint8 data type is
+    float_val = scale * (uint8_val - zero_point)
+    """
+    return get_quantized_dtype("quint8", scale, zero_point)
+
+
+def qint8(scale):
+    """
+    Construct a quantized int8 data type with ``scale`` (float). The real value
+    represented by a qint8 data type is float_val = scale * int8_val
+    """
+    return get_quantized_dtype("qint8", scale, None)
+
+
+def qint32(scale):
+    """
+    Construct a quantized int32 data type with ``scale`` (float). The real value
+    represented by a qint32 data type is float_val = scale * int32_val
+    """
+    return get_quantized_dtype("qint32", scale, None)
+
+
+def quint4(scale, zero_point):
+    """
+    Consturct a quantized unsigned int4 data type with ``scale`` (float) and
+    ``zero_point`` (uint8). The real value represented by a quint4 data type is
+    float_val = scale * (uint4_val - zero_point)
+    """
+    return get_quantized_dtype("quint4", scale, zero_point)
+
+
+def qint4(scale):
+    """
+    Construct a quantized int4 data type with ``scale`` (float). The real value
+    represented by a qint4 data type is float_val = scale * int4_val
+    """
+    return get_quantized_dtype("qint4", scale, None)
+
+
+def _convert_to_quantized_dtype(arr: np.ndarray, dtype: np.dtype, dtype_str: str):
+    metadata = _metadata_dict[dtype_str]
+    arr_metadata = dtype.metadata["mgb_dtype"]
+    if not isinstance(arr, np.ndarray):
+        raise ValueError("arr parameter should be instance of np.ndarray")
+    if not is_quantize(dtype) or arr_metadata["name"] != metadata.name:
+        raise ValueError("dtype parameter should be a {} dtype".format(dtype_str))
+    is_unsigned = metadata.is_unsigned
+    if is_unsigned:
+        scale, zp = (
+            arr_metadata["scale"],
+            arr_metadata["zero_point"],
+        )
+        return (
+            (np.round(arr / scale) + zp)
+            .clip(metadata.qmin, metadata.qmax)
+            .astype(dtype)
+        )
+    else:
+        # don't trick to combine with is_unsigned, seeing ``get_quantized_dtype``
+        scale = arr_metadata["scale"]
+        return np.round(arr / scale).clip(metadata.qmin, metadata.qmax).astype(dtype)
+
+
+def _convert_from_quantized_dtype(arr: np.ndarray, dtype_str: str):
+    metadata = _metadata_dict[dtype_str]
+    arr_metadata = arr.dtype.metadata["mgb_dtype"]
+    if not isinstance(arr, np.ndarray):
+        raise ValueError("arr parameter should be instance of np.ndarray")
+    if not is_quantize(arr.dtype) or arr_metadata["name"] != metadata.name:
+        raise ValueError("arr's dtype should be a {} dtype".format(dtype_str))
+    is_unsigned = metadata.is_unsigned
+    if is_unsigned:
+        scale, zp = (
+            arr_metadata["scale"],
+            arr_metadata["zero_point"],
+        )
+        return (arr.astype(np.float32) - zp) * scale
+    else:
+        # don't trick to combine with is_unsigned, seeing ``get_quantized_dtype``
+        scale = arr_metadata["scale"]
+        return (arr.astype(np.float32)) * scale
+
+
+def convert_to_quint8(arr: np.ndarray, q: np.dtype):
+    """
+    Quantize a float NumPy ndarray into a quint8 one with specified params.
+
+    :param arr: Input ndarray.
+    :param q: Target data type, should be a quint8.
+    """
+    return _convert_to_quantized_dtype(arr, q, "quint8")
+
+
+def convert_from_quint8(arr: np.ndarray):
+    """
+    Dequantize a quint8 NumPy ndarray into a float one.
+
+    :param arr: Input ndarray.
+    """
+    return _convert_from_quantized_dtype(arr, "quint8")
+
+
+def convert_to_qint8(arr: np.ndarray, q: np.dtype):
+    """
+    Quantize a float NumPy ndarray into a qint8 one with specified params.
+
+    :param arr: Input ndarray.
+    :param q: Target data type, should be a qint8.
+    """
+    return _convert_to_quantized_dtype(arr, q, "qint8")
+
+
+def convert_from_qint8(arr: np.ndarray):
+    """
+    Dequantize a qint8 NumPy ndarray into a float one.
+
+    :param arr: Input ndarray.
+    """
+    return _convert_from_quantized_dtype(arr, "qint8")
+
+
+def convert_to_qint32(arr: np.ndarray, q: np.dtype):
+    """
+    Quantize a float NumPy ndarray into a qint32 one with specified params.
+
+    :param arr: Input ndarray.
+    :param q: Target data type, should be a qint8.
+    """
+    return _convert_to_quantized_dtype(arr, q, "qint32")
+
+
+def convert_from_qint32(arr):
+    """
+    Dequantize a qint32 NumPy ndarray into a float one.
+
+    :param arr: Input ndarray.
+    """
+    return _convert_from_quantized_dtype(arr, "qint32")
+
+
+def convert_to_quint4(arr: np.ndarray, q: np.dtype):
+    """
+    Quantize a float NumPy ndarray into a quint4 one with specified params.
+
+    :param arr: Input ndarray.
+    :param q: Target data type, should be a quint4.
+    """
+    return _convert_to_quantized_dtype(arr, q, "quint4")
+
+
+def convert_from_quint4(arr: np.ndarray):
+    """
+    Dequantize a quint4 NumPy ndarray into a float one.
+
+    :param arr: Input ndarray.
+    """
+    return _convert_from_quantized_dtype(arr, "quint4")
+
+
+def convert_to_qint4(arr: np.ndarray, q: np.dtype):
+    """
+    Quantize a float NumPy ndarray into a qint4 one with specified params.
+
+    :param arr: Input ndarray.
+    :param q: Target data type, should be a qint4.
+    """
+    return _convert_to_quantized_dtype(arr, q, "qint4")
+
+
+def convert_from_qint4(arr: np.ndarray):
+    """
+    Dequantize a qint4 NumPy ndarray into a float one.
+
+    :param arr: Input ndarray.
+    """
+    return _convert_from_quantized_dtype(arr, "qint4")
diff --git a/imperative/python/megengine/core/tensor/function.py b/imperative/python/megengine/core/tensor/function.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cbb3d56e2ba9fb91a15c64d24ab7873eaffbc05
--- /dev/null
+++ b/imperative/python/megengine/core/tensor/function.py
@@ -0,0 +1,158 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from ..ops.builtin import OpDef
+from .core import TensorBase, TensorWrapperBase, apply
+from .raw_tensor import RawTensor
+from .tensor import Tensor, push_context
+from .tensor_wrapper import TensorWrapper
+
+
+class Function:
+    """
+    Defines a block of operations with customizable differentiation.
+
+    The computation should be defined in ``forward`` method, with gradient
+    computation defined in ``backward`` method.
+
+    Each instance of ``Function`` should be used only once during forwardding.
+
+    Examples:
+
+    .. testcode::
+
+        class Sigmoid(Function):
+            def forward(self, x):
+                y = 1 / (1 + F.exp(-x))
+                self.y = y
+                return y
+
+            def backward(self. output_grads):
+                y = self.y
+                return output_grads * y * (1-y)
+
+    """
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def __call__(self, *args):
+        ret = apply(self, *args)
+        if type(ret) == tuple and len(ret) == 1:
+            return ret[0]
+        return ret
+
+    def forward(self, *args, **kwargs):
+        """
+        Applies operations to ``inputs`` and returns results. It must be overriden by all subclasses.
+
+        :param input: Input tensors.
+        :return: A tuple of Tensor or a single Tensor.
+
+        .. note::
+
+            This method should return a tuple of Tensor or a single Tensor representing the output
+            of the function.
+        """
+        raise NotImplementedError
+
+    def backward(self, *output_grads):
+        """
+        Compute the gradient of the forward function. It must be overriden by all subclasses.
+
+        :param output_grads: gradients of outputs that are returned by :meth:`~.function.Function.forward`
+
+            .. note::
+
+                In case when some tensors of outputs are not related to loss function, the corresponding
+                values in ``output_grads`` would be ``None``.
+
+        .. note::
+
+            This method should return a tuple which containing the gradients of all inputs, in the same order
+            as the ``inputs`` argument of :meth:`~.function.Function.forward` . A ``Tensor`` could be returned
+            instead if there is only one input. If users want to stop the propagation of some gradients,
+            the corresponding returned values should be set ``None`` .
+
+        """
+        raise NotImplementedError
+
+    def get_backward_fn(self):
+        if self.backward is None:
+            return None
+
+        def _backward(*output_grads):
+            if type(output_grads) is tuple:
+                _output_grads = map(TensorWrapper, output_grads)
+            else:
+                _output_grads = (TensorWrapper(output_grads),)
+            ret = self.backward(*_output_grads)
+            if type(ret) is not tuple:
+                ret = (ret,)
+            ret = tuple([i.__wrapped__ for i in ret])
+            return ret
+
+        return _backward
+
+
+Function.apply = Function.__call__
+
+
+@apply.add
+def _(op: Function, *args: TensorWrapperBase):
+    assert args
+    Wrapper = type(args[0])
+
+    # compute the value for self define function
+    extra_data_dic = {}
+    for arg in args:
+        extra_data_dic[arg.__wrapped__] = arg.__wrapped__._extra_data
+        arg.__wrapped__._extra_data = {}
+
+    rets = op.forward(*args)
+
+    for arg in args:
+        arg.__wrapped__._extra_data = extra_data_dic[arg.__wrapped__]
+
+    # update the gradient information for self define function
+    inputs = tuple(map(lambda i: i.__wrapped__, args))
+    outputs = (
+        tuple(map(lambda i: i.__wrapped__, rets))
+        if type(rets) is tuple
+        else (rets.__wrapped__,)
+    )
+
+    for output in outputs:
+        output._extra_data = {}
+
+    with push_context() as ctx:
+        ctx.inputs = inputs
+        ctx.outputs = outputs
+        for k in set().union(*(i._extra_data for i in inputs if isinstance(i, Tensor))):
+            ctx.key = k
+            data = tuple(
+                i._extra_data.get(k) if isinstance(i, Tensor) else i for i in inputs
+            )
+            # data are instances of Tracer
+            # dispatched to apply.add@grad.py
+            rets = apply(op, *data)
+            if rets is not None:
+                assert len(outputs) == len(rets)
+                for t, i in zip(outputs, rets):
+                    t._extra_data[k] = i
+
+    return tuple(map(Wrapper, outputs))
+
+
+@apply.add
+def _(op: Function, *args: Tensor):
+    raise NotImplementedError
+
+
+@apply.add
+def _(op: Function, *args: RawTensor):
+    raise NotImplementedError
diff --git a/imperative/python/megengine/core/tensor/indexing.py b/imperative/python/megengine/core/tensor/indexing.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c612c9b4899c774d50423f76e2f83c577dccee8
--- /dev/null
+++ b/imperative/python/megengine/core/tensor/indexing.py
@@ -0,0 +1,251 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+from ..ops import builtin
+from ..ops.special import Const
+from .core import TensorBase, TensorWrapperBase, apply
+
+
+def remove_ellipsis(tensor, tuple_val):
+    ndim_sum = tensor.ndim
+    cur_sum = 0
+    pos = -1
+    for i_idx, i in enumerate(tuple_val):
+        if i is Ellipsis:
+            for j in tuple_val[:i_idx:-1]:
+                if j is Ellipsis:
+                    raise IndexError("only one ellipsis is allowed")
+            pos = i_idx
+        else:
+            cur_sum += i.ndim if hasattr(i, "ndim") else 1
+    if pos == -1:
+        return tuple_val
+    else:
+        return (
+            tuple_val[:pos]
+            + (slice(None, None, None),) * (ndim_sum - cur_sum)
+            + tuple_val[pos + 1 :]
+        )
+
+
+def check_bool_index(tensor, tuple_val):
+    cur_shape = tensor.shape
+    new_tuple_val = []
+    offset = 0
+    tdim = 0
+    for idx, i in enumerate(tuple_val):
+        if hasattr(i, "dtype") and i.dtype == np.bool_:
+            if i.ndim > 1:
+                tot = i.ndim
+                for j in range(i.ndim):
+                    if cur_shape[tdim + j - offset] != i.shape[j]:
+                        raise IndexError(
+                            "boolean index did not match tensor along dimension {}; dimension is {} but corresponding boolean dimension is {}".format(
+                                tdim + j, cur_shape[tdim + j - offset], i.shape[j]
+                            )
+                        )
+                i = i.reshape(-1)
+                cur_shape = (
+                    cur_shape[:idx] + (i.shape[0],) + cur_shape[tdim + tot - offset :]
+                )
+                offset += 1
+                tensor = tensor.reshape(cur_shape)
+                tdim += tot
+            new_tuple_val.append(i)
+        else:
+            new_tuple_val.append(i)
+            tdim += 1
+    return tensor, new_tuple_val
+
+
+def unpack_getitem(inp, tuple_val, *, allow_newaxis=True):
+    if not isinstance(tuple_val, tuple):
+        tuple_val = (tuple_val,)
+    ndim_indexed = 0
+    for i in tuple_val:
+        if not i is Ellipsis:
+            ndim_indexed += 1 if not hasattr(i, "ndim") else i.ndim
+    if ndim_indexed > inp.ndim:
+        raise IndexError(
+            "too many indices for tensor: tensor is {}-dimensional, but {} were indexed".format(
+                inp.ndim, ndim_indexed
+            )
+        )
+
+    tuple_val = remove_ellipsis(inp, tuple_val)
+    use_subtensor = True
+    inp, tuple_val = check_bool_index(inp, tuple_val)
+
+    def is_scalar(d):
+        if isinstance(i, int):
+            return True
+        if type(d).__module__ == np.__name__:
+            return np.isscalar(d)
+        # if isinstance(d, (TensorBase, TensorWrapperBase)):
+        #     return d.shape == (1,)
+        return False
+
+    new_axes = []
+    tensors = []
+    items = []
+    cur_axis = -1
+    for i_idx, i in enumerate(tuple_val):
+        cur_axis += 1
+        if i is np.newaxis:
+            if cur_axis >= 0:
+                new_axes.append(cur_axis)
+            continue
+
+        if i is Ellipsis:
+            cur_axis = -1
+            for j in tuple_val[:i_idx:-1]:
+                if j is Ellipsis:
+                    raise IndexError("only one ellipsis is allowed")
+                if j is np.newaxis:
+                    new_axes.append(cur_axis)
+                cur_axis -= 1
+            continue
+
+        if (
+            not is_scalar(i)
+            and not i is np.newaxis
+            and not i is Ellipsis
+            and not isinstance(i, slice)
+        ):
+            use_subtensor = False
+
+        item = [
+            cur_axis,
+        ]
+
+        def is_bool_list(x):
+            if not isinstance(x, list):
+                return False
+            for i in x:
+                if not isinstance(i, bool):
+                    return False
+            return True
+
+        def get_index(i):
+            if not isinstance(i, (TensorBase, TensorWrapperBase)):
+                if is_bool_list(i) or isinstance(i, np.ndarray) and i.dtype == np.bool_:
+                    (i,) = Const(i, dtype=np.bool_, device=inp.device)(inp)
+                else:
+                    (i,) = Const(i, dtype=np.int32, device=inp.device)(inp)
+                    return i
+            assert isinstance(i, (TensorBase, TensorWrapperBase))
+            if i.dtype != np.bool_:
+                return i
+            _, ind = apply(builtin.CondTake(), i, i)
+            return ind
+
+        def push(v, item, tensors):
+            if v is None:
+                item.append(False)
+            else:
+                item.append(True)
+                v = get_index(v)
+                assert np.issubdtype(v.dtype, np.integer) or np.issubdtype(
+                    v.dtype, np.bool
+                ), "var type in the subscript must be int or bool"
+                tensors.append(v)
+
+        if isinstance(i, slice):
+            if i.start is None and i.stop is None and i.step is None:
+                continue
+            push(i.start, item, tensors)
+            push(i.stop, item, tensors)
+            push(i.step, item, tensors)
+            item.append(False)  # idx
+        else:
+            item += [False,] * 3  # begin, end, stop
+            push(i, item, tensors)
+        assert len(item) == 5
+        items.append(item)
+    if new_axes:
+        raise IndexError("newaxis is not allowed here")
+    return inp, tensors, items, use_subtensor
+
+
+def try_condtake(tensor, index):
+    if not hasattr(index, "dtype") or not hasattr(index, "shape"):
+        return []
+    if index.dtype != np.bool_ or index.shape != tensor.shape:
+        return []
+    if isinstance(index, np.ndarray):
+        (i,) = Const(i, dtype=np.bool_, device=inp.device)(inp)
+    assert isinstance(index, (TensorBase, TensorWrapperBase))
+    if not isinstance(tensor, (TensorWrapperBase, TensorBase)):
+        raise TypeError("input must be a tensor")
+    if tensor.device != index.device:
+        raise ValueError(
+            "ambiguous device: {} vs {}".format(tensor.device, index.device)
+        )
+    return apply(builtin.CondTake(), tensor, index)
+
+
+def getitem(tensor, index):
+    try_result = try_condtake(tensor, index)
+    if len(try_result) == 2:
+        return try_result[0]
+    tensor, tensors, items, use_subtensor = unpack_getitem(tensor, index)
+    for v in tensors:
+        if v.shape[0] == 0:
+            (empty_tensor,) = Const([], dtype=tensor.dtype, device=tensor.device)(
+                tensor
+            )
+            return empty_tensor
+    if use_subtensor:
+        op = builtin.Subtensor(items=items)
+    else:
+        op = builtin.IndexingMultiAxisVec(items=items)
+    (result,) = apply(op, tensor, *tensors)
+    return result
+
+
+def setitem(tensor, index, value):
+    org_shape = tensor.shape
+    try_result = try_condtake(tensor, index)
+    if len(try_result) == 2:
+        index = try_result[1]
+        if index.shape[0] == 0:
+            return tensor
+        tensor = tensor.reshape(-1)
+    if not isinstance(value, (TensorBase, TensorWrapperBase)):
+        op = Const(value, dtype=tensor.dtype, device=tensor.device)
+        (value,) = op(tensor)
+    tensor, tensors, items, use_subtensor = unpack_getitem(tensor, index)
+    for v in tensors:
+        if v.shape[0] == 0:
+            return tensor
+    if use_subtensor:
+        op = builtin.Subtensor(items=items)
+    else:
+        op = builtin.IndexingMultiAxisVec(items=items)
+    (tmp_result,) = apply(op, tensor, *tensors)
+    if value.shape != tmp_result.shape:
+        for i in range(min(len(value.shape), len(tmp_result.shape))):
+            if (
+                value.shape[-i - 1] != 1
+                and value.shape[-i - 1] != tmp_result.shape[-i - 1]
+            ):
+                raise ValueError(
+                    "cannot copy tensor with shape {} to subtensor with shape {}".format(
+                        value.shape, tmp_result.shape
+                    )
+                )
+        value = value.broadcast(tmp_result.shape)
+    if use_subtensor:
+        op = builtin.SetSubtensor(items=items)
+    else:
+        op = builtin.IndexingSetMultiAxisVec(items=items)
+    (result,) = apply(op, tensor, value, *tensors)
+    result = result.reshape(org_shape)
+    return result
diff --git a/imperative/python/megengine/core/tensor/megbrain_graph.py b/imperative/python/megengine/core/tensor/megbrain_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..86f7bcc11bfff0450cca6b1bd618ae7543908e6d
--- /dev/null
+++ b/imperative/python/megengine/core/tensor/megbrain_graph.py
@@ -0,0 +1,196 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+import threading
+import weakref
+from concurrent.futures import Future, ThreadPoolExecutor
+
+from .. import _imperative_rt
+from .._wrap import device as as_device
+from ..ops.builtin import OpDef
+from .core import OpBase, TensorBase, apply
+
+
+class CompiledFunction:
+    def __init__(self, graph, function):
+        self._graph = graph
+        self._function = function
+        self._future = None
+
+    def execute(self, *args):
+        assert self._future is None
+        self._future = self._graph._executor.submit(self._function.execute, *args)
+
+    def wait(self):
+        assert self._future is not None
+        self._future.exception()
+        self._function.wait()
+        try:
+            return self._future.result()
+        finally:
+            self._future = None
+
+    def __call__(self, *args):
+        self.execute(*args)
+        return self.wait()
+
+
+class Graph(_imperative_rt.ComputingGraph):
+    def __init__(self):
+        super().__init__()
+        self._var_cache = weakref.WeakKeyDictionary()
+        self._op_cache = weakref.WeakKeyDictionary()
+        self._executor = ThreadPoolExecutor(1)
+
+    def _wrap(self, obj):
+        if type(obj) is _imperative_rt.VarNode:
+            wrapper, cache = VarNode, self._var_cache
+        elif type(obj) is _imperative_rt.OperatorNode:
+            wrapper, cache = OpNode, self._op_cache
+        if obj not in cache:
+            cache[obj] = wrapper(obj)
+        return cache[obj]
+
+    def compile(self, *args):
+        return CompiledFunction(self, super().compile(_unwrap(args)))
+
+
+class VarNode(TensorBase):
+    def __init__(self, node: _imperative_rt.VarNode):
+        self._node = node
+
+    @property
+    def graph(self) -> Graph:
+        return self._node.graph
+
+    @property
+    def op(self):
+        return self.graph._wrap(self._node.owner)
+
+    @property
+    def dtype(self):
+        return self._node.dtype
+
+    @property
+    def device(self):
+        return as_device(self._node.comp_node)
+
+
+class OpNode:
+    def __init__(self, node: _imperative_rt.OperatorNode):
+        self._node = node
+
+    @property
+    def graph(self) -> Graph:
+        return self._node.graph
+
+    @property
+    def inputs(self):
+        return tuple(map(self.graph._wrap, self._node.inputs))
+
+    @property
+    def outputs(self):
+        return tuple(map(self.graph._wrap, self._node.outputs))
+
+
+def _wrap(x):
+    if isinstance(x, collections.Sequence):
+        return type(x)(map(_wrap, x))
+    return x.graph._wrap(x)
+
+
+def _unwrap(x):
+    if isinstance(x, collections.Sequence):
+        return type(x)(map(_unwrap, x))
+    return x._node
+
+
+@apply.add
+def _(op: OpDef, *args: VarNode):
+    outputs = _imperative_rt.invoke_op(op, _unwrap(args))
+    return _wrap(outputs)
+
+
+def input_callback(callback, *args, device=None, dtype=None, graph=None):
+    outputs = _imperative_rt.input_callback(
+        callback, as_device(device).to_c(), dtype, _unwrap(args), graph=graph
+    )
+    value, dummy = _wrap(outputs)
+    return value, dummy
+
+
+class InputNode(OpNode):
+    def __init__(self, *args: VarNode, device=None, dtype=None, graph=None):
+        r = _imperative_rt.DeviceTensorNDRendezvous()
+        if device is not None:
+            device = as_device(device).to_c()
+        outputs = _imperative_rt.input_callback(
+            r, device, dtype, _unwrap(args), graph=graph
+        )
+        super().__init__(outputs[0].owner)
+        self._rendezvous = r
+
+    def set_value(self, value):
+        assert isinstance(value, _imperative_rt.DeviceTensorND)
+        self._rendezvous.set(value)
+
+    def reset(self):
+        self._rendezvous.reset()
+
+    @property
+    def device(self):
+        return self.outputs[0].device
+
+    @property
+    def dtype(self):
+        return self.outputs[0].dtype
+
+
+def output_callback(callback, var, *args):
+    args = (var,) + args
+    dummy = _imperative_rt.output_callback(callback, _unwrap(args))
+    return _wrap(dummy)
+
+
+class OutputNode(OpNode):
+    def __init__(self, var, *args):
+        args = (var,) + args
+        r = _imperative_rt.DeviceTensorNDRendezvous()
+        dummy = _imperative_rt.output_callback(r, _unwrap(args))
+        super().__init__(dummy.owner)
+        self._rendezvous = r
+
+    def get_value(self):
+        return self._rendezvous.get()
+
+    def reset(self):
+        self._rendezvous.reset()
+
+
+class TensorAttr:
+    def __init__(self, shape, dtype, device):
+        self.shape = shape
+        self.dtype = dtype
+        self.device = device
+
+
+class AttrOutputNode(OpNode):
+    def __init__(self, var, *args):
+        args = (var,) + args
+        r = _imperative_rt.TensorAttrRendezvous()
+        dummy = _imperative_rt.attr_output_callback(r, _unwrap(args))
+        super().__init__(dummy.owner)
+        self._rendezvous = r
+
+    def get_value(self):
+        attr = self._rendezvous.get()
+        return TensorAttr(attr.shape, attr.dtype, as_device(attr.comp_node))
+
+    def reset(self):
+        self._rendezvous.reset()
diff --git a/imperative/python/megengine/core/tensor/raw_tensor/__init__.py b/imperative/python/megengine/core/tensor/raw_tensor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..decca86df20b78a6de30341fe20353ebec60373f
--- /dev/null
+++ b/imperative/python/megengine/core/tensor/raw_tensor/__init__.py
@@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import functools
+
+import numpy as np
+
+from ..._imperative_rt import CompNode, DeviceTensorND
+from ..._imperative_rt.imperative import (
+    _get_dev_tensor,
+    apply_op,
+    delete,
+    get_device,
+    get_dtype,
+    get_shape,
+    get_value,
+    put,
+)
+from ..._wrap import device as as_device
+from ...ops.builtin import Copy, OpDef, TypeCvt
+from ...ops.special import Const
+from ..core import OpBase, TensorBase, apply
+
+
+class RawTensor(TensorBase):
+
+    _init_cb = None
+    _del_cb = None
+
+    def __init__(self, handle):
+        self._handle = handle
+        if self._init_cb:
+            self._init_cb()
+
+    @property
+    def dtype(self):
+        return get_dtype(self._handle)
+
+    @property
+    def device(self):
+        return as_device(get_device(self._handle))
+
+    @property
+    def shape(self):
+        return get_shape(self._handle)
+
+    def numpy(self):
+        return get_value(self._handle)
+
+    def _dev_tensor(self):
+        return _get_dev_tensor(self._handle)
+
+    def __repr__(self):
+        return "{}({}, device='{}')".format(
+            type(self).__qualname__, repr(self.numpy()), self.device
+        )
+
+    def __del__(self):
+        if self._del_cb:
+            self._del_cb()
+        delete(self._handle)
+
+
+@apply.add
+def _(op: OpDef, *args: RawTensor):
+    outputs = apply_op(op, tuple(i._handle for i in args))
+    return tuple(map(RawTensor, outputs))
+
+
+@apply.add
+def _(op: Const, *args: RawTensor):
+    dtype = op.dtype
+    device = as_device(op.device).to_c()
+    return (as_raw_tensor(op.value, dtype=dtype, device=device),)
+
+
+@functools.singledispatch
+def as_raw_tensor(obj, dtype=None, device=None):
+    obj = np.asarray(obj, dtype=dtype)
+    if obj.dtype == np.float64:
+        obj = obj.astype(np.float32)
+    if obj.dtype == np.int64:
+        obj = obj.astype(np.int32)
+    return as_raw_tensor(obj, device=device)
+
+
+@as_raw_tensor.register(np.ndarray)
+def _(array: np.ndarray, dtype=None, device=None):
+    device = None if device is None else as_device(device).to_c()
+    return RawTensor(put(array, dtype=dtype, device=device))
+
+
+@as_raw_tensor.register(RawTensor)
+def _(tensor: RawTensor, dtype=None, device=None):
+    if dtype is not None:
+        dtype = np.dtype(dtype)
+        if dtype != tensor.dtype:
+            (tensor,) = apply(TypeCvt(dtype=dtype), tensor)
+    if device is not None:
+        device = as_device(device)
+        if device != tensor.device:
+            (tensor,) = apply(Copy(comp_node=device.to_c()), tensor)
+    return tensor
diff --git a/imperative/python/megengine/core/tensor/raw_tensor/jit.py b/imperative/python/megengine/core/tensor/raw_tensor/jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..091b3789d2764662d953e67caf7b28847f85de4f
--- /dev/null
+++ b/imperative/python/megengine/core/tensor/raw_tensor/jit.py
@@ -0,0 +1,251 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import functools
+import io
+import weakref
+
+
+class partial(functools.partial):
+    def __get__(self, instance, owner=None):
+        if instance is None:
+            return self
+        return functools.partial(self, instance)
+
+
+def hook(f):
+    def decorator(impl):
+        return functools.update_wrapper(partial(f, impl), impl)
+
+    return decorator
+
+
+def on_input(impl, value):
+    tensor = impl(value)
+    trace = get_trace()
+    if trace:
+        var = trace.get_var(tensor)
+        event = InputEvent(var)
+        trace.append(event)
+    return tensor
+
+
+def on_read_dtype(impl, self):
+    trace = get_trace()
+    if trace:
+        var = trace.get_var(self)
+        event = ReadDtypeEvent(var)
+        trace.append(event)
+
+    return impl(self)
+
+
+def on_read_device(impl, self):
+    trace = get_trace()
+    if trace:
+        var = trace.get_var(self)
+        event = ReadDeviceEvent(var)
+        trace.append(event)
+
+    return impl(self)
+
+
+def on_read_shape(impl, self):
+    trace = get_trace()
+    if trace:
+        var = trace.get_var(self)
+        event = ReadShapeEvent(var)
+        trace.append(event)
+
+    return impl(self)
+
+
+def on_read_value(impl, self):
+    trace = get_trace()
+    if trace:
+        var = trace.get_var(self)
+        event = ReadValueEvent(var)
+        trace.append(event)
+
+    return impl(self)
+
+
+def on_builtin_op(impl, op, *args):
+    outputs = impl(op, *args)
+
+    trace = get_trace()
+    if trace:
+        input_vars = tuple(map(trace.get_var, args))
+        output_vars = outputs and tuple(map(trace.get_var, outputs))
+        event = OpEvent(op, input_vars, output_vars)
+        trace.append(event)
+
+    return outputs
+
+
+def on_del(impl, self):
+    trace = get_trace()
+    if trace:
+        var = trace.get_var(self)
+        event = DelEvent(var)
+        trace.append(event)
+
+    return impl(self)
+
+
+class Trace(list):
+    def __init__(self):
+        self._var_id = 1
+        self._t2v = weakref.WeakKeyDictionary()
+        self._v2t = weakref.WeakValueDictionary()
+
+    def get_var(self, x):
+        v = self._t2v.get(x)
+        if v:
+            return v
+        v = self._var_id
+        self._var_id += 1
+        self._t2v[x] = v
+        self._v2t[v] = x
+        return v
+
+    def __bool__(self):
+        return True
+
+    def __enter__(self):
+        global _current_trace
+        if hasattr(self, "_prev_trace"):
+            raise RuntimeError
+        self._prev_trace = _current_trace
+        _current_trace = self
+        return self
+
+    def __exit__(self, *_):
+        global _current_trace
+        if _current_trace is not self:
+            raise RuntimeError
+        _current_trace = self._prev_trace
+        del self._prev_trace
+
+
+class Event:
+    pass
+
+
+class InputEvent(Event):
+    def __init__(self, var):
+        self.var = var
+
+
+class ReadEvent(Event):
+    def __init__(self, var):
+        self.var = var
+
+
+class ReadDtypeEvent(ReadEvent):
+    pass
+
+
+class ReadDeviceEvent(ReadEvent):
+    pass
+
+
+class ReadShapeEvent(ReadEvent):
+    pass
+
+
+class ReadValueEvent(ReadEvent):
+    pass
+
+
+class OpEvent(Event):
+    def __init__(self, op, inputs, outputs):
+        self.op = op
+        self.inputs = inputs
+        self.outputs = outputs
+
+
+class DelEvent(Event):
+    def __init__(self, var):
+        self.var = var
+
+
+_current_trace = None
+
+
+def get_trace() -> Trace:
+    global _current_trace
+    return _current_trace
+
+
+def format_trace(trace):
+    buf = io.StringIO()
+    active_vars = set()
+
+    def write(fmt, *args, **kwargs):
+        print(fmt.format(*args, **kwargs), file=buf)
+
+    def init_vars(*args):
+        for i in args:
+            if i in active_vars:
+                continue
+            active_vars.add(i)
+            write("_{} = input()", i)
+
+    for event in trace:
+        if isinstance(event, InputEvent):
+            init_vars(event.var)
+        elif isinstance(event, ReadDtypeEvent):
+            init_vars(event.var)
+            write("output(_{}.dtype)", event.var)
+        elif isinstance(event, ReadDeviceEvent):
+            init_vars(event.var)
+            write("output(_{}.device)", event.var)
+        elif isinstance(event, ReadShapeEvent):
+            init_vars(event.var)
+            write("output(_{}.shape)", event.var)
+        elif isinstance(event, ReadValueEvent):
+            init_vars(event.var)
+            write("output(_{}.dtype)", event.var)
+        elif isinstance(event, ReadValueEvent):
+            init_vars(event.var)
+            write("output(_{}.value)", event.var)
+        elif isinstance(event, OpEvent):
+            init_vars(*event.inputs)
+            active_vars.update(event.outputs)
+            ovars = ", ".join(map("_{}".format, event.outputs))
+            ivars = ", ".join(map("_{}".format, event.inputs))
+            if ovars:
+                write("{} = {}({})", ovars, repr(event.op), ivars)
+            else:
+                write("{}({})", repr(event.op), ivars)
+        elif isinstance(event, DelEvent):
+            init_vars(event.var)
+            write("del _{}", event.var)
+        else:
+            raise TypeError(type(event))
+
+    return buf.getvalue()
+
+
+def compile_trace(trace):
+    trace = list(trace)
+
+
+def static_function(f):
+    trace = None
+
+    @functools.wraps(f)
+    def wrapper(*args, **kwargs):
+        nonlocal trace
+        if trace is None:
+            with Trace() as trace:
+                return f(*args, **kwargs)
+        return f(*args, **kwargs)
+
+    return wrapper
diff --git a/imperative/python/megengine/core/tensor/raw_tensor/trace_exec.py b/imperative/python/megengine/core/tensor/raw_tensor/trace_exec.py
new file mode 100644
index 0000000000000000000000000000000000000000..d16a6ef0642d3226312618ef0ccf00ed2f3b33e4
--- /dev/null
+++ b/imperative/python/megengine/core/tensor/raw_tensor/trace_exec.py
@@ -0,0 +1,263 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import functools
+import weakref
+
+# Concepts
+#
+# * Internal tensor
+#   Tensor produced by the static sequence
+#
+# * External tensor
+#   Tensor not produced, but used as input, by the static sequence
+#
+# * Irrelevant tensor
+#   Tensor not present in input/output of any op
+#
+# * Escape
+#   An internal tensor is said to escape if it is still alive
+#   at the end of the sequence
+
+# JIT-ed execution
+#
+# 1. read attr (dtype, device, shape)
+#    a. internal tensor
+#       read out as soon as tensor is produced
+#    b. external or irrelevant tensor
+#       fallback
+#
+# 2. apply op
+#    bind external tensors in input
+#
+# 3. del
+
+
+class Action:
+    pass
+
+
+class ReadAttrAction(Action):
+    def __init__(self, var, name, getter):
+        self.var = var
+        self.name = name
+        self.getter = getter
+
+
+class ReadValueAction(Action):
+    def __init__(self, var, getter):
+        self.var = var
+        self.getter = getter
+
+
+class GetTensorAction(Action):
+    def __init__(self, var, getter):
+        self.var = var
+        self.getter = getter
+
+
+class OpAction(Action):
+    def __init__(self, op, inputs, outputs, input_receivers):
+        self.op = op
+        self.inputs = inputs
+        self.outputs = outputs
+        self.input_receivers = input_receivers
+
+
+class TensorAttr:
+    def __init__(self):
+        self.shape = None
+        self.dtype = None
+        self.device = None
+
+
+class Bailout(Exception):
+    pass
+
+
+class Fallback(Exception):
+    pass
+
+
+def handle_bailout_fallback_finalize(f):
+    @functools.wraps(f)
+    def wrapper(self, impl, *args, **kwargs):
+        try:
+            return f(*args, **kwargs)
+        except Bailout:
+            self.bailout()
+        except Fallback:
+            pass
+        finally:
+            if self.pc == len(self):
+                self.finalize()
+        return impl(*args, **kwargs)
+
+    return wrapper
+
+
+class ExecTrajectory(list):
+    def __init__(self):
+        super().__init__()
+        self.reset()
+
+    def __bool__(self):
+        return True
+
+    def __enter__(self):
+        global _current_trajectory
+        if hasattr(self, "_prev_trajectory"):
+            raise RuntimeError
+        self._prev_trajectory = _current_trajectory
+        _current_trajectory = self
+        self._exited = False
+        return self
+
+    def __exit__(self, *exc_info):
+        # cleanup should be done at completion,
+        # which is before exiting context manager
+        assert self._exited == (exc_info == (None, None, None))
+        if not self._exited:
+            assert self.pc < len(self)
+            self.bailout()
+
+    def _exit(self):
+        # clean up self and global varaible
+        assert not self._exited
+        self.reset()
+
+        global _current_trajectory
+        if _current_trajectory is not self:
+            raise RuntimeError
+        _current_trajectory = self._prev_trajectory
+        del self._prev_trajectory
+
+    def reset(self):
+        self._exited = True
+        self.pc = 0
+        self.attr_cache = weakref.WeakKeyDictionary()
+
+        ### Internal and External Tensor ###
+        # internal tensors are those produced by us
+        # external tensors are those received from outside
+        # during JIT-ed execution, internal tensors are just placeholders.
+        # var_to_tensor is the binding table for all tensors
+        self.var_to_tensor = {}  # var -> weakref[tensor]
+        # tensor_to_var is the reverse binding table for internal tensors
+        # note that external tensors could map to >1 vars.
+        self.tensor_to_var = weakref.WeakKeyDictionary()
+        # internal tensor will be materialized if its .data is accessed from outside
+        # after being meterialized, an intern tensor is much like an external tensor
+
+    def finalize(self):
+        assert self.pc == len(self)
+        self._exit()
+
+    def bailout(self):
+        self._exit()
+        raise NotImplementedError
+
+    def next_action(self):
+        assert not self._exited
+        assert self.pc < len(self)
+        return self[self.pc]
+
+    @handle_bailout_fallback_finalize
+    def read_attr(self, tensor, name):
+        attrs = self.attr_cache.setdefault(tensor, TensorAttr())
+        value = getattr(attrs, name, None)
+        if value is None:
+            action = self.next_action()
+            if not isinstance(action, ReadAttrAction):
+                raise Bailout
+            if name != action.name:
+                raise Bailout
+            value = action.getter()
+            setattr(attrs, name, value)
+        return value
+
+    @handle_bailout_fallback_finalize
+    def read_value(self, impl, tensor):
+        # possibilities:
+        # 1. internal tensor
+        # 2. external tensor
+        # 3. irrelevant tensor (not an input / output of any op)
+        if tensor not in self.tensor_to_var:
+            raise Fallback
+        assert tensor._data is None
+        action = self.next_action()
+        if not isinstance(action, ReadValueAction):
+            raise Bailout
+        return action.getter()
+
+    @handle_bailout_fallback_finalize
+    def apply_op(self, impl, op, *args):
+        from . import RawTensor
+
+        action = self.next_action()
+        if not isinstance(action, OpAction):
+            raise Bailout
+        if len(args) != len(action.inputs):
+            raise Bailout
+        assert len(actions.inputs) == len(action.input_receivers)
+
+        for v, t, r in zip(action.inputs, args, action.input_receivers):
+            if v in self.var_to_tensor:
+                assert r is None
+                if t is not self.var_to_tensor[v]():
+                    raise Bailout
+            else:
+                # NOTE: not checking for aliasing (>=2 vars map to 1 tensor)
+                #       the static execution backend must handle this
+                self.var_to_tensor[v] = weakref.ref(t)
+                r(t)
+
+        outputs = []
+        for v in action.outputs:
+            assert v not in self.var_to_tensor
+            t = RawTensor()
+            t._data_getter = functools.partial(self.get_data, v)
+            outputs.append(t)
+            self.var_to_tensor[v] = weakref.ref(t)
+
+        return tuple(outputs)
+
+    def get_data(self, var):
+        tensor = self.var_to_tensor[var]()
+        assert tensor is not None
+        assert tensor._data is None
+        assert tensor in self.tensor_to_var
+        action = self.next_action()
+        if not isinstance(action, GetTensorAction):
+            self.bailout()
+        elif action.var != var:
+            self.bailout()
+        else:
+            tensor._data = action.getter()
+            del tensor._data_getter
+            del self.tensor_to_var[tensor]
+        assert "_data_getter" not in tensor.__dict__
+        return tensor._data_getter()
+
+
+_current_trajectory = None
+
+
+def get_trajectory():
+    return _current_trajectory
+
+
+def compile_trace(trace):
+    from .jit import ReadDTypeEvent, ReadDeviceEvent, ReadShapeEvent, OpEvent, DelEvent
+
+    traj = ExecutionTrajectory()
+    active_vars = set()
+
+    for event in trace:
+        if isinstance(event, ReadDTypeEvent):
+            traj.append(ReadAttrAction())
diff --git a/imperative/python/megengine/core/tensor/tensor.py b/imperative/python/megengine/core/tensor/tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f2ff9d78121e9f4bb9746ac97494e9949e84c29
--- /dev/null
+++ b/imperative/python/megengine/core/tensor/tensor.py
@@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import contextlib
+import copy
+
+from .core import Dispatcher, OpBase, TensorBase, apply
+
+
+class Tensor(TensorBase):
+    def __init__(self, data: TensorBase):
+        self._data = data
+        # _extra_data is set up in Grad.wrt
+        self._extra_data = {}
+        self._user_data = {}
+
+    def __getattr__(self, name):
+        if name in self._user_data:
+            return self._user_data[name]
+        raise AttributeError(name)
+
+    def reset(self, other):
+        assert isinstance(other, __class__)
+        self.__dict__.clear()
+        self._data = other.data
+        self._extra_data = other._extra_data.copy()
+        self._user_data = other._user_data.copy()
+
+    def copy(self):
+        other = object.__new__(type(self))
+        other.reset(self)
+        return other
+
+    # tensor interface
+
+    @property
+    def shape(self):
+        return self._data.shape
+
+    @property
+    def dtype(self):
+        return self._data.dtype
+
+    @property
+    def device(self):
+        return self._data.device
+
+    def numpy(self):
+        return self._data.numpy()
+
+
+class ApplyContext:
+    def __init__(self):
+        self.inputs = None
+        self.outputs = None
+        self.key = None
+
+
+_context = None
+
+
+@contextlib.contextmanager
+def push_context():
+    global _context
+    backup = _context
+    try:
+        _context = ApplyContext()
+        yield _context
+    finally:
+        _context = backup
+
+
+def get_context():
+    return _context
+
+
+@apply.add
+def tensor_apply(op: OpBase, *args: Tensor):
+    data = tuple(i._data if isinstance(i, Tensor) else i for i in args)
+    # type(Tensor._data) is RawTensor
+    # dispached to apply.add@RawTensor.py if passed Tensor args
+    outputs = apply(op, *data)
+    ret = tuple(map(Tensor, outputs))
+
+    with push_context() as ctx:
+        ctx.inputs = args
+        ctx.outputs = ret
+        for k in set().union(*(i._extra_data for i in args if isinstance(i, Tensor))):
+            ctx.key = k
+            data = tuple(
+                i._extra_data.get(k) if isinstance(i, Tensor) else i for i in args
+            )
+            # data are instances of Tracer
+            # dispatched to apply.add@grad.py
+            outputs = apply(op, *data)
+            if outputs is not None:
+                assert len(outputs) == len(ret)
+                for t, i in zip(ret, outputs):
+                    t._extra_data[k] = i
+
+    return ret
diff --git a/imperative/python/megengine/core/tensor/tensor_wrapper.py b/imperative/python/megengine/core/tensor/tensor_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..011e7d1183db67dd129a82051e42a160082295ae
--- /dev/null
+++ b/imperative/python/megengine/core/tensor/tensor_wrapper.py
@@ -0,0 +1,367 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import abc
+import collections
+
+import numpy as np
+
+from ..ops import builtin
+from ..ops.special import Const
+from . import utils
+from .core import OpBase, TensorBase, TensorWrapperBase, apply
+from .indexing import getitem as _getitem
+from .indexing import setitem as _setitem
+from .raw_tensor import RawTensor, as_raw_tensor
+from .tensor import Tensor
+
+
+def _elwise(*args, mode):
+    op = builtin.Elemwise(mode=mode)
+    args = utils.convert_inputs(*args)
+    (result,) = apply(op, *args)
+    return result
+
+
+def _matmul(inp1, inp2):
+    op = builtin.MatrixMul(
+        transposeA=False, transposeB=False, compute_mode="DEFAULT", format="DEFAULT"
+    )
+    inp1, inp2 = utils.convert_inputs(inp1, inp2)
+    (result,) = apply(op, inp1, inp2)
+    return result
+
+
+def _transpose(data, axes):
+    op = builtin.Dimshuffle(axes)
+    (data,) = utils.convert_inputs(data)
+    (result,) = apply(op, data)
+    return result
+
+
+def _broadcast(inp, shape):
+    shape = utils.astensor1d(shape, inp, dtype="int32", device=inp.device)
+    (result,) = apply(builtin.Broadcast(), inp, shape)
+    return result
+
+
+def _reshape(x, shape):
+    if isinstance(shape, (TensorBase, TensorWrapperBase)):
+        shape = shape.numpy()
+    shape = tuple(map(int, shape))
+    unspec_axis = None
+    for i, s in enumerate(shape):
+        if s < 0:
+            if s != -1:
+                raise ValueError("expect shape[{}] >= -1, got {}".format(i, s))
+            if unspec_axis is not None:
+                raise ValueError("multiple -1 in shape: {} & {}".format(unspec_axis, i))
+            unspec_axis = i
+
+    # TODO: device should be None (cpu)
+    (shape,) = Const(shape, dtype=np.int32, device=x.device)(x)
+    if unspec_axis is None:
+        op = builtin.Reshape()
+    else:
+        op = builtin.Reshape(unspec_axis=unspec_axis)
+    (x,) = apply(op, x, shape)
+    return x
+
+
+def _unary_elwise(mode):
+    def f(self):
+        return _elwise(self, mode=mode)
+
+    return f
+
+
+def _binary_elwise(mode, rev=False):
+    if not rev:
+
+        def f(self, value):
+            return _elwise(self, value, mode=mode)
+
+    else:
+
+        def f(self, value):
+            return _elwise(value, self, mode=mode)
+
+    return f
+
+
+def _logical_unary_elwise(mode, rev=False):
+    def f(self):
+        if self.dtype != np.bool_:
+            raise TypeError("{} requires a bool tensor".format(mode))
+        return _elwise(self, mode=mode)
+
+    return f
+
+
+def _logical_binary_elwise(mode, rev=False):
+    if not rev:
+
+        def f(self, value):
+            if self.dtype != np.bool_ or value.dtype != np.bool_:
+                raise TypeError("{} requires 2 bool tensors".format(mode))
+            return _elwise(self, value, mode=mode)
+
+    else:
+
+        def f(self, value):
+            if self.dtype != np.bool_ or value.dtype != np.bool_:
+                raise TypeError("{} requires 2 bool tensors".format(mode))
+            return _elwise(value, self, mode=mode)
+
+    return f
+
+
+def _reduce(mode):
+    def f(self, axis=None):
+        inp = self
+        if axis is None:
+            inp = self.flatten()
+            axis = 0
+        op = builtin.Reduce(mode=mode, axis=axis)
+        (result,) = utils.convert_inputs(inp)
+        (result,) = apply(op, result)
+        return result
+
+    return f
+
+
+def _inplace(f):
+    def g(self, value):
+        result = f(self, value)
+        if result is NotImplemented:
+            raise NotImplementedError
+        self._reset(result)
+        return self
+
+    return g
+
+
+def _todo(*_):
+    raise NotImplementedError
+
+
+class ArrayMethodMixin(abc.ABC):
+
+    __array_priority__ = 233333
+
+    @abc.abstractmethod
+    def _reset(self, other):
+        pass
+
+    @abc.abstractproperty
+    def dtype(self) -> np.dtype:
+        pass
+
+    @abc.abstractproperty
+    def shape(self) -> tuple:
+        pass
+
+    @abc.abstractmethod
+    def numpy(self) -> np.ndarray:
+        pass
+
+    __hash__ = None  # due to __eq__ diviates from python convention
+
+    __lt__ = lambda self, value: _elwise(self, value, mode="LT").astype("bool")
+    __le__ = lambda self, value: _elwise(self, value, mode="LEQ").astype("bool")
+    __gt__ = lambda self, value: _elwise(value, self, mode="LT").astype("bool")
+    __ge__ = lambda self, value: _elwise(value, self, mode="LEQ").astype("bool")
+    __eq__ = lambda self, value: _elwise(self, value, mode="EQ").astype("bool")
+    __ne__ = lambda self, value: _elwise(
+        _elwise(self, value, mode="EQ").astype("bool"), mode="NOT"
+    )
+
+    __neg__ = _unary_elwise("NEGATE")
+    __pos__ = lambda self: self
+    __abs__ = _unary_elwise("ABS")
+    __invert__ = _logical_unary_elwise("NOT")
+    __round__ = _unary_elwise("ROUND")
+    __trunc__ = _todo
+    __floor__ = _unary_elwise("FLOOR")
+    __ceil__ = _unary_elwise("CEIL")
+
+    __add__ = _binary_elwise("ADD")
+    __sub__ = _binary_elwise("SUB")
+    __mul__ = _binary_elwise("MUL")
+    __matmul__ = lambda self, other: _matmul(self, other)
+    __truediv__ = _binary_elwise("TRUE_DIV")
+    __floordiv__ = _binary_elwise("FLOOR_DIV")
+    __mod__ = _binary_elwise("MOD")
+    # __divmode__
+    __pow__ = _binary_elwise("POW")
+    __lshift__ = _binary_elwise("SHL")
+    __rshift__ = _binary_elwise("SHR")
+    __and__ = _logical_binary_elwise("AND")
+    __or__ = _logical_binary_elwise("OR")
+    __xor__ = _logical_binary_elwise("XOR")
+
+    __radd__ = _binary_elwise("ADD", rev=1)
+    __rsub__ = _binary_elwise("SUB", rev=1)
+    __rmul__ = _binary_elwise("MUL", rev=1)
+    __rmatmul__ = lambda self, other: _matmul(other, self)
+    __rtruediv__ = _binary_elwise("TRUE_DIV", rev=1)
+    __rfloordiv__ = _binary_elwise("FLOOR_DIV", rev=1)
+    __rmod__ = _binary_elwise("MOD", rev=1)
+    # __rdivmode__
+    __rpow__ = _binary_elwise("POW", rev=1)
+    __rlshift__ = _binary_elwise("SHL", rev=1)
+    __rrshift__ = _binary_elwise("SHR", rev=1)
+    __rand__ = _logical_binary_elwise("AND", rev=1)
+    __ror__ = _logical_binary_elwise("OR", rev=1)
+    __rxor__ = _logical_binary_elwise("XOR", rev=1)
+
+    __iadd__ = _inplace(__add__)
+    __isub__ = _inplace(__sub__)
+    __imul__ = _inplace(__mul__)
+    __imatmul__ = _inplace(__matmul__)
+    __itruediv__ = _inplace(__truediv__)
+    __ifloordiv__ = _inplace(__floordiv__)
+    __imod__ = _inplace(__mod__)
+    __ipow__ = _inplace(__pow__)
+    __ilshift__ = _inplace(__lshift__)
+    __irshift__ = _inplace(__rshift__)
+    __iand__ = _inplace(__and__)
+    __ior__ = _inplace(__or__)
+    __ixor__ = _inplace(__xor__)
+
+    __index__ = lambda self: self.item().__index__()
+    __bool__ = lambda self: bool(self.item())
+    __int__ = lambda self: int(self.item())
+    __float__ = lambda self: float(self.item())
+    __complex__ = lambda self: complex(self.item())
+
+    def __len__(self):
+        shape = self.shape
+        if shape:
+            return int(shape[0])
+        raise TypeError("ndim is 0")
+
+    def __iter__(self):
+        for i in range(len(self)):
+            yield self[i]
+
+    def __getitem__(self, index):
+        return _getitem(self, index)
+
+    def __setitem__(self, index, value):
+        if index is not Ellipsis:
+            value = _setitem(self, index, value)
+        self._reset(value)
+
+    __contains__ = _todo
+
+    @property
+    def ndim(self):
+        return len(self.shape)
+
+    @property
+    def size(self):
+        return np.prod(self.shape).item()
+
+    @property
+    def T(self):
+        return self.transpose()
+
+    def item(self, *args):
+        if not args:
+            assert self.size == 1
+            return self.numpy().item()
+        return self[args].item()
+
+    def tolist(self):
+        return self.numpy().tolist()
+
+    def astype(self, dtype):
+        return utils.astype(self, dtype)
+
+    def reshape(self, *args):
+        if len(args) == 1:
+            if isinstance(args[0], collections.Sequence):
+                args = args[0]
+        return _reshape(self, args)
+
+    def broadcast(self, *args):
+        if len(args) == 1:
+            if isinstance(args[0], collections.Sequence):
+                args = args[0]
+        return _broadcast(self, args)
+
+    def transpose(self, *args):
+        if not args:
+            args = reversed(range(self.ndim))
+        elif len(args) == 1:
+            if isinstance(args[0], collections.Sequence):
+                args = args[0]
+        return _transpose(self, args)
+
+    def flatten(self):
+        return self.reshape(-1)
+
+    sum = _reduce("SUM")
+    prod = _reduce("PRODUCT")
+    min = _reduce("MIN")
+    max = _reduce("MAX")
+    mean = _reduce("MEAN")
+
+
+class GenericTensorWrapper(ArrayMethodMixin, TensorWrapperBase):
+    def __init__(self, data):
+        self.__wrapped__ = data
+
+    def _reset(self, other):
+        if not isinstance(other, __class__):
+            raise TypeError(type(other))
+        self.__wrapped__ = other.__wrapped__
+        return self
+
+    @property
+    def dtype(self):
+        return self.__wrapped__.dtype
+
+    @property
+    def shape(self):
+        return self.__wrapped__.shape
+
+    @property
+    def device(self):
+        return self.__wrapped__.device
+
+    def numpy(self):
+        return self.__wrapped__.numpy()
+
+
+class TensorWrapper(GenericTensorWrapper):
+    def __init__(self, data, dtype=None, device=None):
+        if isinstance(data, TensorWrapperBase):
+            data = data.__wrapped__
+        elif not isinstance(data, TensorBase):
+            assert data is not None, "Cannot init a tensor with data as None"
+            data = Tensor(as_raw_tensor(data, dtype=dtype, device=device))
+        super().__init__(data)
+
+    def _reset(self, other):
+        if isinstance(other, TensorWrapperBase):
+            self.__wrapped__ = other.__wrapped__
+        elif isinstance(other, TensorBase):
+            self.__wrapped__ = other
+        else:
+            self._reset(type(self)(other, dtype=self.dtype, device=self.device))
+
+    def __repr__(self):
+        piece = "Tensor("
+        with np.printoptions(precision=4, suppress=True):
+            piece += "{}".format(str(self.numpy()))
+        if self.dtype != np.float32:
+            piece += ", dtype={}".format(np.dtype(self.dtype).name)
+        piece += ", device={}".format(self.device) + ")"
+        return piece
diff --git a/imperative/python/megengine/core/tensor/utils.py b/imperative/python/megengine/core/tensor/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a059ff8dfd116621ae5c16835357a182e01f0477
--- /dev/null
+++ b/imperative/python/megengine/core/tensor/utils.py
@@ -0,0 +1,154 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+from typing import Iterable, Union
+
+import numpy as np
+
+from ..ops import builtin
+from ..ops.special import Const
+from ..tensor.core import OpBase, TensorBase, TensorWrapperBase, apply
+
+
+def dtype_promotion(raw_inputs):
+    def add_dtype(i):
+        if type(i) == int:
+            return np.array(i, dtype=np.int32)
+        if type(i) == float:
+            return np.array(i, dtype=np.float32)
+        if type(i) == bool:
+            return np.array(i, dtype=np.bool_)
+        return None
+
+    scalar_inputs = [
+        add_dtype(i) for i in raw_inputs if not hasattr(i, "dtype") and add_dtype(i)
+    ]
+    inputs = [i for i in raw_inputs if hasattr(i, "dtype")]
+    assert len(scalar_inputs + inputs) > 0
+    dtype = np.result_type(*inputs)
+    dtype_all = np.result_type(*(inputs + scalar_inputs))
+    assert (
+        dtype != np.float64 and dtype != np.int64
+    ), "unsupport dtype {} by dtype_promotion, please use explict type convert".format(
+        dtype
+    )
+    if dtype_all == np.bool_:
+        for i in raw_inputs:
+            if not hasattr(i, "dtype") or i.dtype != np.bool_:
+                raise TypeError(
+                    "bool dtype can not be operated with an element without bool dtype"
+                )
+    if dtype_all == np.float64:
+        dtype_all = np.float32
+    return dtype_all
+
+
+def get_device(inputs):
+    device = None
+    for i in inputs:
+        if isinstance(i, (TensorWrapperBase, TensorBase)):
+            if device is None:
+                device = i.device
+            elif device != i.device:
+                raise ValueError("ambiguous device: {} vs {}".format(device, i.device))
+    assert device is not None
+    return device
+
+
+def concatenate(inputs, axis=0, *, device=None):
+    dtype = dtype_promotion(inputs)
+    device = get_device(inputs)
+
+    def convert(x):
+        return convert_single_value(x, inputs, dtype=dtype)
+
+    inputs = tuple(map(convert, inputs))
+    (result,) = apply(builtin.Concat(axis=axis, comp_node=device.to_c()), *inputs)
+    return result
+
+
+def astype(x, dtype):
+    dtype = np.dtype(dtype)
+    if x.dtype != dtype:
+        (x,) = apply(builtin.TypeCvt(param=dtype), x)
+    return x
+
+
+def convert_single_value(v, inputs, *, dtype=None, device=None):
+    tensors = [i for i in inputs if isinstance(i, (TensorBase, TensorWrapperBase))]
+    assert len(tensors) > 0
+    if isinstance(v, (TensorWrapperBase, TensorBase)):
+        v = astype(v, dtype)
+    else:
+        (v,) = Const(v, dtype=dtype, device=device)(*tensors)
+    return v
+
+
+def convert_inputs(*args: TensorBase):
+    dtype = dtype_promotion(args)
+    device = get_device(args)
+
+    def convert(value):
+        if value is None:
+            return value
+        return convert_single_value(value, args, dtype=dtype, device=device)
+
+    return tuple(map(convert, args))
+
+
+def result_type(*args):
+    dtypes = []
+    for i in args:
+        if isinstance(i, (TensorWrapperBase, TensorBase)):
+            dtypes.append(i.dtype)
+            continue
+        try:
+            dtypes.append(np.dtype(i))
+        except TypeError:
+            pass
+    return np.result_type(*dtypes)
+
+
+def isscalar(x):
+    try:
+        return x.ndim == 0
+    except:
+        pass
+    return np.isscalar(x)
+
+
+def astensor1d(x, *reference, dtype=None, device=None):
+    """
+    Convert something to 1D tensor. Support following types
+    * sequence of scalar literal / tensor
+    * numpy array
+    * tensor (returned as is, regardless of dtype and device)
+    """
+    try:
+        ndim = x.ndim
+    except AttributeError:
+        pass
+    else:
+        if ndim != 1:
+            raise ValueError("ndim != 1: %d" % ndim)
+        if not isinstance(x, (TensorBase, TensorWrapperBase)):
+            (x,) = Const(x, dtype=dtype, device=device)(*reference)
+        return x
+
+    if not isinstance(x, collections.Sequence):
+        raise TypeError
+
+    if any(isinstance(i, (TensorBase, TensorWrapperBase)) for i in x):
+        x = concatenate(x, device=device)
+        if dtype is not None:
+            x = astype(x, dtype)
+        return x
+
+    (x,) = Const(x, dtype=dtype, device=device)(*reference)
+    return x
diff --git a/imperative/python/megengine/data/__init__.py b/imperative/python/megengine/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b1e0d556e66b1c71124389df749537b4cc7452c
--- /dev/null
+++ b/imperative/python/megengine/data/__init__.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .collator import Collator
+from .dataloader import DataLoader
+from .sampler import (
+    Infinite,
+    RandomSampler,
+    ReplacementSampler,
+    Sampler,
+    SequentialSampler,
+)
diff --git a/imperative/python/megengine/data/_queue.py b/imperative/python/megengine/data/_queue.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9e328c65c56e4f4ba736b510176677b6c735c32
--- /dev/null
+++ b/imperative/python/megengine/data/_queue.py
@@ -0,0 +1,139 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import binascii
+import os
+import queue
+import subprocess
+from multiprocessing import Queue
+
+import pyarrow
+import pyarrow.plasma as plasma
+
+MGE_PLASMA_MEMORY = int(os.environ.get("MGE_PLASMA_MEMORY", 4000000000))  # 4GB
+
+# Each process only need to start one plasma store, so we set it as a global variable.
+# TODO: how to share between different processes?
+MGE_PLASMA_STORE_MANAGER = None
+
+
+def _clear_plasma_store():
+    # `_PlasmaStoreManager.__del__` will not be called automaticly in subprocess,
+    # so this function should be called explicitly
+    global MGE_PLASMA_STORE_MANAGER
+    if MGE_PLASMA_STORE_MANAGER is not None:
+        del MGE_PLASMA_STORE_MANAGER
+        MGE_PLASMA_STORE_MANAGER = None
+
+
+class _PlasmaStoreManager:
+    __initialized = False
+
+    def __init__(self):
+        self.socket_name = "/tmp/mge_plasma_{}".format(
+            binascii.hexlify(os.urandom(8)).decode()
+        )
+        debug_flag = bool(os.environ.get("MGE_DATALOADER_PLASMA_DEBUG", 0))
+        # NOTE: this is a hack. Directly use `plasma_store` may make subprocess
+        # difficult to handle the exception happened in `plasma-store-server`.
+        # For `plasma_store` is just a wrapper of `plasma-store-server`, which use
+        # `os.execv` to call the executable `plasma-store-server`.
+        cmd_path = os.path.join(pyarrow.__path__[0], "plasma-store-server")
+        self.plasma_store = subprocess.Popen(
+            [cmd_path, "-s", self.socket_name, "-m", str(MGE_PLASMA_MEMORY),],
+            stdout=None if debug_flag else subprocess.DEVNULL,
+            stderr=None if debug_flag else subprocess.DEVNULL,
+        )
+        self.__initialized = True
+
+    def __del__(self):
+        if self.__initialized and self.plasma_store.returncode is None:
+            self.plasma_store.kill()
+
+
+class PlasmaShmQueue:
+    def __init__(self, maxsize: int = 0):
+        r"""Use pyarrow in-memory plasma store to implement shared memory queue.
+
+        Compared to native `multiprocess.Queue`, `PlasmaShmQueue` avoid pickle/unpickle
+        and communication overhead, leading to better performance in multi-process
+        application.
+
+        :type maxsize: int
+        :param maxsize: maximum size of the queue, `None` means no limit. (default: ``None``)
+        """
+
+        # Lazy start the plasma store manager
+        global MGE_PLASMA_STORE_MANAGER
+        if MGE_PLASMA_STORE_MANAGER is None:
+            try:
+                MGE_PLASMA_STORE_MANAGER = _PlasmaStoreManager()
+            except Exception as e:
+                err_info = (
+                    "Please make sure pyarrow installed correctly!\n"
+                    "You can try reinstall pyarrow and see if you can run "
+                    "`plasma_store -s /tmp/mge_plasma_xxx -m 1000` normally."
+                )
+                raise RuntimeError(
+                    "Exception happened in starting plasma_store: {}\n"
+                    "Tips: {}".format(str(e), err_info)
+                )
+
+        self.socket_name = MGE_PLASMA_STORE_MANAGER.socket_name
+
+        # TODO: how to catch the exception happened in `plasma.connect`?
+        self.client = None
+
+        # Used to store the header for the data.(ObjectIDs)
+        self.queue = Queue(maxsize)  # type: Queue
+
+    def put(self, data, block=True, timeout=None):
+        if self.client is None:
+            self.client = plasma.connect(self.socket_name)
+        try:
+            object_id = self.client.put(data)
+        except plasma.PlasmaStoreFull:
+            raise RuntimeError("plasma store out of memory!")
+        try:
+            self.queue.put(object_id, block, timeout)
+        except queue.Full:
+            self.client.delete([object_id])
+            raise queue.Full
+
+    def get(self, block=True, timeout=None):
+        if self.client is None:
+            self.client = plasma.connect(self.socket_name)
+        object_id = self.queue.get(block, timeout)
+        if not self.client.contains(object_id):
+            raise RuntimeError(
+                "ObjectID: {} not found in plasma store".format(object_id)
+            )
+        data = self.client.get(object_id)
+        self.client.delete([object_id])
+        return data
+
+    def qsize(self):
+        return self.queue.qsize()
+
+    def empty(self):
+        return self.queue.empty()
+
+    def join(self):
+        self.queue.join()
+
+    def disconnect_client(self):
+        if self.client is not None:
+            self.client.disconnect()
+
+    def close(self):
+        self.queue.close()
+        self.disconnect_client()
+        _clear_plasma_store()
+
+    def cancel_join_thread(self):
+        self.queue.cancel_join_thread()
diff --git a/imperative/python/megengine/data/collator.py b/imperative/python/megengine/data/collator.py
new file mode 100644
index 0000000000000000000000000000000000000000..952fc39881eb092c5caa786e83ca443f0a1d818b
--- /dev/null
+++ b/imperative/python/megengine/data/collator.py
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+# ---------------------------------------------------------------------
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#
+# This file has been modified by Megvii ("Megvii Modifications").
+# All Megvii Modifications are Copyright (C) 2014-2020 Megvii Inc. All rights reserved.
+# ----------------------------------------------------------------------
+import collections.abc
+import re
+
+import numpy as np
+
+np_str_obj_array_pattern = re.compile(r"[aO]")
+default_collate_err_msg_format = (
+    "default_collator: inputs must contain numpy arrays, numbers, "
+    "Unicode strings, bytes, dicts or lists; found {}"
+)
+
+
+class Collator:
+    r"""
+    Used for merge a list of samples to form a mini-batch of Tenor(s). Used when using batched loading from a dataset.
+    modified from https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py
+    """
+
+    def apply(self, inputs):
+        """
+        input : sequence_N(tuple(CHW, C, CK))
+        output : tuple(NCHW, NC, NCK)
+        """
+        elem = inputs[0]
+        elem_type = type(elem)
+        if (
+            elem_type.__module__ == "numpy"
+            and elem_type.__name__ != "str_"
+            and elem_type.__name__ != "string_"
+        ):
+            elem = inputs[0]
+            if elem_type.__name__ == "ndarray":
+                # array of string classes and object
+                if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                    raise TypeError(default_collate_err_msg_format.format(elem.dtype))
+
+                return np.ascontiguousarray(np.stack(inputs))
+            elif elem.shape == ():  # scalars
+                return np.array(inputs)
+        elif isinstance(elem, float):
+            return np.array(inputs, dtype=np.float64)
+        elif isinstance(elem, int):
+            return np.array(inputs)
+        elif isinstance(elem, (str, bytes)):
+            return inputs
+        elif isinstance(elem, collections.abc.Mapping):
+            return {key: self.apply([d[key] for d in inputs]) for key in elem}
+        elif isinstance(elem, tuple) and hasattr(elem, "_fields"):  # namedtuple
+            return elem_type(*(self.apply(samples) for samples in zip(*inputs)))
+        elif isinstance(elem, collections.abc.Sequence):
+            transposed = zip(*inputs)
+            return [self.apply(samples) for samples in transposed]
+
+        raise TypeError(default_collate_err_msg_format.format(elem_type))
diff --git a/imperative/python/megengine/data/dataloader.py b/imperative/python/megengine/data/dataloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fd3482df727ddc84970bce239ebfd60990db6e1
--- /dev/null
+++ b/imperative/python/megengine/data/dataloader.py
@@ -0,0 +1,500 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+import math
+import multiprocessing
+import queue
+import random
+import time
+
+import numpy as np
+
+from ..logger import get_logger
+from ..random.rng import _random_seed_generator
+from .collator import Collator
+from .dataset import Dataset
+from .sampler import Sampler, SequentialSampler
+from .transform import PseudoTransform, Transform
+
+logger = get_logger(__name__)
+
+
+MP_QUEUE_GET_TIMEOUT = 5
+
+
+class DataLoader:
+    __initialized = False
+
+    def __init__(
+        self,
+        dataset: Dataset,
+        sampler: Sampler = None,
+        transform: Transform = None,
+        collator: Collator = None,
+        num_workers: int = 0,
+        timeout: int = 0,
+        divide: bool = False,
+    ):
+        r"""Provides a convenient way to iterate on a given dataset.
+
+        `DataLoader` combines a dataset with sampler, transform and collator,
+        make it flexible to get minibatch continually from a dataset.
+
+        :type dataset: Dataset
+        :param dataset: dataset from which to load the minibatch.
+        :type sampler: Sampler
+        :param sampler: defines the strategy to sample data from the dataset.
+            If specified, :attr:`shuffle` must be ``False``.
+        :type transform: Transform
+        :param transform: defined the transforming strategy for a sampled batch.
+            (default: ``None``)
+        :type collator: Collator
+        :param collator: defined the merging strategy for a transformed batch.
+            (default: ``None``)
+        :type num_workers: int
+        :param num_workers: the number of sub-process to load, transform and collate
+            the batch. ``0`` means using single-process. (default: ``0``)
+        :type timeout: int
+        :param timeout: if positive, means the timeout value(second) for collecting a
+            batch from workers. (default: 0)
+        :type divide: bool
+        :param divide: define the paralleling strategy in multi-processing mode.
+            ``True`` means one batch is divided into :attr:`num_workers` pieces, and
+            the workers will process these pieces parallelly. ``False`` means
+            different sub-process will process different batch. (default: ``False``)
+
+        """
+
+        if num_workers < 0:
+            raise ValueError("num_workers should not be negative")
+
+        if timeout < 0:
+            raise ValueError("timeout should not be negative")
+
+        if divide and num_workers <= 1:
+            raise ValueError("divide should not be set to True when num_workers <= 1")
+
+        self.dataset = dataset
+        self.num_workers = num_workers
+        self.timeout = timeout
+
+        self.divide = divide
+
+        if sampler is None:
+            self.sampler = SequentialSampler(dataset, batch_size=1, drop_last=False)
+        else:
+            self.sampler = sampler
+
+        if divide:
+            if self.sampler.batch_size <= self.num_workers:
+                raise ValueError(
+                    "batch size must not smaller than num_workers in divide mode."
+                )
+            elif self.sampler.batch_size % self.num_workers:
+                logger.warning(
+                    "batch size is not divisible by num_workers, may lose performance in divide mode."
+                )
+
+        if transform is None:
+            self.transform = PseudoTransform()
+        else:
+            self.transform = transform
+
+        if collator is None:
+            self.collator = Collator()
+        else:
+            self.collator = collator
+
+        self.__initialized = True
+
+    def __iter__(self):
+        if self.num_workers == 0:
+            return _SerialDataLoaderIter(self)
+        else:
+            return _ParallelDataLoaderIter(self)
+
+    def __len__(self):
+        return len(self.sampler)
+
+
+class _BaseDataLoaderIter:
+    def __init__(self, loader):
+        self.dataset = loader.dataset
+        self.sampler = loader.sampler
+        self.seed = _random_seed_generator().__next__()
+        self.transform = loader.transform
+        self.collator = loader.collator
+        self.num_workers = loader.num_workers
+        self.timeout = loader.timeout
+        self.divide = loader.divide
+        self.num_processed = 0
+
+    def _get_next_batch(self):
+        raise NotImplementedError
+
+    def __len__(self):
+        return len(self.sampler)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.num_processed >= len(self):
+            raise StopIteration
+        minibatch = self._get_next_batch()
+        self.num_processed += 1
+        return minibatch
+
+
+class _SerialDataLoaderIter(_BaseDataLoaderIter):
+    def __init__(self, loader):
+        super(_SerialDataLoaderIter, self).__init__(loader)
+        self.indices_iter = iter(self.sampler)
+
+    def _get_next_batch(self):
+        indices = next(self.indices_iter)
+        items = [self.dataset[idx] for idx in indices]
+        trans_items = self.transform.apply_batch(items)
+        return self.collator.apply(trans_items)
+
+
+class _ParallelDataLoaderIter(_BaseDataLoaderIter):
+    __initialized = False
+
+    def __init__(self, loader):
+        super(_ParallelDataLoaderIter, self).__init__(loader)
+
+        self.task_queues = [
+            multiprocessing.Queue(maxsize=2) for _ in range(self.num_workers)
+        ]
+
+        self.feed_batch_idx = multiprocessing.Value("i", 0)
+        self.target_batch_idx = multiprocessing.Value("i", 0)
+        self.shutdown_flag = multiprocessing.Value("i", 0)
+
+        self.trans_data_queues = [
+            multiprocessing.Queue(maxsize=1) for _ in range(self.num_workers)
+        ]
+
+        # use shared-memory queue implemented by pyarrow plasma store.
+        from ._queue import PlasmaShmQueue
+
+        self.batch_queue = PlasmaShmQueue(maxsize=2)
+
+        self.task_feeding_worker = multiprocessing.Process(
+            target=_task_feeding_loop,
+            args=(
+                iter(self.sampler),
+                self.task_queues,
+                self.num_workers,
+                self.divide,
+                self.shutdown_flag,
+                self.feed_batch_idx,
+            ),
+            daemon=True,
+        )
+        self.task_feeding_worker.start()
+
+        self.workers = []
+        for worker_id in range(self.num_workers):
+            worker = multiprocessing.Process(
+                target=_worker_loop,
+                args=(
+                    self.dataset,
+                    self.task_queues[worker_id],
+                    self.trans_data_queues[worker_id],
+                    self.transform,
+                    self.seed + worker_id + 1,
+                    self.shutdown_flag,
+                ),
+                daemon=True,
+            )
+            worker.start()
+            self.workers.append(worker)
+
+        if self.divide:
+            self.data_collecting_worker = multiprocessing.Process(
+                target=_data_gathering_loop,
+                args=(
+                    self.trans_data_queues,
+                    self.batch_queue,
+                    self.collator,
+                    len(self),
+                    self.num_workers,
+                    self.shutdown_flag,
+                    self.target_batch_idx,
+                ),
+                daemon=True,
+            )
+        else:
+            self.data_collecting_worker = multiprocessing.Process(
+                target=_data_selecting_loop,
+                args=(
+                    self.trans_data_queues,
+                    self.batch_queue,
+                    self.collator,
+                    len(self),
+                    self.num_workers,
+                    self.shutdown_flag,
+                    self.target_batch_idx,
+                ),
+                daemon=True,
+            )
+        self.data_collecting_worker.start()
+
+        self.__initialized = True
+
+    def _check_workers(self):
+        # Check the status of each worker.
+        if not self.data_collecting_worker.is_alive():
+            exitcode = self.task_feeding_worker.exitcode
+            if exitcode != 0:
+                raise RuntimeError("data collecting worker died. {}".format(exitcode))
+
+        if not self.task_feeding_worker.is_alive():
+            exitcode = self.task_feeding_worker.exitcode
+            if exitcode != 0:
+                raise RuntimeError("task feeding worker died. {}".format(exitcode))
+
+        for worker_id, worker in enumerate(self.workers):
+            if not worker.is_alive():
+                exitcode = worker.exitcode
+                if exitcode != 0:
+                    raise RuntimeError("worker:{} died. {}".format(worker_id, exitcode))
+
+        logger.debug("all workers are alive.")
+
+    def _try_get_next_batch(self):
+        start_time = time.time()
+        while True:
+            self._check_workers()
+            try:
+                return self.batch_queue.get(timeout=1)
+            except queue.Empty:
+                logger.debug("batch queue empty!")
+            waited_time = time.time() - start_time
+            if self.timeout > 0:
+                if waited_time > self.timeout:
+                    raise RuntimeError("get_next_batch timeout!")
+
+    def _get_next_batch(self):
+        batch_data = self._try_get_next_batch()
+        return batch_data
+
+    def _shutdown(self):
+        with self.shutdown_flag.get_lock():
+            self.shutdown_flag.value = 1
+
+        if self.task_feeding_worker.is_alive():
+            self.task_feeding_worker.terminate()
+        self.task_feeding_worker.join()
+
+        if self.data_collecting_worker.is_alive():
+            self.data_collecting_worker.terminate()
+        self.data_collecting_worker.join()
+
+        for worker in self.workers:
+            if worker.is_alive():
+                worker.terminate()
+            worker.join()
+
+        for q in self.trans_data_queues:
+            q.cancel_join_thread()
+            q.close()
+
+        for q in self.task_queues:
+            q.cancel_join_thread()
+            q.close()
+
+        self.batch_queue.cancel_join_thread()
+        self.batch_queue.close()
+
+    def __del__(self):
+        if self.__initialized:
+            self._shutdown()
+
+
+def _task_feeding_loop(
+    indices_iter, task_queues, num_workers, divide, shutdown_flag, feed_batch_idx
+):
+    # Feed the indices into the task queues
+    while True:
+        if shutdown_flag.value == 1:
+            break
+        batch_idx = feed_batch_idx.value
+        try:
+            indices = next(indices_iter)
+        except StopIteration:
+            break
+        if divide:
+            # make sure all task_queues is ready for put
+            while any([q.full() for q in task_queues]):
+                if shutdown_flag.value == 1:
+                    return
+            # divide into small pieces, feed to different workers.
+            sub_num = math.ceil(len(indices) / num_workers)
+            for worker_id in range(num_workers):
+                sub_indices = indices[worker_id * sub_num : (worker_id + 1) * sub_num]
+                task_queues[worker_id].put((batch_idx, sub_indices))
+        else:
+            # distribute tasks to different workers uniformly.
+            target_id = batch_idx % num_workers
+            while task_queues[target_id].full():
+                if shutdown_flag.value == 1:
+                    return
+            task_queues[target_id].put((batch_idx, indices))
+        with feed_batch_idx.get_lock():
+            feed_batch_idx.value += 1
+
+
+def _worker_loop(dataset, task_queue, trans_data_queue, transform, seed, shutdown_flag):
+    # Get dataset items and do the transform
+    random.seed(seed)
+    np.random.seed(seed)
+    while True:
+        if shutdown_flag.value == 1:
+            break
+        try:
+            batch_idx, indices = task_queue.get(timeout=MP_QUEUE_GET_TIMEOUT)
+        except queue.Empty:
+            continue
+        if len(indices) > 0:
+            items = [dataset[idx] for idx in indices]
+            trans_items = transform.apply_batch(items)
+        else:
+            # in case of incomplete last batch
+            trans_items = ()
+        while True:
+            try:
+                trans_data_queue.put((batch_idx, trans_items), timeout=1)
+                break
+            except queue.Full:
+                if shutdown_flag.value == 1:
+                    break
+                logger.debug("batch part queue is full!")
+
+
+def _data_gathering_loop(
+    trans_data_queues,
+    batch_queue,
+    collator,
+    length,
+    num_workers,
+    shutdown_flag,
+    target_idx,
+):
+    # Gathering the small pieces of batch data into full batch data
+    while True:
+        if shutdown_flag.value == 1:
+            break
+
+        target_batch_idx = target_idx.value
+
+        if target_batch_idx >= length:
+            break
+
+        full_trans_items = []
+        for worker_id in range(num_workers):
+            while True:
+                try:
+                    batch_idx, trans_items = trans_data_queues[worker_id].get(
+                        timeout=MP_QUEUE_GET_TIMEOUT
+                    )
+                    break
+                except queue.Empty:
+                    if shutdown_flag.value == 1:
+                        break
+                    logger.debug(
+                        "worker:{} data queue get timeout! target batch idx:{}".format(
+                            worker_id, target_batch_idx
+                        )
+                    )
+            if batch_idx != target_batch_idx:
+                raise RuntimeError(
+                    "Unexperted batch_idx in data gathering loop. worker_id:{}.".format(
+                        worker_id
+                    )
+                )
+            else:
+                full_trans_items.extend(trans_items)
+
+        # Merge different parts into a batch.
+        full_batch = collator.apply(full_trans_items)
+
+        while True:
+            try:
+                batch_queue.put(full_batch, timeout=1)
+                break
+            except queue.Full:
+                if shutdown_flag.value == 1:
+                    break
+                logger.debug("batch queue is full!")
+
+        with target_idx.get_lock():
+            target_idx.value += 1
+
+    batch_queue.disconnect_client()
+
+
+def _data_selecting_loop(
+    trans_data_queues,
+    batch_queue,
+    collator,
+    length,
+    num_workers,
+    shutdown_flag,
+    target_idx,
+):
+    # Make sure that batch is generated exactly with the same order as generated indices
+    while True:
+        if shutdown_flag.value == 1:
+            break
+
+        target_batch_idx = target_idx.value
+
+        if target_batch_idx >= length:
+            break
+
+        target_worker_id = target_batch_idx % num_workers
+        while True:
+            try:
+                batch_idx, trans_items = trans_data_queues[target_worker_id].get(
+                    timeout=MP_QUEUE_GET_TIMEOUT
+                )
+                batch_data = collator.apply(trans_items)
+                break
+            except queue.Empty:
+                if shutdown_flag.value == 1:
+                    break
+                logger.debug(
+                    "worker:{} data queue get timeout! target batch idx:{}".format(
+                        target_worker_id, target_batch_idx
+                    )
+                )
+
+        if batch_idx != target_batch_idx:
+            raise RuntimeError(
+                "batch_idx {} mismatch the target_batch_idx {}".format(
+                    batch_idx, target_batch_idx
+                )
+            )
+
+        while True:
+            try:
+                batch_queue.put(batch_data, timeout=1)
+                break
+            except queue.Full:
+                if shutdown_flag.value == 1:
+                    break
+                logger.debug("batch queue is full!")
+
+        with target_idx.get_lock():
+            target_idx.value += 1
+
+    batch_queue.disconnect_client()
diff --git a/imperative/python/megengine/data/dataset/__init__.py b/imperative/python/megengine/data/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b70d22111ba33a749a8c90491b2db52a700ed44
--- /dev/null
+++ b/imperative/python/megengine/data/dataset/__init__.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .meta_dataset import ArrayDataset, Dataset, MapDataset, StreamDataset
+from .vision import *
diff --git a/imperative/python/megengine/data/dataset/meta_dataset.py b/imperative/python/megengine/data/dataset/meta_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4415a4274d883ce97a3d9c4102f90323a9c60820
--- /dev/null
+++ b/imperative/python/megengine/data/dataset/meta_dataset.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from abc import ABC, abstractmethod
+from typing import Tuple
+
+
+class Dataset(ABC):
+    r"""
+    An abstract class for all Datasets
+    """
+
+    @abstractmethod
+    def __init__(self):
+        pass
+
+
+class MapDataset(Dataset):
+    r"""
+    An abstract class for map data
+    __getitem__ and __len__ method are aditionally needed
+    """
+
+    @abstractmethod
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def __getitem__(self, index):
+        pass
+
+    @abstractmethod
+    def __len__(self):
+        pass
+
+
+class StreamDataset(Dataset):
+    r"""
+    An abstract class for stream data
+    __iter__ method is aditionally needed
+    """
+
+    @abstractmethod
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def __iter__(self):
+        pass
+
+
+class ArrayDataset(MapDataset):
+    def __init__(self, *arrays):
+        r"""
+        ArrayDataset is a dataset for numpy array data, one or more numpy arrays
+         are needed to initiate the dataset. And the dimensions represented sample number
+         are expected to be the same.
+        """
+        super().__init__()
+        if not all(len(arrays[0]) == len(array) for array in arrays):
+            raise ValueError("lengths of input arrays are inconsistent")
+        self.arrays = arrays
+
+    def __getitem__(self, index: int) -> Tuple:
+        return tuple(array[index] for array in self.arrays)
+
+    def __len__(self) -> int:
+        return len(self.arrays[0])
diff --git a/imperative/python/megengine/data/dataset/vision/__init__.py b/imperative/python/megengine/data/dataset/vision/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd2b0fc302dac854b8880a7894a090ddd3a18f08
--- /dev/null
+++ b/imperative/python/megengine/data/dataset/vision/__init__.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .cifar import CIFAR10, CIFAR100
+from .cityscapes import Cityscapes
+from .coco import COCO
+from .folder import ImageFolder
+from .imagenet import ImageNet
+from .meta_vision import VisionDataset
+from .mnist import MNIST
+from .objects365 import Objects365
+from .voc import PascalVOC
diff --git a/imperative/python/megengine/data/dataset/vision/cifar.py b/imperative/python/megengine/data/dataset/vision/cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ce73688969d707c48245a83dce30759c33bc561
--- /dev/null
+++ b/imperative/python/megengine/data/dataset/vision/cifar.py
@@ -0,0 +1,171 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import os
+import pickle
+import tarfile
+from typing import Tuple
+
+import numpy as np
+
+from ....logger import get_logger
+from .meta_vision import VisionDataset
+from .utils import _default_dataset_root, load_raw_data_from_url
+
+logger = get_logger(__name__)
+
+
+class CIFAR10(VisionDataset):
+    r""" ``Dataset`` for CIFAR10 meta data
+    """
+
+    url_path = "http://www.cs.utoronto.ca/~kriz/"
+    raw_file_name = "cifar-10-python.tar.gz"
+    raw_file_md5 = "c58f30108f718f92721af3b95e74349a"
+    raw_file_dir = "cifar-10-batches-py"
+    train_batch = [
+        "data_batch_1",
+        "data_batch_2",
+        "data_batch_3",
+        "data_batch_4",
+        "data_batch_5",
+    ]
+    test_batch = ["test_batch"]
+    meta_info = {"name": "batches.meta"}
+
+    def __init__(
+        self,
+        root: str = None,
+        train: bool = True,
+        download: bool = True,
+        timeout: int = 500,
+    ):
+        super().__init__(root, order=("image", "image_category"))
+
+        self.timeout = timeout
+
+        # process the root path
+        if root is None:
+            self.root = self._default_root
+            if not os.path.exists(self.root):
+                os.makedirs(self.root)
+        else:
+            self.root = root
+            if not os.path.exists(self.root):
+                if download:
+                    logger.debug(
+                        "dir %s does not exist, will be automatically created",
+                        self.root,
+                    )
+                    os.makedirs(self.root)
+                else:
+                    raise ValueError("dir %s does not exist" % self.root)
+
+        self.target_file = os.path.join(self.root, self.raw_file_dir)
+
+        # check existence of target pickle dir, if exists load the
+        # pickle file no matter what download is set
+        if os.path.exists(self.target_file):
+            if train:
+                self.arrays = self.bytes2array(self.train_batch)
+            else:
+                self.arrays = self.bytes2array(self.test_batch)
+        else:
+            if download:
+                self.download()
+                if train:
+                    self.arrays = self.bytes2array(self.train_batch)
+                else:
+                    self.arrays = self.bytes2array(self.test_batch)
+            else:
+                raise ValueError(
+                    "dir does not contain target file %s, please set download=True"
+                    % (self.target_file)
+                )
+
+    def __getitem__(self, index: int) -> Tuple:
+        return tuple(array[index] for array in self.arrays)
+
+    def __len__(self) -> int:
+        return len(self.arrays[0])
+
+    @property
+    def _default_root(self):
+        return os.path.join(_default_dataset_root(), self.__class__.__name__)
+
+    @property
+    def meta(self):
+        meta_path = os.path.join(self.root, self.raw_file_dir, self.meta_info["name"])
+        with open(meta_path, "rb") as f:
+            meta = pickle.load(f, encoding="bytes")
+        return meta
+
+    def download(self):
+        url = self.url_path + self.raw_file_name
+        load_raw_data_from_url(
+            url, self.raw_file_name, self.raw_file_md5, self.root, self.timeout
+        )
+        self.process()
+
+    def untar(self, file_path, dirs):
+        assert file_path.endswith(".tar.gz")
+        logger.debug("untar file %s to %s", file_path, dirs)
+        t = tarfile.open(file_path)
+        t.extractall(path=dirs)
+
+    def bytes2array(self, filenames):
+        data = []
+        label = []
+        for filename in filenames:
+            path = os.path.join(self.root, self.raw_file_dir, filename)
+            logger.debug("unpickle file %s", path)
+            with open(path, "rb") as fo:
+                dic = pickle.load(fo, encoding="bytes")
+                batch_data = dic[b"data"].reshape(-1, 3, 32, 32).transpose((0, 2, 3, 1))
+                data.extend(list(batch_data[..., [2, 1, 0]]))
+                label.extend(dic[b"labels"])
+        label = np.array(label, dtype=np.int32)
+        return (data, label)
+
+    def process(self):
+        logger.info("process raw data ...")
+        self.untar(os.path.join(self.root, self.raw_file_name), self.root)
+
+
+class CIFAR100(CIFAR10):
+    url_path = "http://www.cs.utoronto.ca/~kriz/"
+    raw_file_name = "cifar-100-python.tar.gz"
+    raw_file_md5 = "eb9058c3a382ffc7106e4002c42a8d85"
+    raw_file_dir = "cifar-100-python"
+    train_batch = ["train"]
+    test_batch = ["test"]
+    meta_info = {"name": "meta"}
+
+    @property
+    def meta(self):
+        meta_path = os.path.join(self.root, self.raw_file_dir, self.meta_info["name"])
+        with open(meta_path, "rb") as f:
+            meta = pickle.load(f, encoding="bytes")
+        return meta
+
+    def bytes2array(self, filenames):
+        data = []
+        fine_label = []
+        coarse_label = []
+        for filename in filenames:
+            path = os.path.join(self.root, self.raw_file_dir, filename)
+            logger.debug("unpickle file %s", path)
+            with open(path, "rb") as fo:
+                dic = pickle.load(fo, encoding="bytes")
+                batch_data = dic[b"data"].reshape(-1, 3, 32, 32).transpose((0, 2, 3, 1))
+                data.extend(list(batch_data[..., [2, 1, 0]]))
+                fine_label.extend(dic[b"fine_labels"])
+                coarse_label.extend(dic[b"coarse_labels"])
+        fine_label = np.array(fine_label, dtype=np.int32)
+        coarse_label = np.array(coarse_label, dtype=np.int32)
+        return data, fine_label, coarse_label
diff --git a/imperative/python/megengine/data/dataset/vision/cityscapes.py b/imperative/python/megengine/data/dataset/vision/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa05ac92f5d814a5b936cf4bbe7fffaccedbc838
--- /dev/null
+++ b/imperative/python/megengine/data/dataset/vision/cityscapes.py
@@ -0,0 +1,151 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# ---------------------------------------------------------------------
+# Part of the following code in this file refs to torchvision
+# BSD 3-Clause License
+#
+# Copyright (c) Soumith Chintala 2016,
+# All rights reserved.
+# ---------------------------------------------------------------------
+import json
+import os
+
+import cv2
+import numpy as np
+
+from .meta_vision import VisionDataset
+
+
+class Cityscapes(VisionDataset):
+    r"""`Cityscapes <http://www.cityscapes-dataset.com/>`_ Dataset.
+    """
+
+    supported_order = (
+        "image",
+        "mask",
+        "info",
+    )
+
+    def __init__(self, root, image_set, mode, *, order=None):
+        super().__init__(root, order=order, supported_order=self.supported_order)
+
+        city_root = self.root
+        if not os.path.isdir(city_root):
+            raise RuntimeError("Dataset not found or corrupted.")
+
+        self.mode = mode
+        self.images_dir = os.path.join(city_root, "leftImg8bit", image_set)
+        self.masks_dir = os.path.join(city_root, self.mode, image_set)
+        self.images, self.masks = [], []
+        # self.target_type = ["instance", "semantic", "polygon", "color"]
+
+        # for semantic segmentation
+        if mode == "gtFine":
+            valid_modes = ("train", "test", "val")
+        else:
+            valid_modes = ("train", "train_extra", "val")
+
+        for city in os.listdir(self.images_dir):
+            img_dir = os.path.join(self.images_dir, city)
+            mask_dir = os.path.join(self.masks_dir, city)
+            for file_name in os.listdir(img_dir):
+                mask_name = "{}_{}".format(
+                    file_name.split("_leftImg8bit")[0],
+                    self._get_target_suffix(self.mode, "semantic"),
+                )
+                self.images.append(os.path.join(img_dir, file_name))
+                self.masks.append(os.path.join(mask_dir, mask_name))
+
+    def __getitem__(self, index):
+        target = []
+        for k in self.order:
+            if k == "image":
+                image = cv2.imread(self.images[index], cv2.IMREAD_COLOR)
+                target.append(image)
+            elif k == "mask":
+                mask = cv2.imread(self.masks[index], cv2.IMREAD_GRAYSCALE)
+                mask = self._trans_mask(mask)
+                mask = mask[:, :, np.newaxis]
+                target.append(mask)
+            elif k == "info":
+                if image is None:
+                    image = cv2.imread(self.images[index], cv2.IMREAD_COLOR)
+                info = [image.shape[0], image.shape[1], self.images[index]]
+                target.append(info)
+            else:
+                raise NotImplementedError
+
+        return tuple(target)
+
+    def __len__(self):
+        return len(self.images)
+
+    def _trans_mask(self, mask):
+        trans_labels = [
+            7,
+            8,
+            11,
+            12,
+            13,
+            17,
+            19,
+            20,
+            21,
+            22,
+            23,
+            24,
+            25,
+            26,
+            27,
+            28,
+            31,
+            32,
+            33,
+        ]
+        label = np.ones(mask.shape) * 255
+        for i, tl in enumerate(trans_labels):
+            label[mask == tl] = i
+        return label.astype(np.uint8)
+
+    def _get_target_suffix(self, mode, target_type):
+        if target_type == "instance":
+            return "{}_instanceIds.png".format(mode)
+        elif target_type == "semantic":
+            return "{}_labelIds.png".format(mode)
+        elif target_type == "color":
+            return "{}_color.png".format(mode)
+        else:
+            return "{}_polygons.json".format(mode)
+
+    def _load_json(self, path):
+        with open(path, "r") as file:
+            data = json.load(file)
+        return data
+
+    class_names = (
+        "road",
+        "sidewalk",
+        "building",
+        "wall",
+        "fence",
+        "pole",
+        "traffic light",
+        "traffic sign",
+        "vegetation",
+        "terrain",
+        "sky",
+        "person",
+        "rider",
+        "car",
+        "truck",
+        "bus",
+        "train",
+        "motorcycle",
+        "bicycle",
+    )
diff --git a/imperative/python/megengine/data/dataset/vision/coco.py b/imperative/python/megengine/data/dataset/vision/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d247e52b4f6567d03dd390864ef5b9c1ee4f600c
--- /dev/null
+++ b/imperative/python/megengine/data/dataset/vision/coco.py
@@ -0,0 +1,366 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# ---------------------------------------------------------------------
+# Part of the following code in this file refs to maskrcnn-benchmark
+# MIT License
+#
+# Copyright (c) 2018 Facebook
+# ---------------------------------------------------------------------
+import json
+import os
+from collections import defaultdict
+
+import cv2
+import numpy as np
+
+from .meta_vision import VisionDataset
+
+min_keypoints_per_image = 10
+
+
+def _count_visible_keypoints(anno):
+    return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
+
+
+def has_valid_annotation(anno, order):
+    # if it"s empty, there is no annotation
+    if len(anno) == 0:
+        return False
+    if "boxes" in order or "boxes_category" in order:
+        if "bbox" not in anno[0]:
+            return False
+    if "keypoints" in order:
+        if "keypoints" not in anno[0]:
+            return False
+        # for keypoint detection tasks, only consider valid images those
+        # containing at least min_keypoints_per_image
+        if _count_visible_keypoints(anno) < min_keypoints_per_image:
+            return False
+    return True
+
+
+class COCO(VisionDataset):
+    r"""`MS COCO <http://cocodataset.org/#home>`_ Dataset.
+    """
+
+    supported_order = (
+        "image",
+        "boxes",
+        "boxes_category",
+        "keypoints",
+        # TODO: need to check
+        # "polygons",
+        "info",
+    )
+
+    def __init__(
+        self, root, ann_file, remove_images_without_annotations=False, *, order=None
+    ):
+        super().__init__(root, order=order, supported_order=self.supported_order)
+
+        with open(ann_file, "r") as f:
+            dataset = json.load(f)
+
+        self.imgs = dict()
+        for img in dataset["images"]:
+            # for saving memory
+            if "license" in img:
+                del img["license"]
+            if "coco_url" in img:
+                del img["coco_url"]
+            if "date_captured" in img:
+                del img["date_captured"]
+            if "flickr_url" in img:
+                del img["flickr_url"]
+            self.imgs[img["id"]] = img
+
+        self.img_to_anns = defaultdict(list)
+        for ann in dataset["annotations"]:
+            # for saving memory
+            if (
+                "boxes" not in self.order
+                and "boxes_category" not in self.order
+                and "bbox" in ann
+            ):
+                del ann["bbox"]
+            if "polygons" not in self.order and "segmentation" in ann:
+                del ann["segmentation"]
+            self.img_to_anns[ann["image_id"]].append(ann)
+
+        self.cats = dict()
+        for cat in dataset["categories"]:
+            self.cats[cat["id"]] = cat
+
+        self.ids = list(sorted(self.imgs.keys()))
+
+        # filter images without detection annotations
+        if remove_images_without_annotations:
+            ids = []
+            for img_id in self.ids:
+                anno = self.img_to_anns[img_id]
+                # filter crowd annotations
+                anno = [obj for obj in anno if obj["iscrowd"] == 0]
+                anno = [
+                    obj for obj in anno if obj["bbox"][2] > 0 and obj["bbox"][3] > 0
+                ]
+                if has_valid_annotation(anno, order):
+                    ids.append(img_id)
+                    self.img_to_anns[img_id] = anno
+                else:
+                    del self.imgs[img_id]
+                    del self.img_to_anns[img_id]
+            self.ids = ids
+
+        self.json_category_id_to_contiguous_id = {
+            v: i + 1 for i, v in enumerate(self.cats.keys())
+        }
+
+        self.contiguous_category_id_to_json_id = {
+            v: k for k, v in self.json_category_id_to_contiguous_id.items()
+        }
+
+    def __getitem__(self, index):
+        img_id = self.ids[index]
+        anno = self.img_to_anns[img_id]
+
+        target = []
+        for k in self.order:
+            if k == "image":
+                file_name = self.imgs[img_id]["file_name"]
+                path = os.path.join(self.root, file_name)
+                image = cv2.imread(path, cv2.IMREAD_COLOR)
+                target.append(image)
+            elif k == "boxes":
+                boxes = [obj["bbox"] for obj in anno]
+                boxes = np.array(boxes, dtype=np.float32).reshape(-1, 4)
+                # transfer boxes from xywh to xyxy
+                boxes[:, 2:] += boxes[:, :2]
+                target.append(boxes)
+            elif k == "boxes_category":
+                boxes_category = [obj["category_id"] for obj in anno]
+                boxes_category = [
+                    self.json_category_id_to_contiguous_id[c] for c in boxes_category
+                ]
+                boxes_category = np.array(boxes_category, dtype=np.int32)
+                target.append(boxes_category)
+            elif k == "keypoints":
+                keypoints = [obj["keypoints"] for obj in anno]
+                keypoints = np.array(keypoints, dtype=np.float32).reshape(
+                    -1, len(self.keypoint_names), 3
+                )
+                target.append(keypoints)
+            elif k == "polygons":
+                polygons = [obj["segmentation"] for obj in anno]
+                polygons = [
+                    [np.array(p, dtype=np.float32).reshape(-1, 2) for p in ps]
+                    for ps in polygons
+                ]
+                target.append(polygons)
+            elif k == "info":
+                info = self.imgs[img_id]
+                info = [info["height"], info["width"], info["file_name"]]
+                target.append(info)
+            else:
+                raise NotImplementedError
+
+        return tuple(target)
+
+    def __len__(self):
+        return len(self.ids)
+
+    def get_img_info(self, index):
+        img_id = self.ids[index]
+        img_info = self.imgs[img_id]
+        return img_info
+
+    class_names = (
+        "person",
+        "bicycle",
+        "car",
+        "motorcycle",
+        "airplane",
+        "bus",
+        "train",
+        "truck",
+        "boat",
+        "traffic light",
+        "fire hydrant",
+        "stop sign",
+        "parking meter",
+        "bench",
+        "bird",
+        "cat",
+        "dog",
+        "horse",
+        "sheep",
+        "cow",
+        "elephant",
+        "bear",
+        "zebra",
+        "giraffe",
+        "backpack",
+        "umbrella",
+        "handbag",
+        "tie",
+        "suitcase",
+        "frisbee",
+        "skis",
+        "snowboard",
+        "sports ball",
+        "kite",
+        "baseball bat",
+        "baseball glove",
+        "skateboard",
+        "surfboard",
+        "tennis racket",
+        "bottle",
+        "wine glass",
+        "cup",
+        "fork",
+        "knife",
+        "spoon",
+        "bowl",
+        "banana",
+        "apple",
+        "sandwich",
+        "orange",
+        "broccoli",
+        "carrot",
+        "hot dog",
+        "pizza",
+        "donut",
+        "cake",
+        "chair",
+        "couch",
+        "potted plant",
+        "bed",
+        "dining table",
+        "toilet",
+        "tv",
+        "laptop",
+        "mouse",
+        "remote",
+        "keyboard",
+        "cell phone",
+        "microwave",
+        "oven",
+        "toaster",
+        "sink",
+        "refrigerator",
+        "book",
+        "clock",
+        "vase",
+        "scissors",
+        "teddy bear",
+        "hair drier",
+        "toothbrush",
+    )
+
+    classes_originID = {
+        "person": 1,
+        "bicycle": 2,
+        "car": 3,
+        "motorcycle": 4,
+        "airplane": 5,
+        "bus": 6,
+        "train": 7,
+        "truck": 8,
+        "boat": 9,
+        "traffic light": 10,
+        "fire hydrant": 11,
+        "stop sign": 13,
+        "parking meter": 14,
+        "bench": 15,
+        "bird": 16,
+        "cat": 17,
+        "dog": 18,
+        "horse": 19,
+        "sheep": 20,
+        "cow": 21,
+        "elephant": 22,
+        "bear": 23,
+        "zebra": 24,
+        "giraffe": 25,
+        "backpack": 27,
+        "umbrella": 28,
+        "handbag": 31,
+        "tie": 32,
+        "suitcase": 33,
+        "frisbee": 34,
+        "skis": 35,
+        "snowboard": 36,
+        "sports ball": 37,
+        "kite": 38,
+        "baseball bat": 39,
+        "baseball glove": 40,
+        "skateboard": 41,
+        "surfboard": 42,
+        "tennis racket": 43,
+        "bottle": 44,
+        "wine glass": 46,
+        "cup": 47,
+        "fork": 48,
+        "knife": 49,
+        "spoon": 50,
+        "bowl": 51,
+        "banana": 52,
+        "apple": 53,
+        "sandwich": 54,
+        "orange": 55,
+        "broccoli": 56,
+        "carrot": 57,
+        "hot dog": 58,
+        "pizza": 59,
+        "donut": 60,
+        "cake": 61,
+        "chair": 62,
+        "couch": 63,
+        "potted plant": 64,
+        "bed": 65,
+        "dining table": 67,
+        "toilet": 70,
+        "tv": 72,
+        "laptop": 73,
+        "mouse": 74,
+        "remote": 75,
+        "keyboard": 76,
+        "cell phone": 77,
+        "microwave": 78,
+        "oven": 79,
+        "toaster": 80,
+        "sink": 81,
+        "refrigerator": 82,
+        "book": 84,
+        "clock": 85,
+        "vase": 86,
+        "scissors": 87,
+        "teddy bear": 88,
+        "hair drier": 89,
+        "toothbrush": 90,
+    }
+
+    keypoint_names = (
+        "nose",
+        "left_eye",
+        "right_eye",
+        "left_ear",
+        "right_ear",
+        "left_shoulder",
+        "right_shoulder",
+        "left_elbow",
+        "right_elbow",
+        "left_wrist",
+        "right_wrist",
+        "left_hip",
+        "right_hip",
+        "left_knee",
+        "right_knee",
+        "left_ankle",
+        "right_ankle",
+    )
diff --git a/imperative/python/megengine/data/dataset/vision/folder.py b/imperative/python/megengine/data/dataset/vision/folder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7124ef56e050c83cb521ef11955bc69e6a3e42a0
--- /dev/null
+++ b/imperative/python/megengine/data/dataset/vision/folder.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+# BSD 3-Clause License
+
+# Copyright (c) Soumith Chintala 2016,
+# All rights reserved.
+# ---------------------------------------------------------------------
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#
+# This file has been modified by Megvii ("Megvii Modifications").
+# All Megvii Modifications are Copyright (C) 2014-2020 Megvii Inc. All rights reserved.
+# ---------------------------------------------------------------------
+import os
+from typing import Dict, List, Tuple
+
+import cv2
+import numpy as np
+
+from .meta_vision import VisionDataset
+from .utils import is_img
+
+
+class ImageFolder(VisionDataset):
+    def __init__(self, root: str, check_valid_func=None, class_name: bool = False):
+        r"""
+        ImageFolder is a class for loading image data and labels from a organized folder.
+
+        the folder is expected to be organized as followed
+        root/cls/xxx.img_ext
+
+        labels are indices of sorted classes in the root directory
+
+        :param root: root directory of an image folder
+        :param loader: a function used to load image from path,
+                       if ``None``, default function that loads
+                       images with PILwill be called
+        :param check_valid_func: a function used to check if files in folder are
+                                 expected image files, if ``None``, default function
+                                 that checks file extensions will be called
+        :param class_name: if ``True``, return class name instead of class index
+
+        """
+        super().__init__(root, order=("image", "image_category"))
+
+        self.root = root
+
+        if check_valid_func is not None:
+            self.check_valid = check_valid_func
+        else:
+            self.check_valid = is_img
+
+        self.class_name = class_name
+
+        self.class_dict = self.collect_class()
+        self.samples = self.collect_samples()
+
+    def collect_samples(self) -> List:
+        samples = []
+        directory = os.path.expanduser(self.root)
+        for key in sorted(self.class_dict.keys()):
+            d = os.path.join(directory, key)
+            if not os.path.isdir(d):
+                continue
+            for r, _, filename in sorted(os.walk(d, followlinks=True)):
+                for name in sorted(filename):
+                    path = os.path.join(r, name)
+                    if self.check_valid(path):
+                        if self.class_name:
+                            samples.append((path, key))
+                        else:
+                            samples.append((path, self.class_dict[key]))
+        return samples
+
+    def collect_class(self) -> Dict:
+        classes = [d.name for d in os.scandir(self.root) if d.is_dir()]
+        classes.sort()
+        return {classes[i]: np.int32(i) for i in range(len(classes))}
+
+    def __getitem__(self, index: int) -> Tuple:
+        path, label = self.samples[index]
+        img = cv2.imread(path, cv2.IMREAD_COLOR)
+        return img, label
+
+    def __len__(self):
+        return len(self.samples)
diff --git a/imperative/python/megengine/data/dataset/vision/imagenet.py b/imperative/python/megengine/data/dataset/vision/imagenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..94c2396cf7497a95236b5f0ef0fa66d8c5a5e4dd
--- /dev/null
+++ b/imperative/python/megengine/data/dataset/vision/imagenet.py
@@ -0,0 +1,248 @@
+# -*- coding: utf-8 -*-
+# BSD 3-Clause License
+#
+# Copyright (c) Soumith Chintala 2016,
+# All rights reserved.
+# ---------------------------------------------------------------------
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#
+# This file has been modified by Megvii ("Megvii Modifications").
+# All Megvii Modifications are Copyright (C) 2014-2020 Megvii Inc. All rights reserved.
+# ---------------------------------------------------------------------
+import os
+import shutil
+
+from tqdm import tqdm
+
+from ....distributed.group import is_distributed
+from ....logger import get_logger
+from ....serialization import load, save
+from .folder import ImageFolder
+from .utils import _default_dataset_root, calculate_md5, untar, untargz
+
+logger = get_logger(__name__)
+
+
+class ImageNet(ImageFolder):
+    r"""
+    Load ImageNet from raw files or folder, expected folder looks like
+
+    .. code-block:: bash
+
+        ${root}/
+        |       [REQUIRED TAR FILES]
+        |-  ILSVRC2012_img_train.tar
+        |-  ILSVRC2012_img_val.tar
+        |-  ILSVRC2012_devkit_t12.tar.gz
+        |       [OPTIONAL IMAGE FOLDERS]
+        |-  train/cls/xxx.${img_ext}
+        |-  val/cls/xxx.${img_ext}
+        |-  ILSVRC2012_devkit_t12/data/meta.mat
+        |-  ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt
+
+    If the image folders don't exist, raw tar files are required to get extracted and processed.
+    """
+
+    raw_file_meta = {
+        "train": ("ILSVRC2012_img_train.tar", "1d675b47d978889d74fa0da5fadfb00e"),
+        "val": ("ILSVRC2012_img_val.tar", "29b22e2961454d5413ddabcf34fc5622"),
+        "devkit": ("ILSVRC2012_devkit_t12.tar.gz", "fa75699e90414af021442c21a62c3abf"),
+    }  # ImageNet raw files
+    default_train_dir = "train"
+    default_val_dir = "val"
+    default_devkit_dir = "ILSVRC2012_devkit_t12"
+
+    def __init__(self, root: str = None, train: bool = True, **kwargs):
+        r"""
+        initialization:
+
+        * if ``root`` contains ``self.target_folder`` depent on ``train``:
+
+          * initialize ImageFolder with target_folder
+
+        * else:
+
+          * if all raw files are in ``root``:
+
+            * parse ``self.target_folder`` from raw files
+            * initialize ImageFolder with ``self.target_folder``
+
+          * else:
+
+            * raise error
+
+        :param root: root directory of imagenet data, if root is ``None``, used default_dataset_root
+        :param train: if ``True``, load the train split, otherwise load the validation split
+        """
+
+        # process the root path
+        if root is None:
+            self.root = self._default_root
+        else:
+            self.root = root
+
+        if not os.path.exists(self.root):
+            raise FileNotFoundError("dir %s does not exist" % self.root)
+
+        self.devkit_dir = os.path.join(self.root, self.default_devkit_dir)
+
+        if not os.path.exists(self.devkit_dir):
+            logger.warning("devkit directory %s does not exists", self.devkit_dir)
+            self._prepare_devkit()
+
+        self.train = train
+
+        if train:
+            self.target_folder = os.path.join(self.root, self.default_train_dir)
+        else:
+            self.target_folder = os.path.join(self.root, self.default_val_dir)
+
+        if not os.path.exists(self.target_folder):
+            logger.warning(
+                "expected image folder %s does not exist, try to load from raw file",
+                self.target_folder,
+            )
+            if not self.check_raw_file():
+                raise FileNotFoundError(
+                    "expected image folder %s does not exist, and raw files do not exist in %s"
+                    % (self.target_folder, self.root)
+                )
+            elif is_distributed():
+                raise RuntimeError(
+                    "extracting raw file shouldn't be done in distributed mode, use single process instead"
+                )
+            elif train:
+                self._prepare_train()
+            else:
+                self._prepare_val()
+
+        super().__init__(self.target_folder, **kwargs)
+
+    @property
+    def _default_root(self):
+        return os.path.join(_default_dataset_root(), self.__class__.__name__)
+
+    @property
+    def valid_ground_truth(self):
+        groud_truth_path = os.path.join(
+            self.devkit_dir, "data", "ILSVRC2012_validation_ground_truth.txt"
+        )
+        if os.path.exists(groud_truth_path):
+            with open(groud_truth_path, "r") as f:
+                val_labels = f.readlines()
+                return [int(val_label) for val_label in val_labels]
+        else:
+            raise FileNotFoundError(
+                "valid ground truth file %s does not exist" % groud_truth_path
+            )
+
+    @property
+    def meta(self):
+        try:
+            return load(os.path.join(self.devkit_dir, "meta.pkl"))
+        except FileNotFoundError:
+            import scipy.io
+
+            meta_path = os.path.join(self.devkit_dir, "data", "meta.mat")
+            if not os.path.exists(meta_path):
+                raise FileNotFoundError("meta file %s does not exist" % meta_path)
+            meta = scipy.io.loadmat(meta_path, squeeze_me=True)["synsets"]
+            nums_children = list(zip(*meta))[4]
+            meta = [
+                meta[idx]
+                for idx, num_children in enumerate(nums_children)
+                if num_children == 0
+            ]
+            idcs, wnids, classes = list(zip(*meta))[:3]
+            classes = [tuple(clss.split(", ")) for clss in classes]
+            idx_to_wnid = dict(zip(idcs, wnids))
+            wnid_to_classes = dict(zip(wnids, classes))
+            logger.info(
+                "saving cached meta file to %s",
+                os.path.join(self.devkit_dir, "meta.pkl"),
+            )
+            save(
+                (idx_to_wnid, wnid_to_classes),
+                os.path.join(self.devkit_dir, "meta.pkl"),
+            )
+            return idx_to_wnid, wnid_to_classes
+
+    def check_raw_file(self) -> bool:
+        return all(
+            [
+                os.path.exists(os.path.join(self.root, value[0]))
+                for _, value in self.raw_file_meta.items()
+            ]
+        )
+
+    def _organize_val_data(self):
+        id2wnid = self.meta[0]
+        val_idcs = self.valid_ground_truth
+        val_wnids = [id2wnid[idx] for idx in val_idcs]
+
+        val_images = sorted(
+            [
+                os.path.join(self.target_folder, image)
+                for image in os.listdir(self.target_folder)
+            ]
+        )
+
+        logger.debug("mkdir for val set wnids")
+        for wnid in set(val_wnids):
+            os.makedirs(os.path.join(self.root, self.default_val_dir, wnid))
+
+        logger.debug("mv val images into wnids dir")
+        for wnid, img_file in tqdm(zip(val_wnids, val_images)):
+            shutil.move(
+                img_file,
+                os.path.join(
+                    self.root, self.default_val_dir, wnid, os.path.basename(img_file)
+                ),
+            )
+
+    def _prepare_val(self):
+        assert not self.train
+        raw_filename, checksum = self.raw_file_meta["val"]
+        raw_file = os.path.join(self.root, raw_filename)
+        logger.info("checksum valid tar file %s ...", raw_file)
+        assert (
+            calculate_md5(raw_file) == checksum
+        ), "checksum mismatch, {} may be damaged".format(raw_file)
+        logger.info("extract valid tar file... this may take 10-20 minutes")
+        untar(os.path.join(self.root, raw_file), self.target_folder)
+        self._organize_val_data()
+
+    def _prepare_train(self):
+        assert self.train
+        raw_filename, checksum = self.raw_file_meta["train"]
+        raw_file = os.path.join(self.root, raw_filename)
+        logger.info("checksum train tar file %s ...", raw_file)
+        assert (
+            calculate_md5(raw_file) == checksum
+        ), "checksum mismatch, {} may be damaged".format(raw_file)
+        logger.info("extract train tar file.. this may take several hours")
+        untar(
+            os.path.join(self.root, raw_file), self.target_folder,
+        )
+        paths = [
+            os.path.join(self.target_folder, child_dir)
+            for child_dir in os.listdir(self.target_folder)
+        ]
+        for path in tqdm(paths):
+            untar(path, os.path.splitext(path)[0], remove=True)
+
+    def _prepare_devkit(self):
+        raw_filename, checksum = self.raw_file_meta["devkit"]
+        raw_file = os.path.join(self.root, raw_filename)
+        logger.info("checksum devkit tar file %s ...", raw_file)
+        assert (
+            calculate_md5(raw_file) == checksum
+        ), "checksum mismatch, {} may be damaged".format(raw_file)
+        logger.info("extract devkit file..")
+        untargz(os.path.join(self.root, self.raw_file_meta["devkit"][0]))
diff --git a/imperative/python/megengine/data/dataset/vision/meta_vision.py b/imperative/python/megengine/data/dataset/vision/meta_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d03d3eda5451a05039f513034f32444004db218
--- /dev/null
+++ b/imperative/python/megengine/data/dataset/vision/meta_vision.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections.abc
+import os
+
+from ..meta_dataset import MapDataset
+
+
+class VisionDataset(MapDataset):
+    _repr_indent = 4
+
+    def __init__(self, root, *, order=None, supported_order=None):
+        if isinstance(root, (str, bytes)):
+            root = os.path.expanduser(root)
+        self.root = root
+
+        if order is None:
+            order = ("image",)
+        if not isinstance(order, collections.abc.Sequence):
+            raise ValueError(
+                "order should be a sequence, but got order={}".format(order)
+            )
+
+        if supported_order is not None:
+            assert isinstance(supported_order, collections.abc.Sequence)
+            for k in order:
+                if k not in supported_order:
+                    raise NotImplementedError("{} is unsupported data type".format(k))
+        self.order = order
+
+    def __getitem__(self, index):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
diff --git a/imperative/python/megengine/data/dataset/vision/mnist.py b/imperative/python/megengine/data/dataset/vision/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e89a3140556bf9449f4fdadf1bb6e6b73b1f6ad
--- /dev/null
+++ b/imperative/python/megengine/data/dataset/vision/mnist.py
@@ -0,0 +1,197 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import gzip
+import os
+import struct
+from typing import Tuple
+
+import numpy as np
+from tqdm import tqdm
+
+from ....logger import get_logger
+from .meta_vision import VisionDataset
+from .utils import _default_dataset_root, load_raw_data_from_url
+
+logger = get_logger(__name__)
+
+
+class MNIST(VisionDataset):
+    r""" ``Dataset`` for MNIST meta data
+    """
+
+    url_path = "http://yann.lecun.com/exdb/mnist/"
+    """
+    url prefix for downloading raw file
+    """
+    raw_file_name = [
+        "train-images-idx3-ubyte.gz",
+        "train-labels-idx1-ubyte.gz",
+        "t10k-images-idx3-ubyte.gz",
+        "t10k-labels-idx1-ubyte.gz",
+    ]
+    """
+    raw file names of both training set and test set (10k)
+    """
+    raw_file_md5 = [
+        "f68b3c2dcbeaaa9fbdd348bbdeb94873",
+        "d53e105ee54ea40749a09fcbcd1e9432",
+        "9fb629c4189551a2d022fa330f9573f3",
+        "ec29112dd5afa0611ce80d1b7f02629c",
+    ]
+    """
+    md5 for checking raw files
+    """
+
+    def __init__(
+        self,
+        root: str = None,
+        train: bool = True,
+        download: bool = True,
+        timeout: int = 500,
+    ):
+        r"""
+        :param root: path for mnist dataset downloading or loading, if ``None``,
+            set ``root`` to the ``_default_root``
+        :param train: if ``True``, loading trainingset, else loading test set
+        :param download: if raw files do not exists and download sets to ``True``,
+            download raw files and process, otherwise raise ValueError, default is True
+
+        """
+        super().__init__(root, order=("image", "image_category"))
+
+        self.timeout = timeout
+
+        # process the root path
+        if root is None:
+            self.root = self._default_root
+            if not os.path.exists(self.root):
+                os.makedirs(self.root)
+        else:
+            self.root = root
+            if not os.path.exists(self.root):
+                if download:
+                    logger.debug(
+                        "dir %s does not exist, will be automatically created",
+                        self.root,
+                    )
+                    os.makedirs(self.root)
+                else:
+                    raise ValueError("dir %s does not exist" % self.root)
+
+        if self._check_raw_files():
+            self.process(train)
+        elif download:
+            self.download()
+            self.process(train)
+        else:
+            raise ValueError(
+                "root does not contain valid raw files, please set download=True"
+            )
+
+    def __getitem__(self, index: int) -> Tuple:
+        return tuple(array[index] for array in self.arrays)
+
+    def __len__(self) -> int:
+        return len(self.arrays[0])
+
+    @property
+    def _default_root(self):
+        return os.path.join(_default_dataset_root(), self.__class__.__name__)
+
+    @property
+    def meta(self):
+        return self._meta_data
+
+    def _check_raw_files(self):
+        return all(
+            [
+                os.path.exists(os.path.join(self.root, path))
+                for path in self.raw_file_name
+            ]
+        )
+
+    def download(self):
+        for file_name, md5 in zip(self.raw_file_name, self.raw_file_md5):
+            url = self.url_path + file_name
+            load_raw_data_from_url(url, file_name, md5, self.root, self.timeout)
+
+    def process(self, train):
+        # load raw files and transform them into meta data and datasets Tuple(np.array)
+        logger.info("process the raw files of %s set...", "train" if train else "test")
+        if train:
+            meta_data_images, images = parse_idx3(
+                os.path.join(self.root, self.raw_file_name[0])
+            )
+            meta_data_labels, labels = parse_idx1(
+                os.path.join(self.root, self.raw_file_name[1])
+            )
+        else:
+            meta_data_images, images = parse_idx3(
+                os.path.join(self.root, self.raw_file_name[2])
+            )
+            meta_data_labels, labels = parse_idx1(
+                os.path.join(self.root, self.raw_file_name[3])
+            )
+
+        self._meta_data = {
+            "images": meta_data_images,
+            "labels": meta_data_labels,
+        }
+        self.arrays = (images, labels.astype(np.int32))
+
+
+def parse_idx3(idx3_file):
+    # parse idx3 file to meta data and data in numpy array (images)
+    logger.debug("parse idx3 file %s ...", idx3_file)
+    assert idx3_file.endswith(".gz")
+    with gzip.open(idx3_file, "rb") as f:
+        bin_data = f.read()
+
+    #  parse meta data
+    offset = 0
+    fmt_header = ">iiii"
+    magic, imgs, height, width = struct.unpack_from(fmt_header, bin_data, offset)
+    meta_data = {"magic": magic, "imgs": imgs, "height": height, "width": width}
+
+    # parse images
+    image_size = height * width
+    offset += struct.calcsize(fmt_header)
+    fmt_image = ">" + str(image_size) + "B"
+    images = []
+    bar = tqdm(total=meta_data["imgs"], ncols=80)
+    for image in struct.iter_unpack(fmt_image, bin_data[offset:]):
+        images.append(np.array(image, dtype=np.uint8).reshape((height, width, 1)))
+        bar.update()
+    bar.close()
+    return meta_data, images
+
+
+def parse_idx1(idx1_file):
+    # parse idx1 file to meta data and data in numpy array (labels)
+    logger.debug("parse idx1 file %s ...", idx1_file)
+    assert idx1_file.endswith(".gz")
+    with gzip.open(idx1_file, "rb") as f:
+        bin_data = f.read()
+
+    # parse meta data
+    offset = 0
+    fmt_header = ">ii"
+    magic, imgs = struct.unpack_from(fmt_header, bin_data, offset)
+    meta_data = {"magic": magic, "imgs": imgs}
+
+    # parse labels
+    offset += struct.calcsize(fmt_header)
+    fmt_image = ">B"
+    labels = np.empty(imgs, dtype=int)
+    bar = tqdm(total=meta_data["imgs"], ncols=80)
+    for i, label in enumerate(struct.iter_unpack(fmt_image, bin_data[offset:])):
+        labels[i] = label[0]
+        bar.update()
+    bar.close()
+    return meta_data, labels
diff --git a/imperative/python/megengine/data/dataset/vision/objects365.py b/imperative/python/megengine/data/dataset/vision/objects365.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c1481bac99fa2af82fb8d93856b7815024373c9
--- /dev/null
+++ b/imperative/python/megengine/data/dataset/vision/objects365.py
@@ -0,0 +1,498 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# ---------------------------------------------------------------------
+# Part of the following code in this file refs to maskrcnn-benchmark
+# MIT License
+#
+# Copyright (c) 2018 Facebook
+# ---------------------------------------------------------------------
+import json
+import os
+from collections import defaultdict
+
+import cv2
+import numpy as np
+
+from .meta_vision import VisionDataset
+
+
+class Objects365(VisionDataset):
+    r"""`Objects365 <https://www.objects365.org/overview.html>`_ Dataset.
+    """
+
+    supported_order = (
+        "image",
+        "boxes",
+        "boxes_category",
+        "info",
+    )
+
+    def __init__(
+        self, root, ann_file, remove_images_without_annotations=False, *, order=None
+    ):
+        super().__init__(root, order=order, supported_order=self.supported_order)
+
+        with open(ann_file, "r") as f:
+            dataset = json.load(f)
+
+        self.imgs = dict()
+        for img in dataset["images"]:
+            self.imgs[img["id"]] = img
+
+        self.img_to_anns = defaultdict(list)
+        for ann in dataset["annotations"]:
+            # for saving memory
+            if (
+                "boxes" not in self.order
+                and "boxes_category" not in self.order
+                and "bbox" in ann
+            ):
+                del ann["bbox"]
+            self.img_to_anns[ann["image_id"]].append(ann)
+
+        self.cats = dict()
+        for cat in dataset["categories"]:
+            self.cats[cat["id"]] = cat
+
+        self.ids = list(sorted(self.imgs.keys()))
+
+        # filter images without detection annotations
+        if remove_images_without_annotations:
+            ids = []
+            for img_id in self.ids:
+                anno = self.img_to_anns[img_id]
+                # filter crowd annotations
+                anno = [obj for obj in anno if obj["iscrowd"] == 0]
+                anno = [
+                    obj for obj in anno if obj["bbox"][2] > 0 and obj["bbox"][3] > 0
+                ]
+                if len(anno) > 0:
+                    ids.append(img_id)
+                    self.img_to_anns[img_id] = anno
+                else:
+                    del self.imgs[img_id]
+                    del self.img_to_anns[img_id]
+            self.ids = ids
+
+        self.json_category_id_to_contiguous_id = {
+            v: i + 1 for i, v in enumerate(self.cats.keys())
+        }
+
+        self.contiguous_category_id_to_json_id = {
+            v: k for k, v in self.json_category_id_to_contiguous_id.items()
+        }
+
+    def __getitem__(self, index):
+        img_id = self.ids[index]
+        anno = self.img_to_anns[img_id]
+
+        target = []
+        for k in self.order:
+            if k == "image":
+                file_name = self.imgs[img_id]["file_name"]
+                path = os.path.join(self.root, file_name)
+                image = cv2.imread(path, cv2.IMREAD_COLOR)
+                target.append(image)
+            elif k == "boxes":
+                boxes = [obj["bbox"] for obj in anno]
+                boxes = np.array(boxes, dtype=np.float32).reshape(-1, 4)
+                # transfer boxes from xywh to xyxy
+                boxes[:, 2:] += boxes[:, :2]
+                target.append(boxes)
+            elif k == "boxes_category":
+                boxes_category = [obj["category_id"] for obj in anno]
+                boxes_category = [
+                    self.json_category_id_to_contiguous_id[c] for c in boxes_category
+                ]
+                boxes_category = np.array(boxes_category, dtype=np.int32)
+                target.append(boxes_category)
+            elif k == "info":
+                info = self.imgs[img_id]
+                info = [info["height"], info["width"], info["file_name"]]
+                target.append(info)
+            else:
+                raise NotImplementedError
+
+        return tuple(target)
+
+    def __len__(self):
+        return len(self.ids)
+
+    def get_img_info(self, index):
+        img_id = self.ids[index]
+        img_info = self.imgs[img_id]
+        return img_info
+
+    class_names = (
+        "person",
+        "sneakers",
+        "chair",
+        "hat",
+        "lamp",
+        "bottle",
+        "cabinet/shelf",
+        "cup",
+        "car",
+        "glasses",
+        "picture/frame",
+        "desk",
+        "handbag",
+        "street lights",
+        "book",
+        "plate",
+        "helmet",
+        "leather shoes",
+        "pillow",
+        "glove",
+        "potted plant",
+        "bracelet",
+        "flower",
+        "tv",
+        "storage box",
+        "vase",
+        "bench",
+        "wine glass",
+        "boots",
+        "bowl",
+        "dining table",
+        "umbrella",
+        "boat",
+        "flag",
+        "speaker",
+        "trash bin/can",
+        "stool",
+        "backpack",
+        "couch",
+        "belt",
+        "carpet",
+        "basket",
+        "towel/napkin",
+        "slippers",
+        "barrel/bucket",
+        "coffee table",
+        "suv",
+        "toy",
+        "tie",
+        "bed",
+        "traffic light",
+        "pen/pencil",
+        "microphone",
+        "sandals",
+        "canned",
+        "necklace",
+        "mirror",
+        "faucet",
+        "bicycle",
+        "bread",
+        "high heels",
+        "ring",
+        "van",
+        "watch",
+        "sink",
+        "horse",
+        "fish",
+        "apple",
+        "camera",
+        "candle",
+        "teddy bear",
+        "cake",
+        "motorcycle",
+        "wild bird",
+        "laptop",
+        "knife",
+        "traffic sign",
+        "cell phone",
+        "paddle",
+        "truck",
+        "cow",
+        "power outlet",
+        "clock",
+        "drum",
+        "fork",
+        "bus",
+        "hanger",
+        "nightstand",
+        "pot/pan",
+        "sheep",
+        "guitar",
+        "traffic cone",
+        "tea pot",
+        "keyboard",
+        "tripod",
+        "hockey",
+        "fan",
+        "dog",
+        "spoon",
+        "blackboard/whiteboard",
+        "balloon",
+        "air conditioner",
+        "cymbal",
+        "mouse",
+        "telephone",
+        "pickup truck",
+        "orange",
+        "banana",
+        "airplane",
+        "luggage",
+        "skis",
+        "soccer",
+        "trolley",
+        "oven",
+        "remote",
+        "baseball glove",
+        "paper towel",
+        "refrigerator",
+        "train",
+        "tomato",
+        "machinery vehicle",
+        "tent",
+        "shampoo/shower gel",
+        "head phone",
+        "lantern",
+        "donut",
+        "cleaning products",
+        "sailboat",
+        "tangerine",
+        "pizza",
+        "kite",
+        "computer box",
+        "elephant",
+        "toiletries",
+        "gas stove",
+        "broccoli",
+        "toilet",
+        "stroller",
+        "shovel",
+        "baseball bat",
+        "microwave",
+        "skateboard",
+        "surfboard",
+        "surveillance camera",
+        "gun",
+        "life saver",
+        "cat",
+        "lemon",
+        "liquid soap",
+        "zebra",
+        "duck",
+        "sports car",
+        "giraffe",
+        "pumpkin",
+        "piano",
+        "stop sign",
+        "radiator",
+        "converter",
+        "tissue ",
+        "carrot",
+        "washing machine",
+        "vent",
+        "cookies",
+        "cutting/chopping board",
+        "tennis racket",
+        "candy",
+        "skating and skiing shoes",
+        "scissors",
+        "folder",
+        "baseball",
+        "strawberry",
+        "bow tie",
+        "pigeon",
+        "pepper",
+        "coffee machine",
+        "bathtub",
+        "snowboard",
+        "suitcase",
+        "grapes",
+        "ladder",
+        "pear",
+        "american football",
+        "basketball",
+        "potato",
+        "paint brush",
+        "printer",
+        "billiards",
+        "fire hydrant",
+        "goose",
+        "projector",
+        "sausage",
+        "fire extinguisher",
+        "extension cord",
+        "facial mask",
+        "tennis ball",
+        "chopsticks",
+        "electronic stove and gas stove",
+        "pie",
+        "frisbee",
+        "kettle",
+        "hamburger",
+        "golf club",
+        "cucumber",
+        "clutch",
+        "blender",
+        "tong",
+        "slide",
+        "hot dog",
+        "toothbrush",
+        "facial cleanser",
+        "mango",
+        "deer",
+        "egg",
+        "violin",
+        "marker",
+        "ship",
+        "chicken",
+        "onion",
+        "ice cream",
+        "tape",
+        "wheelchair",
+        "plum",
+        "bar soap",
+        "scale",
+        "watermelon",
+        "cabbage",
+        "router/modem",
+        "golf ball",
+        "pine apple",
+        "crane",
+        "fire truck",
+        "peach",
+        "cello",
+        "notepaper",
+        "tricycle",
+        "toaster",
+        "helicopter",
+        "green beans",
+        "brush",
+        "carriage",
+        "cigar",
+        "earphone",
+        "penguin",
+        "hurdle",
+        "swing",
+        "radio",
+        "CD",
+        "parking meter",
+        "swan",
+        "garlic",
+        "french fries",
+        "horn",
+        "avocado",
+        "saxophone",
+        "trumpet",
+        "sandwich",
+        "cue",
+        "kiwi fruit",
+        "bear",
+        "fishing rod",
+        "cherry",
+        "tablet",
+        "green vegetables",
+        "nuts",
+        "corn",
+        "key",
+        "screwdriver",
+        "globe",
+        "broom",
+        "pliers",
+        "volleyball",
+        "hammer",
+        "eggplant",
+        "trophy",
+        "dates",
+        "board eraser",
+        "rice",
+        "tape measure/ruler",
+        "dumbbell",
+        "hamimelon",
+        "stapler",
+        "camel",
+        "lettuce",
+        "goldfish",
+        "meat balls",
+        "medal",
+        "toothpaste",
+        "antelope",
+        "shrimp",
+        "rickshaw",
+        "trombone",
+        "pomegranate",
+        "coconut",
+        "jellyfish",
+        "mushroom",
+        "calculator",
+        "treadmill",
+        "butterfly",
+        "egg tart",
+        "cheese",
+        "pig",
+        "pomelo",
+        "race car",
+        "rice cooker",
+        "tuba",
+        "crosswalk sign",
+        "papaya",
+        "hair drier",
+        "green onion",
+        "chips",
+        "dolphin",
+        "sushi",
+        "urinal",
+        "donkey",
+        "electric drill",
+        "spring rolls",
+        "tortoise/turtle",
+        "parrot",
+        "flute",
+        "measuring cup",
+        "shark",
+        "steak",
+        "poker card",
+        "binoculars",
+        "llama",
+        "radish",
+        "noodles",
+        "yak",
+        "mop",
+        "crab",
+        "microscope",
+        "barbell",
+        "bread/bun",
+        "baozi",
+        "lion",
+        "red cabbage",
+        "polar bear",
+        "lighter",
+        "seal",
+        "mangosteen",
+        "comb",
+        "eraser",
+        "pitaya",
+        "scallop",
+        "pencil case",
+        "saw",
+        "table tennis paddle",
+        "okra",
+        "starfish",
+        "eagle",
+        "monkey",
+        "durian",
+        "game board",
+        "rabbit",
+        "french horn",
+        "ambulance",
+        "asparagus",
+        "hoverboard",
+        "pasta",
+        "target",
+        "hotair balloon",
+        "chainsaw",
+        "lobster",
+        "iron",
+        "flashlight",
+    )
diff --git a/imperative/python/megengine/data/dataset/vision/utils.py b/imperative/python/megengine/data/dataset/vision/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a028d9ce4120d8ce3cb468db1ba8760e5b7e1c9
--- /dev/null
+++ b/imperative/python/megengine/data/dataset/vision/utils.py
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import hashlib
+import os
+import tarfile
+
+from ....distributed.group import is_distributed
+from ....logger import get_logger
+from ....utils.http_download import download_from_url
+
+IMG_EXT = (".jpg", ".png", ".jpeg", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
+
+logger = get_logger(__name__)
+
+
+def _default_dataset_root():
+    default_dataset_root = os.path.expanduser(
+        os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "megengine")
+    )
+
+    return default_dataset_root
+
+
+def load_raw_data_from_url(
+    url: str, filename: str, target_md5: str, raw_data_dir: str, timeout: int
+):
+    cached_file = os.path.join(raw_data_dir, filename)
+    logger.debug(
+        "load_raw_data_from_url: downloading to or using cached %s ...", cached_file
+    )
+    if not os.path.exists(cached_file):
+        if is_distributed():
+            logger.warning(
+                "Downloading raw data in DISTRIBUTED mode\n"
+                "    File may be downloaded multiple times. We recommend\n"
+                "    users to download in single process first."
+            )
+        md5 = download_from_url(url, cached_file, http_read_timeout=timeout)
+    else:
+        md5 = calculate_md5(cached_file)
+    if target_md5 == md5:
+        logger.debug("%s exists with correct md5: %s", filename, target_md5)
+    else:
+        os.remove(cached_file)
+        raise RuntimeError("{} exists but fail to match md5".format(filename))
+
+
+def calculate_md5(filename):
+    m = hashlib.md5()
+    with open(filename, "rb") as f:
+        while True:
+            data = f.read(4096)
+            if not data:
+                break
+            m.update(data)
+    return m.hexdigest()
+
+
+def is_img(filename):
+    return filename.lower().endswith(IMG_EXT)
+
+
+def untar(path, to=None, remove=False):
+    if to is None:
+        to = os.path.dirname(path)
+    with tarfile.open(path, "r") as tar:
+        tar.extractall(path=to)
+
+    if remove:
+        os.remove(path)
+
+
+def untargz(path, to=None, remove=False):
+    if path.endswith(".tar.gz"):
+        if to is None:
+            to = os.path.dirname(path)
+        with tarfile.open(path, "r:gz") as tar:
+            tar.extractall(path=to)
+    else:
+        raise ValueError("path %s does not end with .tar" % path)
+
+    if remove:
+        os.remove(path)
diff --git a/imperative/python/megengine/data/dataset/vision/voc.py b/imperative/python/megengine/data/dataset/vision/voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..42bf712dc172176ee040881a84ee8b2ed79a383b
--- /dev/null
+++ b/imperative/python/megengine/data/dataset/vision/voc.py
@@ -0,0 +1,195 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# ---------------------------------------------------------------------
+# Part of the following code in this file refs to torchvision
+# BSD 3-Clause License
+#
+# Copyright (c) Soumith Chintala 2016,
+# All rights reserved.
+# ---------------------------------------------------------------------
+import collections.abc
+import os
+import xml.etree.ElementTree as ET
+
+import cv2
+import numpy as np
+
+from .meta_vision import VisionDataset
+
+
+class PascalVOC(VisionDataset):
+    r"""`Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ Dataset.
+    """
+
+    supported_order = (
+        "image",
+        "boxes",
+        "boxes_category",
+        "mask",
+        "info",
+    )
+
+    def __init__(self, root, image_set, *, order=None):
+        if ("boxes" in order or "boxes_category" in order) and "mask" in order:
+            raise ValueError(
+                "PascalVOC only supports boxes & boxes_category or mask, not both."
+            )
+
+        super().__init__(root, order=order, supported_order=self.supported_order)
+
+        if not os.path.isdir(self.root):
+            raise RuntimeError("Dataset not found or corrupted.")
+
+        self.image_set = image_set
+        image_dir = os.path.join(self.root, "JPEGImages")
+
+        if "boxes" in order or "boxes_category" in order:
+            annotation_dir = os.path.join(self.root, "Annotations")
+            splitdet_dir = os.path.join(self.root, "ImageSets/Main")
+            split_f = os.path.join(splitdet_dir, image_set.rstrip("\n") + ".txt")
+            with open(os.path.join(split_f), "r") as f:
+                self.file_names = [x.strip() for x in f.readlines()]
+            self.images = [os.path.join(image_dir, x + ".jpg") for x in self.file_names]
+            self.annotations = [
+                os.path.join(annotation_dir, x + ".xml") for x in self.file_names
+            ]
+            assert len(self.images) == len(self.annotations)
+        elif "mask" in order:
+            if "aug" in image_set:
+                mask_dir = os.path.join(self.root, "SegmentationClass_aug")
+            else:
+                mask_dir = os.path.join(self.root, "SegmentationClass")
+            splitmask_dir = os.path.join(self.root, "ImageSets/Segmentation")
+            split_f = os.path.join(splitmask_dir, image_set.rstrip("\n") + ".txt")
+            with open(os.path.join(split_f), "r") as f:
+                self.file_names = [x.strip() for x in f.readlines()]
+            self.images = [os.path.join(image_dir, x + ".jpg") for x in self.file_names]
+            self.masks = [os.path.join(mask_dir, x + ".png") for x in self.file_names]
+            assert len(self.images) == len(self.masks)
+        else:
+            raise NotImplementedError
+
+    def __getitem__(self, index):
+        target = []
+        for k in self.order:
+            if k == "image":
+                image = cv2.imread(self.images[index], cv2.IMREAD_COLOR)
+                target.append(image)
+            elif k == "boxes":
+                anno = self.parse_voc_xml(ET.parse(self.annotations[index]).getroot())
+                boxes = [obj["bndbox"] for obj in anno["annotation"]["object"]]
+                # boxes type xyxy
+                boxes = [
+                    (bb["xmin"], bb["ymin"], bb["xmax"], bb["ymax"]) for bb in boxes
+                ]
+                boxes = np.array(boxes, dtype=np.float32).reshape(-1, 4)
+                target.append(boxes)
+            elif k == "boxes_category":
+                anno = self.parse_voc_xml(ET.parse(self.annotations[index]).getroot())
+                boxes_category = [obj["name"] for obj in anno["annotation"]["object"]]
+                boxes_category = [
+                    self.class_names.index(bc) + 1 for bc in boxes_category
+                ]
+                boxes_category = np.array(boxes_category, dtype=np.int32)
+                target.append(boxes_category)
+            elif k == "mask":
+                if "aug" in self.image_set:
+                    mask = cv2.imread(self.masks[index], cv2.IMREAD_GRAYSCALE)
+                else:
+                    mask = cv2.imread(self.masks[index], cv2.IMREAD_COLOR)
+                    mask = self._trans_mask(mask)
+                mask = mask[:, :, np.newaxis]
+                target.append(mask)
+            elif k == "info":
+                if image is None:
+                    image = cv2.imread(self.images[index], cv2.IMREAD_COLOR)
+                info = [image.shape[0], image.shape[1], self.file_names[index]]
+                target.append(info)
+            else:
+                raise NotImplementedError
+
+        return tuple(target)
+
+    def __len__(self):
+        return len(self.images)
+
+    def _trans_mask(self, mask):
+        label = np.ones(mask.shape[:2]) * 255
+        for i in range(len(self.class_colors)):
+            b, g, r = self.class_colors[i]
+            label[
+                (mask[:, :, 0] == b) & (mask[:, :, 1] == g) & (mask[:, :, 2] == r)
+            ] = i
+        return label.astype(np.uint8)
+
+    def parse_voc_xml(self, node):
+        voc_dict = {}
+        children = list(node)
+        if children:
+            def_dic = collections.defaultdict(list)
+            for dc in map(self.parse_voc_xml, children):
+                for ind, v in dc.items():
+                    def_dic[ind].append(v)
+            if node.tag == "annotation":
+                def_dic["object"] = [def_dic["object"]]
+            voc_dict = {
+                node.tag: {
+                    ind: v[0] if len(v) == 1 else v for ind, v in def_dic.items()
+                }
+            }
+        if node.text:
+            text = node.text.strip()
+            if not children:
+                voc_dict[node.tag] = text
+        return voc_dict
+
+    class_names = (
+        "aeroplane",
+        "bicycle",
+        "bird",
+        "boat",
+        "bottle",
+        "bus",
+        "car",
+        "cat",
+        "chair",
+        "cow",
+        "diningtable",
+        "dog",
+        "horse",
+        "motorbike",
+        "person",
+        "pottedplant",
+        "sheep",
+        "sofa",
+        "train",
+        "tvmonitor",
+    )
+    class_colors = [
+        [0, 0, 128],
+        [0, 128, 0],
+        [0, 128, 128],
+        [128, 0, 0],
+        [128, 0, 128],
+        [128, 128, 0],
+        [128, 128, 128],
+        [0, 0, 64],
+        [0, 0, 192],
+        [0, 128, 64],
+        [0, 128, 192],
+        [128, 0, 64],
+        [128, 0, 192],
+        [128, 128, 64],
+        [128, 128, 192],
+        [0, 64, 0],
+        [0, 64, 128],
+        [0, 192, 0],
+        [0, 192, 128],
+        [128, 64, 0],
+    ]
diff --git a/imperative/python/megengine/data/sampler.py b/imperative/python/megengine/data/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbd5d3a395756a42eef606e16823f771146c9a87
--- /dev/null
+++ b/imperative/python/megengine/data/sampler.py
@@ -0,0 +1,274 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections.abc
+import math
+from abc import ABC
+from typing import Any, Generator, Iterator, List, Union
+
+import numpy as np
+
+import megengine.distributed as dist
+
+
+class Sampler(ABC):
+    def __init__(
+        self,
+        dataset,
+        batch_size=1,
+        drop_last=False,
+        num_samples=None,
+        world_size=None,
+        rank=None,
+        seed=None,
+    ):
+        r"""
+        An abstract class for all sampler
+
+        :type dataset: `dataset`
+        :param dataset: dataset to sample from
+        :type batch_size: positive integer
+        :param batch_size: batch size for batch method
+        :type drop_last: bool
+        :param drop_last: set ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and 
+            the size of dataset is not divisible by the batch_size, then the last batch will
+            be smaller. (default: ``False``)
+        :type num_samples: positive integer
+        :param num_samples: number of samples assigned to one rank
+        :type world_size: positive integer
+        :param world_size: number of ranks
+        :type rank: non-negative integer within 0 and world_size
+        :param rank: rank id, non-negative interger within 0 and ``world_size``
+        :type seed: non-negative integer
+        :param seed: seed for random operators
+        """
+        if (
+            not isinstance(batch_size, int)
+            or isinstance(batch_size, bool)
+            or batch_size <= 0
+        ):
+            raise ValueError(
+                "batch_size should be a positive integer value, "
+                "but got batch_size={}".format(batch_size)
+            )
+        if not isinstance(drop_last, bool):
+            raise ValueError(
+                "drop_last should be a boolean value, but got "
+                "drop_last={}".format(drop_last)
+            )
+        if num_samples is not None and (
+            not isinstance(num_samples, int)
+            or isinstance(num_samples, bool)
+            or num_samples <= 0
+        ):
+            raise ValueError(
+                "num_samples should be a positive integer "
+                "value, but got num_samples={}".format(num_samples)
+            )
+
+        self.batch_size = batch_size
+        self.dataset = dataset
+        self.drop_last = drop_last
+
+        if world_size is None:
+            world_size = dist.get_world_size() if dist.is_distributed() else 1
+        self.world_size = world_size
+        if rank is None:
+            rank = dist.get_rank() if dist.is_distributed() else 0
+        self.rank = rank
+
+        if num_samples is None:
+            num_samples = len(self.dataset)
+        self.num_samples = int(math.ceil(num_samples / self.world_size))
+
+        # Make sure seeds are the same at each rank
+        if seed is None and self.world_size > 1:
+            seed = 0
+        self.rng = np.random.RandomState(seed)
+
+    def __iter__(self) -> Union[Generator, Iterator]:
+        return self.batch()
+
+    def __len__(self) -> int:
+        if self.drop_last:
+            return self.num_samples // self.batch_size
+        else:
+            return int(math.ceil(self.num_samples / self.batch_size))
+
+    def sample(self):
+        """
+        return a list contains all sample indices
+        """
+        raise NotImplementedError
+
+    def scatter(self, indices) -> List:
+        r"""
+        scatter method is used for splitting indices into subset, each subset
+        will be assigned to a rank. Indices are evenly splitted by default.
+        If customized indices assignment method is needed, please rewrite this method
+        """
+        total_size = self.num_samples * self.world_size
+
+        # add extra indices to make it evenly divisible
+        indices += indices[: (total_size - len(indices))]
+        assert len(indices) == total_size
+
+        # subsample
+        indices = indices[self.rank : total_size : self.world_size]
+        assert len(indices) == self.num_samples
+
+        return indices
+
+    def batch(self) -> Iterator[List[Any]]:
+        r"""
+        batch method provides a batch indices generator
+        """
+        indices = list(self.sample())
+
+        # user might pass the world_size parameter without dist,
+        # so dist.is_distributed() should not be used
+        if self.world_size > 1:
+            indices = self.scatter(indices)
+
+        step, length = self.batch_size, len(indices)
+        batch_index = [indices[i : i + step] for i in range(0, length, step)]
+
+        if self.drop_last and len(batch_index[-1]) < self.batch_size:
+            batch_index.pop()
+
+        return iter(batch_index)
+
+
+class SequentialSampler(Sampler):
+    def __init__(
+        self,
+        dataset,
+        batch_size=1,
+        drop_last=False,
+        indices=None,
+        world_size=None,
+        rank=None,
+    ):
+        r"""
+        Sample elements sequentially
+        """
+        super().__init__(dataset, batch_size, drop_last, None, world_size, rank)
+        if indices is not None and not isinstance(indices, collections.abc.Sequence):
+            raise ValueError(
+                "indices should be None or a sequence, "
+                "but got indices={}".format(indices)
+            )
+        self.indices = indices
+
+    def sample(self) -> Iterator[Any]:
+        r"""
+        return a generator 
+        """
+        if self.indices is None:
+            return iter(range(len(self.dataset)))
+        else:
+            return self.indices
+
+
+class RandomSampler(Sampler):
+    def __init__(
+        self,
+        dataset,
+        batch_size=1,
+        drop_last=False,
+        indices=None,
+        world_size=None,
+        rank=None,
+        seed=None,
+    ):
+        r"""
+        Sample elements randomly without replacement
+        """
+        super().__init__(dataset, batch_size, drop_last, None, world_size, rank, seed)
+        if indices is not None and not isinstance(indices, collections.abc.Sequence):
+            raise ValueError(
+                "indices should be None or a sequence, "
+                "but got indices={}".format(indices)
+            )
+        self.indices = indices
+
+    def sample(self) -> List:
+        if self.indices is None:
+            return self.rng.permutation(len(self.dataset)).tolist()
+        else:
+            return self.rng.permutation(self.indices).tolist()
+
+
+class ReplacementSampler(Sampler):
+    def __init__(
+        self,
+        dataset,
+        batch_size=1,
+        drop_last=False,
+        num_samples=None,
+        weights=None,
+        world_size=None,
+        rank=None,
+        seed=None,
+    ):
+        r"""
+        Sample elements randomly with replacement
+
+        :type weights: List
+        :param weights: weights for sampling indices, it could be unnormalized weights
+        """
+        super().__init__(
+            dataset, batch_size, drop_last, num_samples, world_size, rank, seed
+        )
+        if weights is not None:
+            if not isinstance(weights, collections.abc.Sequence):
+                raise ValueError(
+                    "weights should be None or a sequence, "
+                    "but got weights={}".format(weights)
+                )
+            if len(weights) != len(dataset):
+                raise ValueError(
+                    "len(dataset)={} should be equal to"
+                    "len(weights)={}".format(len(dataset), len(weights))
+                )
+        self.weights = weights
+        if self.weights is not None:
+            self.weights = np.array(weights) / sum(weights)
+
+    def sample(self) -> List:
+        n = len(self.dataset)
+        if self.weights is None:
+            return self.rng.randint(n, size=self.num_samples).tolist()
+        else:
+            return self.rng.multinomial(n, self.weights, self.num_samples).tolist()
+
+
+class Infinite(Sampler):
+    r"""Infinite Sampler warper for basic sampler"""
+
+    def sample(self):
+        raise NotImplementedError("sample method not supported in Infinite")
+
+    def __init__(self, sampler):
+        self.sampler = sampler
+        self.sampler_iter = iter(self.sampler)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        try:
+            index = next(self.sampler_iter)
+        except StopIteration:
+            self.sampler_iter = iter(self.sampler)
+            index = next(self.sampler_iter)
+        return index
+
+    def __len__(self):
+        return np.iinfo(np.int64).max
diff --git a/imperative/python/megengine/data/transform/__init__.py b/imperative/python/megengine/data/transform/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..30424cbc6c31e8c4b9dde9ed10a65b3317b74294
--- /dev/null
+++ b/imperative/python/megengine/data/transform/__init__.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .meta_transform import PseudoTransform, Transform
+from .vision import *
diff --git a/imperative/python/megengine/data/transform/meta_transform.py b/imperative/python/megengine/data/transform/meta_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7fd4f47a457e16e457ada5f9bc2ac92bb732cdc
--- /dev/null
+++ b/imperative/python/megengine/data/transform/meta_transform.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from abc import ABC, abstractmethod
+from typing import Sequence, Tuple
+
+
+class Transform(ABC):
+    """
+    rewrite apply method in subclass
+    """
+
+    def apply_batch(self, inputs: Sequence[Tuple]):
+        return tuple(self.apply(input) for input in inputs)
+
+    @abstractmethod
+    def apply(self, input: Tuple):
+        pass
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+
+class PseudoTransform(Transform):
+    def apply(self, input: Tuple):
+        return input
diff --git a/imperative/python/megengine/data/transform/vision/__init__.py b/imperative/python/megengine/data/transform/vision/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d90c9e989d1bd7515e1419564d58b87c9cce028e
--- /dev/null
+++ b/imperative/python/megengine/data/transform/vision/__init__.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .transform import *
diff --git a/imperative/python/megengine/data/transform/vision/functional.py b/imperative/python/megengine/data/transform/vision/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2f4e512d624529f1c4c473632e3495913e8ee74
--- /dev/null
+++ b/imperative/python/megengine/data/transform/vision/functional.py
@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections.abc
+import functools
+import random
+
+import cv2
+import numpy as np
+
+
+def wrap_keepdims(func):
+    """Wraper to keep the dimension of input images unchanged"""
+
+    @functools.wraps(func)
+    def wrapper(image, *args, **kwargs):
+        if len(image.shape) != 3:
+            raise ValueError(
+                "image must have 3 dims, but got {} dims".format(len(image.shape))
+            )
+        ret = func(image, *args, **kwargs)
+        if len(ret.shape) == 2:
+            ret = ret[:, :, np.newaxis]
+        return ret
+
+    return wrapper
+
+
+@wrap_keepdims
+def to_gray(image):
+    r"""
+    Change BGR format image's color space to gray
+
+    :param image: Input BGR format image, with (H, W, C) shape
+    :return: Gray format image, with (H, W, C) shape
+    """
+    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+
+@wrap_keepdims
+def to_bgr(image):
+    r"""
+    Change gray format image's color space to BGR
+
+    :param image: input Gray format image, with (H, W, C) shape
+    :return: BGR format image, with (H, W, C) shape
+    """
+    return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
+
+
+@wrap_keepdims
+def pad(input, size, value):
+    r"""
+    Pad input data with *value* and given *size*
+
+    :param input: Input data, with (H, W, C) shape
+    :param size: Padding size of input data, it could be integer or sequence.
+        If it's an integer, the input data will be padded in four directions.
+        If it's a sequence contains two integer, the bottom and right side
+        of input data will be padded.
+        If it's a sequence contains four integer, the top, bottom, left, right
+        side of input data will be padded with given size.
+    :param value: Padding value of data, could be a sequence of int or float.
+        if it's float value, the dtype of image will be casted to float32 also.
+    :return: Padded image
+    """
+    if isinstance(size, int):
+        size = (size, size, size, size)
+    elif isinstance(size, collections.abc.Sequence) and len(size) == 2:
+        size = (0, size[0], 0, size[1])
+    if np.array(value).dtype == float:
+        input = input.astype(np.float32)
+    return cv2.copyMakeBorder(input, *size, cv2.BORDER_CONSTANT, value=value)
+
+
+@wrap_keepdims
+def flip(image, flipCode):
+    r"""
+    Accordding to the flipCode (the type of flip), flip the input image
+
+    :param image: Input image, with (H, W, C) shape
+    :param flipCode: code that indicates the type of flip.
+        1 : Flip horizontally
+        0 : Flip vertically
+        -1 : Flip horizontally and vertically
+    :return: BGR format image, with (H, W, C) shape
+    """
+    return cv2.flip(image, flipCode=flipCode)
+
+
+@wrap_keepdims
+def resize(input, size, interpolation=cv2.INTER_LINEAR):
+    r"""
+    resize the input data to given size
+
+    :param input: Input data, could be image or masks, with (H, W, C) shape
+    :param size: Target size of input data, with (height, width) shape.
+    :param interpolation: Interpolation method.
+    :return: Resized data, with (H, W, C) shape
+    """
+    if len(size) != 2:
+        raise ValueError("resize needs (h, w), but got {}".format(size))
+
+    if isinstance(interpolation, collections.abc.Sequence):
+        interpolation = random.choice(interpolation)
+    return cv2.resize(input, size[::-1], interpolation=interpolation)
diff --git a/imperative/python/megengine/data/transform/vision/transform.py b/imperative/python/megengine/data/transform/vision/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf3834a95507df99388908b5f26c11fb171e180c
--- /dev/null
+++ b/imperative/python/megengine/data/transform/vision/transform.py
@@ -0,0 +1,1025 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections.abc
+import math
+from typing import Sequence, Tuple
+
+import cv2
+import numpy as np
+
+from megengine.data.transform import Transform
+from megengine.data.transform.vision import functional as F
+
+__all__ = [
+    "VisionTransform",
+    "ToMode",
+    "Compose",
+    "TorchTransformCompose",
+    "Pad",
+    "Resize",
+    "ShortestEdgeResize",
+    "RandomResize",
+    "RandomCrop",
+    "RandomResizedCrop",
+    "CenterCrop",
+    "RandomHorizontalFlip",
+    "RandomVerticalFlip",
+    "Normalize",
+    "GaussianNoise",
+    "BrightnessTransform",
+    "SaturationTransform",
+    "ContrastTransform",
+    "HueTransform",
+    "ColorJitter",
+    "Lighting",
+]
+
+
+class VisionTransform(Transform):
+    r"""
+    Base class of all transforms used in computer vision.
+    calling logic: apply_batch() -> apply() -> _apply_image() and other _apply_*()
+    method. If you want to implement a self-defined transform method for image,
+    rewrite _apply_image method in subclass.
+
+    :param order: Input type order. Input is a tuple contains different structures,
+        order is used to specify the order of structures. For example, if your input
+        is (image, boxes) type, then the order should be ("image", "boxes").
+        Current available strings & data type are describe below:
+
+        * "image": input image, with shape of (H, W, C)
+        * "coords": coordinates, with shape of (N, 2)
+        * "boxes": bounding boxes, with shape of (N, 4), "xyxy" format,
+          the 1st "xy" represents top left point of a box,
+          the 2nd "xy" represents right bottom point.
+        * "mask": map used for segmentation, with shape of (H, W, 1)
+        * "keypoints": keypoints with shape of (N, K, 3), N for number of instances,
+          and K for number of keypoints in one instance. The first two dimensions
+          of last axis is coordinate of keypoints and the the 3rd dimension is
+          the label of keypoints.
+        * "polygons": A sequence contains numpy array, its length is number of instances.
+          Each numpy array represents polygon coordinate of one instance.
+        * "category": categories for some data type. For example, "image_category"
+          means category of the input image and "boxes_category" means categories of
+          bounding boxes.
+        * "info": information for images such as image shapes and image path.
+
+        You can also customize your data types only if you implement the corresponding
+        _apply_*() methods, otherwise ``NotImplementedError`` will be raised.
+    """
+
+    def __init__(self, order=None):
+        super().__init__()
+        if order is None:
+            order = ("image",)
+        elif not isinstance(order, collections.abc.Sequence):
+            raise ValueError(
+                "order should be a sequence, but got order={}".format(order)
+            )
+        for k in order:
+            if k in ("batch",):
+                raise ValueError("{} is invalid data type".format(k))
+            elif k.endswith("category") or k.endswith("info"):
+                # when the key is *category or info, we should do nothing
+                # if the corresponding apply methods are not implemented.
+                continue
+            elif self._get_apply(k) is None:
+                raise NotImplementedError("{} is unsupported data type".format(k))
+        self.order = order
+
+    def apply_batch(self, inputs: Sequence[Tuple]):
+        r"""Apply transform on batch input data"""
+        return tuple(self.apply(input) for input in inputs)
+
+    def apply(self, input: Tuple):
+        r"""Apply transform on single input data"""
+        if not isinstance(input, tuple):
+            input = (input,)
+
+        output = []
+        for i in range(min(len(input), len(self.order))):
+            apply_func = self._get_apply(self.order[i])
+            if apply_func is None:
+                output.append(input[i])
+            else:
+                output.append(apply_func(input[i]))
+        if len(input) > len(self.order):
+            output.extend(input[len(self.order) :])
+
+        if len(output) == 1:
+            output = output[0]
+        else:
+            output = tuple(output)
+        return output
+
+    def _get_apply(self, key):
+        return getattr(self, "_apply_{}".format(key), None)
+
+    def _get_image(self, input: Tuple):
+        if not isinstance(input, tuple):
+            input = (input,)
+        return input[self.order.index("image")]
+
+    def _apply_image(self, image):
+        raise NotImplementedError
+
+    def _apply_coords(self, coords):
+        raise NotImplementedError
+
+    def _apply_boxes(self, boxes):
+        idxs = np.array([(0, 1), (2, 1), (0, 3), (2, 3)]).flatten()
+        coords = np.asarray(boxes).reshape(-1, 4)[:, idxs].reshape(-1, 2)
+        coords = self._apply_coords(coords).reshape((-1, 4, 2))
+        minxy = coords.min(axis=1)
+        maxxy = coords.max(axis=1)
+        trans_boxes = np.concatenate((minxy, maxxy), axis=1)
+        return trans_boxes
+
+    def _apply_mask(self, mask):
+        raise NotImplementedError
+
+    def _apply_keypoints(self, keypoints):
+        coords, visibility = keypoints[..., :2], keypoints[..., 2:]
+        trans_coords = [self._apply_coords(p) for p in coords]
+        return np.concatenate((trans_coords, visibility), axis=-1)
+
+    def _apply_polygons(self, polygons):
+        return [[self._apply_coords(p) for p in instance] for instance in polygons]
+
+
+class ToMode(VisionTransform):
+    r"""Change input data to a target mode.
+    For example, most transforms use HWC mode image,
+    while the Neural Network might use CHW mode input tensor
+
+    :param mode: Output mode of input. Use "CHW" mode by default.
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(self, mode="CHW", *, order=None):
+        super().__init__(order)
+        assert mode in ["CHW"], "unsupported mode: {}".format(mode)
+        self.mode = mode
+
+    def _apply_image(self, image):
+        if self.mode == "CHW":
+            return np.ascontiguousarray(np.rollaxis(image, 2))
+        return image
+
+    def _apply_coords(self, coords):
+        return coords
+
+    def _apply_mask(self, mask):
+        if self.mode == "CHW":
+            return np.ascontiguousarray(np.rollaxis(mask, 2))
+        return mask
+
+
+class Compose(VisionTransform):
+    r"""
+    Composes several transforms together.
+
+    :param transforms: List of :class:`VisionTransform` to compose.
+    :param batch_compose: Whether use shuffle_indices for batch data or not.
+        If True, use original input sequence.
+        Otherwise, the shuffle_indices will be used for transforms.
+    :param shuffle_indices: Indices used for random shuffle, start at 1.
+        For example, if shuffle_indices is [(1, 3), (2, 4)], then the 1st and 3rd transform
+        will be random shuffled, the 2nd and 4th transform will also be shuffled.
+    :param order: The same with :class:`VisionTransform`
+
+    Example:
+
+    ..testcode::
+
+        from megengine.data.transform import RandomHorizontalFlip, RandomVerticalFlip, CenterCrop, ToMode, Compose
+
+        transform_func = Compose([
+            RandomHorizontalFlip(),
+            RandomVerticalFlip(),
+            CenterCrop(100),
+            ToMode("CHW"),
+            ],
+            shuffle_indices=[(1, 2, 3)]
+            )
+    """
+
+    def __init__(
+        self, transforms=[], batch_compose=False, shuffle_indices=None, *, order=None
+    ):
+        super().__init__(order)
+        self.transforms = transforms
+        self._set_order()
+
+        if batch_compose and shuffle_indices is not None:
+            raise ValueError(
+                "Do not support shuffle when apply transforms along the whole batch"
+            )
+        self.batch_compose = batch_compose
+
+        if shuffle_indices is not None:
+            shuffle_indices = [tuple(x - 1 for x in idx) for idx in shuffle_indices]
+        self.shuffle_indices = shuffle_indices
+
+    def _set_order(self):
+        for t in self.transforms:
+            t.order = self.order
+            if isinstance(t, Compose):
+                t._set_order()
+
+    def apply_batch(self, inputs: Sequence[Tuple]):
+        if self.batch_compose:
+            for t in self.transforms:
+                inputs = t.apply_batch(inputs)
+            return inputs
+        else:
+            return super().apply_batch(inputs)
+
+    def apply(self, input: Tuple):
+        for t in self._shuffle():
+            input = t.apply(input)
+        return input
+
+    def _shuffle(self):
+        if self.shuffle_indices is not None:
+            source_idx = list(range(len(self.transforms)))
+            for idx in self.shuffle_indices:
+                shuffled = np.random.permutation(idx).tolist()
+                for src, dst in zip(idx, shuffled):
+                    source_idx[src] = dst
+            return [self.transforms[i] for i in source_idx]
+        else:
+            return self.transforms
+
+
+class TorchTransformCompose(VisionTransform):
+    r"""
+    Compose class used for transforms in torchvision, only support PIL image,
+    some transforms with tensor in torchvision are not supported,
+    such as Normalize and ToTensor in torchvision.
+
+    :param transforms: The same with ``Compose``
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(self, transforms, *, order=None):
+        super().__init__(order)
+        self.transforms = transforms
+
+    def _apply_image(self, image):
+        from PIL import Image
+
+        try:
+            import accimage
+        except ImportError:
+            accimage = None
+
+        if image.shape[0] == 3:  # CHW
+            image = np.ascontiguousarray(image[[2, 1, 0]])
+        elif image.shape[2] == 3:  # HWC
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image = Image.fromarray(image.astype(np.uint8))
+
+        for t in self.transforms:
+            image = t(image)
+
+        if isinstance(image, Image.Image) or (
+            accimage is not None and isinstance(image, accimage.Image)
+        ):
+            image = np.array(image, dtype=np.uint8)
+        if image.shape[0] == 3:  # CHW
+            image = np.ascontiguousarray(image[[2, 1, 0]])
+        elif image.shape[2] == 3:  # HWC
+            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+        return image
+
+
+class Pad(VisionTransform):
+    r"""Pad the input data.
+
+    :param size: Padding size of input image, it could be integer or sequence.
+        If it's an integer, the input image will be padded in four directions.
+        If it's a sequence contains two integer, the bottom and right side
+        of image will be padded.
+        If it's a sequence contains four integer, the top, bottom, left, right
+        side of image will be padded with given size.
+    :param value: Padding value of image, could be a sequence of int or float.
+        if it's float value, the dtype of image will be casted to float32 also.
+    :param mask_value: Padding value of segmentation map.
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(self, size=0, value=0, mask_value=0, *, order=None):
+        super().__init__(order)
+        if isinstance(size, int):
+            size = (size, size, size, size)
+        elif isinstance(size, collections.abc.Sequence) and len(size) == 2:
+            size = (0, size[0], 0, size[1])
+        elif not (isinstance(size, collections.abc.Sequence) and len(size) == 4):
+            raise ValueError(
+                "size should be a list/tuple which contains "
+                "(top, down, left, right) four pad sizes."
+            )
+        self.size = size
+        self.value = value
+        if not isinstance(mask_value, int):
+            raise ValueError(
+                "mask_value should be a positive integer, "
+                "but got mask_value={}".format(mask_value)
+            )
+        self.mask_value = mask_value
+
+    def _apply_image(self, image):
+        return F.pad(image, self.size, self.value)
+
+    def _apply_coords(self, coords):
+        coords[:, 0] += self.size[2]
+        coords[:, 1] += self.size[0]
+        return coords
+
+    def _apply_mask(self, mask):
+        return F.pad(mask, self.size, self.mask_value)
+
+
+class Resize(VisionTransform):
+    r"""Resize the input data.
+
+    :param output_size: Target size of image, with (height, width) shape.
+    :param interpolation: Interpolation method. All methods are listed below:
+
+        * cv2.INTER_NEAREST – a nearest-neighbor interpolation.
+        * cv2.INTER_LINEAR – a bilinear interpolation (used by default).
+        * cv2.INTER_AREA – resampling using pixel area relation.
+        * cv2.INTER_CUBIC – a bicubic interpolation over 4×4 pixel neighborhood.
+        * cv2.INTER_LANCZOS4 – a Lanczos interpolation over 8×8 pixel neighborhood.
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(self, output_size, interpolation=cv2.INTER_LINEAR, *, order=None):
+        super().__init__(order)
+        self.output_size = output_size
+        self.interpolation = interpolation
+
+    def apply(self, input: Tuple):
+        self._shape_info = self._get_shape(self._get_image(input))
+        return super().apply(input)
+
+    def _apply_image(self, image):
+        h, w, th, tw = self._shape_info
+        if h == th and w == tw:
+            return image
+        return F.resize(image, (th, tw), self.interpolation)
+
+    def _apply_coords(self, coords):
+        h, w, th, tw = self._shape_info
+        if h == th and w == tw:
+            return coords
+        coords[:, 0] = coords[:, 0] * (tw / w)
+        coords[:, 1] = coords[:, 1] * (th / h)
+        return coords
+
+    def _apply_mask(self, mask):
+        h, w, th, tw = self._shape_info
+        if h == th and w == tw:
+            return mask
+        return F.resize(mask, (th, tw), cv2.INTER_NEAREST)
+
+    def _get_shape(self, image):
+        h, w, _ = image.shape
+        if isinstance(self.output_size, int):
+            if min(h, w) == self.output_size:
+                return h, w, h, w
+            if h < w:
+                th = self.output_size
+                tw = int(self.output_size * w / h)
+            else:
+                tw = self.output_size
+                th = int(self.output_size * h / w)
+            return h, w, th, tw
+        else:
+            return (h, w, *self.output_size)
+
+
+class ShortestEdgeResize(VisionTransform):
+    def __init__(
+        self,
+        min_size,
+        max_size,
+        sample_style="range",
+        interpolation=cv2.INTER_LINEAR,
+        *,
+        order=None
+    ):
+        super().__init__(order)
+        if sample_style not in ("range", "choice"):
+            raise NotImplementedError(
+                "{} is unsupported sample style".format(sample_style)
+            )
+        self.sample_style = sample_style
+        if isinstance(min_size, int):
+            min_size = (min_size, min_size)
+        self.min_size = min_size
+        self.max_size = max_size
+        self.interpolation = interpolation
+
+    def apply(self, input: Tuple):
+        self._shape_info = self._get_shape(self._get_image(input))
+        return super().apply(input)
+
+    def _apply_image(self, image):
+        h, w, th, tw = self._shape_info
+        if h == th and w == tw:
+            return image
+        return F.resize(image, (th, tw), self.interpolation)
+
+    def _apply_coords(self, coords):
+        h, w, th, tw = self._shape_info
+        if h == th and w == tw:
+            return coords
+        coords[:, 0] = coords[:, 0] * (tw / w)
+        coords[:, 1] = coords[:, 1] * (th / h)
+        return coords
+
+    def _apply_mask(self, mask):
+        h, w, th, tw = self._shape_info
+        if h == th and w == tw:
+            return mask
+        return F.resize(mask, (th, tw), cv2.INTER_NEAREST)
+
+    def _get_shape(self, image):
+        h, w, _ = image.shape
+        if self.sample_style == "range":
+            size = np.random.randint(self.min_size[0], self.min_size[1] + 1)
+        else:
+            size = np.random.choice(self.min_size)
+
+        scale = size / min(h, w)
+        if h < w:
+            th, tw = size, scale * w
+        else:
+            th, tw = scale * h, size
+        if max(th, tw) > self.max_size:
+            scale = self.max_size / max(th, tw)
+            th = th * scale
+            tw = tw * scale
+        th = int(round(th))
+        tw = int(round(tw))
+        return h, w, th, tw
+
+
+class RandomResize(VisionTransform):
+    r"""Resize the input data randomly.
+
+    :param scale_range: .
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(self, scale_range, interpolation=cv2.INTER_LINEAR, *, order=None):
+        super().__init__(order)
+        self.scale_range = scale_range
+        self.interpolation = interpolation
+
+    def apply(self, input: Tuple):
+        self._shape_info = self._get_shape(self._get_image(input))
+        return super().apply(input)
+
+    def _apply_image(self, image):
+        h, w, th, tw = self._shape_info
+        if h == th and w == tw:
+            return image
+        return F.resize(image, (th, tw), self.interpolation)
+
+    def _apply_coords(self, coords):
+        h, w, th, tw = self._shape_info
+        if h == th and w == tw:
+            return coords
+        coords[:, 0] = coords[:, 0] * (tw / w)
+        coords[:, 1] = coords[:, 1] * (th / h)
+        return coords
+
+    def _apply_mask(self, mask):
+        h, w, th, tw = self._shape_info
+        if h == th and w == tw:
+            return mask
+        return F.resize(mask, (th, tw), cv2.INTER_NEAREST)
+
+    def _get_shape(self, image):
+        h, w, _ = image.shape
+        scale = np.random.uniform(*self.scale_range)
+        th = int(round(h * scale))
+        tw = int(round(w * scale))
+        return h, w, th, tw
+
+
+class RandomCrop(VisionTransform):
+    r"""Crop the input data randomly. Before applying the crop transform,
+    pad the image first. And if target size is still bigger than the size of
+    padded image, pad the image size to target size.
+
+    :param output_size: Target size of output image, with (height, width) shape.
+    :param padding_size: The same with `size` in ``Pad``
+    :param padding_value: The same with `value` in ``Pad``
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(
+        self,
+        output_size,
+        padding_size=0,
+        padding_value=[0, 0, 0],
+        padding_maskvalue=0,
+        *,
+        order=None
+    ):
+        super().__init__(order)
+        if isinstance(output_size, int):
+            self.output_size = (output_size, output_size)
+        else:
+            self.output_size = output_size
+        self.pad = Pad(padding_size, padding_value, order=self.order)
+        self.padding_value = padding_value
+        self.padding_maskvalue = padding_maskvalue
+
+    def apply(self, input):
+        input = self.pad.apply(input)
+        self._h, self._w, _ = self._get_image(input).shape
+        self._th, self._tw = self.output_size
+        self._x = np.random.randint(0, max(0, self._w - self._tw) + 1)
+        self._y = np.random.randint(0, max(0, self._h - self._th) + 1)
+        return super().apply(input)
+
+    def _apply_image(self, image):
+        if self._th > self._h:
+            image = F.pad(image, (self._th - self._h, 0), self.padding_value)
+        if self._tw > self._w:
+            image = F.pad(image, (0, self._tw - self._w), self.padding_value)
+        return image[self._y : self._y + self._th, self._x : self._x + self._tw]
+
+    def _apply_coords(self, coords):
+        coords[:, 0] -= self._x
+        coords[:, 1] -= self._y
+        return coords
+
+    def _apply_mask(self, mask):
+        if self._th > self._h:
+            mask = F.pad(mask, (self._th - self._h, 0), self.padding_maskvalue)
+        if self._tw > self._w:
+            mask = F.pad(mask, (0, self._tw - self._w), self.padding_maskvalue)
+        return mask[self._y : self._y + self._th, self._x : self._x + self._tw]
+
+
+class RandomResizedCrop(VisionTransform):
+    r"""Crop the input data to random size and aspect ratio.
+    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
+    aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made.
+    After applying crop transfrom, the input data will be resized to given size.
+
+    :param output_size: Target size of output image, with (height, width) shape.
+    :param scale_range: Range of size of the origin size cropped. Default: (0.08, 1.0)
+    :param ratio_range: Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(
+        self,
+        output_size,
+        scale_range=(0.08, 1.0),
+        ratio_range=(3.0 / 4, 4.0 / 3),
+        interpolation=cv2.INTER_LINEAR,
+        *,
+        order=None
+    ):
+        super().__init__(order)
+        if isinstance(output_size, int):
+            self.output_size = (output_size, output_size)
+        else:
+            self.output_size = output_size
+        assert (
+            scale_range[0] <= scale_range[1]
+        ), "scale_range should be of kind (min, max)"
+        assert (
+            ratio_range[0] <= ratio_range[1]
+        ), "ratio_range should be of kind (min, max)"
+        self.scale_range = scale_range
+        self.ratio_range = ratio_range
+        self.interpolation = interpolation
+
+    def apply(self, input: Tuple):
+        self._coord_info = self._get_coord(self._get_image(input))
+        return super().apply(input)
+
+    def _apply_image(self, image):
+        x, y, w, h = self._coord_info
+        cropped_img = image[y : y + h, x : x + w]
+        return F.resize(cropped_img, self.output_size, self.interpolation)
+
+    def _apply_coords(self, coords):
+        x, y, w, h = self._coord_info
+        coords[:, 0] = (coords[:, 0] - x) * self.output_size[1] / w
+        coords[:, 1] = (coords[:, 1] - y) * self.output_size[0] / h
+        return coords
+
+    def _apply_mask(self, mask):
+        x, y, w, h = self._coord_info
+        cropped_mask = mask[y : y + h, x : x + w]
+        return F.resize(cropped_mask, self.output_size, cv2.INTER_NEAREST)
+
+    def _get_coord(self, image, attempts=10):
+        height, width, _ = image.shape
+        area = height * width
+
+        for _ in range(attempts):
+            target_area = np.random.uniform(*self.scale_range) * area
+            log_ratio = tuple(math.log(x) for x in self.ratio_range)
+            aspect_ratio = math.exp(np.random.uniform(*log_ratio))
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if 0 < w <= width and 0 < h <= height:
+                x = np.random.randint(0, width - w + 1)
+                y = np.random.randint(0, height - h + 1)
+                return x, y, w, h
+
+        # Fallback to central crop
+        in_ratio = float(width) / float(height)
+        if in_ratio < min(self.ratio_range):
+            w = width
+            h = int(round(w / min(self.ratio_range)))
+        elif in_ratio > max(self.ratio_range):
+            h = height
+            w = int(round(h * max(self.ratio_range)))
+        else:  # whole image
+            w = width
+            h = height
+        x = (width - w) // 2
+        y = (height - h) // 2
+        return x, y, w, h
+
+
+class CenterCrop(VisionTransform):
+    r"""Crops the given the input data at the center.
+
+    :param output_size: Target size of output image, with (height, width) shape.
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(self, output_size, *, order=None):
+        super().__init__(order)
+        if isinstance(output_size, int):
+            self.output_size = (output_size, output_size)
+        else:
+            self.output_size = output_size
+
+    def apply(self, input: Tuple):
+        self._coord_info = self._get_coord(self._get_image(input))
+        return super().apply(input)
+
+    def _apply_image(self, image):
+        x, y = self._coord_info
+        th, tw = self.output_size
+        return image[y : y + th, x : x + tw]
+
+    def _apply_coords(self, coords):
+        x, y = self._coord_info
+        coords[:, 0] -= x
+        coords[:, 1] -= y
+        return coords
+
+    def _apply_mask(self, mask):
+        x, y = self._coord_info
+        th, tw = self.output_size
+        return mask[y : y + th, x : x + tw]
+
+    def _get_coord(self, image):
+        th, tw = self.output_size
+        h, w, _ = image.shape
+        assert th <= h and tw <= w, "output size is bigger than image size"
+        x = int(round((w - tw) / 2.0))
+        y = int(round((h - th) / 2.0))
+        return x, y
+
+
+class RandomHorizontalFlip(VisionTransform):
+    r"""Horizontally flip the input data randomly with a given probability.
+
+    :param p: probability of the input data being flipped. Default: 0.5
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(self, prob: float = 0.5, *, order=None):
+        super().__init__(order)
+        self.prob = prob
+
+    def apply(self, input: Tuple):
+        self._flipped = np.random.random() < self.prob
+        self._w = self._get_image(input).shape[1]
+        return super().apply(input)
+
+    def _apply_image(self, image):
+        if self._flipped:
+            return F.flip(image, flipCode=1)
+        return image
+
+    def _apply_coords(self, coords):
+        if self._flipped:
+            coords[:, 0] = self._w - coords[:, 0]
+        return coords
+
+    def _apply_mask(self, mask):
+        if self._flipped:
+            return F.flip(mask, flipCode=1)
+        return mask
+
+
+class RandomVerticalFlip(VisionTransform):
+    r"""Vertically flip the input data randomly with a given probability.
+
+    :param p: probability of the input data being flipped. Default: 0.5
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(self, prob: float = 0.5, *, order=None):
+        super().__init__(order)
+        self.prob = prob
+
+    def apply(self, input: Tuple):
+        self._flipped = np.random.random() < self.prob
+        self._h = self._get_image(input).shape[0]
+        return super().apply(input)
+
+    def _apply_image(self, image):
+        if self._flipped:
+            return F.flip(image, flipCode=0)
+        return image
+
+    def _apply_coords(self, coords):
+        if self._flipped:
+            coords[:, 1] = self._h - coords[:, 1]
+        return coords
+
+    def _apply_mask(self, mask):
+        if self._flipped:
+            return F.flip(mask, flipCode=0)
+        return mask
+
+
+class Normalize(VisionTransform):
+    r"""Normalize the input data with mean and standard deviation.
+    Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels,
+    this transform will normalize each channel of the input data.
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    :param mean: Sequence of means for each channel.
+    :param std: Sequence of standard deviations for each channel.
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(self, mean=0.0, std=1.0, *, order=None):
+        super().__init__(order)
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+
+    def _apply_image(self, image):
+        return (image - self.mean) / self.std
+
+    def _apply_coords(self, coords):
+        return coords
+
+    def _apply_mask(self, mask):
+        return mask
+
+
+class GaussianNoise(VisionTransform):
+    r"""Add random gaussian noise to the input data.
+    Gaussian noise is generated with given mean and std.
+
+    :param mean: Gaussian mean used to generate noise.
+    :param std: Gaussian standard deviation used to generate noise.
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(self, mean=0.0, std=1.0, *, order=None):
+        super().__init__(order)
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+
+    def _apply_image(self, image):
+        dtype = image.dtype
+        noise = np.random.normal(self.mean, self.std, image.shape) * 255
+        image = image + noise.astype(np.float32)
+        return np.clip(image, 0, 255).astype(dtype)
+
+    def _apply_coords(self, coords):
+        return coords
+
+    def _apply_mask(self, mask):
+        return mask
+
+
+class BrightnessTransform(VisionTransform):
+    r"""Adjust brightness of the input data.
+
+    :param value: How much to adjust the brightness. Can be any
+        non negative number. 0 gives the original image
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(self, value, *, order=None):
+        super().__init__(order)
+        if value < 0:
+            raise ValueError("brightness value should be non-negative")
+        self.value = value
+
+    def _apply_image(self, image):
+        if self.value == 0:
+            return image
+
+        dtype = image.dtype
+        image = image.astype(np.float32)
+        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
+        image = image * alpha
+        return image.clip(0, 255).astype(dtype)
+
+    def _apply_coords(self, coords):
+        return coords
+
+    def _apply_mask(self, mask):
+        return mask
+
+
+class ContrastTransform(VisionTransform):
+    r"""Adjust contrast of the input data.
+
+    :param value: How much to adjust the contrast. Can be any
+        non negative number. 0 gives the original image
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(self, value, *, order=None):
+        super().__init__(order)
+        if value < 0:
+            raise ValueError("contrast value should be non-negative")
+        self.value = value
+
+    def _apply_image(self, image):
+        if self.value == 0:
+            return image
+
+        dtype = image.dtype
+        image = image.astype(np.float32)
+        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
+        image = image * alpha + F.to_gray(image).mean() * (1 - alpha)
+        return image.clip(0, 255).astype(dtype)
+
+    def _apply_coords(self, coords):
+        return coords
+
+    def _apply_mask(self, mask):
+        return mask
+
+
+class SaturationTransform(VisionTransform):
+    r"""Adjust saturation of the input data.
+
+    :param value: How much to adjust the saturation. Can be any
+        non negative number. 0 gives the original image
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(self, value, *, order=None):
+        super().__init__(order)
+        if value < 0:
+            raise ValueError("saturation value should be non-negative")
+        self.value = value
+
+    def _apply_image(self, image):
+        if self.value == 0:
+            return image
+
+        dtype = image.dtype
+        image = image.astype(np.float32)
+        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
+        image = image * alpha + F.to_gray(image) * (1 - alpha)
+        return image.clip(0, 255).astype(dtype)
+
+    def _apply_coords(self, coords):
+        return coords
+
+    def _apply_mask(self, mask):
+        return mask
+
+
+class HueTransform(VisionTransform):
+    r"""Adjust hue of the input data.
+
+    :param value: How much to adjust the hue. Can be any number
+        between 0 and 0.5, 0 gives the original image
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(self, value, *, order=None):
+        super().__init__(order)
+        if value < 0 or value > 0.5:
+            raise ValueError("hue value should be in [0.0, 0.5]")
+        self.value = value
+
+    def _apply_image(self, image):
+        if self.value == 0:
+            return image
+
+        dtype = image.dtype
+        image = image.astype(np.uint8)
+        hsv_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV_FULL)
+        h, s, v = cv2.split(hsv_image)
+
+        alpha = np.random.uniform(-self.value, self.value)
+        h = h.astype(np.uint8)
+        # uint8 addition take cares of rotation across boundaries
+        with np.errstate(over="ignore"):
+            h += np.uint8(alpha * 255)
+        hsv_image = cv2.merge([h, s, v])
+        return cv2.cvtColor(hsv_image, cv2.COLOR_HSV2BGR_FULL).astype(dtype)
+
+    def _apply_coords(self, coords):
+        return coords
+
+    def _apply_mask(self, mask):
+        return mask
+
+
+class ColorJitter(VisionTransform):
+    r"""Randomly change the brightness, contrast, saturation and hue of an image.
+
+    :param brightness: How much to jitter brightness.
+        Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+        or the given [min, max]. Should be non negative numbers.
+    :param contrast: How much to jitter contrast.
+        Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+        or the given [min, max]. Should be non negative numbers.
+    :param saturation: How much to jitter saturation.
+        Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+        or the given [min, max]. Should be non negative numbers.
+    :param hue: How much to jitter hue.
+        Chosen uniformly from [-hue, hue] or the given [min, max].
+        Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+    :param order: The same with :class:`VisionTransform`
+    """
+
+    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0, *, order=None):
+        super().__init__(order)
+        transforms = []
+        if brightness != 0:
+            transforms.append(BrightnessTransform(brightness))
+        if contrast != 0:
+            transforms.append(ContrastTransform(contrast))
+        if saturation != 0:
+            transforms.append(SaturationTransform(saturation))
+        if hue != 0:
+            transforms.append(HueTransform(hue))
+        self.transforms = Compose(
+            transforms,
+            shuffle_indices=[tuple(range(1, len(transforms) + 1))],
+            order=order,
+        )
+
+    def apply(self, input):
+        return self.transforms.apply(input)
+
+
+class Lighting(VisionTransform):
+    def __init__(self, scale, *, order=None):
+        super().__init__(order)
+        if scale < 0:
+            raise ValueError("lighting scale should be non-negative")
+        self.scale = scale
+        self.eigvec = np.array(
+            [
+                [-0.5836, -0.6948, 0.4203],
+                [-0.5808, -0.0045, -0.8140],
+                [-0.5675, 0.7192, 0.4009],
+            ]
+        )  # reverse the first dimension for BGR
+        self.eigval = np.array([0.2175, 0.0188, 0.0045])
+
+    def _apply_image(self, image):
+        if self.scale == 0:
+            return image
+
+        dtype = image.dtype
+        image = image.astype(np.float32)
+        alpha = np.random.normal(scale=self.scale, size=3)
+        image = image + self.eigvec.dot(alpha * self.eigval)
+        return image.clip(0, 255).astype(dtype)
+
+    def _apply_coords(self, coords):
+        return coords
+
+    def _apply_mask(self, mask):
+        return mask
diff --git a/imperative/python/megengine/device.py b/imperative/python/megengine/device.py
new file mode 100644
index 0000000000000000000000000000000000000000..008920febb4efd4477dae6cf9773317b12048c69
--- /dev/null
+++ b/imperative/python/megengine/device.py
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import os
+
+from .core._imperative_rt.common import CompNode, DeviceType
+
+__all__ = [
+    "is_cuda_available",
+    "get_device_count",
+    "get_default_device",
+    "set_default_device",
+]
+
+_default_device = os.getenv("MGE_DEFAULT_DEVICE", "xpux")
+
+
+def _valid_device(inp):
+    if isinstance(inp, str) and len(inp) == 4:
+        if inp[0] in {"x", "c", "g"} and inp[1:3] == "pu":
+            if inp[3] == "x" or inp[3].isdigit():
+                return True
+    return False
+
+
+def _str2device_type(type_str: str, allow_unspec: bool = True):
+    type_str = type_str.upper()
+    if type_str == "CPU":
+        return DeviceType.CPU
+    elif type_str == "GPU" or type_str == "CUDA":
+        return DeviceType.CUDA
+    else:
+        assert allow_unspec and str == "XPU", "bad device type"
+        return DeviceType.UNSPEC
+
+
+def get_device_count(device_type: str) -> int:
+    """Gets number of devices installed on this system.
+
+    :param device_type: device type, one of 'gpu' or 'cpu'
+    """
+
+    device_type_set = ("cpu", "gpu")
+    assert device_type in device_type_set, "device must be one of {}".format(
+        device_type_set
+    )
+    device_type = _str2device_type(device_type)
+    return CompNode._get_device_count(device_type, False)
+
+
+def is_cuda_available() -> bool:
+    """Returns whether cuda device is available on this system.
+
+    """
+    t = _str2device_type("gpu")
+    return CompNode._get_device_count(t, False) > 0
+
+
+def set_default_device(device: str = "xpux"):
+    r"""Sets default computing node.
+
+    :param device: default device type. The type can be 'cpu0', 'cpu1', etc.,
+        or 'gpu0', 'gpu1', etc., to specify the particular cpu or gpu to use.
+        'cpux' and  'gpux' can also be used to specify any number of cpu or gpu devices.
+
+        'multithread' device type is avaliable when inference, which implements
+        multi-threading parallelism at the operator level. For example,
+        'multithread4' will compute with 4 threads. which implements
+
+        The default value is 'xpux' to specify any device available.
+
+        It can also be set by environmental variable `MGE_DEFAULT_DEVICE`.
+    """
+    global _default_device  # pylint: disable=global-statement
+    assert _valid_device(device), "Invalid device name {}".format(device)
+    _default_device = device
+
+
+def get_default_device() -> str:
+    r"""Gets default computing node.
+
+    It returns the value set by :func:`~.set_default_device`.
+    """
+    return _default_device
diff --git a/imperative/python/megengine/distributed/__init__.py b/imperative/python/megengine/distributed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..30e0766f3dd17b959894148f334e8a9bba7a0a68
--- /dev/null
+++ b/imperative/python/megengine/distributed/__init__.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .group import (
+    WORLD,
+    get_backend,
+    get_client,
+    get_mm_server_addr,
+    get_py_server_addr,
+    get_rank,
+    get_world_size,
+    group_barrier,
+    init_process_group,
+    is_distributed,
+    new_group,
+)
+from .helper import synchronized
+from .launcher import launcher
+from .server import Client, Server
+from .util import get_free_ports
diff --git a/imperative/python/megengine/distributed/group.py b/imperative/python/megengine/distributed/group.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e60a4d51125999a256a2c5dd21f357c4632eed0
--- /dev/null
+++ b/imperative/python/megengine/distributed/group.py
@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import List, Optional, Tuple
+
+from ..device import set_default_device
+from .server import Client, Server
+
+
+class StaticData:
+    server = None
+    client = None
+    master_ip = None
+    py_server_port = None
+    mm_server_port = None
+    world_size = None
+    proc_rank = None
+    device = None
+    backend = None
+    next_stream = None
+
+
+_sd = None
+
+
+class Group:
+    def __init__(self, proc_ranks):
+        if len(proc_ranks) == 0:  # empty group
+            self.proc_ranks = None
+            self.stream = None
+        else:
+            self.reset(proc_ranks)
+
+    def reset(self, proc_ranks):
+        self.check(proc_ranks)
+        self.proc_ranks = proc_ranks
+        self.stream = _sd.next_stream
+        _sd.next_stream += 1
+
+    def check(self, proc_ranks):
+        assert _sd is not None, "please call init_process_group first"
+        for rank in proc_ranks:
+            assert isinstance(rank, int)
+            assert rank >= 0 and rank < _sd.world_size
+        assert _sd.proc_rank in proc_ranks
+
+    @property
+    def size(self):
+        assert len(self.proc_ranks) > 0, "invalid group"
+        return len(self.proc_ranks)
+
+    @property
+    def key(self):
+        assert len(self.proc_ranks) > 0, "invalid group"
+        return ",".join(map(str, self.proc_ranks))
+
+    @property
+    def rank(self):
+        assert len(self.proc_ranks) > 0, "invalid group"
+        return self.proc_ranks.index(_sd.proc_rank)
+
+    @property
+    def comp_node(self):
+        assert len(self.proc_ranks) > 0, "invalid group"
+        return "gpu{}:{}".format(_sd.device, self.stream)
+
+
+WORLD = Group([])
+
+
+def init_process_group(
+    master_ip: str,
+    port: int,
+    world_size: int,
+    rank: int,
+    device: int,
+    backend: Optional[str] = "nccl",
+) -> None:
+    """Initialize the distributed process group and specify the device used in the current process
+
+    :param master_ip: IP address of the master node
+    :param port: Port available for all processes to communicate
+    :param world_size: Total number of processes participating in the job
+    :param rank: Rank of the current process
+    :param device: The GPU device id to bind this process to
+    :param backend: Communicator backend, currently support 'nccl' and 'ucx'
+    """
+    if not isinstance(master_ip, str):
+        raise TypeError("Expect type str but got {}".format(type(master_ip)))
+    if not isinstance(port, int):
+        raise TypeError("Expect type int but got {}".format(type(port)))
+    if not isinstance(world_size, int):
+        raise TypeError("Expect type int but got {}".format(type(world_size)))
+    if not isinstance(rank, int):
+        raise TypeError("Expect type int but got {}".format(type(rank)))
+    if not isinstance(device, int):
+        raise TypeError("Expect type int but got {}".format(type(backend)))
+    if not isinstance(backend, str):
+        raise TypeError("Expect type str but got {}".format(type(backend)))
+
+    global _sd
+    assert _sd is None, "init_process_group should be called only once"
+    _sd = StaticData()
+
+    assert world_size > 1
+    assert rank >= 0 and rank < world_size
+    assert port > 0
+
+    _sd.client = Client(master_ip, port)
+    _sd.master_ip = master_ip
+    _sd.py_server_port = port
+    _sd.mm_server_port = _sd.client.get_mm_server_port()
+    _sd.world_size = world_size
+    _sd.proc_rank = rank
+    _sd.device = device
+    _sd.backend = backend
+    _sd.next_stream = 1
+
+    WORLD.reset(list(range(world_size)))
+
+    set_default_device("gpu{}".format(device))
+
+
+def is_distributed() -> bool:
+    """Return True if the distributed process group has been initialized"""
+    return _sd is not None
+
+
+def get_rank() -> int:
+    """Get the rank of the current process"""
+    return _sd.proc_rank if _sd is not None else 0
+
+
+def get_world_size() -> int:
+    """Get the total number of processes participating in the job"""
+    return _sd.world_size if _sd is not None else 1
+
+
+def get_backend() -> str:
+    """Get the backend str"""
+    assert _sd is not None, "please call init_process_group first"
+    return _sd.backend if _sd is not None else None
+
+
+def get_py_server_addr() -> Tuple[str, int]:
+    """Get master_ip and port of python XML RPC server"""
+    assert _sd is not None, "please call init_process_group first"
+    return _sd.master_ip, _sd.py_server_port
+
+
+def get_mm_server_addr() -> Tuple[str, int]:
+    """Get master_ip and port of C++ mm_server"""
+    assert _sd is not None, "please call init_process_group first"
+    return _sd.master_ip, _sd.mm_server_port
+
+
+def get_client() -> Client:
+    """Get client of python XML RPC server"""
+    assert _sd is not None, "please call init_process_group first"
+    return _sd.client
+
+
+def new_group(proc_ranks: List[int]) -> Group:
+    """Build a subgroup containing certain ranks"""
+    return Group(proc_ranks)
+
+
+def group_barrier(group: Optional[Group] = WORLD) -> None:
+    """Block until all ranks in the group reach this barrier"""
+    assert isinstance(group, Group)
+    _sd.client.group_barrier(group.key, group.size)
diff --git a/imperative/python/megengine/distributed/helper.py b/imperative/python/megengine/distributed/helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..f56cddc0272c1623d00ca837caa36e877ce38c3b
--- /dev/null
+++ b/imperative/python/megengine/distributed/helper.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import functools
+from typing import Callable
+
+from .group import group_barrier, is_distributed
+
+
+def synchronized(func: Callable):
+    """Decorator. Decorated function will synchronize when finished.
+    Specifically, we use this to prevent data race during hub.load"""
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if not is_distributed():
+            return func(*args, **kwargs)
+
+        ret = func(*args, **kwargs)
+        group_barrier()
+        return ret
+
+    return wrapper
diff --git a/imperative/python/megengine/distributed/launcher.py b/imperative/python/megengine/distributed/launcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..152180abb47e212c9d89b77a94610c62a0623ee3
--- /dev/null
+++ b/imperative/python/megengine/distributed/launcher.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import multiprocessing as mp
+
+from ..device import get_device_count
+from .group import init_process_group
+from .server import Server
+from .util import get_free_ports
+
+
+def _get_device_count():
+    """use subprocess to avoid cuda environment initialization in the main process"""
+
+    def run(q):
+        count = get_device_count("gpu")
+        q.put(count)
+
+    q = mp.Queue()
+    p = mp.Process(target=run, args=(q,))
+    p.start()
+    p.join()
+    return q.get()
+
+
+def _run_wrapped(func, master_ip, port, world_size, rank, dev, args, kwargs):
+    """init distributed process group and run wrapped function"""
+    init_process_group(
+        master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=dev
+    )
+    func(*args, **kwargs)
+
+
+def launcher(n_gpus):
+    """decorator for launching multiple processes in single-machine multi-gpu training"""
+
+    count = _get_device_count()
+    assert isinstance(n_gpus, int) and n_gpus > 1, "invalid n_gpus"
+    assert n_gpus <= count, "{} gpus required, {} gpus provided".format(n_gpus, count)
+
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            master_ip = "localhost"
+            port = get_free_ports(1)[0]
+            server = Server(port)
+
+            procs = []
+            for rank in range(n_gpus):
+                p = mp.Process(
+                    target=_run_wrapped,
+                    args=(func, master_ip, port, n_gpus, rank, rank, args, kwargs),
+                )
+                p.start()
+                procs.append(p)
+
+            for rank in range(n_gpus):
+                procs[rank].join()
+                code = procs[rank].exitcode
+                assert code == 0, "subprocess {} exit with code {}".format(rank, code)
+
+        return wrapper
+
+    return decorator
diff --git a/imperative/python/megengine/distributed/server.py b/imperative/python/megengine/distributed/server.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3d811209a692f927e29aea47ce874fa61cf918c
--- /dev/null
+++ b/imperative/python/megengine/distributed/server.py
@@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import multiprocessing as mp
+import threading
+import time
+from collections import defaultdict
+from functools import partial
+from socketserver import ThreadingMixIn
+from xmlrpc.client import ServerProxy
+from xmlrpc.server import SimpleXMLRPCServer
+
+from ..core._imperative_rt.utils import create_mm_server
+from .util import get_free_ports
+
+
+class Future:
+    def __init__(self, ack=True):
+        self.ready = threading.Event()
+        self.ack = threading.Event() if ack else None
+
+    def set(self, value):
+        self.value = value
+        self.ready.set()
+        if self.ack:
+            self.ack.wait()
+
+    def get(self):
+        self.ready.wait()
+        if self.ack:
+            self.ack.set()
+        return self.value
+
+
+class Methods:
+    def __init__(self, mm_server_port):
+        self.lock = threading.Lock()
+        self.mm_server_port = mm_server_port
+        self.dict_is_grad = defaultdict(partial(Future, True))
+        self.dict_remote_tracer = defaultdict(partial(Future, True))
+        self.dict_pack_list = defaultdict(partial(Future, False))
+        self.dict_barrier_counter = defaultdict(int)
+        self.dict_barrier_event = defaultdict(threading.Event)
+
+    def connect(self):
+        return True
+
+    def get_mm_server_port(self):
+        return self.mm_server_port
+
+    def set_is_grad(self, rank_peer, is_grad):
+        with self.lock:
+            future = self.dict_is_grad[rank_peer]
+        future.set(is_grad)
+        return True
+
+    def check_is_grad(self, rank_peer):
+        with self.lock:
+            future = self.dict_is_grad[rank_peer]
+        ret = future.get()
+        with self.lock:
+            del self.dict_is_grad[rank_peer]
+        return ret
+
+    def set_remote_tracer(self, rank_peer, tracer_set):
+        with self.lock:
+            future = self.dict_remote_tracer[rank_peer]
+        future.set(tracer_set)
+        return True
+
+    def check_remote_tracer(self, rank_peer):
+        with self.lock:
+            future = self.dict_remote_tracer[rank_peer]
+        ret = future.get()
+        with self.lock:
+            del self.dict_remote_tracer[rank_peer]
+        return ret
+
+    def set_pack_list(self, key, pack_list):
+        with self.lock:
+            future = self.dict_pack_list[key]
+        future.set(pack_list)
+        return True
+
+    def get_pack_list(self, key):
+        with self.lock:
+            future = self.dict_pack_list[key]
+        return future.get()
+
+    def group_barrier(self, key, size):
+        with self.lock:
+            self.dict_barrier_counter[key] += 1
+            counter = self.dict_barrier_counter[key]
+            event = self.dict_barrier_event[key]
+        if counter == size:
+            del self.dict_barrier_counter[key]
+            del self.dict_barrier_event[key]
+            event.set()
+        else:
+            event.wait()
+        return True
+
+
+class ThreadXMLRPCServer(ThreadingMixIn, SimpleXMLRPCServer):
+    pass
+
+
+def start_server(py_server_port, mm_server_port):
+    server = ThreadXMLRPCServer(("0.0.0.0", py_server_port), logRequests=False)
+    server.register_instance(Methods(mm_server_port))
+    server.serve_forever()
+
+
+class Server:
+    def __init__(self, port):
+        self.py_server_port = get_free_ports(1)[0] if port == 0 else port
+        self.mm_server_port = create_mm_server("0.0.0.0", 0)
+        self.proc = mp.Process(
+            target=start_server,
+            args=(self.py_server_port, self.mm_server_port),
+            daemon=True,
+        )
+        self.proc.start()
+
+
+class Client:
+    def __init__(self, master_ip, port):
+        self.master_ip = master_ip
+        self.port = port
+        self.connect()
+
+    def connect(self):
+        while True:
+            try:
+                self.proxy = ServerProxy(
+                    "http://{}:{}".format(self.master_ip, self.port)
+                )
+                if self.proxy.connect():
+                    break
+            except:
+                time.sleep(1)
+
+    def get_mm_server_port(self):
+        return self.proxy.get_mm_server_port()
+
+    def set_is_grad(self, rank_peer, is_grad):
+        self.proxy.set_is_grad(rank_peer, is_grad)
+
+    def check_is_grad(self, rank_peer):
+        return self.proxy.check_is_grad(rank_peer)
+
+    def set_remote_tracer(self, rank_peer, tracer_set):
+        self.proxy.set_remote_tracer(rank_peer, tracer_set)
+
+    def check_remote_tracer(self, rank_peer):
+        return self.proxy.check_remote_tracer(rank_peer)
+
+    def set_pack_list(self, key, pack_list):
+        self.proxy.set_pack_list(key, pack_list)
+
+    def get_pack_list(self, key):
+        return self.proxy.get_pack_list(key)
+
+    def group_barrier(self, key, size):
+        self.proxy.group_barrier(key, size)
diff --git a/imperative/python/megengine/distributed/util.py b/imperative/python/megengine/distributed/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3a0a2aa18a2bfde09d7cd48d223e85867b60369
--- /dev/null
+++ b/imperative/python/megengine/distributed/util.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import functools
+import socket
+from typing import List
+
+
+def get_free_ports(num: int) -> List[int]:
+    """Get one or more free ports.
+    """
+    socks, ports = [], []
+    for i in range(num):
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.bind(("", 0))
+        socks.append(sock)
+        ports.append(sock.getsockname()[1])
+    for sock in socks:
+        sock.close()
+    return ports
diff --git a/imperative/python/megengine/functional/__init__.py b/imperative/python/megengine/functional/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..14fef9b30fa74bd7f2b6043909f1d25fadda28be
--- /dev/null
+++ b/imperative/python/megengine/functional/__init__.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# pylint: disable=redefined-builtin
+from . import distributed
+from .elemwise import *
+from .graph import add_update
+from .loss import (
+    binary_cross_entropy,
+    cross_entropy,
+    cross_entropy_with_softmax,
+    hinge_loss,
+    l1_loss,
+    nll_loss,
+    smooth_l1_loss,
+    square_loss,
+    triplet_margin_loss,
+)
+from .math import *
+from .nn import *
+from .quantized import conv_bias_activation
+from .tensor import *
+from .utils import accuracy, zero_grad
+
+# delete namespace
+# pylint: disable=undefined-variable
+# del elemwise, graph, loss, math, nn, tensor  # type: ignore[name-defined]
diff --git a/imperative/python/megengine/functional/debug_param.py b/imperative/python/megengine/functional/debug_param.py
new file mode 100644
index 0000000000000000000000000000000000000000..b27f4b4b205acc2ebbc4c947c5be3f4a955be048
--- /dev/null
+++ b/imperative/python/megengine/functional/debug_param.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import os
+
+_conv_execution_strategy = os.getenv("MEGENGINE_CONV_EXECUTION_STRATEGY", "HEURISTIC")
+
+
+def get_conv_execution_strategy() -> str:
+    """Returns the execuation strategy of :class:`~.Conv2d`.
+
+    See :func:`~.set_conv_execution_strategy` for possible return values
+    """
+    return _conv_execution_strategy
+
+
+def set_conv_execution_strategy(option: str):
+    """Sets the execuation strategy of :class:`~.Conv2d`.
+
+    :param option: Decides how :class:`~.Conv2d` algorithm is chosen.
+        Available values:
+
+        * 'HEURISTIC' uses heuristic to choose the fastest algorithm.
+        * 'PROFILE' runs possible algorithms on real device to find the best.
+        * 'PROFILE_HEURISTIC' uses profile result and heuristic to choose the fastest algorithm.
+        * 'PROFILE_REPRODUCIBLE' uses the fastest of profile result that is also reproducible.
+        * 'HEURISTIC_REPRODUCIBLE' uses heuristic to choose the fastest algorithm that is also reproducible.
+
+        The default strategy is 'HEURISTIC'.
+
+        It can also be set through the environmental variable 'MEGENGINE_CONV_EXECUTION_STRATEGY'.
+    """
+    valid_option = (
+        "HEURISTIC",
+        "PROFILE",
+        "PROFILE_HEURISTIC",
+        "PROFILE_REPRODUCIBLE",
+        "HEURISTIC_REPRODUCIBLE",
+    )
+    if not option in valid_option:
+        raise ValueError("Valid option can only be one of {}".format(valid_option))
+
+    global _conv_execution_strategy  # pylint: disable=global-statement
+    _conv_execution_strategy = option
diff --git a/imperative/python/megengine/functional/distributed.py b/imperative/python/megengine/functional/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..92e93f84601a833e864fb34c6d3ddf24c7dc5434
--- /dev/null
+++ b/imperative/python/megengine/functional/distributed.py
@@ -0,0 +1,299 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Optional, Tuple
+
+from ..core._imperative_rt.ops import CollectiveCommDefModeEnum
+from ..core.autodiff.builtin_op_utils import builtin_op_get_backward_fn
+from ..core.autodiff.grad import (
+    Tracer,
+    check_backward_allow_noinput,
+    get_grad_managers,
+    get_op_has_grad_fn,
+    tracer_apply,
+)
+from ..core.ops.builtin import CollectiveComm, Copy, RemoteRecv, RemoteSend
+from ..core.tensor.core import apply
+from ..core.tensor.tensor import Tensor, tensor_apply
+from ..distributed.group import (
+    WORLD,
+    Group,
+    get_backend,
+    get_client,
+    get_mm_server_addr,
+    get_rank,
+)
+from ..tensor import tensor
+
+__all__ = [
+    "reduce_sum",
+    "broadcast",
+    "all_gather",
+    "reduce_scatter_sum",
+    "all_reduce_sum",
+    "all_reduce_max",
+    "all_reduce_min",
+    "gather",
+    "scatter",
+    "all_to_all",
+    "remote_send",
+    "remote_recv",
+]
+
+
+@apply.add
+def _(op: RemoteSend, *args: Tensor):
+    ret = tensor_apply(op, *args)
+
+    # set extra information
+    tracer_set = dict()
+    for k in set().union(*(i._extra_data for i in args if isinstance(i, Tensor))):
+        tracer_set[k.name] = True
+
+    # check tracer_set in remote_recv
+    get_client().set_remote_tracer(op.key, tracer_set)
+    return ret
+
+
+@builtin_op_get_backward_fn.register(RemoteSend)
+def _(op: RemoteSend, inputs, outputs, input_requires_grad):
+    def backward(*args):
+        return [
+            remote_recv(
+                op.rank_to, inputs[0].shape, inputs[0].dtype, str(inputs[0].device)
+            )
+        ]
+
+    return backward, [True]
+
+
+@get_op_has_grad_fn.register(RemoteSend)
+def _(op: RemoteSend):
+    def has_grad(opnode, reached):
+        return get_client().check_is_grad(op.key)
+
+    return has_grad
+
+
+@check_backward_allow_noinput.register(RemoteSend)
+def _(op: RemoteSend):
+    return True
+
+
+@builtin_op_get_backward_fn.register(RemoteRecv)
+def _(op: RemoteRecv, inputs, outputs, input_requires_grad):
+    def backward(*output_grads):
+        return [remote_send(output_grads[0], op.rank_from)]
+
+    return backward, [True]
+
+
+@get_op_has_grad_fn.register(RemoteRecv)
+def _(op: RemoteRecv):
+    def has_grad(opnode, reached):
+        ret = False
+        for v in opnode.outputs:
+            if v() in reached:
+                ret = True
+                break
+        get_client().set_is_grad(op.key, ret)
+        return ret
+
+    return has_grad
+
+
+def collective_comm(inp, mode, group, device):
+    """Helper function for applying collective communication functions"""
+    assert isinstance(group, Group)
+    if group is None:
+        return inp
+    op = CollectiveComm()
+    op.key = group.key
+    op.nr_devices = group.size
+    op.rank = group.rank
+    op.is_root = op.rank == 0
+    op.local_grad = False
+    op.addr, op.port = get_mm_server_addr()
+    op.mode = mode
+    op.dtype = inp.dtype
+    op.backend = get_backend()
+    op.comp_node = device
+    return apply(op, inp)[0]
+
+
+def reduce_sum(
+    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
+) -> Tensor:
+    """Create reduce_sum operator for collective communication
+
+    :param inp: input tensor
+    :param group: communication group
+    :param device: execute placement
+    """
+    mode = CollectiveCommDefModeEnum.REDUCE_SUM
+    return collective_comm(inp, mode, group, device)
+
+
+def broadcast(
+    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
+) -> Tensor:
+    """Create broadcast operator for collective communication
+
+    :param inp: input tensor
+    :param group: communication group
+    :param device: execute placement
+    """
+    mode = CollectiveCommDefModeEnum.BROADCAST
+    return collective_comm(inp, mode, group, device)
+
+
+def all_gather(
+    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
+) -> Tensor:
+    """Create all_gather operator for collective communication
+
+    :param inp: input tensor
+    :param group: communication group
+    :param device: execute placement
+    """
+    mode = CollectiveCommDefModeEnum.ALL_GATHER
+    return collective_comm(inp, mode, group, device)
+
+
+def reduce_scatter_sum(
+    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
+) -> Tensor:
+    """Create reduce_scatter_sum operator for collective communication
+
+    :param inp: input tensor
+    :param group: communication group
+    :param device: execute placement
+    """
+    mode = CollectiveCommDefModeEnum.REDUCE_SCATTER_SUM
+    return collective_comm(inp, mode, group, device)
+
+
+def all_reduce_sum(
+    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
+) -> Tensor:
+    """Create all_reduce_sum operator for collective communication
+
+    :param inp: input tensor
+    :param group: communication group
+    :param device: execute placement
+    """
+    mode = CollectiveCommDefModeEnum.ALL_REDUCE_SUM
+    return collective_comm(inp, mode, group, device)
+
+
+def all_reduce_max(
+    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
+) -> Tensor:
+    """Create all_reduce_max operator for collective communication
+
+    :param inp: input tensor
+    :param group: communication group
+    :param device: execute placement
+    """
+    mode = CollectiveCommDefModeEnum.ALL_REDUCE_MAX
+    return collective_comm(inp, mode, group, device)
+
+
+def all_reduce_min(
+    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
+) -> Tensor:
+    """Create all_reduce_min operator for collective communication
+
+    :param inp: input tensor
+    :param group: communication group
+    :param device: execute placement
+    """
+    mode = CollectiveCommDefModeEnum.ALL_REDUCE_MIN
+    return collective_comm(inp, mode, group, device)
+
+
+def gather(
+    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
+) -> Tensor:
+    """Create gather operator for collective communication
+
+    :param inp: input tensor
+    :param group: communication group
+    :param device: execute placement
+    """
+    mode = CollectiveCommDefModeEnum.GATHER
+    return collective_comm(inp, mode, group, device)
+
+
+def scatter(
+    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
+) -> Tensor:
+    """Create scatter operator for collective communication
+
+    :param inp: input tensor
+    :param group: communication group
+    :param device: execute placement
+    """
+    mode = CollectiveCommDefModeEnum.SCATTER
+    return collective_comm(inp, mode, group, device)
+
+
+def all_to_all(
+    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
+) -> Tensor:
+    """Create all_to_all operator for collective communication
+
+    :param inp: input tensor
+    :param group: communication group
+    :param device: execute placement
+    """
+    mode = CollectiveCommDefModeEnum.ALL_TO_ALL
+    return collective_comm(inp, mode, group, device)
+
+
+def remote_send(inp: Tensor, dest_rank: int) -> Tensor:
+    """Send a Tensor to a remote process
+
+    :param inp: tensor to send
+    :param dest_rank: destination process rank
+    """
+    op = RemoteSend()
+    op.key = "{}->{}".format(get_rank(), dest_rank)
+    op.addr, op.port = get_mm_server_addr()
+    op.rank_to = dest_rank
+    return apply(op, inp)[0]
+
+
+def remote_recv(
+    src_rank: int, shape: Tuple[int], dtype: type, cn: Optional[str] = "gpu0"
+) -> Tensor:
+    """Receive a Tensor from a remote process
+
+    :param src_rank: source process rank
+    :param shape: the shape of the tensor to receive
+    :param dtype: the data type of the tensor to receive
+    :param cn: the comp node to place the received tensor
+    """
+    key = "{}->{}".format(src_rank, get_rank())
+
+    # dummpy input
+    inp = tensor([0])
+    tracer_set = get_client().check_remote_tracer(key)
+    for grad_manager in get_grad_managers():
+        if grad_manager.name in tracer_set:
+            grad_manager.wrt(inp)
+
+    op = RemoteRecv()
+    op.key = key
+    op.cn = cn
+    op.shape = shape
+    op.dtype = dtype
+    op.addr, op.port = get_mm_server_addr()
+    op.rank_from = src_rank
+
+    return apply(op, inp)[0]
diff --git a/imperative/python/megengine/functional/elemwise.py b/imperative/python/megengine/functional/elemwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b8ac1f08f1230151d07be35bc1b14e270964386
--- /dev/null
+++ b/imperative/python/megengine/functional/elemwise.py
@@ -0,0 +1,481 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# pylint: disable=unused-argument,invalid-name,redefined-builtin,arguments-out-of-order
+import functools
+
+from ..core.ops import builtin
+from ..core.tensor import utils
+from ..core.tensor.core import apply
+from ..tensor import Tensor
+
+__all__ = [
+    "abs",
+    "add",
+    "acos",
+    "asin",
+    "atan",
+    "atan2",
+    "asinh",
+    "acosh",
+    "atanh",
+    "bitwise_and",  # TODO
+    "bitwise_not",  # TODO
+    "bitwise_or",  # TODO
+    "bitwise_xor",  # TODO
+    "ceil",
+    "clamp",
+    "cos",
+    "cosh",
+    "div",
+    "eq",
+    "exp",
+    "expm1",
+    "floor",
+    "floor_div",
+    "gt",
+    "ge",
+    "hswish",
+    "hsigmoid",
+    "left_shift",
+    "lt",
+    "le",
+    "log",
+    "log1p",
+    "logical_and",
+    "logical_not",
+    "logical_or",
+    "logical_xor",
+    "maximum",
+    "minimum",
+    "mod",
+    "mul",
+    "neg",
+    "ne",
+    "pow",
+    "relu",
+    "relu6",
+    "right_shift",
+    "round",
+    "sigmoid",
+    "sin",
+    "sinh",
+    "sqrt",
+    "square",
+    "sub",
+    "tan",
+    "tanh",
+    "fast_tanh",
+]
+
+
+def _elwise(*args, mode):
+    op = builtin.Elemwise(mode=mode)
+    args = utils.convert_inputs(*args)
+    (result,) = apply(op, *args)
+    return result
+
+
+def _logical(*args, mode):
+    op = builtin.CondExecPredLogical(mode=mode)
+    args = utils.convert_inputs(*args)
+    (result,) = apply(op, *args)
+    return result
+
+
+def _elemwise_multi_type(*args, mode, **kwargs):
+    op = builtin.ElemwiseMultiType(mode=mode, **kwargs)
+    args = utils.convert_inputs(*args)
+    (result,) = apply(op, *args)
+    return result
+
+
+# math operations
+
+
+def add(x, y):
+    """Element-wise addition.
+    At least one operand should be tensor.
+    same for sub/mul/div/floor_div/pow/mod/atan2/eq/ne/lt/le/gt/ge/maximum/minmium.
+    """
+    return _elwise(x, y, mode="add")
+
+
+def sub(x, y):
+    """Element-wise subtract."""
+    return _elwise(x, y, mode="sub")
+
+
+def mul(x, y):
+    """Element-wise multiplication."""
+    return _elwise(x, y, mode="mul")
+
+
+def div(x, y):
+    """Element-wise (x / y)."""
+    return _elwise(x, y, mode="true_div")
+
+
+def floor_div(x, y):
+    """Element-wise floor(x / y)."""
+    return _elwise(x, y, mode="floor_divide")
+
+
+def neg(x):
+    """Element-wise negation."""
+    return _elwise(x, mode="negate")
+
+
+def pow(x, y):
+    """Element-wise power."""
+    return _elwise(x, y, mode="pow")
+
+
+def mod(x, y):
+    """Element-wise remainder of division."""
+    return _elwise(x, y, mode="mod")
+
+
+def abs(x):
+    """Element-wise absolute value."""
+    return _elwise(x, mode="abs")
+
+
+def exp(x):
+    """Element-wise exponential."""
+    return _elwise(x, mode="exp")
+
+
+def expm1(x):
+    """Element-wise exp(x)-1."""
+    return _elwise(x, mode="expm1")
+
+
+def log(x):
+    """Element-wise logarithm (base `e`)."""
+    return _elwise(x, mode="log")
+
+
+def log1p(x):
+    """Element-wise log(x+1) (base `e`)."""
+    return _elwise(x, mode="log1p")
+
+
+def sqrt(inp: Tensor) -> Tensor:
+    """
+    Return a new tensor with the square-root of the elements of ``inp``.
+    For negative value, return nan.
+
+    :param inp: The input tensor
+    :return: The computed tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+        import megengine.functional as F
+
+        data = mge.tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+        out = F.sqrt(data)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[0.      1.     1.4142]
+         [1.7321  2.     2.2361 ]]
+
+    """
+    return inp ** 0.5
+
+
+def square(inp: Tensor) -> Tensor:
+    """
+    Return a new tensor with the square of the elements of ``inp``
+
+    :param inp: The input tensor
+    :return: The computed tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+        import megengine.functional as F
+
+        data = mge.tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+        out = F.square(data)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[0.      1.     4.]
+         [9.      16.    25.]]
+
+    """
+    return inp ** 2
+
+
+def round(x):
+    """Round tensor to int element-wise."""
+    return _elwise(x, mode="round")
+
+
+def ceil(x):
+    """Return the ceil of the input, element-wise."""
+    return _elwise(x, mode="ceil")
+
+
+def floor(x):
+    """Calculate the floor element-wise"""
+    return _elwise(x, mode="floor")
+
+
+# trigonometric functions
+
+
+def cos(x):
+    """Cosine, element-wise."""
+    return _elwise(x, mode="cos")
+
+
+def sin(x):
+    """Sine, element-wise."""
+    return _elwise(x, mode="sin")
+
+
+def tan(x):
+    return sin(x) / cos(x)
+
+
+def acos(x):
+    """Inverse cosine, element-wise."""
+    return _elwise(x, mode="acos")
+
+
+def asin(x):
+    """Inverse sine, element-wise."""
+    return _elwise(x, mode="asin")
+
+
+def atan(x):
+    return _elwise(x, 1, mode="atan2")
+
+
+def atan2(y, x):
+    return _elwise(y, x, mode="atan2")
+
+
+def cosh(x):
+    r"""Compute element-wise hyperbolic cosine."""
+    return 0.5 * (exp(x) + exp(-x))
+
+
+def sinh(x):
+    r"""Compute element-wise hyperbolic sine."""
+    u = expm1(x)
+    return 0.5 * u / (u + 1) * (u + 2)
+
+
+def tanh(x):
+    r"""Compute element-wise hyperbolic tangent."""
+    return _elwise(x, mode="tanh")
+
+
+def asinh(x):
+    r"""Compute element-wise inverse hyperbolic sine."""
+    return log(x + (x ** 2 + 1) ** 0.5)
+
+
+def acosh(x):
+    r"""Compute element-wise inverse hyperbolic cosine."""
+    return log(x + (x ** 2 - 1) ** 0.5)
+
+
+def atanh(x):
+    r"""Compute element-wise inverse hyperbolic tangent."""
+    return log1p(2 * x / (1 - x)) / 2
+
+
+def fast_tanh(x):
+    r"""Compute element-wise fast tanh; this is an approximation:
+
+    .. math::
+        \text{fast_tanh}(x) = x * (27. + x * x) / (27. + 9. * x * x)
+    """
+    return _elwise(x, mode="fast_tanh")
+
+
+# bit-twiddling functions
+
+
+def left_shift(x, y):
+    return _elwise(x, y, mode="shl")
+
+
+def right_shift(x, y):
+    return _elwise(x, y, mode="shl")
+
+
+def bitwise_and(x, y):
+    raise NotImplementedError
+
+
+def bitwise_not(x):
+    raise NotImplementedError
+
+
+def bitwise_or(x, y):
+    raise NotImplementedError
+
+
+def bitwise_xor(x, y):
+    raise NotImplementedError
+
+
+# logical functions
+
+
+def logical_and(x, y):
+    return _elwise(x, y, mode="AND")
+
+
+def logical_not(x):
+    return _elwise(x, mode="NOT")
+
+
+def logical_or(x, y):
+    return _elwise(x, y, mode="OR")
+
+
+def logical_xor(x, y):
+    return _elwise(x, y, mode="XOR")
+
+
+# comparison functions
+
+
+def eq(x, y):
+    """Return (x == y) element-wise."""
+    return _elwise(x, y, mode="eq")
+
+
+def ne(x, y):
+    return x != y
+
+
+def lt(x, y):
+    """Return (x < y) element-wise."""
+    return _elwise(x, y, mode="lt")
+
+
+def le(x, y):
+    """Return (x =< y) element-wise."""
+    return _elwise(x, y, mode="leq")
+
+
+def gt(x, y):
+    """Return (x > y) element-wise."""
+    return _elwise(y, x, mode="lt")
+
+
+def ge(x, y):
+    """Return (x >= y) element-wise"""
+    return _elwise(y, x, mode="leq")
+
+
+def hswish(x):
+    """Return x * relu6(x + 3) / 6 element-wise"""
+    return _elwise(x, mode="h_swish")
+
+
+def hsigmoid(x):
+    """Return relu6(x + 3) / 6 element-wise"""
+    return relu6(x + 3) / 6
+
+
+def relu(x):
+    """Return `max(x, 0)` element-wise."""
+    return _elwise(x, mode="relu")
+
+
+def relu6(x):
+    """Return min(max(x, 0), 6) element-wise."""
+    return minimum(maximum(x, 0), 6)
+
+
+def sigmoid(x):
+    """Return 1 / ( 1 + exp( -x ) ) element-wise."""
+    return _elwise(x, mode="sigmoid")
+
+
+def maximum(x, y):
+    """Element-wise maximum of array elements."""
+    return _elwise(x, y, mode="max")
+
+
+def minimum(x, y):
+    """Element-wise minimum of array elements."""
+    return _elwise(x, y, mode="min")
+
+
+def clamp(inp: Tensor, lower=None, upper=None) -> Tensor:
+    r"""
+    Clamp all elements in :attr:`inp` into the range `[` :attr:`lower`, :attr:`upper` `]` and return
+    a resulting tensor:
+
+    .. math::
+        y_i = \begin{cases}
+            \text{lower} & \text{if } x_i < \text{lower} \\
+            x_i & \text{if } \text{lower} \leq x_i \leq \text{upper} \\
+            \text{upper} & \text{if } x_i > \text{upper}
+        \end{cases}
+
+    :param inp: the input tensor.
+    :param lower: lower-bound of the range to be clamped to
+    :param upper: upper-bound of the range to be clamped to
+
+    Example:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+        a = tensor(np.arange(5).astype(np.int32))
+
+        print(F.clamp(a, 2, 4).numpy())
+
+        print(F.clamp(a, lower=3).numpy())
+
+        print(F.clamp(a, upper=3).numpy())
+
+    .. testoutput::
+
+        [2 2 2 3 4]
+        [3 3 3 3 4]
+        [0 1 2 3 3]
+
+    """
+    assert (
+        lower is not None or upper is not None
+    ), "At least one of 'lower' or 'upper' must not be None"
+    if lower is not None:
+        if upper is not None:
+            assert lower <= upper, "clamp lower bound is bigger that upper bound"
+            return minimum(maximum(inp, lower), upper)
+        else:
+            return maximum(inp, lower)
+    else:
+        return minimum(inp, upper)
diff --git a/imperative/python/megengine/functional/external.py b/imperative/python/megengine/functional/external.py
new file mode 100644
index 0000000000000000000000000000000000000000..6411be0a0602a91b6926425d7e61dfbf04b96111
--- /dev/null
+++ b/imperative/python/megengine/functional/external.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# pylint: disable=too-many-lines
+from typing import List
+
+from ..core import Tensor
+
+
+def cambricon_subgraph(
+    inputs: List[Tensor], data: bytes, symbol: str, tensor_dim_mutable: bool,
+) -> List[Tensor]:
+    """Load a serialized Cambricon subgraph (i.e. cnrtModel_t) and
+    execute the operations defined in the subgraph.
+
+    :param inputs: List of input tensors of the subgraph.
+    :param data: The serialized subgraph.
+    :param symbol: The name of the function in the subgraph.
+        The function is corresponding to a cnmlFusionOp
+        which is added to the cnmlModel_t/cnrtModel_t.
+    :param tensor_dim_mutable: Whether the input tensors' shapes are mutalbe
+        in cnrtModel_t
+    """
+    raise NotImplementedError
+
+
+def extern_opr_subgraph(
+    inputs, output_shapes: List[tuple], dump_name: str, dump_data: bytes,
+) -> List[Tensor]:
+    """Load a serialized extern opr subgraph and fake execute the operator
+
+    :param inputs: Tensor or list of input tensors.
+    :param output_shapes: The output shapes.
+    :param dump_name: The serialized subgraph name.
+    :param dump_data: The serialized subgraph.
+
+    :return: List of tensors
+    """
+    raise NotImplementedError
diff --git a/imperative/python/megengine/functional/graph.py b/imperative/python/megengine/functional/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..54009172de1b763fa5b4502d53f3a569e614ecc7
--- /dev/null
+++ b/imperative/python/megengine/functional/graph.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+from typing import Iterable, Optional, Union
+
+from ..core.tensor import Tensor
+
+
+def add_update(
+    dest: Tensor,
+    delta: Tensor,
+    *,
+    alpha: Union[Tensor, float, int] = 1.0,
+    beta: Union[Tensor, float, int] = 1.0,
+    bias: Union[Tensor, float, int] = 0.0
+):
+    r"""Inplace modify ``dest`` as follows:
+
+    .. math::
+        dest = alpha * dest +  beta * delta + bias
+
+    :param dest: input data that will be inplace modified.
+    :param delta: update value that will be added to ``dest``.
+    :param alpha: weight ratio of ``dest``. Default: 1.0
+    :param beta: weight ratio of ``delta``. Default: 1.0
+    :param bias: bias value appended to the result. Default: 0.0
+    """
+    if beta is not None and beta != 1.0:
+        delta = delta * beta
+    if bias is not None and bias != 0.0:
+        delta = delta + bias
+    if alpha is not None and alpha != 1.0:
+        dest *= alpha
+    dest += delta
+    return dest
diff --git a/imperative/python/megengine/functional/loss.py b/imperative/python/megengine/functional/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..400065d4b2c84958890e9fabb9e3491214008f25
--- /dev/null
+++ b/imperative/python/megengine/functional/loss.py
@@ -0,0 +1,388 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+from ..tensor import Tensor
+from .elemwise import abs, eq, exp, log, maximum, pow, relu
+from .nn import assert_equal, indexing_one_hot
+from .tensor import where
+from .utils import zero_grad
+
+
+def l1_loss(pred: Tensor, label: Tensor) -> Tensor:
+    r"""
+    Calculates the mean absolute error (MAE) between
+    each element in the pred :math:`x` and label :math:`y`.
+
+    The mean absolute error can be described as:
+
+    .. math:: \ell(x,y) = mean\left(L \right)
+
+    where
+
+    .. math::
+
+        L = \{l_1,\dots,l_N\}, \quad
+        l_n = \left| x_n - y_n \right|,
+
+    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
+    of :math:`N` elements each. :math:`N` is the batch size.
+
+    :param pred: The predicted result from model.
+    :param label: The ground truth to compare.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+        import megengine.functional as F
+        ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
+        tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
+        loss = F.l1_loss(ipt,tgt)
+        print(loss.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [2.75]
+
+    """
+    diff = pred - label
+    return abs(diff).mean()
+
+
+def square_loss(pred: Tensor, label: Tensor) -> Tensor:
+    r"""
+    Calculates the mean squared error (squared L2 norm) between
+    each element in the pred :math:`x` and label :math:`y`.
+
+    The mean squared error can be described as:
+
+    .. math:: \ell(x, y) = mean\left( L \right)
+
+    where
+
+    .. math::
+
+        L = \{l_1,\dots,l_N\}, \quad
+        l_n = \left( x_n - y_n \right)^2,
+
+    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
+    of :math:`N` elements each. :math:`N` is the batch size.
+
+    :param pred: The predicted result from model.
+    :param label: The ground truth to compare.
+
+    Shape:
+        - pred: :math:`(N, *)` where :math:`*` means any number of additional
+          dimensions
+        - label: :math:`(N, *)`. Same shape as ``pred``
+
+    """
+    diff = pred - label
+    return (diff ** 2).mean()
+
+
+def cross_entropy(
+    inp: Tensor, target: Tensor, axis: int = 1, ignore_index: int = -1
+) -> Tensor:
+    r"""
+    Returns the cross entropy loss in a classification problem.
+
+    .. math:: \textrm{CrossEntropy}(x, y) = - \sum_{i} y_i\log(x_i)
+
+    :param inp: The input tensor representing the predicted probability.
+    :param label: The input tensor representing the classification label.
+    :param axis: An axis along which cross_entropy will be applied. Default: 1
+    :param ignore_index: Specifies a target value that is ignored and does not contribute to the input gradient. Default: -1
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        data_shape = (1, 2)
+        label_shape = (1, )
+
+        pred = tensor(np.array([0.5, 0.5], dtype=np.float32).reshape(data_shape))
+        label = tensor(np.ones(label_shape, dtype=np.int32))
+        loss = F.cross_entropy(pred, label)
+        print(loss.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [0.69]
+
+    """
+    raise NotImplementedError
+    # n0 = inp.ndim
+    # n1 = target.ndim
+    # assert n0 == n1 + 1, (
+    #     "target ndim must be one less than input ndim; input_ndim={} "
+    #     "target_ndim={}".format(n0, n1)
+    # )
+
+    # if ignore_index != -1:
+    #     mask = 1 - equal(target, ignore_index)
+    #     target = target * mask
+    #     loss = -log(indexing_one_hot(inp, target, axis)) * mask
+    #     return loss.sum() / maximum(mask.sum(), 1.0)
+    # else:
+    #     return -log(indexing_one_hot(inp, target, axis)).mean()
+
+
+def cross_entropy_with_softmax(
+    pred: Tensor, label: Tensor, axis: int = 1, label_smooth: float = 0
+) -> Tensor:
+    r"""
+    Returns loss after applying :func:`~.softmax` + :func:`~.cross_entropy`.
+
+    It has better numerical stability compared with sequential calls to :func:`~.softmax` and :func:`~.cross_entropy`.
+
+    When using label smoothing, the label distribution is as follows:
+
+    .. math:: y^{LS}_{k}=y_{k}\left(1-\alpha\right)+\alpha/K
+
+    where :math:`y^{LS}` and :math:`y` are new label distribution and origin label distribution respectively.
+    k is the index of label distribution. :math:`\alpha` is label_smooth and :math:`K` is the number of classes.
+
+    :param pred: The input tensor representing the predicted probability.
+    :param label: The input tensor representing the classification label.
+    :param axis: An axis along which softmax will be applied. Default: 1.
+    :param label_smooth: A label smoothing of parameter that can re-distribute target distribution. Default: 0.
+    """
+    n0 = pred.ndim
+    n1 = label.ndim
+    assert n0 == n1 + 1, (
+        "target ndim must be one less than input ndim; input_ndim={} "
+        "target_ndim={}".format(n0, n1)
+    )
+
+    num_classes = pred.shape[axis]
+
+    # Denominator of the softmax
+    offset = pred.max(axis=axis).detach()
+    pred = pred - offset
+    down = exp(pred).sum(axis=axis)
+
+    up = pred[np.arange(pred.shape[0]), label]
+
+    if label_smooth != 0:
+        factor = label_smooth / num_classes
+        up = up * (1 - label_smooth) + pred.sum(axis=axis) * factor
+
+    return (log(down) - up).mean()
+
+
+def triplet_margin_loss(
+    anchor: Tensor, positive: Tensor, negative: Tensor, margin: float = 1.0, p: int = 2
+) -> Tensor:
+    r"""
+    Creates a criterion that measures the triplet loss given an input tensors.
+
+    .. math::
+
+        L(a, p, n) = max\left\{d\left(a_{i},p_{i}\right)-d\left(a_{i}, n_{i}\right)+margin, 0\right\},\
+        d\left(x_{i},y_{i}\right)=\left\|x_{i}-y_{i}\right\|_{p}
+
+    :param anchor: The input tensor representing the anchor samples.
+    :param positive: The input tensor representing the positive samples.
+    :param negative: The input tensor representing the negative samples.
+    :param margin: Default: 1.0
+    :param p: The norm degree for pairwise distance. Default: 2.0
+    """
+    s0 = anchor.shapeof()
+    s1 = positive.shapeof()
+    s2 = negative.shapeof()
+    assert_equal(s0, s1)
+    assert_equal(s1, s2)
+
+    n0 = anchor.ndim
+    n1 = positive.ndim
+    n2 = negative.ndim
+    assert n0 == 2 and n1 == 2 and n2 == 2, (
+        "anchor ndim, positive ndim, and negative ndim must be 2; "
+        "anchor_ndim={} positive_ndim={} negative_ndim={}".format(n0, n1, n2)
+    )
+    assert p > 0, "a margin with a value greater than 0; p={}".format(p)
+
+    diff0 = abs(anchor - positive)
+    diff1 = abs(anchor - negative)
+
+    d1 = power(power(diff0, p).sum(axis=1, keepdims=True), 1 / p)
+    d2 = power(power(diff1, p).sum(axis=1, keepdims=True), 1 / p)
+
+    loss = maximum(d1 - d2 + margin, 0)
+
+    return loss.mean()
+
+
+def binary_cross_entropy(pred: Tensor, label: Tensor) -> Tensor:
+    r"""Function that measures the Binary Cross Entropy between the target and the prediction.
+
+    :param pred: (N,*) where * means, any number of additional dimensions.
+    :param label: (N,*), same shape as the input.
+
+    """
+    assert pred.shape == label.shape
+
+    return -1.0 * (label * log(pred) + (1.0 - label) * log(1 - pred)).mean()
+
+
+def nll_loss(
+    pred: Tensor, label: Tensor, axis: int = 1, ignore_index: int = -1
+) -> Tensor:
+    r"""
+    The negative log likelihood loss.
+
+    :param pred: The predicted result from model.
+    :param label: The ground truth to compare.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+        data_shape = (2, 2)
+        label_shape = (2, )
+
+        data = tensor(
+            np.array([[1, 0.5], [0.3, 1.2]], dtype=np.float32).reshape(data_shape),
+        )
+        label = tensor(
+            np.ones(label_shape, dtype=np.int32)
+        )
+        pred = F.log(F.softmax(data))
+        loss1 = F.nll_loss(pred, label)
+        loss2 = F.cross_entropy_with_softmax(data, label)
+        print(loss1.numpy(), loss2.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [0.6576154] [0.6576154]
+
+    """
+    raise NotImplementedError
+    # n0 = pred.ndim
+    # n1 = label.ndim
+    # assert n0 == n1 + 1, (
+    #     "target ndim must be one less than input ndim; input_ndim={} "
+    #     "target_ndim={}".format(n0, n1)
+    # )
+
+    # mask = 1.0 - equal(label, ignore_index)
+    # label = label * mask
+
+    # loss = indexing_one_hot(pred, label, axis) * mask
+
+    # return -1.0 * loss.sum() / maximum(mask.sum(), 1.0)
+
+
+def hinge_loss(pred: Tensor, label: Tensor, norm: str = "L1") -> Tensor:
+    r"""
+    Caculate the hinge loss which is often used in SVMs.
+
+    The hinge loss can be described as:
+
+    .. math:: loss(x, y) = \frac{1}{N}\sum_i\sum_j(max(0, 1 - x_i_j*y_i_j))
+
+    :param pred: The input tensor representing the predicted probability, shape is (N, C).
+    :param label: The input tensor representing the binary classification label, shape is (N, C).
+    :param norm: Specify the norm to caculate the loss, should be "L1" or "L2".
+
+    Examples:
+
+    .. testcode::
+
+        from megengine import tensor
+        import megengine.functional as F
+
+        pred = tensor([[0.5, -0.5, 0.1], [-0.6, 0.7, 0.8]], dtype="float32")
+        label = tensor([[1, -1, -1], [-1, 1, 1]], dtype="float32")
+
+        loss = F.hinge_loss(pred, label)
+
+        print(loss.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [1.5]
+
+    """
+    assert norm in ["L1", "L2"], "norm must be L1 or L2"
+    # Converts binary labels to -1/1 labels.
+    loss = relu(1.0 - pred * label)
+    if norm == "L1":
+        return loss.sum(axis=1).mean()
+    else:
+        return (loss ** 2).sum(axis=1).mean()
+
+
+def smooth_l1_loss(pred: Tensor, label: Tensor) -> Tensor:
+    r"""
+    Caculate the smooth l1 loss proposed in `Fast R-CNN paper by Ross Girshick`.
+
+    The smooth l1 loss can be described as:
+
+    .. math::
+        \text{loss}(x, y) = \frac{1}{n} \sum_{i} l_{i}
+
+    where :math:`l_{i}` is given by:
+
+    .. math::
+        l_{i} =
+        \begin{cases}
+        0.5 (x_i - y_i)^2, & \text{if } |x_i - y_i| < 1 \\
+        |x_i - y_i| - 0.5, & \text{otherwise }
+        \end{cases}
+
+    :param pred: The predicted result from model.
+    :param label: The ground truth to compare.
+
+    Examples:
+
+    .. testcode::
+
+        from megengine import tensor
+        import megengine.functional as F
+
+        pred = tensor([[0.5, -0.5, 0.1], [-0.6, 0.7, 0.8]])
+        label = tensor([[0.4, 1.5, 1.2], [0., 0.1, 2.2]])
+
+        loss = F.smooth_l1_loss(pred, label)
+
+        print(loss.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [0.5608334]
+    """
+    raise NotImplementedError
+    # diff = abs(pred - label)
+    # l2_loss = 0.5 * (diff ** 2)
+    # l1_loss = diff - 0.5
+    # mask = diff < 1
+    # loss = where(mask, l2_loss, l1_loss)
+    # return loss.mean()
diff --git a/imperative/python/megengine/functional/math.py b/imperative/python/megengine/functional/math.py
new file mode 100644
index 0000000000000000000000000000000000000000..3483ad816dc9e1193d4202e809172e3c3a2b494b
--- /dev/null
+++ b/imperative/python/megengine/functional/math.py
@@ -0,0 +1,696 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+import functools
+import math
+import numbers
+from typing import Optional, Sequence, Tuple, Union
+
+from ..core.ops import builtin
+from ..core.ops._internal import param_defs as P
+from ..core.tensor import utils
+from ..core.tensor.core import apply
+from ..tensor import Tensor
+from .elemwise import clamp, exp, log, log1p
+from .tensor import remove_axis, reshape
+
+__all__ = [
+    "all",  # TODO
+    "all_close",  # TODO
+    "any",  # TODO
+    "argmax",
+    "argmin",
+    "argsort",
+    "isinf",
+    "isnan",  # TODO
+    "max",
+    "mean",
+    "median",  # TODO
+    "min",
+    "norm",
+    "normalize",
+    "prod",
+    "sign",  # TODO
+    "sort",
+    "std",
+    "sum",
+    "topk",
+    "unique",  # TODO
+    "var",
+]
+
+
+def all(inp):
+    raise NotImplementedError
+
+
+def all_close(inp):
+    raise NotImplementedError
+
+
+def any(inp):
+    raise NotImplementedError
+
+
+def unique(inp):
+    raise NotImplementedError
+
+
+def isnan(inp: Tensor) -> Tensor:
+    r"""Returns a new tensor representing if each element is NaN or not.
+
+    :param: inp
+    :return: a new tensor representing if each element in :attr:`inp` is NaN or not.
+
+    Examples:
+
+    .. testcode::
+
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor([1, float("nan"), 0])
+
+        print(F.isnan(x))
+
+    .. testoutput::
+
+        Tensor([0 1 0], dtype=uint8)
+
+    """
+    raise NotImplementedError
+    # return (inp != inp).astype("uint8")
+
+
+def isinf(inp: Tensor) -> Tensor:
+    r"""Returns a new tensor representing if each element is Inf or not.
+
+    :param: inp
+    :return: a new tensor representing if each element in :attr:`inp` is Inf or not.
+
+    Examples:
+
+    .. testcode::
+
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor([1, float("inf"), 0])
+
+        print(F.isinf(x))
+
+    .. testoutput::
+
+        Tensor([0 1 0], dtype=uint8)
+
+    """
+    return (abs(inp).astype("float32") == float("inf")).astype("uint8")
+
+
+def sign(inp: Tensor):
+    raise NotImplementedError
+
+
+def _reduce(
+    data,
+    *,
+    mode,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False
+):
+    (data,) = utils.convert_inputs(data)
+    if axis is None:
+        data = data.reshape(-1)
+        assert not keepdims, "can not set axis=None and keepdims=True"
+
+        op = builtin.Reduce(mode=mode, axis=0)
+        (result,) = apply(op, data)
+    elif isinstance(axis, collections.Iterable):
+        axis = list(axis)
+        axis.sort(reverse=True)
+
+        for ai in axis:
+            op = builtin.Reduce(mode=mode, axis=ai)
+            (data,) = apply(op, data)
+            if not keepdims:
+                data = remove_axis(data, ai)
+        result = data
+    else:
+        op = builtin.Reduce(mode=mode, axis=axis)
+        (result,) = apply(op, data)
+
+        if not keepdims:
+            result = remove_axis(result, axis)
+
+    return result
+
+
+def sum(
+    inp: Tensor,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+) -> Tensor:
+    r"""Returns the sum of each row of the ``inp`` tensor in the given ``axis``.
+
+    :param inp: The input tensor.
+    :param axis: The dimension to reduce. If None, all the dimensions will be reduced.
+        Default: None
+    :param keepdims: Whether the output tensor has ``axis`` retained or not.
+        Default: False
+    :return: The output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        data = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
+        out = F.sum(data)
+        print(out.numpy())
+
+    .. testoutput::
+
+        [21]
+
+    """
+    return _reduce(inp, mode="SUM", axis=axis, keepdims=keepdims)
+
+
+def prod(
+    inp: Tensor, axis: Optional[Union[int, Sequence[int]]] = None, keepdims=False
+) -> Tensor:
+    r"""
+    Returns the element product of input tensor along given *axis*.
+
+    :param inp: The input tensor
+    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: ``None``
+    :param keepdims: Whether the output tensor has *axis* retained or not. Default: ``False``
+    :return: The output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        data = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
+        out = F.prod(data)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [720]
+
+    """
+    return _reduce(inp, mode="PRODUCT", axis=axis, keepdims=keepdims)
+
+
+def mean(
+    inp: Tensor,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+) -> Tensor:
+    """Returns the mean value of each row of the ``inp`` tensor in
+    the given ``axis``. If axis is a list of dimensions,
+    reduce over all of them.
+
+    :param inp: The input tensor
+    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param keepdims: Whether the output tensor has ``axis`` retained or not. Default: False
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        data = tensor(np.arange(1, 7, dtype=np.int32).reshape(2, 3))
+        out = F.mean(data)
+        print(out.numpy())
+
+    .. testoutput::
+
+        [3.5]
+
+    """
+    return _reduce(inp, mode="MEAN", axis=axis, keepdims=keepdims)
+
+
+def median(
+    inp: Tensor,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+) -> Tensor:
+    raise NotImplementedError
+
+
+def var(
+    inp: Tensor,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+) -> Tensor:
+    """Returns the variance value of input tensor along
+    given ``axis``. If axis is a list of dimensions,
+    reduce over all of them.
+
+    :param inp: The input tensor.
+    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: ``None``.
+    :param keepdims: Whether the output tensor has ``axis`` retained or not. Default: ``False``.
+    :return: The output tensor.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        data = tensor(np.arange(1, 7, dtype=np.float32).reshape(2, 3))
+        out = F.var(data)
+        print(out.numpy())
+
+    .. testoutput::
+
+        [2.9166667]
+    """
+    if axis is None:
+        m = mean(inp, axis=axis, keepdims=False)
+    else:
+        m = mean(inp, axis=axis, keepdims=True)
+    v = inp - m
+    return mean(v ** 2, axis=axis, keepdims=keepdims)
+
+
+def std(
+    inp: Tensor,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+) -> Tensor:
+    """Returns the standard deviation of input tensor along
+    given ``axis``. If axis is a list of dimensions,
+    reduce over all of them.
+
+    :param inp: The input tensor.
+    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: ``None``.
+    :param keepdims: Whether the output tensor has ``axis`` retained or not. Default: ``False``.
+    :return: The output tensor.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        data = tensor(np.arange(1, 7, dtype=np.float32).reshape(2, 3))
+        out = F.std(data, axis=1)
+        print(out.numpy())
+
+    .. testoutput::
+
+        [0.8164966 0.8164966]
+    """
+    return var(inp, axis=axis, keepdims=keepdims) ** 0.5
+
+
+def min(
+    inp: Tensor,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+) -> Tensor:
+    r"""
+    Returns the min value of input tensor along given *axis*.
+
+    :param inp: The input tensor
+    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param keepdims: Whether the output tensor has *axis* retained or not. Default: False
+    :return: The output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
+        y = F.min(x)
+        print(y.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [1]
+
+    """
+    return _reduce(inp, mode="MIN", axis=axis, keepdims=keepdims)
+
+
+def max(
+    inp: Tensor,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+) -> Tensor:
+    r"""Returns the max value of the input tensor along given *axis*.
+
+    :param inp: The input tensor
+    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param keepdims: Whether the output tensor has *axis* retained or not. Default: False
+    :return: The output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
+        y = F.max(x)
+        print(y.numpy())
+
+    .. testoutput::
+
+        [6]
+
+    """
+    return _reduce(inp, mode="MAX", axis=axis, keepdims=keepdims)
+
+
+def norm(
+    inp: Tensor,
+    p: int = 2,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims=False,
+):
+    """Calculate ``p``-norm of input tensor along certain axis.
+
+    :param inp: The input tensor
+    :param p: power of value ``p`` applied to ``inp``. Default: 2
+    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param keepdims: Whether the output tensor has ``axis`` retained or not. Default: False
+    :return: The output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.arange(-3, 3, dtype=np.float32).reshape(2,3))
+        y = F.norm(x)
+        print(y.numpy())
+
+    .. testoutput::
+
+        [4.358899]
+
+    """
+    if p == 0:
+        return sum(inp != 0, axis=axis, keepdims=keepdims)
+    if p == math.inf:
+        return max(abs(inp))
+    if p == -math.inf:
+        return min(abs(inp))
+    return sum(abs(inp) ** p, axis=axis, keepdims=keepdims) ** (1.0 / p)
+
+
+def argmin(
+    inp: Tensor,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+) -> Tensor:
+    r"""Returns the indices of the minimum values along an axis
+
+    :param inp: The input tensor
+    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param keepdims: Whether the output tensor has *axis* retained or not. Default: False
+    :return: The output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
+        y = F.argmin(x)
+        print(y.numpy())
+
+    .. testoutput::
+
+        [0]
+
+    """
+    if isinstance(axis, collections.Iterable):
+        axis = list(axis)
+        axis.sort(reverse=True)
+
+        for ai in axis:
+            op = builtin.Argmin(axis=ai)
+            (inp,) = apply(op, inp)
+
+            if not keepdims:
+                inp = remove_axis(inp, ai)
+
+        return inp
+
+    if axis is None:
+        assert not keepdims, "can not set axis=None and keepdims=True"
+        inp = inp.flatten()
+        axis = 0
+
+    op = builtin.Argmin(axis=axis)
+    (result,) = apply(op, inp)
+    if not keepdims:
+        result = remove_axis(result, axis)
+    return result
+
+
+def argmax(
+    inp: Tensor,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    keepdims: bool = False,
+) -> Tensor:
+    r"""Returns the indices of the maximum values along an axis
+
+    :param inp: The input tensor
+    :param axis: The dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param keepdims: Whether the output tensor has *axis* retained or not. Default: False
+    :return: The output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
+        y = F.argmax(x)
+        print(y.numpy())
+
+    .. testoutput::
+
+        [5]
+
+    """
+    if isinstance(axis, collections.Iterable):
+        axis = list(axis)
+        axis.sort(reverse=True)
+
+        for ai in axis:
+            op = builtin.Argmax(axis=ai)
+            (inp,) = apply(op, inp)
+
+            if not keepdims:
+                inp = remove_axis(inp, ai)
+
+        return inp
+
+    if axis is None:
+        assert not keepdims, "can not set axis=None and keepdims=True"
+        inp = inp.flatten()
+        axis = 0
+
+    op = builtin.Argmax(axis=axis)
+    (result,) = apply(op, inp)
+    if not keepdims:
+        result = remove_axis(result, axis)
+    return result
+
+
+def normalize(
+    inp: Tensor,
+    p: int = 2,
+    axis: Optional[Union[int, Sequence[int]]] = None,
+    eps: float = 1e-12,
+) -> Tensor:
+    r"""Perform :math:`L_p` normalization of input tensor along certain axis.
+
+    For a tensor :attr:`inp` of shape :math:`(n_0, ..., n_{dim}, ..., n_k)`, each
+    :math:`n_{dim}` -element vector :math:`v` along dimension :attr:`axis` is transformed as:
+
+    .. math::
+        v = \frac{v}{\max(\lVert v \rVert_p, \epsilon)}.
+
+    :param inp: the input tensor
+    :param p: power of value ``p`` applied to ``inp``. Default: 2
+    :param axis: The dimension to reduce. If None, all the dimensions will be reduced
+        to calculate the norm. Default: None
+    :param eps: a small value to avoid division by zero. Default: 1e-12
+    :return: the normalized output tensor
+
+    """
+    if axis is None:
+        return inp / clamp(norm(inp, p, axis), lower=eps)
+    else:
+        return inp / clamp(norm(inp, p, axis, keepdims=True), lower=eps)
+
+
+def argsort(inp: Tensor, descending: bool = False) -> Tensor:
+    r"""
+    Sort the target 2d matrix by row, return both the sorted tensor and indices.
+
+    :param inp: The input tensor, if 2d, each row will be sorted
+    :param descending: Sort in descending order, where the largest comes first. Default: ``False``
+    :return: Tuple of two tensors (sorted_tensor, indices_of_int32)
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+        data = tensor(np.array([1,2], dtype=np.float32))
+        indices = F.argsort(data)
+        print(indices.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [0 1]
+
+    """
+    assert len(inp.shape) <= 2, "Input should be 1d or 2d"
+    if descending:
+        order = P.Argsort.Order.DESCENDING
+    else:
+        order = P.Argsort.Order.ASCENDING
+
+    op = builtin.Argsort(order=order)
+    if len(inp.shape) == 1:
+        inp = inp.reshape(1, -1)
+        _, result = apply(op, inp)
+        return result[0]
+    _, result = apply(op, inp)
+    return result
+
+
+def sort(inp: Tensor, descending: bool = False) -> Tuple[Tensor, Tensor]:
+    assert len(inp.shape) <= 2, "Input should be 1d or 2d"
+    if descending:
+        order = P.Argsort.Order.DESCENDING
+    else:
+        order = P.Argsort.Order.ASCENDING
+
+    op = builtin.Argsort(order=order)
+    if len(inp.shape) == 1:
+        inp = inp.reshape(1, -1)
+        tns, ind = apply(op, inp)
+        return tns[0], ind[0]
+    tns, ind = apply(op, inp)
+    return tns, ind
+
+
+def topk(
+    inp: Tensor,
+    k: int,
+    descending: bool = False,
+    kth_only: bool = False,
+    no_sort: bool = False,
+) -> Tuple[Tensor, Tensor]:
+    r"""
+    Selected the Top-K (by default) smallest elements of 2d matrix by row.
+
+    :param inp: The input tensor, if 2d, each row will be sorted
+    :param k: The number of elements needed
+    :param descending: If true, return the largest elements instead. Default: ``False``
+    :param kth_only: If true, only the k-th element will be returned. Default: ``False``
+    :param no_sort: If true, the returned elements can be unordered. Default: ``False``
+    :return: Tuple of two tensors (topk_tensor, indices_of_int32)
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import  megengine.functional as F
+        data = tensor(np.array([2, 4, 6, 8, 7, 5, 3, 1], dtype=np.float32))
+        top, indices = F.topk(data, 5)
+        print(top.numpy(), indices.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [1. 2. 3. 4. 5.] [7 0 6 1 5]
+
+    """
+    if descending:
+        inp = -inp
+
+    Mode = P.TopK.Mode
+    if kth_only:
+        mode = Mode.KTH_ONLY
+    elif no_sort:
+        mode = Mode.VALUE_IDX_NOSORT
+    else:
+        mode = Mode.VALUE_IDX_SORTED
+    op = builtin.TopK(mode=mode)
+
+    if len(inp.shape) == 1:
+        inp = inp.reshape(1, -1)
+        res = apply(op, inp, Tensor(k, dtype="int32"))
+        if kth_only:
+            tns = res[0]
+        else:
+            tns, ind = res[0][0], res[1][0]
+    else:
+        res = apply(op, inp, Tensor(k, dtype="int32"))
+        if kth_only:
+            tns = res
+        else:
+            tns, ind = res[0], res[1]
+
+    if descending:
+        tns = -tns
+    return tns, ind
diff --git a/imperative/python/megengine/functional/nn.py b/imperative/python/megengine/functional/nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..5596058dd6db47d4ecd7589f2f24407b929eba24
--- /dev/null
+++ b/imperative/python/megengine/functional/nn.py
@@ -0,0 +1,1556 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# pylint: disable=too-many-lines
+from typing import Optional, Sequence, Tuple, Union
+
+from ..core._imperative_rt import CompNode
+from ..core.ops import builtin
+from ..core.ops._internal import param_defs as P
+from ..core.ops.special import Const
+from ..core.tensor import utils
+from ..core.tensor.core import apply
+from ..distributed import WORLD, is_distributed
+from ..random import uniform
+from ..tensor import Tensor
+from .debug_param import get_conv_execution_strategy
+from .distributed import all_reduce_sum
+from .elemwise import exp, floor, log, log1p, maximum, minimum, relu
+from .math import argsort, max, sum
+from .tensor import add_axis, broadcast, concat, full, remove_axis, reshape
+from .types import _pair, _pair_nonzero
+
+__all__ = [
+    "linear",
+    "conv2d",
+    "conv_transpose2d",
+    "local_conv2d",
+    "max_pool2d",
+    "avg_pool2d",
+    "prelu",
+    "leaky_relu",
+    "softplus",
+    "log_softmax",
+    "logsigmoid",
+    "logsumexp",
+    "flatten",
+    "softmax",
+    "batch_norm2d",
+    "sync_batch_norm",
+    "one_hot",
+    "warp_perspective",
+    "matmul",
+    "interpolate",
+    "dropout",
+    "identity",
+    "embedding",
+    "roi_pooling",
+    "roi_align",
+    "assert_equal",
+    "indexing_one_hot",
+    "dot",
+    "svd",
+    "nms",
+    "batched_nms",
+]
+
+
+def expand_hw(x):
+    # NOTE: >1d array is accepted, as long as 1 <= size <= 2
+    try:
+        x = int(x)
+        return [x, x]
+    except (TypeError, ValueError):
+        pass
+    h, w = x
+    return int(h), int(w)
+
+
+def linear(inp: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tensor:
+    """Applies a linear transformation to the input.
+
+    Refer to :class:`~.module.linear.Linear` for more information.
+
+    :param inp: the input tensor with shape `(N, in_features)`.
+    :param weight: the weight with shape `(out_features, in_features)`.
+    :param bias: the bias with shape `(out_features,)`.
+        Default: ``None``
+    """
+    ret = matmul(inp, weight, transpose_b=True)
+    if bias is not None:
+        ret += bias
+    return ret
+
+
+def conv2d(
+    inp: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor] = None,
+    stride: Union[int, Tuple[int, int]] = 1,
+    padding: Union[int, Tuple[int, int]] = 0,
+    dilation: Union[int, Tuple[int, int]] = 1,
+    groups: int = 1,
+    conv_mode="CROSS_CORRELATION",
+    compute_mode="DEFAULT",
+) -> Tensor:
+    """2D convolution operation.
+
+    Refer to :class:`~.Conv2d` for more information.
+
+    :param inp: The feature map of the convolution operation
+    :param weight: The convolution kernel
+    :param bias: The bias added to the result of convolution (if given)
+    :param stride: Stride of the 2D convolution operation. Default: 1
+    :param padding: Size of the paddings added to the input on both sides of its
+        spatial dimensions. Only zero-padding is supported. Default: 0
+    :param dilation: Dilation of the 2D convolution operation. Default: 1
+    :param groups: number of groups to divide input and output channels into,
+        so as to perform a "grouped convolution". When ``groups`` is not 1,
+        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+        and the shape of weight should be ``(groups, out_channel // groups,
+        in_channels // groups, height, width)``.
+    :type conv_mode: string or :class:`P.Convolution.Mode`
+    :param conv_mode: Supports 'CROSS_CORRELATION' or 'CONVOLUTION'. Default:
+        'CROSS_CORRELATION'.
+    :type compute_mode: string or
+        :class:`P.Convolution.ComputeMode`
+    :param compute_mode: When set to 'DEFAULT', no special requirements will be
+        placed on the precision of intermediate results. When set to 'FLOAT32',
+        Float32 would be used for accumulator and intermediate result, but only
+        effective when input and output are of Float16 dtype.
+
+    """
+    assert conv_mode == "CROSS_CORRELATION" or conv_mode.name == "CROSS_CORRELATION"
+    assert compute_mode == "DEFAULT" or compute_mode.name == "DEFAULT"
+
+    stride_h, stride_w = expand_hw(stride)
+    pad_h, pad_w = expand_hw(padding)
+    dilate_h, dilate_w = expand_hw(dilation)
+
+    Sparse = P.Convolution.Sparse
+    sparse_type = Sparse.DENSE if groups == 1 else Sparse.GROUP
+    op = builtin.Convolution(
+        stride_h=stride_h,
+        stride_w=stride_w,
+        pad_h=pad_h,
+        pad_w=pad_w,
+        dilate_h=dilate_h,
+        dilate_w=dilate_w,
+        strategy=get_conv_execution_strategy(),
+        mode=conv_mode,
+        compute_mode=compute_mode,
+        sparse=sparse_type,
+    )
+    (output,) = apply(op, inp, weight)
+    if bias is not None:
+        output += bias
+    return output
+
+
+def conv_transpose2d(
+    inp: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor] = None,
+    stride: Union[int, Tuple[int, int]] = 1,
+    padding: Union[int, Tuple[int, int]] = 0,
+    dilation: Union[int, Tuple[int, int]] = 1,
+    groups: int = 1,
+    conv_mode="CROSS_CORRELATION",
+    compute_mode="DEFAULT",
+) -> Tensor:
+    """2D transposed convolution operation.
+
+    Refer to :class:`~.ConvTranspose2d` for more information.
+
+    :param inp: The feature map of the convolution operation
+    :param weight: The convolution kernel
+    :param bias: The bias added to the result of convolution (if given)
+    :param stride: Stride of the 2D convolution operation. Default: 1
+    :param padding: Size of the paddings added to the input on both sides of its
+        spatial dimensions. Only zero-padding is supported. Default: 0
+    :param dilation: Dilation of the 2D convolution operation. Default: 1
+    :param groups: number of groups to divide input and output channels into,
+        so as to perform a "grouped convolution". When ``groups`` is not 1,
+        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+        and the shape of weight should be ``(groups, out_channel // groups,
+        in_channels // groups, height, width)``. Default: 1
+    :type conv_mode: string or :class:`P.Convolution.Mode`
+    :param conv_mode: Supports 'CROSS_CORRELATION' or 'CONVOLUTION'. Default:
+        'CROSS_CORRELATION'.
+    :type compute_mode: string or
+        :class:`P.Convolution.ComputeMode`
+    :param compute_mode: When set to 'DEFAULT', no special requirements will be
+        placed on the precision of intermediate results. When set to 'FLOAT32',
+        Float32 would be used for accumulator and intermediate result, but only
+        effective when input and output are of Float16 dtype.
+
+    """
+    assert conv_mode == "CROSS_CORRELATION" or conv_mode.name == "CROSS_CORRELATION"
+    assert compute_mode == "DEFAULT" or compute_mode.name == "DEFAULT"
+
+    if groups != 1:
+        raise NotImplementedError("TODO")
+
+    stride_h, stride_w = expand_hw(stride)
+    pad_h, pad_w = expand_hw(padding)
+    dilate_h, dilate_w = expand_hw(dilation)
+
+    op = builtin.ConvolutionBackwardData(
+        stride_h=stride_h,
+        stride_w=stride_w,
+        pad_h=pad_h,
+        pad_w=pad_w,
+        dilate_h=dilate_h,
+        dilate_w=dilate_w,
+        strategy=get_conv_execution_strategy(),
+    )
+    (output,) = apply(op, inp, weight)
+    if bias is not None:
+        output += bias
+    return output
+
+
+def local_conv2d(
+    inp: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor] = None,
+    stride: Union[int, Tuple[int, int]] = 1,
+    padding: Union[int, Tuple[int, int]] = 0,
+    dilation: Union[int, Tuple[int, int]] = 1,
+    conv_mode="CROSS_CORRELATION",
+) -> Tensor:
+    """Applies spatial 2D convolution over an image with untied kernels.
+
+    Refer to :class:`~.LocalConv2d` for more information.
+    """
+    assert conv_mode == "CROSS_CORRELATION" or conv_mode.name == "CROSS_CORRELATION"
+
+    stride_h, stride_w = expand_hw(stride)
+    pad_h, pad_w = expand_hw(padding)
+    dilate_h, dilate_w = expand_hw(dilation)
+
+    op = builtin.GroupLocal(
+        stride_h=stride_h,
+        stride_w=stride_w,
+        pad_h=pad_h,
+        pad_w=pad_w,
+        dilate_h=dilate_h,
+        dilate_w=dilate_w,
+        strategy=get_conv_execution_strategy(),
+    )
+    (output,) = apply(op, inp, weight)
+    if bias is not None:
+        output += bias
+    return output
+
+
+def max_pool2d(
+    inp: Tensor,
+    kernel_size: Union[int, Tuple[int, int]],
+    stride: Optional[Union[int, Tuple[int, int]]] = None,
+    padding: Union[int, Tuple[int, int]] = 0,
+) -> Tensor:
+    """Applies a 2D max pooling over an input.
+
+    Refer to :class:`~.MaxPool2d` for more information.
+
+    :param inp: The input tensor.
+    :param kernel_size: The size of the window.
+    :param stride: The stride of the window. If not provided, its value is set to ``kernel_size``.
+        Default: None
+    :param padding: Implicit zero padding to be added on both sides. Default: 0
+
+    """
+    if stride is None:
+        stride = kernel_size
+    window_h, window_w = _pair_nonzero(kernel_size)
+    stride_h, stride_w = _pair_nonzero(stride)
+    padding_h, padding_w = _pair(padding)
+
+    op = builtin.Pooling(
+        window_h=window_h,
+        window_w=window_w,
+        stride_h=stride_h,
+        stride_w=stride_w,
+        pad_h=padding_h,
+        pad_w=padding_w,
+        mode="MAX",
+    )
+    (output,) = apply(op, inp)
+    return output
+
+
+def avg_pool2d(
+    inp: Tensor,
+    kernel_size: Union[int, Tuple[int, int]],
+    stride: Optional[Union[int, Tuple[int, int]]] = None,
+    padding: Union[int, Tuple[int, int]] = 0,
+    mode: str = "AVERAGE_COUNT_EXCLUDE_PADDING",
+) -> Tensor:
+    """ Applies a 2D average pooling over an input.
+
+    Refer to :class:`~.AvgPool2d` for more information.
+
+    :param inp: The input tensor.
+    :param kernel_size: The size of the window.
+    :param stride: The stride of the window. If not provided, its value is set to ``kernel_size``.
+        Default: None
+    :param padding: Implicit zero padding to be added on both sides. Default: 0
+    :param mode: Whether to count padding values. Default: "AVERAGE_COUNT_EXCLUDE_PADDING"
+
+    """
+    if stride is None:
+        stride = kernel_size
+    window_h, window_w = _pair_nonzero(kernel_size)
+    stride_h, stride_w = _pair_nonzero(stride)
+    padding_h, padding_w = _pair(padding)
+
+    op = builtin.Pooling(
+        window_h=window_h,
+        window_w=window_w,
+        stride_h=stride_h,
+        stride_w=stride_w,
+        pad_h=padding_h,
+        pad_w=padding_w,
+        mode=mode,
+    )
+    (output,) = apply(op, inp)
+    return output
+
+
+def prelu(inp: Tensor, weight: Tensor) -> Tensor:
+    r"""
+    Applies the element-wise PReLU function.
+
+    Refer to :class:`~.PReLU` for more information.
+    """
+    return maximum(inp, 0) + weight * minimum(inp, 0)
+
+
+def leaky_relu(inp: Tensor, negative_slope: float = 0.01) -> Tensor:
+    r"""
+    Applies the element-wise leaky_relu function
+
+    Refer to :class:`~.LeakyReLU` for more information.
+    """
+    return maximum(inp, 0) + negative_slope * minimum(inp, 0)
+
+
+def softplus(inp: Tensor) -> Tensor:
+    r"""Applies the element-wise function:
+
+    .. math::
+        \text{softplus}(x) = \log(1 + \exp(x))
+    
+    softplus is a smooth approximation to the ReLU function and can be used
+    to constrain the output of a machine to always be positive.
+    For numerical stability the implementation follows this transformation:
+
+    .. math::
+        \text{softplus}(x) = \log(1 + \exp(x)) 
+                           = \log(1 + \exp(-\text{abs}(x))) + \max(x, 0) 
+                           = \log1p(\exp(-\text{abs}(x))) + \text{relu}(x)
+
+    :param inp: The input tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.arange(-3, 3, dtype=np.float32))
+        y = F.softplus(x)
+        print(y.numpy())
+
+    .. output::
+
+        [0.04858735 0.126928   0.3132617  0.6931472  1.3132617  2.126928  ]
+
+    """
+    return log1p(exp(-abs(inp))) + relu(inp)
+
+
+def log_softmax(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
+    r"""Applies the :math:`\log(\text{Softmax}(x))` function to an n-dimensional
+    input Tensor. The LogSoftmax formulation can be simplified as:
+
+    .. math::
+        \text{LogSoftmax}(x_{i}) = \log(\frac{\exp(x_i) }{ \sum_j \exp(x_j)} )
+
+    For numerical stability the implementation follows this transformation:
+
+    .. math::
+        \operatorname{logsoftmax}(x) 
+        = \log (\frac{\exp (x)}{\sum_{i}(\exp (x_{i}))})
+        = x - \log (\sum_{i}(\exp (x_{i})))
+        = x - logsumexp(x)
+    
+    :param inp: The input tensor
+    :param axis: An axis along which log_softmax will be applied.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.arange(-5, 5, dtype=np.float32)).reshape(2,5)
+        y = F.log_softmax(x, axis=1)
+        print(y.numpy())
+
+    .. output::
+
+        [[-4.4519143 -3.4519143 -2.4519143 -1.4519144 -0.4519144]
+         [-4.4519143 -3.4519143 -2.4519143 -1.4519144 -0.4519144]]
+
+    """
+    return inp - logsumexp(inp, axis, keepdims=True)
+
+
+def logsigmoid(inp: Tensor) -> Tensor:
+    r"""Applies the element-wise function:
+
+    .. math::
+        \text{logsigmoid}(x) = \log(\frac{ 1 }{ 1 + \exp(-x)})
+        = \log(1/(1 + exp(-x)))
+        = - \log(1 + exp(-x))
+        = - \text{softplus}(-x)
+
+    :param inp: The input tensor
+
+    Examples:
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.arange(-5, 5, dtype=np.float32))
+        y = F.logsigmoid(x)
+        print(y.numpy())
+
+    .. output::
+
+        [-5.0067153  -4.01815    -3.0485873  -2.126928   -1.3132617  -0.6931472  -0.3132617  -0.126928   -0.04858735 -0.01814993]
+
+    """
+    return -softplus(-inp)
+
+
+def logsumexp(
+    inp: Tensor, axis: Union[int, Sequence[int]], keepdims: bool = False
+) -> Tensor:
+    r"""
+    Compute the log of the sum of exponentials of inputs along the given :attr:`axis`. 
+    The computation is numerically stabilized.
+    
+    .. math::
+        
+        \operatorname{logsumexp}(\boldsymbol{x})= \log \sum_{j=1}^{n} \exp \left(x_{j}\right)
+
+    For numerical stability, the implementation follows this transformation:
+
+    .. math::
+
+        \operatorname{logsumexp}(\boldsymbol{x})= \log \sum_{j=1}^{n} \exp \left(x_{j}\right)
+        = \operatorname{logsumexp}(\boldsymbol{x})=b+\log \sum_{j=1}^{n} \exp \left(x_{j}-b\right)
+    
+    where
+
+    .. math::
+        b = \max(x_j)
+
+    :param inp: The input tensor.
+    :param axis: Axis over which the sum is taken. It can be a single axis or a list of axes.
+    :param keepdims: whether to retain :attr:`axis` or not for the output tensor.
+
+    Examples:
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.arange(-5, 5, dtype=np.float32)).reshape(2,5)
+        y = F.logsumexp(x, axis=1, keepdims=False)
+        print(y.numpy())
+
+    .. output::
+
+        [-0.5480856  4.4519143]
+
+    """
+    max_value = max(inp, axis, keepdims=True)
+    if keepdims:
+        return max_value + log(sum(exp(inp - max_value), axis, keepdims))
+    else:
+        return remove_axis(max_value, axis=None) + log(
+            sum(exp(inp - max_value), axis, keepdims)
+        )
+
+
+def flatten(inp: Tensor, start_axis: int = 0, end_axis: int = -1) -> Tensor:
+    r"""
+    Reshapes the tensor by flattening the sub-tensor from dimension ``start_axis`` to dimension ``end_axis``.
+
+    :param inp: The input tensor.
+    :param start_axis: The start dimension that the sub-tensor to be flattened. Default: 0
+    :param end_axis: The end dimension that the sub-tensor to be flattened. Default: -1
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        inp_shape = (2, 2, 3, 3)
+        inp = tensor(
+            np.arange(36, dtype=np.int32).reshape(inp_shape),
+        )
+        oup = F.flatten(inp, 2)
+        print(inp.numpy().shape)
+        print(oup.numpy().shape)
+
+    Outputs:
+
+    .. testoutput::
+
+        (2, 2, 3, 3)
+        (2, 2, 9)
+
+    """
+    target_shape = tuple(inp.shape[i] for i in range(start_axis)) + (-1,)
+    if end_axis != -1:
+        target_shape += (*inp.shape[end_axis + 1 :],)
+    return inp.reshape(*target_shape)
+
+
+def _get_softmax_axis(ndim: int) -> int:
+    if ndim in (0, 1, 3):
+        return 0
+    return 1
+
+
+def softmax(inp: Tensor, axis: Optional[int] = None) -> Tensor:
+    r"""
+    Applies a softmax function. Softmax is defined as:
+
+    .. math::
+            \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
+
+    It is applied to all elements along axis, and will re-scale them so that
+    the elements lie in the range `[0, 1]` and sum to 1.
+
+    See :class:`~megengine.module.activation.Softmax` for more details.
+
+    :param inp: The input tensor.
+    :param axis: An axis along which softmax will be applied. By default,
+        softmax will apply along the highest ranked axis.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.arange(-5, 5, dtype=np.float32)).reshape(2,5)
+        out = F.softmax(x)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+        [[0.01165623 0.03168492 0.08612854 0.23412167 0.6364086 ]
+         [0.01165623 0.03168492 0.08612854 0.23412167 0.6364086 ]]
+
+    """
+    if axis is None:
+        axis = _get_softmax_axis(len(inp.shape))
+    offset = inp.max(axis=axis).detach()
+    cached = exp(inp - offset)
+    down = sum(cached, axis=axis, keepdims=True)
+    return cached / down
+
+
+def batch_norm2d(
+    data: Tensor,
+    running_mean: Tensor = None,
+    running_var: Tensor = None,
+    weight: Optional[Tensor] = None,
+    bias: Optional[Tensor] = None,
+    *,
+    training: bool = False,
+    momentum: float = 0.9,
+    eps: float = 1e-5,
+    inplace: bool = True
+):
+    """Applies batch normalization to the input.
+
+    Refer to :class:`~.BatchNorm2d` and :class:`~.BatchNorm1d` for more information.
+
+    :param inp: input tensor.
+    :param running_mean: tensor to store running mean.
+    :param running_var: tensor to store running variance.
+    :param weight: scaling tensor in the learnable affine parameters.
+        See :math:`\gamma` in :class:`~.BatchNorm2d`
+    :param bias: bias tensor in the learnable affine parameters.
+        See :math:`\beta` in :class:`~.BatchNorm2d`
+    :param training: a boolean value to indicate whether batch norm is performed
+        in traning mode. Default: ``False``
+    :param momentum: the value used for the ``running_mean`` and ``running_var``
+        computation.
+        Default: 0.9
+    :param eps: a value added to the denominator for numerical stability.
+        Default: 1e-5.
+    :param inplace: whether to update running_mean and running_var inplace or return new tensors 
+        Default: True
+
+    """
+    from .tensor import expand_dims, squeeze, broadcast
+
+    def full(value):
+        N, C, H, W = data.shape
+        (x,) = Const(value, dtype=data.dtype, device=data.device)(data)
+        return broadcast(x, [1, C, 1, 1])
+
+    def expand_or_full(x, value):
+        if x is None:
+            return full(value)
+        return expand_dims(x, [0, 2, 3])
+
+    def make_full_if_none(x, value):
+        if x is None:
+            return full(value)
+        return x
+
+    has_mean = running_mean is not None
+    has_var = running_var is not None
+
+    if not training:
+        assert has_mean, "running_mean must be provided in inference mode"
+        assert has_var, "running_var must be provided in inference mode"
+
+    if has_mean and running_mean.ndim != 4:
+        raise ValueError
+    if has_var and running_var.ndim != 4:
+        raise ValueError
+
+    data, weight, bias, running_mean, running_var = utils.convert_inputs(
+        data, weight, bias, running_mean, running_var
+    )
+
+    weight = expand_or_full(weight, 1)
+    bias = expand_or_full(bias, 0)
+
+    if not training:
+        op = builtin.BatchNorm(fwd_mode="INFERENCE", epsilon=eps, param_dim="DIM_1C11")
+        ret = apply(op, data, weight, bias, running_mean, running_var)[-1]
+        return ret
+
+    else:
+        op = builtin.BatchNorm(
+            avg_factor=1 - momentum, epsilon=eps, param_dim="DIM_1C11"
+        )
+
+        if has_mean or has_var:
+            running_mean = make_full_if_none(running_mean, 0)
+            running_var = make_full_if_none(running_var, 1)
+            new_mean, new_var, _, _, data = apply(
+                op, data, weight, bias, running_mean, running_var
+            )
+            if not has_mean:
+                new_mean = None
+            if not has_var:
+                new_var = None
+
+            if inplace:
+                if has_mean:
+                    running_mean[...] = new_mean
+                if has_var:
+                    running_var[...] = new_var
+
+                return data
+            else:
+                return data, new_mean, new_var
+        else:
+            _, _, data, = apply(op, data, weight, bias)
+            return data
+
+
+def sync_batch_norm(
+    input: Tensor,
+    running_mean: Tensor,
+    running_var: Tensor,
+    weight: Optional[Tensor] = None,
+    bias: Optional[Tensor] = None,
+    training: bool = False,
+    momentum: Union[float, Tensor] = 0.9,
+    eps: float = 1e-5,
+    eps_mode="ADDITIVE",
+    group=WORLD,
+) -> Tensor:
+    """ Applies synchronized batch normalization to the input.
+
+    Refer to :class:`~.BatchNorm2d` and :class:`~.BatchNorm1d` for more information.
+
+    :param inp: input tensor.
+    :param running_mean: tensor to store running mean.
+    :param running_var: tensor to store running variance.
+    :param weight: scaling tensor in the learnable affine parameters.
+        See :math:`\gamma` in :class:`~.BatchNorm2d`
+    :param bias: bias tensor in the learnable affine parameters.
+        See :math:`\beta` in :class:`~.BatchNorm2d`
+    :param training: a boolean value to indicate whether batch norm is performed
+        in traning mode. Default: ``False``
+    :param momentum: the value used for the ``running_mean`` and ``running_var``
+        computation.
+        Default: 0.9
+    :param eps: a value added to the denominator for numerical stability.
+        Default: 1e-5.
+    """
+    assert eps_mode in {"MAX", "ADDITIVE"}, "unknown eps_mode: {}".format(eps_mode)
+    _channels = input.shape[1]
+    _ndim = len(input.shape)
+    _param_shape = (1, _channels) + (1,) * (_ndim - 2)
+
+    if training:
+
+        def _sum_on_channel(input):
+            return apply(builtin.Reduce(mode="SUM"), input, Tensor(_param_shape))[0]
+
+        reduce_size = input.shape[0]
+        for i in range(2, _ndim):
+            reduce_size = reduce_size * input.shape[i]
+        channel_x1s = _sum_on_channel(input)
+        channel_x2s = _sum_on_channel(input ** 2)
+
+        if is_distributed():
+            # reduce all nodes' data to calculate mean and variance
+            reduce_size = full([1 for _ in range(_ndim)], reduce_size)
+            stat = concat([reduce_size, channel_x1s, channel_x2s], axis=1)
+            stat = all_reduce_sum(stat, group)
+            reduce_size = stat[:, :1].reshape(1)
+            channel_x1s = stat[:, 1 : 1 + _channels]
+            channel_x2s = stat[:, 1 + _channels :]
+
+        channel_mean = channel_x1s / reduce_size
+        channel_variance = (
+            channel_x1s ** 2 / (-reduce_size * reduce_size) + channel_x2s / reduce_size
+        )
+    else:
+        assert running_var is not None and running_mean is not None
+        channel_variance = running_var.reshape(*_param_shape)
+        channel_mean = running_mean.reshape(*_param_shape)
+
+    invsqrt_channel_variance = (
+        maximum(channel_variance, eps) if eps_mode == "MAX" else channel_variance + eps
+    ) ** -0.5
+
+    if weight is not None:
+        weight = weight.reshape(*_param_shape)
+    if bias is not None:
+        bias = bias.reshape(*_param_shape)
+
+    # outvar = output * weight + bias
+    # where output = input * invsqrt_channel_variance + (
+    #    -channel_mean * invsqrt_channel_variance
+    # )
+    # Manually expand output for gopt
+
+    if weight is not None:
+        inv_var_wt = invsqrt_channel_variance * weight
+        neg_channel_mean = -channel_mean
+        if bias is not None:
+            outvar = input * inv_var_wt + (neg_channel_mean * inv_var_wt + bias)
+        else:
+            outvar = input * inv_var_wt + neg_channel_mean * inv_var_wt
+    else:
+        outvar = input * invsqrt_channel_variance + (
+            -channel_mean * invsqrt_channel_variance
+        )
+        if bias is not None:
+            outvar = outvar + bias
+
+    if training and running_var is not None and running_mean is not None:
+        running_mean *= momentum
+        running_mean += (1 - momentum) * channel_mean
+        channel_variance_unbiased = channel_x1s ** 2 / (
+            -reduce_size * (reduce_size - 1)
+        ) + channel_x2s / (reduce_size - 1)
+        running_var *= momentum
+        running_var += (1 - momentum) * channel_variance_unbiased
+
+    return outvar
+
+
+def one_hot(inp: Tensor, num_classes: int) -> Tensor:
+    r"""
+    Perform one-hot encoding for the input tensor.
+
+    :param inp: input tensor
+    :param num_classes: number of classes denotes the last dimension of the output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        inp = tensor(np.arange(1, 4, dtype=np.int32))
+        out = F.one_hot(inp, num_classes=4)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[0 1 0 0]
+         [0 0 1 0]
+         [0 0 0 1]]
+
+    """
+    raise NotImplementedError
+    # comp_node, comp_graph = _decide_comp_node_and_comp_graph(inp)
+
+    # zeros = mgb.make_immutable(value=0, comp_node=comp_node, comp_graph=comp_graph)
+    # zeros_symvar = zeros.broadcast(inp.shapeof(), num_classes)
+
+    # ones = mgb.make_immutable(value=1, comp_node=comp_node, comp_graph=comp_graph)
+    # ones_symvar = ones.broadcast(inp.shapeof(), 1)
+
+    # return Tensor(
+    #     mgb.opr.indexing_set_one_hot(
+    #         zeros_symvar, axis=len(inp.shapeof()), index=inp, value=ones_symvar
+    #     )
+    # )
+
+
+def warp_perspective(
+    inp: Tensor,
+    M: Tensor,
+    dsize: Union[Tuple[int, int], int, Tensor],
+    border_mode: str = "REPLICATE",
+    border_val: float = 0.0,
+    interp_mode: str = "LINEAR",
+):
+    r"""
+    Applies perspective transformation to batched 2D images.
+
+    The input images are transformed to the output images by the transformation matrix:
+
+    .. math::
+            \text{output}(n, c, h, w) = \text{input} \left( n, c,
+                \frac{M_{00}h + M_{01}w + M_{02}}{M_{20}h + M_{21}w + M_{22}},
+                \frac{M_{10}h + M_{11}w + M_{12}}{M_{20}h + M_{21}w + M_{22}}
+                \right)
+
+    :param inp: input image
+    :param M: (batch, 3, 3) transformation matrix
+    :param dsize: (h, w) size of the output image
+    :param border_mode: pixel extrapolation method. Default: ``"REPLICATE"``
+    :param border_val: value used in case of a constant border. Default: ``0``
+    :param interp_mode: interpolation methods. Default: ``"LINEAR"``
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+        inp_shape = (1, 1, 4, 4)
+        inp = tensor(np.arange(16, dtype=np.float32).reshape(inp_shape))
+        M_shape = (1, 3, 3)
+        # M defines a translation: dst(1, 1, h, w) = rst(1, 1, h+1, w+1)
+        M = tensor(np.array([[1., 0., 1.],
+                             [0., 1., 1.],
+                             [0., 0., 1.]], dtype=np.float32).reshape(M_shape))
+        out = F.warp_perspective(inp, M, (2, 2))
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[[[ 5.  6.]
+           [ 9. 10.]]]]
+
+    """
+    op = builtin.WarpPerspective(
+        imode=interp_mode, bmode=border_mode, format="NCHW", border_val=border_val
+    )
+    (result,) = apply(op, inp, M, Tensor(dsize))
+    return result
+
+
+def matmul(
+    inp1: Tensor,
+    inp2: Tensor,
+    transpose_a=False,
+    transpose_b=False,
+    compute_mode="DEFAULT",
+    format="DEFAULT",
+) -> Tensor:
+    """
+    Performs a matrix multiplication of the matrices ``inp1`` and ``inp2``.
+
+    With different inputs dim, this function behaves differently:
+
+    - Both 1-D tensor, simply forward to dot.
+    - Both 2-D tensor, normal matrix multiplication.
+    - If one input tensor is 1-D, matrix vector multiplication.
+    - If at least one tensor are 3-dimensional or >3-dimensional, the batched matrix-matrix is returned, and the tensor with smaller dimension will
+      be broadcasted. For example:
+        - inp1: `(k, m)`, inp2: `(m, p)`, return: `(k, p)`
+        - inp1: `(n, k, m)`, inp2: `(n, m, p)`, return: `(n, k, p)`
+        - inp1: `(n, k, m)`, inp2: `(m, p)`, return: `(n, k, p)`
+        - inp1: `(n, j, k, m)`, inp2: `(n, j, m, p)`, return: `(n, j, k, p)`
+
+    :param inp1: The first matrix to be multiplied
+    :param inp2: The second matrix to be multiplied
+    :return: The output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        data1 = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+        data2 = tensor(np.arange(0, 6, dtype=np.float32).reshape(3, 2))
+        out = F.matmul(data1, data2)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[10. 13.]
+         [28. 40.]]
+
+    """
+    inp1, inp2 = utils.convert_inputs(inp1, inp2)
+    dim1, dim2 = inp1.ndim, inp2.ndim
+    if dim1 == 1 and dim2 == 1:
+        return dot(inp1, inp2)
+
+    shp = None
+    if dim1 > 3 or dim2 > 3:
+        shape1, shape2 = list(inp1.shape), list(inp2.shape)
+        if dim1 != dim2:
+            if dim1 < dim2:
+                shape1 = shape2[: dim2 - dim1] + shape1
+                inp1 = inp1.broadcast(*shape1)
+            else:
+                shape2 = shape1[: dim1 - dim2] + shape2
+                inp2 = inp2.broadcast(*shape2)
+        reshaped_batch_size = 1
+        for i in shape1[:-2]:
+            reshaped_batch_size *= i
+        inp1 = inp1.reshape(*([reshaped_batch_size] + shape1[-2:]))
+        inp2 = inp2.reshape(*([reshaped_batch_size] + shape2[-2:]))
+        op = builtin.BatchedMatrixMul(
+            transposeA=transpose_a,
+            transposeB=transpose_b,
+            compute_mode=compute_mode,
+            format=format,
+        )
+        shp = shape1[:-1] + shape2[-1:]
+    elif dim1 == 3 or dim2 == 3:
+        if dim2 < 3:
+            inp2 = inp2.broadcast(*(inp1.shape[:1] + inp2.shape))
+        elif dim1 < 3:
+            inp1 = inp1.broadcast(*(inp2.shape[:1] + inp1.shape))
+        op = builtin.BatchedMatrixMul(
+            transposeA=transpose_a,
+            transposeB=transpose_b,
+            compute_mode=compute_mode,
+            format=format,
+        )
+    else:
+        if dim1 == 1:
+            shp = (inp2.shape[1],)
+            inp1 = add_axis(inp1, 0)
+        if dim2 == 1:
+            shp = (inp1.shape[0],)
+            inp2 = add_axis(inp2, 1)
+        op = builtin.MatrixMul(
+            transposeA=transpose_a,
+            transposeB=transpose_b,
+            compute_mode=compute_mode,
+            format=format,
+        )
+
+    (result,) = apply(op, inp1, inp2)
+    if shp is not None:
+        result = result.reshape(shp)
+    return result
+
+
+def dot(inp1: Tensor, inp2: Tensor) -> Tensor:
+    """
+    Compute dot-product of two vectors ``inp1`` and ``inp2``.
+    inputs must be 1-dimensional, scalar input can be automatically broadcasted.
+
+    :param inp1: The first vector
+    :param inp2: The second vector
+    :return: The output value
+
+    Examples:
+
+    .. teestcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        data1 = tensor(np.arange(0, 6, dtype=np.float32))
+        data2 = tensor(np.arange(0, 6, dtype=np.float32))
+        out = F.dot(data1, data2)
+        print(out.numpy())
+
+    Outputs:
+
+        [55.]
+
+    .. testoutputs::
+    """
+    op = builtin.Dot()
+    inp1, inp2 = utils.convert_inputs(inp1, inp2)
+    (result,) = apply(op, inp1, inp2)
+    return result
+
+
+def svd(inp: Tensor, full_matrices=False, compute_uv=True) -> Tensor:
+    """
+    Compute the singular value decompositions of input matrix ``inp``.
+
+    :param inp: The input matrix, must has shape ``[..., M, N]``
+    :return: The output matrices, U, sigma, V
+
+    Examples:
+
+    .. teestcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2,3))
+        _, y, _ = F.svd(x)
+        print(y.numpy())
+
+    Outputs:
+
+        [7.348, 1.]
+
+    """
+    op = builtin.SVD(full_matrices=full_matrices, compute_uv=compute_uv)
+    U, sigma, V = apply(op, inp)
+    return U, sigma, V
+
+
+def interpolate(
+    inp: Tensor,
+    size: Optional[Union[int, Tuple[int, int]]] = None,
+    scale_factor: Optional[Union[float, Tuple[float, float]]] = None,
+    mode: str = "BILINEAR",
+    align_corners: bool = None,
+) -> Tensor:
+    r"""
+    Down/up samples the input tensor to either the given :attr:`size` or the given
+    :attr:`scale_factor`
+
+    :param inp: input tensor
+    :param size: size of the output tensor. Default: ``None``
+    :param scale_factor: scaling factor of the output tensor. Default: ``None``
+    :param mode: interpolation methods, acceptable values are:
+        'BILINEAR', 'LINEAR'. Default: ``BILINEAR``
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+        from megengine.test import assertTensorClose
+
+        inp = tensor(np.arange(1, 5, dtype=np.float32).reshape(1, 1, 2, 2))
+        out = F.interpolate(inp, [4, 4], align_corners=False)
+        print(out.numpy())
+
+        out2 = F.interpolate(inp, scale_factor=2.)
+        assertTensorClose(out.numpy(), out2.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[[[1.   1.25 1.75 2.  ]
+           [1.5  1.75 2.25 2.5 ]
+           [2.5  2.75 3.25 3.5 ]
+           [3.   3.25 3.75 4.  ]]]]
+
+    """
+    mode = mode.upper()
+    if mode not in ["BILINEAR", "LINEAR"]:
+        raise ValueError("interpolate only support linear or bilinear mode")
+    if mode not in ["BILINEAR", "LINEAR"]:
+        if align_corners is not None:
+            raise ValueError(
+                "align_corners option can only be set in the bilinear/linear interpolating mode"
+            )
+    else:
+        if align_corners is None:
+            align_corners = False
+
+    if mode == "LINEAR":
+        inp = add_axis(inp, 3)
+
+    if len(inp.shape) != 4:
+        raise ValueError("shape of input tensor must correspond to the operartion mode")
+
+    if size is None:
+        if scale_factor is None:
+            raise ValueError("scale_factor must not be None when size is None")
+
+        if isinstance(scale_factor, (float, int)):
+            scale_factor = float(scale_factor)
+            if mode == "LINEAR":
+                scale_factor = (scale_factor, float(1))
+            else:
+                scale_factor = (scale_factor, scale_factor)
+        else:
+            if mode == "LINEAR":
+                raise ValueError(
+                    "under LINEAR mode, scale_factor can only be single value"
+                )
+
+        assert len(scale_factor) == 2, "shape of scale_factor must be equal to (2, )"
+        assert isinstance(scale_factor[0], float) and isinstance(
+            scale_factor[1], float
+        ), "scale_factor must be float type"
+        dsize = tuple(
+            floor(
+                Tensor(
+                    inp.shape[i + 2] * scale_factor[i],
+                    dtype="float32",
+                    device=inp.device,
+                )
+            )
+            for i in range(2)
+        )
+        dsize = concat([dsize[0], dsize[1]], axis=0)
+    else:
+        if scale_factor is not None:
+            raise ValueError("scale_factor must be None when size is provided")
+
+        if isinstance(size, int):
+            size = (size, 1)
+        else:
+            if mode == "LINEAR":
+                raise ValueError("under LINEAR mode, size can only be single value")
+        dsize = size
+
+    oh, ow = dsize[0], dsize[1]
+    ih, iw = inp.shape[2], inp.shape[3]
+
+    if align_corners:
+        hscale = (ih - 1.0) / (oh - 1.0)
+        wscale = 1.0 * iw / ow
+        if mode != "LINEAR":
+            wscale = (iw - 1.0) / (ow - 1.0)
+        row0 = concat(
+            [wscale, Tensor([0, 0], dtype="float32", device=inp.device)], axis=0
+        ).reshape(1, 3)
+        row1 = concat(
+            [
+                Tensor(0, dtype="float32", device=inp.device),
+                hscale,
+                Tensor(0, dtype="float32", device=inp.device),
+            ],
+            axis=0,
+        ).reshape(1, 3)
+        weight = concat(
+            [row0, row1, Tensor([[0, 0, 1]], dtype="float32", device=inp.device)],
+            axis=0,
+        ).reshape(1, 3, 3)
+        weight = broadcast(weight, (inp.shape[0], 3, 3))
+    else:
+        hscale = 1.0 * ih / oh
+        wscale = 1.0 * iw / ow
+        row0 = concat(
+            [wscale, Tensor(0, dtype="float32", device=inp.device), 0.5 * wscale - 0.5],
+            axis=0,
+        ).reshape(1, 3)
+        row1 = concat(
+            [Tensor(0, dtype="float32", device=inp.device), hscale, 0.5 * hscale - 0.5],
+            axis=0,
+        ).reshape(1, 3)
+        weight = concat(
+            [row0, row1, Tensor([[0, 0, 1]], dtype="float32", device=inp.device)],
+            axis=0,
+        ).reshape(1, 3, 3)
+        weight = broadcast(weight, (inp.shape[0], 3, 3))
+
+    weight = weight.astype("float32")
+    ret = warp_perspective(inp, weight, dsize, interp_mode="LINEAR")
+    if mode == "LINEAR":
+        ret = reshape(ret, ret.shape[0:3])
+    return ret
+
+
+def dropout(inp: Tensor, drop_prob: float, rescale: bool = True) -> Tensor:
+    """
+    Returns a new tensor where each of the elements are randomly set to zero
+    with probability P = ``drop_prob``. Optionally rescale the output tensor.
+
+    :param inp: The input tensor
+    :param drop_prob: The probability to drop (set to zero) a single element
+    :param rescale: The default behavior of ``dropout`` during training is to rescale the output,
+        then it can be replaced by an :class:`~.Identity` during inference, default to True.
+    :return: The output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+
+        import megengine.functional as F
+        from megengine import tensor
+
+        data = tensor(np.ones(10, dtype=np.float32))
+        out = F.dropout(data, 1./3.)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+        :options: +SKIP
+
+        [1.5 1.5 0.  1.5 1.5 1.5 1.5 1.5 1.5 1.5]
+
+    """
+    assert 0 <= drop_prob < 1
+    rv = uniform(inp.shape)
+    mask = rv > drop_prob
+    inp *= mask.astype(inp.dtype)
+    if rescale:
+        inp *= 1 / (1 - drop_prob)
+    return inp
+
+
+def identity(inp: Tensor) -> Tensor:
+    """applies an identity transform to the input tensor.
+
+    :param inp: The input tensor
+    """
+    op = builtin.Identity()
+    (data,) = utils.convert_inputs(inp)
+    (output,) = apply(op, data)
+    return output
+
+
+def embedding(
+    input: Tensor,
+    weight: Tensor,
+    padding_idx: Optional[int] = None,
+    max_norm: Optional[float] = None,
+    norm_type: Optional[float] = None,
+):
+    """
+    Applies lookup table for embedding.
+
+    :param input: the tensor with indices.
+    :param weight: the learnable weights which embedding from.
+    :param padding_idx: should be set to None, not support now.
+    :param max_norm: should be set to None, not support now.
+    :param norm_type: should be set to None, not support now.
+
+
+    Refer to :class:`~.Embedding` for more information.
+    """
+    if padding_idx is not None:
+        raise ValueError("Not support padding_idx Now!")
+    if max_norm is not None or norm_type is not None:
+        raise ValueError("Not support weight normlization Now!")
+
+    dest_shp = list(input.shape) + [weight.shape[-1]]
+    return weight[input.reshape(-1)].reshape(dest_shp)
+
+
+def roi_pooling(
+    inp: Tensor,
+    rois: Tensor,
+    output_shape: Union[int, tuple, list],
+    mode: str = "max",
+    scale: float = 1.0,
+) -> Tensor:
+    """
+    Apply roi pooling on input feature
+
+    :param inp: tensor that represents the input feature, (N, C, H, W) images
+    :param rois: (K, 5) boxes. First column is the index into N. The other 4 columns are xyxy
+    :param output_shape: (height, width) of output rois feature
+    :param mode: "max" or "average", use max/average align just like max/average pooling. Default: ``"max"``
+    :param scale: scale the input boxes by this number. Default: 1.0
+    :return: (K, C, output_shape[0], output_shape[1]) feature of rois
+    """
+    assert mode in ["max", "average"], "only max/average mode is supported"
+    if isinstance(output_shape, int):
+        output_shape = (output_shape, output_shape)
+
+    op = builtin.ROIPooling(mode=mode, scale=scale)
+    result, _ = apply(
+        op, inp, rois, Tensor(output_shape, dtype="int32", device=inp.device)
+    )
+    return result
+
+
+def roi_align(
+    input: Tensor,
+    rois: Tensor,
+    output_shape: Union[int, tuple, list],
+    mode: str = "average",
+    spatial_scale: float = 1.0,
+    sample_points: Union[int, tuple, list] = 2,
+    aligned: bool = True,
+) -> Tensor:
+    """
+    Apply roi align on input feature
+
+    :param input: tensor that represents the input feature, (N, C, H, W) images
+    :param rois: (N, 5) boxes. First column is the index into N. The other 4 columns are xyxy
+    :param output_shape: (height, width) shape of output rois feature.
+    :param mode: "max" or "average", use max/average align just like max/average pooling. Default: ``"average"``
+    :param spatial_scale: scale the input boxes by this number. Default: 1.0
+    :param sample_points: number of inputs samples to take for each output sample.
+        0 to take samples densely. Default: 2
+    :param aligned: wheather align the input feature, with `aligned=True`,
+        we first appropriately scale the ROI and then shift it by -0.5. Default: True
+    """
+    assert mode in ["max", "average"], "only max/average mode is supported"
+    if isinstance(output_shape, int):
+        output_shape = (output_shape, output_shape)
+    pooled_height, pooled_width = output_shape
+    if isinstance(sample_points, int):
+        sample_points = (sample_points, sample_points)
+    sample_height, sample_width = sample_points
+    offset = 0.5 if aligned else 0.0
+
+    op = builtin.ROIAlign(
+        mode=mode,
+        format="NCHW",
+        spatial_scale=spatial_scale,
+        offset=offset,
+        pooled_height=pooled_height,
+        pooled_width=pooled_width,
+        sample_height=sample_height,
+        sample_width=sample_width,
+    )
+    result, *_ = apply(op, input, rois)
+    return result
+
+
+def assert_equal(
+    get: Tensor, expect: Tensor, max_err: float = 1e-4, verbose: bool = False
+) -> Tensor:
+    r"""
+    Asserts that ``get`` equals to ``expect``, and returns value of ``expect``.
+
+    :param get: tensor to be checked.
+    :param expect: tensor with expected values.
+    :param max_err: tolerance that two float values are asserted equal. Default: 1e-4
+    :param verbose: whether to print details if two tensors are not equal. Default: False
+
+    Examples:
+
+    .. testcode::
+
+        import megengine.functional as F
+        from megengine import tensor
+
+        get = tensor([1.0, 2.0])
+        max_err = 0.1
+        expect = get + max_err / 2.0
+        val = F.assert_equal(expect, get, max_err=max_err)
+        print(val.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [1.05 2.05]
+
+    """
+    raise NotImplementedError
+    # op = builtin.AssertEqual(maxerr=max_err, verbose=verbose)
+    # result, = apply(op, get, expect)
+    # return result
+
+
+def indexing_one_hot(
+    src: Tensor, index: Tensor, axis: int = 1, keepdims=False
+) -> Tensor:
+    r"""
+    One-hot indexing for some axis.
+
+    :param src: input data tensor.
+    :param index: index tensor.
+    :param axis: the axis on src for which values in index index. Default: 1
+    :param keepdims: whether not to remove the axis in result. Default: ``False``
+
+    Examples:
+
+    .. testcode::
+
+        import megengine.functional as F
+        from megengine import tensor
+
+        src = tensor([[1.0, 2.0]])
+        index = tensor([0])
+        val = F.indexing_one_hot(src, index)
+        print(val.numpy())
+
+    .. testoutput::
+
+        [1.]
+
+    """
+    op = builtin.IndexingOneHot(axis=axis)
+    (result,) = apply(op, src, index)
+    if not keepdims:
+        result = remove_axis(result, axis)
+    return result
+
+
+def nms(boxes: Tensor, iou_thresh: float, scores: Optional[Tensor] = None) -> Tensor:
+    r"""
+    Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union (IoU).
+
+    :param boxes: tensor of shape ``(N, 4)``; the boxes to perform nms on; each box is expected to be in (x1, y1, x2, y2) format.
+    :param iou_thresh: iou threshold for overlapping.
+    :param scores: tensor of shape ``(N,)``, the score of boxes.
+    :return: indices of the elements that have been kept by NMS.
+    
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = np.zeros((100,4))
+        np.random.seed(42)
+        x[:,:2] = np.random.rand(100,2)*20
+        x[:,2:] = np.random.rand(100,2)*20 + 100
+        scores = tensor(np.random.rand(100))
+        inp = tensor(x)
+        result = F.nms(inp, iou_thresh=0.7, scores=scores)
+        print(result.numpy())
+
+    Outputs:
+
+    .. testoutput::
+    
+        [75 69]
+
+    """
+    assert (
+        boxes.ndim == 2 and boxes.shape[1] == 4
+    ), "the expected shape of boxes is (N, 4)"
+
+    sorted_idx = None
+    if not scores is None:
+        assert scores.ndim == 1, "the expected shape of scores is (N,)"
+        sorted_idx = argsort(scores, descending=True)
+        boxes = boxes[sorted_idx]
+    max_output = boxes.shape[0]
+
+    op = builtin.NMSKeep(iou_thresh, max_output)
+    inp = utils.convert_inputs(boxes.reshape(1, -1, 4))
+    indices, count = apply(op, *inp)
+    indices = indices[0][: count.item()]
+    ret = sorted_idx[indices] if sorted_idx is not None else indices
+    return ret
+
+
+def batched_nms(
+    boxes: Tensor, iou_thresh: float, idxs: Tensor, scores: Optional[Tensor] = None
+) -> Tensor:
+    r"""
+    Performs non-maximum suppression (NMS) on the boxes according to their intersection-over-union (IoU).
+
+    :param boxes: tensor of shape ``(N, 4)``; the boxes to perform nms on; each box is expected to be in (x1, y1, x2, y2) format
+    :param iou_thresh: iou threshold for overlapping
+    :param idxs: tensor of shape ``(N,)``, the class indexs of boxes in the batch.
+    :param scores: tensor of shape ``(N,)``, the score of boxes.
+    :return: indices and the number of the elements that have been kept by NMS
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = np.zeros((100,4))
+        np.random.seed(42)
+        x[:,:2] = np.random.rand(100,2)*20
+        x[:,2:] = np.random.rand(100,2)*20 + 100
+        scores = tensor(np.random.rand(100))
+        idxs =  tensor(np.random.randint(0, 10, 100))
+        inp = tensor(x)
+        result = F.batched_nms(inp, iou_thresh=0.6, idxs=idxs, scores=scores)
+        print(result.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [75 41 99 98 69 64 11 27 35 18]
+
+    """
+    assert (
+        boxes.ndim == 2 and boxes.shape[1] == 4
+    ), "the expected shape of boxes is (N, 4)"
+    max_coordinate = boxes.max()
+    offsets = idxs.astype("float32") * (max_coordinate + 1)
+    boxes = boxes + offsets.reshape(-1, 1).broadcast(boxes.shape[0], 4)
+
+    sorted_idx = None
+    if not scores is None:
+        assert scores.ndim == 1, "the expected shape of scores is (N,)"
+        sorted_idx = argsort(scores, descending=True)
+        boxes = boxes[sorted_idx]
+    max_output = boxes.shape[0]
+
+    op = builtin.NMSKeep(iou_thresh, max_output)
+    inp = utils.convert_inputs(boxes.reshape(1, -1, 4))
+    indices, count = apply(op, *inp)
+    indices = indices[0][: count.item()]
+    ret = sorted_idx[indices] if sorted_idx is not None else indices
+    return ret
diff --git a/imperative/python/megengine/functional/quantized.py b/imperative/python/megengine/functional/quantized.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bee9311c05126b196de3c60eae55db604aa8d75
--- /dev/null
+++ b/imperative/python/megengine/functional/quantized.py
@@ -0,0 +1,83 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# pylint: disable=too-many-lines
+from typing import Tuple, Union
+
+from ..core.ops import builtin
+from ..core.tensor.core import apply
+from ..tensor import Tensor
+from .debug_param import get_conv_execution_strategy
+from .types import _pair, _pair_nonzero
+
+
+def conv_bias_activation(
+    inp: Tensor,
+    weight: Tensor,
+    bias: Tensor,
+    dtype=None,
+    stride: Union[int, Tuple[int, int]] = 1,
+    padding: Union[int, Tuple[int, int]] = 0,
+    dilation: Union[int, Tuple[int, int]] = 1,
+    groups: int = 1,
+    format="NCHW",
+    nonlinear_mode="IDENTITY",
+    conv_mode="CROSS_CORRELATION",
+    compute_mode="DEFAULT",
+) -> Tensor:
+    """ convolution bias with activation operation, only for inference.
+
+    :param inp: The feature map of the convolution operation
+    :param weight: The convolution kernel
+    :param bias: The bias added to the result of convolution
+    :param stride: Stride of the 2D convolution operation. Default: 1
+    :param padding: Size of the paddings added to the input on both sides of its
+        spatial dimensions. Only zero-padding is supported. Default: 0
+    :param dilation: Dilation of the 2D convolution operation. Default: 1
+    :param groups: number of groups to divide input and output channels into,
+        so as to perform a "grouped convolution". When ``groups`` is not 1,
+        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+        and the shape of weight should be ``(groups, out_channel // groups,
+        in_channels // groups, height, width)``.
+    :type conv_mode: string or :class:`P.Convolution.Mode`
+    :param conv_mode: Supports 'CROSS_CORRELATION' or 'CONVOLUTION'. Default:
+        'CROSS_CORRELATION'.
+    :param dtype:  Support for np.dtype, Default:
+        np.int8.
+    :param scale:  scale if use quantization, Default:
+        0.0.
+    :param zero_point:  scale if use quantization quint8, Default:
+        0.0.
+    :type compute_mode: string or
+        :class:`P.Convolution.ComputeMode`
+    :param compute_mode: When set to 'DEFAULT', no special requirements will be
+        placed on the precision of intermediate results. When set to 'FLOAT32',
+        Float32 would be used for accumulator and intermediate result, but only
+        effective when input and output are of Float16 dtype.
+
+    """
+    ph, pw = _pair(padding)
+    sh, sw = _pair_nonzero(stride)
+    dh, dw = _pair_nonzero(dilation)
+    sparse_type = "DENSE" if groups == 1 else "GROUP"
+    op = builtin.ConvBiasForward(
+        stride_h=sh,
+        stride_w=sw,
+        pad_h=ph,
+        pad_w=pw,
+        dilate_h=dh,
+        dilate_w=dw,
+        dtype=dtype,
+        format=format,
+        strategy=get_conv_execution_strategy(),
+        nonlineMode=nonlinear_mode,
+        mode=conv_mode,
+        compute_mode=compute_mode,
+        sparse=sparse_type,
+    )
+    (outputs,) = apply(op, inp, weight, bias)
+    return outputs
diff --git a/imperative/python/megengine/functional/tensor.py b/imperative/python/megengine/functional/tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..15c26bf0aed389aad07d065d8c66e48c98b4524f
--- /dev/null
+++ b/imperative/python/megengine/functional/tensor.py
@@ -0,0 +1,934 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import functools
+import math
+from itertools import accumulate
+from typing import Iterable, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+
+from ..core._imperative_rt import CompNode
+from ..core.ops import builtin
+from ..core.ops._internal import param_defs as P
+from ..core.ops.special import Const
+from ..core.tensor.core import TensorBase, TensorWrapperBase, apply
+from ..core.tensor.utils import (
+    astensor1d,
+    convert_inputs,
+    convert_single_value,
+    dtype_promotion,
+    get_device,
+)
+from ..device import get_default_device
+from ..tensor import Tensor
+from .elemwise import ceil
+
+__all__ = [
+    "add_axis",  # expand_dims
+    "arange",
+    "broadcast",
+    "concat",
+    "cond_take",
+    "dimshuffle",  # transpose, permute
+    "expand_dims",
+    "full",
+    "full_like",
+    "gather",
+    "eye",
+    "linspace",
+    "ones",
+    "ones_like",
+    "remove_axis",  # squeeze
+    "split",
+    "squeeze",
+    "stack",
+    "reshape",
+    "scatter",
+    "where",
+    "zeros",
+    "zeros_like",
+    "param_pack_split",
+    "param_pack_concat",
+]
+
+
+def eye(n: int, *, dtype=None, device: Optional[CompNode] = None) -> Tensor:
+    """
+    Returns a 2D tensor with ones on the diagonal and zeros elsewhere.
+
+    :param n: The number of rows
+    :param m: The number of columns. Default: None
+    :param dtype: The data type. Default: None
+    :param device: Compute node of the matrix. Default: None
+    :param comp_graph: Compute graph of the matrix. Default: None
+    :return: The eye matrix
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine.functional as F
+
+        data_shape = (4, 6)
+        n, m = data_shape
+        out = F.eye(n, m, dtype=np.float32)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[1. 0. 0. 0. 0. 0.]
+         [0. 1. 0. 0. 0. 0.]
+         [0. 0. 1. 0. 0. 0.]
+         [0. 0. 0. 1. 0. 0.]]
+
+    """
+    op = builtin.Eye(k=0, dtype=dtype, comp_node=device)
+    (result,) = apply(op, Tensor(n, dtype="int32", device=device))
+    return result
+
+
+def full(shape, value, dtype="float32", device=None):
+    if device is None:
+        device = get_default_device()
+    (x,) = Const(value, dtype=dtype, device=device)(
+        Tensor(value, dtype=dtype, device=device)
+    )
+    return broadcast(x, shape)
+
+
+def ones(shape, dtype="float32", device=None):
+    return full(shape, 1.0, dtype=dtype, device=device)
+
+
+def zeros(shape, dtype="float32", device=None):
+    return full(shape, 0.0, dtype=dtype, device=device)
+
+
+def zeros_like(inp: Tensor) -> Tensor:
+    r"""
+    Returns a zero tensor with the same shape as input tensor
+
+    :param inp: input tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        inp = tensor(np.arange(1, 7, dtype=np.int32).reshape(2,3))
+        out = F.zeros_like(inp)
+        print(out.numpy())
+
+    .. testoutput::
+
+        [[0 0 0]
+         [0 0 0]]
+
+    """
+    return zeros(inp.shape, dtype=inp.dtype, device=inp.device)
+
+
+def ones_like(inp: Tensor) -> Tensor:
+    r"""
+    Returns a identity tensor with the same shape as input tensor
+    """
+    return ones(inp.shape, dtype=inp.dtype, device=inp.device)
+
+
+def full_like(inp: Tensor, value: Union[int, float]) -> Tensor:
+    r"""
+    Returns a tensor filled with value val with the same shape as input tensor
+    """
+    return full(inp.shape, value, dtype=inp.dtype, device=inp.device)
+
+
+def broadcast(inp: Tensor, shape: Union[int, Iterable[int]]) -> Tensor:
+    """
+    Broadcast a tensor to ``shape``
+
+    :param inp: The input tensor
+    :param shape: The target shape
+    :return: The output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        data = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+        out = F.broadcast(data, (4, 2, 3))
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[[0. 1. 2.]
+          [3. 4. 5.]]
+
+         [[0. 1. 2.]
+          [3. 4. 5.]]
+
+         [[0. 1. 2.]
+          [3. 4. 5.]]
+
+         [[0. 1. 2.]
+          [3. 4. 5.]]]
+
+    """
+    shape = astensor1d(shape, inp, dtype="int32", device=inp.device)
+    (result,) = apply(builtin.Broadcast(), inp, shape)
+    return result
+
+
+def concat(
+    inps: Iterable[Tensor], axis: int = 0, device: Optional[CompNode] = None,
+) -> Tensor:
+    r"""
+    Concat some tensors
+
+    :param inps: Input tensors to concat
+    :param axis: the dimension over which the tensors are concatenated. Default: 0
+    :param device: The comp node output on. Default: None
+    :param comp_graph: The graph in which output is. Default: None
+    :return: The output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        data1 = tensor(np.arange(0, 6, dtype=np.float32).reshape((2, 3)))
+        data2 = tensor(np.arange(6, 12, dtype=np.float32).reshape((2, 3)))
+        out = F.concat([data1, data2])
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[ 0.  1.  2.]
+         [ 3.  4.  5.]
+         [ 6.  7.  8.]
+         [ 9. 10. 11.]]
+
+    """
+    dtype = dtype_promotion(inps)
+    device = get_device(inps)
+
+    def convert(x):
+        return convert_single_value(x, inps, dtype=dtype)
+
+    inps = tuple(map(convert, inps))
+    (result,) = apply(builtin.Concat(axis=axis, comp_node=device.to_c()), *inps)
+    return result
+
+
+def stack(inps, axis=0):
+    """Concats a sequence of tensors along a new axis.
+    The input tensors must have the same shape.
+
+    :param inps: The input tensors.
+    :param axis: Which axis will be concatenated.
+    :return: The output concatenated tensor.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x1 = tensor(np.arange(0, 6, dtype=np.float32).reshape((2, 3)))
+        x2 = tensor(np.arange(6, 12, dtype=np.float32).reshape((2, 3)))
+        out = F.stack([x1, x2], axis=0)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[[ 0.  1.  2.]
+          [ 3.  4.  5.]]
+
+         [[ 6.  7.  8.]
+          [ 9. 10. 11.]]]
+
+    """
+    shapes = {arr.shape for arr in inps}
+    if len(shapes) != 1:
+        raise ValueError("All input tensors must have the same shape")
+
+    inps = [add_axis(inp, axis=axis) for inp in inps]
+    return concat(inps, axis=axis)
+
+
+def split(inp, nsplits_or_sections, axis=0):
+    """Splits the input tensor into several smaller tensors.
+    When nsplits_or_sections is int, the last tensor may be smaller than others.
+
+    :param inp: The input tensor.
+    :param nsplits_or_sections: Number of sub tensors or section information list.
+    :param axis: Which axis will be splited.
+    :return: The output tensor list.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.random.random((2,3,4,5)), dtype=np.float32)
+        out = F.split(x, 2, axis=3)
+        print(out[0].shape, out[1].shape)
+
+    Outputs:
+
+    .. testoutput::
+
+        (2, 3, 4, 3) (2, 3, 4, 2)
+
+    """
+    sub_tensors = []
+    sections = []
+
+    def swapaxis(inp, src, dst):
+        if src == dst:
+            return inp
+        shape = [i for i in range(len(inp.shape))]
+        shape[src] = dst
+        shape[dst] = src
+        return inp.transpose(shape)
+
+    inp = swapaxis(inp, 0, axis)
+
+    if isinstance(nsplits_or_sections, int):
+        incr_step = math.ceil(inp.shape[0] / nsplits_or_sections)
+        while incr_step < inp.shape[0]:
+            sections.append(incr_step)
+            incr_step += nsplits_or_sections
+    else:
+        sections = nsplits_or_sections
+
+    st = 0
+    for se in sections:
+        sub_tensors.append(swapaxis(inp[st:se], axis, 0))
+        st = se
+
+    if st < inp.shape[0]:
+        sub_tensors.append(swapaxis(inp[st:], axis, 0))
+
+    return sub_tensors
+
+
+def _get_idx(index, axis):
+    index_dims = len(index.shape)
+    idx = []
+    for i in range(index_dims):
+        if i != axis:
+            shape = [1] * index_dims
+            shape[i] = index.shape[i]
+            arange = linspace(
+                0, index.shape[i] - 1, index.shape[i], device=index.device,
+            )
+            arange = (
+                arange.reshape(*shape)
+                .broadcast(index.shape)
+                .reshape(-1)
+                .astype(np.int32)
+            )
+            idx.append(arange)
+        else:
+            idx.append(index.reshape(-1))
+    return tuple(idx)
+
+
+def gather(inp: Tensor, axis: int, index: Tensor) -> Tensor:
+    r"""
+    Gather data from :attr:`inp` on :attr:`axis` using :attr:`index`.
+
+    For a 3-D tensor, the output is specified by::
+
+        out[i][j][k] = inp[index[i][j][k]][j][k] # if axis == 0
+        out[i][j][k] = inp[i][index[i][j][k]][k] # if axis == 1
+        out[i][j][k] = inp[i][j][index[i][j][k]] # if axis == 2
+
+    if :attr:`inp` is an n-dimensional tensor with size
+    :math:`(x_0,x_1,...,x_{i-1},x_i,x_{i+1},...,x_{n-1})` and axis=i,
+    then :attr:`index` must be an n-dimensional tensor with size
+    :math:`(x_0,x_1,...,x_{i-1},y,x_{i+1},...,x_{n-1})` where :math:`y\ge 1` and
+    output will have the same size as :attr:`index`.
+
+
+    :param inp: the source tensor
+    :param axis: the axis along which to index
+    :param index: the indices of elements to gather
+
+    Examples:
+
+    .. testcode::
+
+        import megengine.functional as F
+        from megengine import tensor
+
+        inp = tensor([
+            [1,2], [3,4], [5,6],
+        ])
+        index = tensor([[0,2], [1,0]])
+        oup = F.gather(inp, 0, index)
+        print(oup.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[1 6]
+         [3 2]]
+
+    """
+    input_shape = inp.shape
+    index_shape = index.shape
+    input_dims = len(input_shape)
+    index_dims = len(index_shape)
+    if input_dims != index_dims:
+        raise ValueError(
+            "The index tensor must have same dimensions as input tensor, "
+            "But the input dims:{}, the index dims:{}".format(input_dims, index_dims)
+        )
+
+    if axis < 0 or axis >= input_dims:
+        raise ValueError(
+            "Index axis {} is output of bounds, should in range [0 {})".format(
+                axis, input_dims
+            )
+        )
+
+    for i in range(input_dims):
+        if i != axis and input_shape[i] != index_shape[i]:
+            raise ValueError(
+                "The input {} and index {} must have the same size apart from axis {}".format(
+                    input_shape, index_shape, axis
+                )
+            )
+
+    idx = _get_idx(index, axis)
+    return inp[idx].reshape(index.shape)  # pylint: disable=no-member
+
+
+def scatter(inp: Tensor, axis: int, index: Tensor, source: Tensor) -> Tensor:
+    r"""
+    Writes all values from the tensor :attr:`source` into :attr:`inp` at the indices specified in the :attr:`index` tensor.
+
+    For each value in :attr:`source`, its output index is specified by its index
+    in :attr:`source` for ``axis != dimension`` and by the corresponding value in
+    :attr:`index` for ``axis = dimension``.
+
+    For a 3-D tensor, :attr:`inp` is updated as::
+
+        inp[index[i][j][k]][j][k] = source[i][j][k]  # if axis == 0
+        inp[i][index[i][j][k]][k] = source[i][j][k]  # if axis == 1
+        inp[i][j][index[i][j][k]] = source[i][j][k]  # if axis == 2
+
+    :attr:`inp`, :attr:`index` and :attr:`source` should have same number of dimensions.
+
+    It is also required that ``source.shape(d) <= inp.shape(d)`` and ``index.shape(d) == source.shape(d)``
+    for all dimensions ``d``.
+
+    Moreover, the values of :attr:`index` must be between ``0`` and ``inp.shape(axis) - 1`` inclusive.
+
+    .. note::
+        Please notice that, due to performance issues, the result is uncertain on the GPU device
+        if scatter difference positions from source to the same destination position
+        regard to index tensor.
+
+        Show the case using the following examples, the oup[0][2] is maybe
+        from source[0][2] which value is 0.2256 or source[1][2] which value is 0.5339
+        if set the index[1][2] from 1 to 0.
+
+    :param inp: the inp tensor which to be scattered
+    :param axis: the axis along which to index
+    :param index: the indices of elements to scatter
+    :param source: the source element(s) to scatter
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine.functional as F
+        from megengine import tensor
+
+        inp = tensor(np.zeros(shape=(3,5),dtype=np.float32))
+        source = tensor([[0.9935,0.9465,0.2256,0.8926,0.4396],[0.7723,0.0718,0.5939,0.357,0.4576]])
+        index = tensor([[0,2,0,2,1],[2,0,1,1,2]])
+        oup = F.scatter(inp, 0, index,source)
+        print(oup.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[0.9935 0.0718 0.2256 0.     0.    ]
+         [0.     0.     0.5939 0.357  0.4396]
+         [0.7723 0.9465 0.     0.8926 0.4576]]
+
+    """
+    input_shape = inp.shape
+    index_shape = index.shape
+    source_shape = source.shape
+    input_dims = len(input_shape)
+    index_dims = len(index_shape)
+    source_dims = len(source_shape)
+
+    if input_dims != index_dims or input_dims != source_dims:
+        raise ValueError("The input, source and index tensor must have same dimensions")
+
+    if axis < 0 or axis >= input_dims:
+        raise ValueError(
+            "Index axis {} is output of bounds, should in range [0 {})".format(
+                axis, input_dims
+            )
+        )
+
+    for i in range(source_dims):
+        if source_shape[i] > input_shape[i]:
+            raise ValueError(
+                "The each shape size for source {} must be less than or equal to input {} ".format(
+                    source_shape, input_shape
+                )
+            )
+
+    for i in range(index_dims):
+        if index_shape[i] != source_shape[i]:
+            raise ValueError(
+                "The each shape size for index {} must be equal to source {} ".format(
+                    index_shape, source_shape
+                )
+            )
+
+    for i in range(index_dims):
+        if i != axis and index_shape[i] > input_shape[i]:
+            raise ValueError(
+                "The index {} must be less than or equal to input {} size apart from axis {}".format(
+                    index_shape, input_shape, axis
+                )
+            )
+
+    idx = _get_idx(index, axis)
+    inp[idx] = source.flatten()
+    return inp
+
+
+def where(mask: Tensor, x: Tensor, y: Tensor) -> Tensor:
+    r"""
+    Select elements either from Tensor x or Tensor y, according to mask.
+
+    .. math::
+
+        \textrm{out}_i = x_i \textrm{ if } \textrm{mask}_i \textrm{ is True else } y_i
+
+    :param mask: a mask used for choosing x or y
+    :param x: the first choice
+    :param y: the second choice
+
+    Examples:
+
+    .. testcode::
+
+        from megengine import tensor
+        import megengine.functional as F
+        mask = tensor(np.array([[1, 0], [0, 1]], dtype=np.int32))
+        x = tensor(np.array([[1, np.inf], [np.nan, 4]],
+            dtype=np.float32))
+        y = tensor(np.array([[5, 6], [7, 8]], dtype=np.float32))
+        out = F.where(mask, x, y)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[1. 6.]
+         [7. 4.]]
+    """
+    raise NotImplementedError
+    # v0, index0 = mgb.opr.cond_take(
+    #     x, mask, mode=P.CondTake.Mode.EQ, val=1
+    # )
+    # v1, index1 = mgb.opr.cond_take(
+    #     y, mask, mode=P.CondTake.Mode.EQ, val=0
+    # )
+    # out = x.flatten()
+    # index = mgb.opr.concat(index0, index1, axis=0)
+    # v = mgb.opr.concat(v0, v1, axis=0)
+    # out = mgb.opr.set_advanced_indexing(out, v)[index]
+    # out = out.reshape(x.shape)
+    # return out
+
+
+def cond_take(mask: Tensor, x: Tensor) -> Tensor:
+    r"""
+    Take elements from data if specific condition is satisfied on mask. This operator has two outputs: the first is the elements taken, and the second is the indices corresponding to those elements; they are both 1-dimensional. High-dimension input would first be flattened.
+
+    :param mask: condition param; must be the same shape with data
+    :param x: input tensor from which to take elements
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+        mask = tensor(np.array([[True, False], [False, True]], dtype=np.bool_))
+        x = tensor(np.array([[1, np.inf], [np.nan, 4]],
+            dtype=np.float32))
+        v, index = F.cond_take(mask, x)
+        print(v.numpy(), index.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        Tensor([1. 4.]) Tensor([0 3], dtype=int32)
+
+    """
+    if not isinstance(x, (TensorWrapperBase, TensorBase)):
+        raise TypeError("input must be a tensor")
+    if not isinstance(mask, (TensorWrapperBase, TensorBase)):
+        raise TypeError("mask must be a tensor")
+    if mask.dtype != np.bool_:
+        raise ValueError("mask must be bool")
+    if x.device != mask.device:
+        raise ValueError("ambiguous device: {} vs {}".format(x.device, mask.device))
+
+    op = builtin.CondTake()
+    v, index = apply(op, x, mask)
+    return v, index
+
+
+def dimshuffle(inp: Tensor, pattern: Iterable[int]) -> Tensor:
+    r"""
+    Swap shapes and strides according to given pattern
+
+    :param inp: Input tensor
+    :param pattern: a list of integers including 0, 1, ... , ``ndim``-1, and any number of ``'x'`` char in dimensions where this tensor should be broadcasted. For examples:
+
+        * (``'x'``) -> make a 0d (scalar) into a 1d vector
+        * (0, 1) -> identity for 2d vectors
+        * (1, 0) -> inverts the first and second dimensions
+        * (``'x'``, 0) -> make a row out of a 1d vector (N to 1xN)
+        * (0, ``'x'``) -> make a column out of a 1d vector (N to Nx1)
+        * (2, 0, 1) -> AxBxC to CxAxB
+        * (0, ``'x'``, 1) -> AxB to Ax1xB
+        * (1, ``'x'``, 0) -> AxB to Bx1xA
+        * (1,) -> This remove dimensions 0. It must be a broadcastable dimension (1xA to A)
+
+    :return: The output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+        x = tensor(np.array([[1, 1], [0, 0]], dtype=np.int32))
+        out = F.dimshuffle(x, (1, 0))
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[1 0]
+         [1 0]]
+
+    """
+    op = builtin.Dimshuffle(pattern)
+    (inp,) = convert_inputs(inp)
+    (result,) = apply(op, inp)
+    return result
+
+
+def reshape(inp: Tensor, target_shape: Iterable[int]) -> Tensor:
+    r"""
+    Reshape a tensor to given target shape; total number of logical elements must
+    remain unchanged
+
+    :param inp: Input tensor
+    :param target_shape: target shape, the components would be concatenated to form the
+        target shape, and it can contain an element of -1 representing unspec_axis.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+        x = tensor(np.arange(12, dtype=np.int32))
+        out = F.reshape(x, (3, 2, 2))
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[[ 0  1]
+          [ 2  3]]
+
+         [[ 4  5]
+          [ 6  7]]
+
+         [[ 8  9]
+          [10 11]]]
+
+    """
+    if isinstance(target_shape, (TensorBase, TensorWrapperBase)):
+        target_shape = target_shape.numpy()
+    target_shape = tuple(map(int, target_shape))
+    unspec_axis = None
+    for i, s in enumerate(target_shape):
+        if s < 0:
+            if s != -1:
+                raise ValueError("expect shape[{}] >= -1, got {}".format(i, s))
+            if unspec_axis is not None:
+                raise ValueError("multiple -1 in shape: {} & {}".format(unspec_axis, i))
+            unspec_axis = i
+
+    # TODO: device should be None (cpu)
+    (target_shape,) = Const(target_shape, dtype="int32", device=inp.device)(inp)
+    if unspec_axis is None:
+        op = builtin.Reshape()
+    else:
+        op = builtin.Reshape(unspec_axis=unspec_axis)
+    (x,) = apply(op, inp, target_shape)
+    return x
+
+
+transpose = dimshuffle
+
+
+AxisAddRemove = builtin.AxisAddRemove
+AxisDesc = AxisAddRemove.AxisDesc
+
+
+def add_axis(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
+    r"""
+    Add dimension before given axis.
+
+    :param inp: Input tensor
+    :param axis: Place of new axes
+    :return: The output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+        x = tensor([1, 2])
+        out = F.add_axis(x, 0)
+        print(out.shape)
+
+    Outputs:
+
+    .. testoutput::
+
+        (1, 2)
+
+    """
+    Param = AxisAddRemove.Param
+
+    def get_axes():
+        try:
+            return [int(axis)]
+        except (TypeError, ValueError):
+            pass
+        return list(map(int, axis))
+
+    axis = get_axes()
+    ndim = inp.ndim + len(axis)
+    axis = sorted(i + ndim if i < 0 else i for i in axis)
+
+    param = Param(*map(AxisDesc.make_add, axis))
+    op = AxisAddRemove(param=param)
+    (result,) = apply(op, inp)
+    return result
+
+
+expand_dims = add_axis
+
+
+def remove_axis(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
+    r"""
+    Remove dimension of shape 1.
+
+    :param inp: Input tensor
+    :param axis: Place of axis to be removed
+    :return: The output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+        x = tensor(np.array([1, 2], dtype=np.int32).reshape(1, 1, 2, 1))
+        out = F.remove_axis(x, 3)
+        print(out.shape)
+
+    Outputs:
+
+    .. testoutput::
+
+        (1, 1, 2)
+
+    """
+    Param = AxisAddRemove.Param
+
+    def get_axes():
+        if axis is None:
+            return [i for i, s in enumerate(inp.shape) if s == 1]
+        try:
+            return [int(axis)]
+        except (TypeError, ValueError):
+            pass
+        return list(map(int, axis))
+
+    axis = get_axes()
+    axis = sorted(i + inp.ndim if i < 0 else i for i in axis)
+    axis = [a - i for i, a in enumerate(axis)]
+
+    param = Param(*map(AxisDesc.make_remove, axis))
+    op = AxisAddRemove(param=param)
+    (result,) = apply(op, inp)
+    return result
+
+
+squeeze = remove_axis
+
+
+def linspace(
+    start: Union[int, float, Tensor],
+    stop: Union[int, float, Tensor],
+    num: Union[int, Tensor],
+    dtype="float32",
+    device: Optional[CompNode] = None,
+) -> Tensor:
+    r"""
+    Return equally spaced numbers over a specified interval
+
+    :param start: Starting value of the squence, shoule be scalar
+    :param stop: The last value of the squence, shoule be scalar
+    :param num: number of values to generate
+    :param dtype: result data type
+    :return: The generated tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine.functional as F
+
+        a = F.linspace(3,10,5)
+        print(a.numpy())
+
+    .. testoutput::
+
+        [ 3.    4.75  6.5   8.25 10.  ]
+
+    """
+    start = Tensor(start, device=device)
+    stop = Tensor(stop, device=device)
+    num = Tensor(num, device=device)
+
+    device = device if device is None else device.to_c()
+    op = builtin.Linspace(comp_node=device)
+    (result,) = apply(op, start, stop, num)
+    if np.dtype(dtype) == np.int32:
+        return result.astype(dtype)
+    return result
+
+
+def arange(
+    start: Union[int, float, Tensor],
+    end: Union[int, float, Tensor],
+    step: Union[int, float, Tensor] = 1,
+    dtype="float32",
+    device: Optional[CompNode] = None,
+) -> Tensor:
+    r"""
+    Returns a Tensor with values from `start` to `end` with adjacent interval `step`
+
+    :param start: starting value of the squence, shoule be scalar
+    :param end: ending value of the squence, shoule be scalar
+    :param step: the gap between each pair of adjacent values. Default 1
+    :param dtype: result data type
+    :return: The generated tensor
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine.functional as F
+
+        a = F.arange(1, 5, 1)
+        print(a.numpy())
+
+    .. testoutput::
+
+        [1. 2. 3. 4.]
+
+    """
+    if isinstance(start, Tensor):
+        start = start.astype("float32")
+    if isinstance(end, Tensor):
+        end = end.astype("float32")
+    if isinstance(step, Tensor):
+        step = step.astype("float32")
+    num = ceil(Tensor((end - start) / step, device=device))
+    stop = start + step * (num - 1)
+    result = linspace(start, stop, num, device=device)
+    if np.dtype(dtype) == np.int32:
+        return result.astype(dtype)
+    return result
+
+
+def param_pack_split(inp: Tensor, offsets: List, shapes: List) -> Tensor:
+    op = builtin.ParamPackSplit()
+    op.offsets = offsets
+    op.shapes = shapes
+    return apply(op, inp)
+
+
+def param_pack_concat(inps: List, offsets: Tensor, offsets_val: List) -> Tensor:
+    op = builtin.ParamPackConcat()
+    op.offsets = offsets_val
+    return apply(op, *inps, offsets)[0]
diff --git a/imperative/python/megengine/functional/types.py b/imperative/python/megengine/functional/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..465ca03ce68f02d3944ddb87f5b0d4abde5ef9f9
--- /dev/null
+++ b/imperative/python/megengine/functional/types.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+import functools
+
+
+def get_ndtuple(value, *, n, allow_zero=True):
+    r"""Converts possibly 1D tuple to nd tuple
+
+    :type allow_zero: bool
+    :param allow_zero: whether to allow zero tuple value"""
+    if not isinstance(value, collections.Iterable):
+        value = int(value)
+        value = tuple([value for i in range(n)])
+    else:
+        assert len(value) == n, "tuple len is not equal to n: {}".format(value)
+        spatial_axis = map(int, value)
+        value = tuple(spatial_axis)
+    if allow_zero:
+        minv = 0
+    else:
+        minv = 1
+    assert min(value) >= minv, "invalid value: {}".format(value)
+    return value
+
+
+_single = functools.partial(get_ndtuple, n=1, allow_zero=True)
+_pair = functools.partial(get_ndtuple, n=2, allow_zero=True)
+_pair_nonzero = functools.partial(get_ndtuple, n=2, allow_zero=False)
+_triple = functools.partial(get_ndtuple, n=3, allow_zero=True)
+_quadruple = functools.partial(get_ndtuple, n=4, allow_zero=True)
diff --git a/imperative/python/megengine/functional/utils.py b/imperative/python/megengine/functional/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..85c85d43a84c9cb6caf4e7009abac9414b42c8dc
--- /dev/null
+++ b/imperative/python/megengine/functional/utils.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+from typing import Iterable, Union
+
+import numpy as np
+
+from ..core.ops.builtin import Copy
+from ..core.tensor import Tensor
+from ..core.tensor.core import apply
+from .math import topk as _topk
+from .tensor import dimshuffle as _dimshuffle
+
+
+def accuracy(
+    logits: Tensor, target: Tensor, topk: Union[int, Iterable[int]] = 1
+) -> Union[Tensor, Iterable[Tensor]]:
+    r"""
+    Calculate the classification accuracy given predicted logits and ground-truth labels.
+
+    :param logits: Model predictions of shape [batch_size, num_classes],
+        representing the probability (likelyhood) of each class.
+    :param target: Ground-truth labels, 1d tensor of int32
+    :param topk: Specifies the topk values, could be an int or tuple of ints. Default: 1
+    :return: Tensor(s) of classification accuracy between 0.0 and 1.0
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        logits = tensor(np.arange(80, dtype=np.int32).reshape(8,10))
+        target = tensor(np.arange(8, dtype=np.int32))
+        top1, top5 = F.accuracy(logits, target, (1, 5))
+        print(top1.numpy(), top5.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [0.] [0.375]
+    """
+    if isinstance(topk, int):
+        topk = (topk,)
+    _, pred = _topk(logits, k=max(topk), descending=True)
+    accs = []
+    for k in topk:
+        correct = pred[:, :k].detach() == _dimshuffle(target, (0, "x")).broadcast(
+            target.shape[0], k
+        )
+        accs.append(correct.astype(np.float32).sum() / target.shape[0])
+    if len(topk) == 1:  # type: ignore[arg-type]
+        accs = accs[0]
+    return accs
+
+
+def zero_grad(inp: Tensor) -> Tensor:
+    r"""
+    Returns a tensor which is treated as constant during backward gradient calcuation,
+    i.e. its gradient is zero.
+
+    :param inp: Input tensor.
+
+    See implementation of :func:`~.softmax` for example.
+    """
+    print("zero_grad is obsoleted, please use detach instead")
+    raise NotImplementedError
+
+
+def copy(inp, cn):
+    return apply(Copy(comp_node=cn), inp)[0]
diff --git a/imperative/python/megengine/hub/__init__.py b/imperative/python/megengine/hub/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f07c3979919e4f2bf2dd6efbbdb19dec7e3bb294
--- /dev/null
+++ b/imperative/python/megengine/hub/__init__.py
@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .hub import (
+    help,
+    import_module,
+    list,
+    load,
+    load_serialized_obj_from_url,
+    pretrained,
+)
diff --git a/imperative/python/megengine/hub/const.py b/imperative/python/megengine/hub/const.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f53420bed8aa290f29ea958e3129c80e779a388
--- /dev/null
+++ b/imperative/python/megengine/hub/const.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+DEFAULT_BRANCH_NAME = "master"
+HUBCONF = "hubconf.py"
+HUBDEPENDENCY = "dependencies"
+DEFAULT_GIT_HOST = "github.com"
+ENV_MGE_HOME = "MGE_HOME"
+ENV_XDG_CACHE_HOME = "XDG_CACHE_HOME"
+DEFAULT_CACHE_DIR = "~/.cache"
+DEFAULT_PROTOCOL = "HTTPS"
+HTTP_READ_TIMEOUT = 120
diff --git a/imperative/python/megengine/hub/exceptions.py b/imperative/python/megengine/hub/exceptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..aab0a13452736d4d14d6e36141bedee68b5b16b4
--- /dev/null
+++ b/imperative/python/megengine/hub/exceptions.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+class FetcherError(Exception):
+    """Base class for fetch related error."""
+
+
+class InvalidRepo(FetcherError):
+    """The repo provided was somehow invalid."""
+
+
+class InvalidGitHost(FetcherError):
+    """The git host provided was somehow invalid."""
+
+
+class GitPullError(FetcherError):
+    """A git pull error occurred"""
+
+
+class GitCheckoutError(FetcherError):
+    """A git checkout error occurred"""
+
+
+class InvalidProtocol(FetcherError):
+    """The protocol provided was somehow invalid"""
diff --git a/imperative/python/megengine/hub/fetcher.py b/imperative/python/megengine/hub/fetcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f60b3ceeed1409a8b7f04ac31436bf3654749c6
--- /dev/null
+++ b/imperative/python/megengine/hub/fetcher.py
@@ -0,0 +1,300 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import hashlib
+import os
+import re
+import shutil
+import subprocess
+from tempfile import NamedTemporaryFile
+from typing import Tuple
+from zipfile import ZipFile
+
+import requests
+from tqdm import tqdm
+
+from megengine.utils.http_download import (
+    CHUNK_SIZE,
+    HTTP_CONNECTION_TIMEOUT,
+    HTTPDownloadError,
+)
+
+from ..distributed import is_distributed, synchronized
+from ..logger import get_logger
+from .const import DEFAULT_BRANCH_NAME, HTTP_READ_TIMEOUT
+from .exceptions import GitCheckoutError, GitPullError, InvalidGitHost, InvalidRepo
+from .tools import cd
+
+logger = get_logger(__name__)
+
+HTTP_TIMEOUT = (HTTP_CONNECTION_TIMEOUT, HTTP_READ_TIMEOUT)
+
+pattern = re.compile(
+    r"^(?:[a-z0-9]"  # First character of the domain
+    r"(?:[a-z0-9-_]{0,61}[a-z0-9])?\.)"  # Sub domain + hostname
+    r"+[a-z0-9][a-z0-9-_]{0,61}"  # First 61 characters of the gTLD
+    r"[a-z]$"  # Last character of the gTLD
+)
+
+
+class RepoFetcherBase:
+    @classmethod
+    def fetch(
+        cls,
+        git_host: str,
+        repo_info: str,
+        use_cache: bool = False,
+        commit: str = None,
+        silent: bool = True,
+    ) -> str:
+        raise NotImplementedError()
+
+    @classmethod
+    def _parse_repo_info(cls, repo_info: str) -> Tuple[str, str, str]:
+        try:
+            branch_info = DEFAULT_BRANCH_NAME
+            if ":" in repo_info:
+                prefix_info, branch_info = repo_info.split(":")
+            else:
+                prefix_info = repo_info
+            repo_owner, repo_name = prefix_info.split("/")
+            return repo_owner, repo_name, branch_info
+        except ValueError:
+            raise InvalidRepo("repo_info: '{}' is invalid.".format(repo_info))
+
+    @classmethod
+    def _check_git_host(cls, git_host):
+        return cls._is_valid_domain(git_host) or cls._is_valid_host(git_host)
+
+    @classmethod
+    def _is_valid_domain(cls, s):
+        try:
+            return pattern.match(s.encode("idna").decode("ascii"))
+        except UnicodeError:
+            return False
+
+    @classmethod
+    def _is_valid_host(cls, s):
+        nums = s.split(".")
+        if len(nums) != 4 or any(not _.isdigit() for _ in nums):
+            return False
+        return all(0 <= int(_) < 256 for _ in nums)
+
+    @classmethod
+    def _gen_repo_dir(cls, repo_dir: str) -> str:
+        return hashlib.sha1(repo_dir.encode()).hexdigest()[:16]
+
+
+class GitSSHFetcher(RepoFetcherBase):
+    @classmethod
+    @synchronized
+    def fetch(
+        cls,
+        git_host: str,
+        repo_info: str,
+        use_cache: bool = False,
+        commit: str = None,
+        silent: bool = True,
+    ) -> str:
+        """
+        Fetches git repo by SSH protocol
+
+        :param git_host:
+            host address of git repo.
+            example: github.com
+        :param repo_info:
+            a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
+            tag/branch. The default branch is ``master`` if not specified.
+            example: ``"brain_sdk/MegBrain[:hub]"``
+        :param use_cache:
+            whether to use locally fetched code or completely re-fetch
+        :param commit:
+            commit id on github or gitlab
+        :param silent:
+            whether to accept the stdout and stderr of the subprocess with PIPE, instead of
+            displaying on the screen
+        :return:
+            directory where the repo code is stored
+        """
+        if not cls._check_git_host(git_host):
+            raise InvalidGitHost("git_host: '{}' is malformed.".format(git_host))
+
+        repo_owner, repo_name, branch_info = cls._parse_repo_info(repo_info)
+        normalized_branch_info = branch_info.replace("/", "_")
+        repo_dir_raw = "{}_{}_{}".format(
+            repo_owner, repo_name, normalized_branch_info
+        ) + ("_{}".format(commit) if commit else "")
+        repo_dir = cls._gen_repo_dir(repo_dir_raw)
+        git_url = "git@{}:{}/{}.git".format(git_host, repo_owner, repo_name)
+
+        if use_cache and os.path.exists(repo_dir):  # use cache
+            logger.debug("Cache Found in %s", repo_dir)
+            return repo_dir
+
+        if is_distributed():
+            logger.warning(
+                "When using `hub.load` or `hub.list` to fetch git repositories\n"
+                "    in DISTRIBUTED mode for the first time, processes are synchronized to\n"
+                "    ensure that target repository is ready to use for each process.\n"
+                "    Users are expected to see this warning no more than ONCE, otherwise\n"
+                "    (very little chance) you may need to remove corrupt cache\n"
+                "    `%s` and fetch again.",
+                repo_dir,
+            )
+
+        shutil.rmtree(repo_dir, ignore_errors=True)  # ignore and clear cache
+
+        logger.debug(
+            "Git Clone from Repo:%s Branch: %s to %s",
+            git_url,
+            normalized_branch_info,
+            repo_dir,
+        )
+
+        kwargs = (
+            {"stderr": subprocess.PIPE, "stdout": subprocess.PIPE} if silent else {}
+        )
+        if commit is None:
+            # shallow clone repo by branch/tag
+            p = subprocess.Popen(
+                [
+                    "git",
+                    "clone",
+                    "-b",
+                    normalized_branch_info,
+                    git_url,
+                    repo_dir,
+                    "--depth=1",
+                ],
+                **kwargs,
+            )
+            cls._check_clone_pipe(p)
+        else:
+            # clone repo and checkout to commit_id
+            p = subprocess.Popen(["git", "clone", git_url, repo_dir], **kwargs)
+            cls._check_clone_pipe(p)
+
+            with cd(repo_dir):
+                logger.debug("git checkout to %s", commit)
+                p = subprocess.Popen(["git", "checkout", commit], **kwargs)
+                _, err = p.communicate()
+                if p.returncode:
+                    shutil.rmtree(repo_dir, ignore_errors=True)
+                    raise GitCheckoutError(
+                        "Git checkout error, please check the commit id.\n"
+                        + err.decode()
+                    )
+        with cd(repo_dir):
+            shutil.rmtree(".git")
+
+        return repo_dir
+
+    @classmethod
+    def _check_clone_pipe(cls, p):
+        _, err = p.communicate()
+        if p.returncode:
+            raise GitPullError(
+                "Repo pull error, please check repo info.\n" + err.decode()
+            )
+
+
+class GitHTTPSFetcher(RepoFetcherBase):
+    @classmethod
+    @synchronized
+    def fetch(
+        cls,
+        git_host: str,
+        repo_info: str,
+        use_cache: bool = False,
+        commit: str = None,
+        silent: bool = True,
+    ) -> str:
+        """
+        Fetches git repo by HTTPS protocol
+
+        :param git_host:
+            host address of git repo
+            example: github.com
+        :param repo_info:
+            a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
+            tag/branch. The default branch is ``master`` if not specified.
+            example: ``"brain_sdk/MegBrain[:hub]"``
+        :param use_cache:
+            whether to use locally cached code or completely re-fetch
+        :param commit:
+            commit id on github or gitlab
+        :param silent:
+            whether to accept the stdout and stderr of the subprocess with PIPE, instead of
+            displaying on the screen
+        :return:
+            directory where the repo code is stored
+        """
+        if not cls._check_git_host(git_host):
+            raise InvalidGitHost("git_host: '{}' is malformed.".format(git_host))
+
+        repo_owner, repo_name, branch_info = cls._parse_repo_info(repo_info)
+        normalized_branch_info = branch_info.replace("/", "_")
+        repo_dir_raw = "{}_{}_{}".format(
+            repo_owner, repo_name, normalized_branch_info
+        ) + ("_{}".format(commit) if commit else "")
+        repo_dir = cls._gen_repo_dir(repo_dir_raw)
+        archive_url = cls._git_archive_link(
+            git_host, repo_owner, repo_name, branch_info, commit
+        )
+
+        if use_cache and os.path.exists(repo_dir):  # use cache
+            logger.debug("Cache Found in %s", repo_dir)
+            return repo_dir
+
+        if is_distributed():
+            logger.warning(
+                "When using `hub.load` or `hub.list` to fetch git repositories "
+                "in DISTRIBUTED mode for the first time, processes are synchronized to "
+                "ensure that target repository is ready to use for each process.\n"
+                "Users are expected to see this warning no more than ONCE, otherwise"
+                "(very little chance) you may need to remove corrupt hub cache %s and fetch again."
+            )
+
+        shutil.rmtree(repo_dir, ignore_errors=True)  # ignore and clear cache
+
+        logger.debug("Downloading from %s to %s", archive_url, repo_dir)
+        cls._download_zip_and_extract(archive_url, repo_dir)
+
+        return repo_dir
+
+    @classmethod
+    def _download_zip_and_extract(cls, url, target_dir):
+        resp = requests.get(url, timeout=HTTP_TIMEOUT, stream=True)
+        if resp.status_code != 200:
+            raise HTTPDownloadError(
+                "An error occured when downloading from {}".format(url)
+            )
+
+        total_size = int(resp.headers.get("Content-Length", 0))
+        _bar = tqdm(total=total_size, unit="iB", unit_scale=True)
+
+        with NamedTemporaryFile("w+b") as f:
+            for chunk in resp.iter_content(CHUNK_SIZE):
+                if not chunk:
+                    break
+                _bar.update(len(chunk))
+                f.write(chunk)
+            _bar.close()
+            f.seek(0)
+            with ZipFile(f) as temp_zip_f:
+                zip_dir_name = temp_zip_f.namelist()[0].split("/")[0]
+                temp_zip_f.extractall(".")
+                shutil.move(zip_dir_name, target_dir)
+
+    @classmethod
+    def _git_archive_link(cls, git_host, repo_owner, repo_name, branch_info, commit):
+        archive_link = "https://{}/{}/{}/archive/{}.zip".format(
+            git_host, repo_owner, repo_name, commit or branch_info
+        )
+
+        return archive_link
diff --git a/imperative/python/megengine/hub/hub.py b/imperative/python/megengine/hub/hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..139256e9f111184ca798bb136527e841620142bf
--- /dev/null
+++ b/imperative/python/megengine/hub/hub.py
@@ -0,0 +1,333 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import functools
+import hashlib
+import os
+import sys
+import types
+from typing import Any, List
+from urllib.parse import urlparse
+
+from megengine.utils.http_download import download_from_url
+
+from ..distributed import is_distributed
+from ..logger import get_logger
+from ..serialization import load as _mge_load_serialized
+from .const import (
+    DEFAULT_CACHE_DIR,
+    DEFAULT_GIT_HOST,
+    DEFAULT_PROTOCOL,
+    ENV_MGE_HOME,
+    ENV_XDG_CACHE_HOME,
+    HTTP_READ_TIMEOUT,
+    HUBCONF,
+    HUBDEPENDENCY,
+)
+from .exceptions import InvalidProtocol
+from .fetcher import GitHTTPSFetcher, GitSSHFetcher
+from .tools import cd, check_module_exists, load_module
+
+logger = get_logger(__name__)
+
+
+PROTOCOLS = {
+    "HTTPS": GitHTTPSFetcher,
+    "SSH": GitSSHFetcher,
+}
+
+
+def _get_megengine_home() -> str:
+    """MGE_HOME setting complies with the XDG Base Directory Specification
+    """
+    megengine_home = os.path.expanduser(
+        os.getenv(
+            ENV_MGE_HOME,
+            os.path.join(os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), "megengine"),
+        )
+    )
+    return megengine_home
+
+
+def _get_repo(
+    git_host: str,
+    repo_info: str,
+    use_cache: bool = False,
+    commit: str = None,
+    protocol: str = DEFAULT_PROTOCOL,
+) -> str:
+    if protocol not in PROTOCOLS:
+        raise InvalidProtocol(
+            "Invalid protocol, the value should be one of {}.".format(
+                ", ".join(PROTOCOLS.keys())
+            )
+        )
+    cache_dir = os.path.expanduser(os.path.join(_get_megengine_home(), "hub"))
+    with cd(cache_dir):
+        fetcher = PROTOCOLS[protocol]
+        repo_dir = fetcher.fetch(git_host, repo_info, use_cache, commit)
+        return os.path.join(cache_dir, repo_dir)
+
+
+def _check_dependencies(module: types.ModuleType) -> None:
+    if not hasattr(module, HUBDEPENDENCY):
+        return
+
+    dependencies = getattr(module, HUBDEPENDENCY)
+    if not dependencies:
+        return
+
+    missing_deps = [m for m in dependencies if not check_module_exists(m)]
+    if len(missing_deps):
+        raise RuntimeError("Missing dependencies: {}".format(", ".join(missing_deps)))
+
+
+def _init_hub(
+    repo_info: str,
+    git_host: str,
+    use_cache: bool = True,
+    commit: str = None,
+    protocol: str = DEFAULT_PROTOCOL,
+):
+    """Imports hubmodule like python import
+
+    :param repo_info:
+        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
+        tag/branch. The default branch is ``master`` if not specified.
+        Example: ``"brain_sdk/MegBrain[:hub]"``
+    :param git_host:
+        host address of git repo
+        Example: github.com
+    :param use_cache:
+        whether to use locally cached code or completely re-fetch
+    :param commit:
+        commit id on github or gitlab
+    :param protocol:
+        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
+        The value should be one of HTTPS, SSH.
+    :return:
+        hubconf.py as a python module
+    """
+    cache_dir = os.path.expanduser(os.path.join(_get_megengine_home(), "hub"))
+    os.makedirs(cache_dir, exist_ok=True)
+    absolute_repo_dir = _get_repo(
+        git_host, repo_info, use_cache=use_cache, commit=commit, protocol=protocol
+    )
+    sys.path.insert(0, absolute_repo_dir)
+    hubmodule = load_module(HUBCONF, os.path.join(absolute_repo_dir, HUBCONF))
+    sys.path.remove(absolute_repo_dir)
+
+    return hubmodule
+
+
+@functools.wraps(_init_hub)
+def import_module(*args, **kwargs):
+    return _init_hub(*args, **kwargs)
+
+
+def list(
+    repo_info: str,
+    git_host: str = DEFAULT_GIT_HOST,
+    use_cache: bool = True,
+    commit: str = None,
+    protocol: str = DEFAULT_PROTOCOL,
+) -> List[str]:
+    """Lists all entrypoints available in repo hubconf
+
+    :param repo_info:
+        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
+        tag/branch. The default branch is ``master`` if not specified.
+        Example: ``"brain_sdk/MegBrain[:hub]"``
+    :param git_host:
+        host address of git repo
+        Example: github.com
+    :param use_cache:
+        whether to use locally cached code or completely re-fetch
+    :param commit:
+        commit id on github or gitlab
+    :param protocol:
+        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
+        The value should be one of HTTPS, SSH.
+    :return:
+        all entrypoint names of the model
+    """
+    hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)
+
+    return [
+        _
+        for _ in dir(hubmodule)
+        if not _.startswith("__") and callable(getattr(hubmodule, _))
+    ]
+
+
+def load(
+    repo_info: str,
+    entry: str,
+    *args,
+    git_host: str = DEFAULT_GIT_HOST,
+    use_cache: bool = True,
+    commit: str = None,
+    protocol: str = DEFAULT_PROTOCOL,
+    **kwargs
+) -> Any:
+    """Loads model from github or gitlab repo, with pretrained weights.
+
+    :param repo_info:
+        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
+        tag/branch. The default branch is ``master`` if not specified.
+        Example: ``"brain_sdk/MegBrain[:hub]"``
+    :param entry:
+        an entrypoint defined in hubconf
+    :param git_host:
+        host address of git repo
+        Example: github.com
+    :param use_cache:
+        whether to use locally cached code or completely re-fetch
+    :param commit:
+        commit id on github or gitlab
+    :param protocol:
+        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
+        The value should be one of HTTPS, SSH.
+    :return:
+        a single model with corresponding pretrained weights.
+    """
+    hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)
+
+    if not hasattr(hubmodule, entry) or not callable(getattr(hubmodule, entry)):
+        raise RuntimeError("Cannot find callable {} in hubconf.py".format(entry))
+
+    _check_dependencies(hubmodule)
+
+    module = getattr(hubmodule, entry)(*args, **kwargs)
+    return module
+
+
+def help(
+    repo_info: str,
+    entry: str,
+    git_host: str = DEFAULT_GIT_HOST,
+    use_cache: bool = True,
+    commit: str = None,
+    protocol: str = DEFAULT_PROTOCOL,
+) -> str:
+    """This function returns docstring of entrypoint ``entry`` by following steps:
+
+    1. Pull the repo code specified by git and repo_info
+    2. Load the entry defined in repo's hubconf.py
+    3. Return docstring of function entry
+
+    :param repo_info:
+        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
+        tag/branch. The default branch is ``master`` if not specified.
+        Example: ``"brain_sdk/MegBrain[:hub]"``
+    :param entry:
+        an entrypoint defined in hubconf.py
+    :param git_host:
+        host address of git repo
+        Example: github.com
+    :param use_cache:
+        whether to use locally cached code or completely re-fetch
+    :param commit:
+        commit id on github or gitlab
+    :param protocol:
+        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
+        The value should be one of HTTPS, SSH.
+    :return:
+        docstring of entrypoint ``entry``
+    """
+    hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)
+
+    if not hasattr(hubmodule, entry) or not callable(getattr(hubmodule, entry)):
+        raise RuntimeError("Cannot find callable {} in hubconf.py".format(entry))
+
+    doc = getattr(hubmodule, entry).__doc__
+    return doc
+
+
+def load_serialized_obj_from_url(url: str, model_dir=None) -> Any:
+    """Loads MegEngine serialized object from the given URL.
+
+    If the object is already present in ``model_dir``, it's deserialized and
+    returned. If no ``model_dir`` is specified, it will be ``MGE_HOME/serialized``.
+
+    :param url: url to serialized object
+    :param model_dir: dir to cache target serialized file
+
+    :return: loaded object
+    """
+    if model_dir is None:
+        model_dir = os.path.join(_get_megengine_home(), "serialized")
+    os.makedirs(model_dir, exist_ok=True)
+
+    parts = urlparse(url)
+    filename = os.path.basename(parts.path)
+
+    # use hash as prefix to avoid filename conflict from different urls
+    sha256 = hashlib.sha256()
+    sha256.update(url.encode())
+    digest = sha256.hexdigest()[:6]
+    filename = digest + "_" + filename
+
+    cached_file = os.path.join(model_dir, filename)
+    logger.info(
+        "load_serialized_obj_from_url: download to or using cached %s", cached_file
+    )
+    if not os.path.exists(cached_file):
+        if is_distributed():
+            logger.warning(
+                "Downloading serialized object in DISTRIBUTED mode\n"
+                "    File may be downloaded multiple times. We recommend\n"
+                "    users to download in single process first."
+            )
+        download_from_url(url, cached_file, HTTP_READ_TIMEOUT)
+
+    state_dict = _mge_load_serialized(cached_file)
+    return state_dict
+
+
+class pretrained:
+    r"""
+    Decorator which helps to download pretrained weights from the given url.
+
+    For example, we can decorate a resnet18 function as follows
+
+    .. code-block::
+
+        @hub.pretrained("https://url/to/pretrained_resnet18.pkl")
+        def resnet18(**kwargs):
+            return ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+
+    When decorated function is called with ``pretrained=True``, MegEngine will automatically
+    download and fill the returned model with pretrained weights.
+    """
+
+    def __init__(self, url):
+        self.url = url
+
+    def __call__(self, func):
+        @functools.wraps(func)
+        def pretrained_model_func(
+            pretrained=False, **kwargs
+        ):  # pylint: disable=redefined-outer-name
+            model = func(**kwargs)
+            if pretrained:
+                weights = load_serialized_obj_from_url(self.url)
+                model.load_state_dict(weights)
+            return model
+
+        return pretrained_model_func
+
+
+__all__ = [
+    "list",
+    "load",
+    "help",
+    "load_serialized_obj_from_url",
+    "pretrained",
+    "import_module",
+]
diff --git a/imperative/python/megengine/hub/tools.py b/imperative/python/megengine/hub/tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bf9c98c7b3cdc8958cd85a149df5a9158f83471
--- /dev/null
+++ b/imperative/python/megengine/hub/tools.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import importlib.util
+import os
+import types
+from contextlib import contextmanager
+from typing import Iterator
+
+
+def load_module(name: str, path: str) -> types.ModuleType:
+    """
+    Loads module specified by name and path
+
+    :param name: module name
+    :param path: module path
+    """
+    spec = importlib.util.spec_from_file_location(name, path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def check_module_exists(module: str) -> bool:
+    """Checks whether python module exists or not
+
+    :param module: name of module
+    """
+    return importlib.util.find_spec(module) is not None
+
+
+@contextmanager
+def cd(target: str) -> Iterator[None]:
+    """Changes current directory to target
+
+    :param target: target directory
+    """
+    prev = os.getcwd()
+    os.chdir(os.path.expanduser(target))
+    try:
+        yield
+    finally:
+        os.chdir(prev)
diff --git a/imperative/python/megengine/logger.py b/imperative/python/megengine/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e926ca3be8f9feb33fe93724e13d6432b0d2cda
--- /dev/null
+++ b/imperative/python/megengine/logger.py
@@ -0,0 +1,237 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import contextlib
+import logging
+import os
+import sys
+
+_all_loggers = []
+_default_level_name = os.getenv("MEGENGINE_LOGGING_LEVEL", "ERROR")
+_default_level = logging.getLevelName(_default_level_name.upper())
+
+
+def set_log_file(fout, mode="a"):
+    r"""Sets log output file.
+
+    :type fout: str or file-like
+    :param fout: file-like object that supports write and flush, or string for
+        the filename
+    :type mode: str
+    :param mode: specify the mode to open log file if *fout* is a string
+    """
+    if isinstance(fout, str):
+        fout = open(fout, mode)
+    MegEngineLogFormatter.log_fout = fout
+
+
+class MegEngineLogFormatter(logging.Formatter):
+    log_fout = None
+    date_full = "[%(asctime)s %(lineno)d@%(filename)s:%(name)s] "
+    date = "%(asctime)s "
+    msg = "%(message)s"
+    max_lines = 256
+
+    def _color_exc(self, msg):
+        r"""Sets the color of message as the execution type.
+        """
+        return "\x1b[34m{}\x1b[0m".format(msg)
+
+    def _color_dbg(self, msg):
+        r"""Sets the color of message as the debugging type.
+        """
+        return "\x1b[36m{}\x1b[0m".format(msg)
+
+    def _color_warn(self, msg):
+        r"""Sets the color of message as the warning type.
+        """
+        return "\x1b[1;31m{}\x1b[0m".format(msg)
+
+    def _color_err(self, msg):
+        r"""Sets the color of message as the error type.
+        """
+        return "\x1b[1;4;31m{}\x1b[0m".format(msg)
+
+    def _color_omitted(self, msg):
+        r"""Sets the color of message as the omitted type.
+        """
+        return "\x1b[35m{}\x1b[0m".format(msg)
+
+    def _color_normal(self, msg):
+        r"""Sets the color of message as the normal type.
+        """
+        return msg
+
+    def _color_date(self, msg):
+        r"""Sets the color of message the same as date.
+        """
+        return "\x1b[32m{}\x1b[0m".format(msg)
+
+    def format(self, record):
+        if record.levelno == logging.DEBUG:
+            mcl, mtxt = self._color_dbg, "DBG"
+        elif record.levelno == logging.WARNING:
+            mcl, mtxt = self._color_warn, "WRN"
+        elif record.levelno == logging.ERROR:
+            mcl, mtxt = self._color_err, "ERR"
+        else:
+            mcl, mtxt = self._color_normal, ""
+
+        if mtxt:
+            mtxt += " "
+
+        if self.log_fout:
+            self.__set_fmt(self.date_full + mtxt + self.msg)
+            formatted = super(MegEngineLogFormatter, self).format(record)
+            nr_line = formatted.count("\n") + 1
+            if nr_line >= self.max_lines:
+                head, body = formatted.split("\n", 1)
+                formatted = "\n".join(
+                    [
+                        head,
+                        "BEGIN_LONG_LOG_{}_LINES{{".format(nr_line - 1),
+                        body,
+                        "}}END_LONG_LOG_{}_LINES".format(nr_line - 1),
+                    ]
+                )
+            self.log_fout.write(formatted)
+            self.log_fout.write("\n")
+            self.log_fout.flush()
+
+        self.__set_fmt(self._color_date(self.date) + mcl(mtxt + self.msg))
+        formatted = super(MegEngineLogFormatter, self).format(record)
+
+        if record.exc_text or record.exc_info:
+            # handle exception format
+            b = formatted.find("Traceback ")
+            if b != -1:
+                s = formatted[b:]
+                s = self._color_exc("  " + s.replace("\n", "\n  "))
+                formatted = formatted[:b] + s
+
+        nr_line = formatted.count("\n") + 1
+        if nr_line >= self.max_lines:
+            lines = formatted.split("\n")
+            remain = self.max_lines // 2
+            removed = len(lines) - remain * 2
+            if removed > 0:
+                mid_msg = self._color_omitted(
+                    "[{} log lines omitted (would be written to output file "
+                    "if set_log_file() has been called;\n"
+                    " the threshold can be set at "
+                    "MegEngineLogFormatter.max_lines)]".format(removed)
+                )
+                formatted = "\n".join(lines[:remain] + [mid_msg] + lines[-remain:])
+
+        return formatted
+
+    if sys.version_info.major < 3:
+
+        def __set_fmt(self, fmt):
+            self._fmt = fmt
+
+    else:
+
+        def __set_fmt(self, fmt):
+            self._style._fmt = fmt
+
+
+def get_logger(name=None, formatter=MegEngineLogFormatter):
+    r"""Gets megengine logger with given name.
+    """
+
+    logger = logging.getLogger(name)
+    if getattr(logger, "_init_done__", None):
+        return logger
+    logger._init_done__ = True
+    logger.propagate = False
+    logger.setLevel(_default_level)
+    handler = logging.StreamHandler()
+    handler.setFormatter(formatter(datefmt="%d %H:%M:%S"))
+    handler.setLevel(0)
+    del logger.handlers[:]
+    logger.addHandler(handler)
+    _all_loggers.append(logger)
+    return logger
+
+
+def set_log_level(level, update_existing=True):
+    """Sets default logging level.
+
+    :type level: int e.g. logging.INFO
+    :param level: loggin level given by python :mod:`logging` module
+    :param update_existing: whether to update existing loggers
+    """
+    global _default_level  # pylint: disable=global-statement
+    _default_level = level
+    if update_existing:
+        for i in _all_loggers:
+            i.setLevel(level)
+
+
+_logger = get_logger(__name__)
+
+try:
+    if sys.version_info.major < 3:
+        raise ImportError()
+
+    from .core._imperative_rt.utils import Logger as _imperative_rt_logger
+
+    class MegBrainLogFormatter(MegEngineLogFormatter):
+        date = "%(asctime)s[mgb] "
+
+        def _color_date(self, msg):
+            return "\x1b[33m{}\x1b[0m".format(msg)
+
+    _megbrain_logger = get_logger("megbrain", MegBrainLogFormatter)
+    _imperative_rt_logger.set_log_handler(_megbrain_logger)
+    if _default_level == logging.getLevelName("ERROR"):
+        _imperative_rt_logger.set_log_level(_imperative_rt_logger.LogLevel.Error)
+    elif _default_level == logging.getLevelName("INFO"):
+        _imperative_rt_logger.set_log_level(_imperative_rt_logger.LogLevel.Info)
+    else:
+        _imperative_rt_logger.set_log_level(_imperative_rt_logger.LogLevel.Debug)
+
+    def set_mgb_log_level(level):
+        r"""Sets megbrain log level
+
+        :type level: int e.g. logging.INFO
+        :param level: new log level
+        :return: original log level
+        """
+        logger = _megbrain_logger
+        rst = logger.getEffectiveLevel()
+        logger.setLevel(level)
+        return rst
+
+
+except ImportError as exc:
+
+    def set_mgb_log_level(level):
+        raise NotImplementedError("imperative_rt has not been imported")
+
+
+@contextlib.contextmanager
+def replace_mgb_log_level(level):
+    r"""Replaces megbrain log level in a block and restore after exiting.
+
+    :type level: int e.g. logging.INFO
+    :param level: new log level
+    """
+    old = set_mgb_log_level(level)
+    try:
+        yield
+    finally:
+        set_mgb_log_level(old)
+
+
+def enable_debug_log():
+    r"""Sets logging level to debug for all components.
+    """
+    set_log_level(logging.DEBUG)
+    set_mgb_log_level(logging.DEBUG)
diff --git a/imperative/python/megengine/module/__init__.py b/imperative/python/megengine/module/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a10228f20eae774f3a09ab0288d39787963d581f
--- /dev/null
+++ b/imperative/python/megengine/module/__init__.py
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .activation import LeakyReLU, PReLU, ReLU, Sigmoid, Softmax
+from .batchnorm import BatchNorm1d, BatchNorm2d, SyncBatchNorm
+from .concat import Concat
+from .conv import Conv2d, ConvRelu2d, ConvTranspose2d, LocalConv2d
+from .conv_bn import ConvBn2d, ConvBnRelu2d
+from .dropout import Dropout
+from .elemwise import Elemwise
+from .embedding import Embedding
+from .identity import Identity
+from .linear import Linear
+from .module import Module
+from .parampack import ParamPack
+from .pooling import AvgPool2d, MaxPool2d
+from .quant_dequant import DequantStub, QuantStub
+from .sequential import Sequential
diff --git a/imperative/python/megengine/module/activation.py b/imperative/python/megengine/module/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..025844ed03ac6803d438f3686537e5f88e98641e
--- /dev/null
+++ b/imperative/python/megengine/module/activation.py
@@ -0,0 +1,231 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+from ..functional import leaky_relu, prelu, relu, sigmoid, softmax
+from ..tensor_nn import Parameter
+from .module import Module
+
+
+class Softmax(Module):
+    r"""
+    Applies a softmax function. Softmax is defined as:
+
+    .. math::
+            \text{Softmax}(x_{i}) = \frac{exp(x_i)}{\sum_j exp(x_j)}
+
+    It is applied to an n-dimensional input Tensor and rescaling them so that the elements of the
+    n-dimensional output Tensor lie in the range of `[0, 1]` and sum to 1.
+
+    :param axis: An axis along which softmax will be applied. By default,
+        softmax will apply along the highest ranked axis.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+        import megengine.module as M
+
+        data = mge.tensor(np.array([-2,-1,0,1,2]).astype(np.float32))
+        softmax = M.Softmax()
+        output = softmax(data)
+        with np.printoptions(precision=6):
+            print(output.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [0.011656 0.031685 0.086129 0.234122 0.636409]
+
+    """
+
+    def __init__(self, axis=None):
+        super().__init__()
+        self.axis = axis
+
+    def forward(self, inputs):
+        return softmax(inputs, self.axis)
+
+
+class Sigmoid(Module):
+    r"""
+    Applies the element-wise function:
+
+    .. math::
+        \text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+        import megengine.module as M
+
+        data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
+        sigmoid = M.Sigmoid()
+        output = sigmoid(data)
+        with np.printoptions(precision=6):
+            print(output.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [0.119203 0.268941 0.5      0.731059 0.880797]
+
+    """
+
+    def forward(self, inputs):
+        return sigmoid(inputs)
+
+
+class ReLU(Module):
+    r"""
+    Applies the element-wise function:
+
+    .. math::
+        \text{ReLU}(x) = \max(x, 0)
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+        import megengine.module as M
+        data = mge.tensor(np.array([-2,-1,0,1,2,]).astype(np.float32))
+        relu = M.ReLU()
+        output = relu(data)
+        with np.printoptions(precision=6):
+            print(output.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [0. 0. 0. 1. 2.]
+
+    """
+
+    def forward(self, x):
+        return relu(x)
+
+
+class PReLU(Module):
+    r"""
+    Applies the element-wise function:
+
+    .. math::
+        \text{PReLU}(x) = \max(0,x) + a * \min(0,x)
+
+    or
+
+    .. math::
+        \text{PReLU}(x) =
+        \begin{cases}
+        x, & \text{ if } x \geq 0 \\
+        ax, & \text{ otherwise }
+        \end{cases}
+
+    Here :math:`a` is a learnable parameter. When called without arguments, `PReLU()` uses
+    a single paramter :math:`a` across all input channel. If called with `PReLU(num_of_channels)`,
+    a seperate :math:`a` is used for each input channle.
+
+    :param num_parameters: number of :math:`a` to learn, there is only two
+        values are legitimate: 1, or the number of channels at input. Default: 1
+    :param init: the initial value of :math:`a`. Default: 0.25
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+        import megengine.module as M
+        data = mge.tensor(np.array([-1.2, -3.7, 2.7]).astype(np.float32))
+        prelu = M.PReLU()
+        output = prelu(data)
+        print(output.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [-0.3   -0.925  2.7  ]
+
+    """
+
+    def __init__(self, num_parameters: int = 1, init: float = 0.25):
+        super().__init__()
+        self.num_parameters = num_parameters
+        if num_parameters > 1:
+            # Assume format is NCHW
+            self.weight = Parameter(
+                data=np.full((1, num_parameters, 1, 1), init, dtype=np.float32)
+            )
+        else:
+            self.weight = Parameter(data=[init])
+
+    def forward(self, inputs):
+        assert self.weight.shape == (1,) or self.weight.shape == (
+            1,
+            int(inputs.shape[1]),
+            1,
+            1,
+        ), "invalid weight's shape"
+        return prelu(inputs, self.weight)
+
+
+class LeakyReLU(Module):
+    r"""
+    Applies the element-wise function:
+
+    .. math::
+        \text{LeakyReLU}(x) = \max(0,x) + negative\_slope \times \min(0,x)
+
+    or
+
+    .. math::
+        \text{LeakyReLU}(x) =
+        \begin{cases}
+        x, & \text{ if } x \geq 0 \\
+        negative\_slope \times x, & \text{ otherwise }
+        \end{cases}
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+        import megengine.module as M
+        data = mge.tensor(np.array([-8, -12, 6, 10]).astype(np.float32))
+
+        leakyrelu = M.LeakyReLU(0.01)
+        output = leakyrelu(data)
+        print(output.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [-0.08   -0.12  6.   10.  ]
+
+    """
+
+    def __init__(self, negative_slope: float = 0.01):
+        super().__init__()
+        self.negative_slope = negative_slope
+
+    def forward(self, inputs):
+        return leaky_relu(inputs, self.negative_slope)
diff --git a/imperative/python/megengine/module/batchnorm.py b/imperative/python/megengine/module/batchnorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac154e1cfdf1adfdcc68023a77f984e08b1bfcd5
--- /dev/null
+++ b/imperative/python/megengine/module/batchnorm.py
@@ -0,0 +1,281 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Optional
+
+import numpy as np
+
+from ..distributed.group import WORLD, Group
+from ..functional import batch_norm2d, sync_batch_norm
+from ..tensor_nn import Buffer, Parameter
+from . import init
+from .module import Module
+
+
+class _BatchNorm(Module):
+    def __init__(
+        self,
+        num_features,
+        eps=1e-5,
+        momentum=0.9,
+        affine=True,
+        track_running_stats=True,
+        freeze=False,
+    ):
+        super(_BatchNorm, self).__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        self._track_running_stats_saved = track_running_stats
+        self.freeze = freeze
+        if self.affine:
+            self.weight = Parameter(np.ones(num_features, dtype=np.float32))
+            self.bias = Parameter(np.zeros(num_features, dtype=np.float32))
+        else:
+            self.weight = None
+            self.bias = None
+
+        tshape = (1, self.num_features, 1, 1)
+
+        if self.track_running_stats:
+            self.running_mean = Buffer(np.zeros(tshape, dtype=np.float32))
+            self.running_var = Buffer(np.ones(tshape, dtype=np.float32))
+        else:
+            self.running_mean = None
+            self.running_var = None
+
+    def reset_running_stats(self) -> None:
+        if self.track_running_stats:
+            init.zeros_(self.running_mean)
+            init.ones_(self.running_var)
+
+    def reset_parameters(self) -> None:
+        self.reset_running_stats()
+        if self.affine:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+
+    def _check_input_ndim(self, inp):
+        raise NotImplementedError
+
+    def forward(self, inp):
+        self._check_input_ndim(inp)
+        if self._track_running_stats_saved == False:
+            assert (
+                self.track_running_stats == False
+            ), "track_running_stats can not be initilized to False and changed to True later"
+
+        _ndims = len(inp.shape)
+        if _ndims != 4:
+            origin_shape = inp.shapeof()
+            if _ndims == 2:
+                n, c = inp.shapeof(0), inp.shapeof(1)
+                new_shape = (n, c, 1, 1)
+            elif _ndims == 3:
+                n, c, h = inp.shapeof(0), inp.shapeof(1), inp.shapeof(2)
+                new_shape = (n, c, h, 1)
+
+            inp = inp.reshape(new_shape)
+
+        if self.freeze and self.training and self._track_running_stats_saved:
+            scale = self.weight.reshape(1, -1, 1, 1) * (
+                self.running_var + self.eps
+            ) ** (-0.5)
+            bias = self.bias.reshape(1, -1, 1, 1) - self.running_mean * scale
+            return inp * scale.detach() + bias.detach()
+
+        if self.training and self.track_running_stats:
+            exponential_average_factor = self.momentum
+        else:
+            exponential_average_factor = 0.0  # useless
+
+        output = batch_norm2d(
+            inp,
+            self.running_mean if self.track_running_stats else None,
+            self.running_var if self.track_running_stats else None,
+            self.weight,
+            self.bias,
+            training=self.training
+            or ((self.running_mean is None) and (self.running_var is None)),
+            momentum=exponential_average_factor,
+            eps=self.eps,
+        )
+
+        if _ndims != 4:
+            output = output.reshape(origin_shape)
+
+        return output
+
+
+class SyncBatchNorm(_BatchNorm):
+    r"""
+    Applies Synchronization Batch Normalization.
+    """
+
+    def __init__(
+        self,
+        num_features,
+        eps=1e-5,
+        momentum=0.9,
+        affine=True,
+        track_running_stats=True,
+        freeze=False,
+        group: Optional[Group] = None,
+    ) -> None:
+        super().__init__(
+            num_features, eps, momentum, affine, track_running_stats, freeze
+        )
+        self.group = group
+
+    def _check_input_ndim(self, inp):
+        if len(inp.shape) not in {2, 3, 4}:
+            raise ValueError(
+                "expected 2D, 3D or 4D input (got {}D input)".format(len(inp.shape))
+            )
+
+    def forward(self, inp):
+        self._check_input_ndim(inp)
+
+        _ndims = len(inp.shape)
+        if _ndims != 4:
+            origin_shape = inp.shapeof()
+            if _ndims == 2:
+                n, c = inp.shapeof(0), inp.shapeof(1)
+                new_shape = (n, c, 1, 1)
+            elif _ndims == 3:
+                n, c, h = inp.shapeof(0), inp.shapeof(1), inp.shapeof(2)
+                new_shape = (n, c, h, 1)
+
+            inp = inp.reshape(new_shape)
+
+        if self.training and self.track_running_stats:
+            exponential_average_factor = self.momentum
+        else:
+            exponential_average_factor = 0.0  # useless
+
+        output = sync_batch_norm(
+            inp,
+            self.running_mean,
+            self.running_var,
+            self.weight,
+            self.bias,
+            self.training or not self.track_running_stats,
+            exponential_average_factor,
+            self.eps,
+            group=self.group,
+        )
+
+        if _ndims != 4:
+            output = output.reshape(origin_shape)
+
+        return output
+
+
+class BatchNorm1d(_BatchNorm):
+    r"""
+    Applies Batch Normalization over a 2D/3D tensor.
+
+    Refer to :class:`~.BatchNorm2d` for more information.
+    """
+
+    def _check_input_ndim(self, inp):
+        if len(inp.shape) not in {2, 3}:
+            raise ValueError(
+                "expected 2D or 3D input (got {}D input)".format(len(inp.shape))
+            )
+
+
+class BatchNorm2d(_BatchNorm):
+    r"""
+    Applies Batch Normalization over a 4D tensor.
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable
+    parameter vectors.
+
+    By default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.9.
+
+    If :attr:`track_running_stats` is set to ``False``, this layer will not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = \text{momentum} \times \hat{x} + (1 - \text{momentum}) \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Because the Batch Normalization is done over the `C` dimension, computing
+    statistics on `(N, H, W)` slices, it's common terminology to call this
+    Spatial Batch Normalization.
+
+    :type num_features: int
+    :param num_features: usually the :math:`C` from an input of size
+        :math:`(N, C, H, W)` or the highest ranked dimension of an input with
+        less than 4D.
+    :type eps: float
+    :param eps: a value added to the denominator for numerical stability.
+        Default: 1e-5.
+    :type momentum: float
+    :param momentum: the value used for the `running_mean` and `running_var`
+        computation.
+        Default: 0.9
+    :type affine: bool
+    :param affine: a boolean value that when set to ``True``, this module has
+        learnable affine parameters. Default: ``True``
+    :type track_running_stats: bool
+    :param track_running_stats: when set to ``True``, this module tracks the
+        running mean and variance. When set to ``False``, this module does not
+        track such statistics and always uses batch statistics in both training
+        and eval modes. Default: ``True``.
+
+    :type freeze: bool
+    :param freeze: when set to ``True``, this module does not update the
+        running mean and variance, and uses the running mean and variance instead of
+        the batch mean and batch variance to normalize the input. The parameter takes effect
+        only when the module is initilized with ``track_running_stats`` as ``True`` and
+        the module is in training mode.
+        Default: ``False``.
+
+    Examples:
+
+    .. testcode::
+
+        import megengine as mge
+        import megengine.module as M
+
+        # With Learnable Parameters
+        m = M.BatchNorm2d(4)
+        inp = mge.tensor(np.random.rand(1, 4, 3, 3).astype("float32"))
+        oup = m(inp)
+        print(m.weight, m.bias)
+        # Without Learnable Parameters
+        m = M.BatchNorm2d(4, affine=False)
+        oup = m(inp)
+        print(m.weight, m.bias)
+
+    .. testoutput::
+
+        Tensor([1. 1. 1. 1.]) Tensor([0. 0. 0. 0.])
+        None None
+    """
+
+    def _check_input_ndim(self, inp):
+        if len(inp.shape) != 4:
+            raise ValueError("expected 4D input (got {}D input)".format(len(inp.shape)))
diff --git a/imperative/python/megengine/module/concat.py b/imperative/python/megengine/module/concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7eca519c549636265e542d6d97fae0c8c98cfc66
--- /dev/null
+++ b/imperative/python/megengine/module/concat.py
@@ -0,0 +1,22 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Iterable
+
+from ..functional import concat
+from ..tensor import Tensor
+from .module import Module
+
+
+class Concat(Module):
+    r"""
+    A :class:`~.Module` to do functional concat. Could be replaced with :class:`~.QATModule`
+    version :class:`~.qat.concat.Concat` using :func:`~.quantize.quantize_qat`.
+    """
+
+    def forward(self, inps: Iterable[Tensor], axis: int = 0):
+        return concat(inps, axis)
diff --git a/imperative/python/megengine/module/conv.py b/imperative/python/megengine/module/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..87bdc244e3b564bfd6c784a3456c4bc6379b808b
--- /dev/null
+++ b/imperative/python/megengine/module/conv.py
@@ -0,0 +1,391 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from abc import abstractmethod
+from typing import Tuple, Union
+
+import numpy as np
+
+from ..core.ops._internal import param_defs as P
+from ..functional import conv2d, conv_transpose2d, local_conv2d, relu
+from ..functional.types import _pair, _pair_nonzero
+from ..tensor_nn import Parameter
+from . import init
+from .module import Module
+
+
+class _ConvNd(Module):
+    """base class for convolution modules, including transposed conv"""
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: Union[int, Tuple[int, int]],
+        padding: Union[int, Tuple[int, int]],
+        dilation: Union[int, Tuple[int, int]],
+        groups: int,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if in_channels % groups != 0:
+            raise ValueError("in_channels must be divisible by groups")
+        if out_channels % groups != 0:
+            raise ValueError("out_channels must be divisible by groups")
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+
+        self.weight = Parameter(np.zeros(self._infer_weight_shape(), dtype=np.float32))
+        self.bias = None
+        if bias:
+            self.bias = Parameter(np.zeros(self._infer_bias_shape(), dtype=np.float32))
+        self.reset_parameters()
+
+    @abstractmethod
+    def _get_fanin(self):
+        pass
+
+    def reset_parameters(self) -> None:
+        fanin = self._get_fanin()
+        std = np.sqrt(1 / fanin)
+        init.normal_(self.weight, 0.0, std)
+        if self.bias is not None:
+            init.zeros_(self.bias)
+
+    @abstractmethod
+    def _infer_weight_shape(self):
+        pass
+
+    @abstractmethod
+    def _infer_bias_shape(self):
+        pass
+
+
+class Conv2d(_ConvNd):
+    r"""Applies a 2D convolution over an input tensor.
+
+    For instance, given an input of the size :math:`(N, C_{\text{in}}, H, W)`,
+    this layer generates an output of the size
+    :math:`(N, C_{\text{out}}, H_{\text{out}}, W_{\text{out}})` through the
+    process described as below:
+
+    .. math::
+        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
+        \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k)
+
+    where :math:`\star` is the valid 2D cross-correlation operator,
+    :math:`N` is a batch size, :math:`C` denotes a number of channels,
+    :math:`H` is a height of input planes in pixels, and :math:`W` is
+    width in pixels.
+
+    When ``groups == in_channels`` and ``out_channels == K * in_channels``,
+    where `K` is a positive integer, this operation is also known as depthwise
+    convolution.
+
+    In other words, for an input of size :math:`(N, C_{in}, H_{in}, W_{in})`,
+    a depthwise convolution with a depthwise multiplier `K`, can be constructed
+    by arguments :math:`(in\_channels=C_{in}, out\_channels=C_{in} \times K, ..., groups=C_{in})`.
+
+    :param in_channels: number of input channels.
+    :param out_channels: number of output channels.
+    :param kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
+        an :class:`int`, the actual kernel size would be
+        ``(kernel_size, kernel_size)``. Default: 1
+    :param stride: stride of the 2D convolution operation. Default: 1
+    :param padding: size of the paddings added to the input on both sides of its
+        spatial dimensions. Only zero-padding is supported. Default: 0
+    :param dilation: dilation of the 2D convolution operation. Default: 1
+    :param groups: number of groups to divide input and output channels into,
+        so as to perform a "grouped convolution". When ``groups`` is not 1,
+        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+        and there would be an extra dimension at the beginning of the weight's
+        shape. Specifically, the shape of weight would be ``(groups,
+        out_channel // groups, in_channels // groups, *kernel_size)``.
+    :param bias: whether to add a bias onto the result of convolution. Default:
+        True
+    :param conv_mode: Supports `CROSS_CORRELATION` or `CONVOLUTION`. Default:
+        `CROSS_CORRELATION`.
+    :param compute_mode: When set to `DEFAULT`, no special requirements will be
+        placed on the precision of intermediate results. When set to `FLOAT32`,
+        float32 would be used for accumulator and intermediate result, but only
+        effective when input and output are of float16 dtype.
+    """
+
+    _conv_mode_type = P.Convolution.Mode
+    _compute_mode_type = P.Convolution.ComputeMode
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: Union[int, Tuple[int, int]] = 1,
+        padding: Union[int, Tuple[int, int]] = 0,
+        dilation: Union[int, Tuple[int, int]] = 1,
+        groups: int = 1,
+        bias: bool = True,
+        conv_mode: str = "CROSS_CORRELATION",
+        compute_mode: str = "DEFAULT",
+    ):
+        kernel_size = _pair_nonzero(kernel_size)
+        stride = _pair_nonzero(stride)
+        padding = _pair(padding)
+        dilation = _pair_nonzero(dilation)
+        self.conv_mode = self._conv_mode_type.convert(conv_mode)
+        self.compute_mode = self._compute_mode_type.convert(compute_mode)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+        )
+
+    def _get_fanin(self):
+        kh, kw = self.kernel_size
+        ic = self.in_channels
+        return kh * kw * ic
+
+    def _infer_weight_shape(self):
+        group = self.groups
+        ichl = self.in_channels
+        ochl = self.out_channels
+        kh, kw = self.kernel_size
+        if group == 1:
+            # Assume format is NCHW
+            return (ochl, ichl, kh, kw)
+
+        assert (
+            ichl % group == 0 and ochl % group == 0
+        ), "invalid config: input_channels={} output_channels={} group={}".format(
+            ichl, ochl, group
+        )
+        # Assume format is NCHW
+        return (group, ochl // group, ichl // group, kh, kw)
+
+    def _infer_bias_shape(self):
+        # Assume format is NCHW
+        return (1, self.out_channels, 1, 1)
+
+    def calc_conv(self, inp, weight, bias):
+        return conv2d(
+            inp,
+            weight,
+            bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.conv_mode,
+            self.compute_mode,
+        )
+
+    def forward(self, inp):
+        return self.calc_conv(inp, self.weight, self.bias)
+
+
+class ConvTranspose2d(_ConvNd):
+    r"""Applies a 2D transposed convolution over an input tensor.
+
+    This module is also known as a deconvolution or a fractionally-strided convolution.
+    :class:`ConvTranspose2d` can ben seen as the gradient of :class:`Conv2d` operation
+    with respect to its input.
+
+    Convolution usually reduces the size of input, while transposed convolution works
+    the opposite way, transforming a smaller input to a larger output while preserving the
+    connectivity pattern.
+
+    :param in_channels: number of input channels.
+    :param out_channels: number of output channels.
+    :param kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
+        an :class:`int`, the actual kernel size would be
+        ``(kernel_size, kernel_size)``. Default: 1
+    :param stride: stride of the 2D convolution operation. Default: 1
+    :param padding: size of the paddings added to the input on both sides of its
+        spatial dimensions. Only zero-padding is supported. Default: 0
+    :param dilation: dilation of the 2D convolution operation. Default: 1
+    :param groups: number of groups to divide input and output channels into,
+        so as to perform a "grouped convolution". When ``groups`` is not 1,
+        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+        and there would be an extra dimension at the beginning of the weight's
+        shape. Specifically, the shape of weight would be ``(groups,
+        out_channels // groups, in_channels // groups, *kernel_size)``. Default: 1
+    :param bias: wether to add a bias onto the result of convolution. Default:
+        True
+    :param conv_mode: Supports `CROSS_CORRELATION` or `CONVOLUTION`. Default:
+        `CROSS_CORRELATION`.
+    :param compute_mode: When set to `DEFAULT`, no special requirements will be
+        placed on the precision of intermediate results. When set to `FLOAT32`,
+        float32 would be used for accumulator and intermediate result, but only
+        effective when input and output are of float16 dtype.
+    """
+
+    _conv_mode_type = P.Convolution.Mode
+    _compute_mode_type = P.Convolution.ComputeMode
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: Union[int, Tuple[int, int]] = 1,
+        padding: Union[int, Tuple[int, int]] = 0,
+        dilation: Union[int, Tuple[int, int]] = 1,
+        groups: int = 1,
+        bias: bool = True,
+        conv_mode: str = "CROSS_CORRELATION",
+        compute_mode: str = "DEFAULT",
+    ):
+        kernel_size = _pair_nonzero(kernel_size)
+        stride = _pair_nonzero(stride)
+        padding = _pair(padding)
+        dilation = _pair_nonzero(dilation)
+        self.conv_mode = self._conv_mode_type.convert(conv_mode)
+        self.compute_mode = self._compute_mode_type.convert(compute_mode)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+        )
+
+    def _get_fanin(self):
+        kh, kw = self.kernel_size
+        oc = self.out_channels
+        return kh * kw * oc
+
+    def _infer_weight_shape(self):
+        group = self.groups
+        ichl = self.in_channels
+        ochl = self.out_channels
+        kh, kw = self.kernel_size
+        if group == 1:
+            # Assume format is NCHW
+            return (ichl, ochl, kh, kw)
+
+        assert (
+            ichl % group == 0 and ochl % group == 0
+        ), "invalid config: input_channels={} output_channels={} group={}".format(
+            ichl, ochl, group
+        )
+        # Assume format is NCHW
+        return (group, ichl // group, ochl // group, kh, kw)
+
+    def _infer_bias_shape(self):
+        # Assume format is NCHW
+        return (1, self.out_channels, 1, 1)
+
+    def forward(self, inp):
+        return conv_transpose2d(
+            inp,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.conv_mode,
+            self.compute_mode,
+        )
+
+
+class LocalConv2d(Conv2d):
+    r"""Applies a spatial convolution with untied kernels over an input 4D tensor.
+    It is also known as the locally connected layer.
+
+    :param in_channels: number of input channels.
+    :param out_channels: number of output channels.
+    :param input_height: the height of the input images.
+    :param input_width: the width of the input images.
+    :param kernel_size: size of weight on spatial dimensions. If ``kernel_size`` is
+        an :class:`int`, the actual kernel size would be
+        ``(kernel_size, kernel_size)``. Default: 1
+    :param stride: stride of the 2D convolution operation. Default: 1
+    :param padding: size of the paddings added to the input on both sides of its
+        spatial dimensions. Only zero-padding is supported. Default: 0
+    :param groups: number of groups to divide input and output channels into,
+        so as to perform a "grouped convolution". When ``groups`` is not 1,
+        ``in_channels`` and ``out_channels`` must be divisible by ``groups``.
+        The shape of weight is ``(groups, output_height, output_width,
+        in_channels // groups, *kernel_size, out_channels // groups)``.
+    """
+
+    _conv_mode_type = P.Convolution.Mode
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        input_height: int,
+        input_width: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: Union[int, Tuple[int, int]] = 1,
+        padding: Union[int, Tuple[int, int]] = 0,
+        dilation: Union[int, Tuple[int, int]] = 1,
+        groups: int = 1,
+        conv_mode: str = "CROSS_CORRELATION",
+    ):
+        self.input_height = input_height
+        self.input_width = input_width
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=False,
+        )
+
+    def _infer_weight_shape(self):
+        group = self.groups
+        output_height = (
+            self.input_height + self.padding[0] * 2 - self.kernel_size[0]
+        ) // self.stride[0] + 1
+        output_width = (
+            self.input_width + self.padding[1] * 2 - self.kernel_size[1]
+        ) // self.stride[1] + 1
+        # Assume format is NCHW
+        return (
+            group,
+            output_height,
+            output_width,
+            self.in_channels // group,
+            self.kernel_size[0],
+            self.kernel_size[1],
+            self.out_channels // group,
+        )
+
+    def forward(self, inp):
+        return local_conv2d(
+            inp, self.weight, self.stride, self.padding, self.dilation, self.conv_mode
+        )
+
+
+class ConvRelu2d(Conv2d):
+    r"""
+    A fused :class:`~.Module` including Conv2d and relu. Could be replaced
+    with :class:`~.QATModule` version :class:`~.qat.conv.ConvRelu2d` using
+    :func:`~.quantize.quantize_qat`.
+    """
+
+    def forward(self, inp):
+        return relu(self.calc_conv(inp, self.weight, self.bias))
diff --git a/imperative/python/megengine/module/conv_bn.py b/imperative/python/megengine/module/conv_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..76713b0f81e502900de5ce34b2faa96ddda595a2
--- /dev/null
+++ b/imperative/python/megengine/module/conv_bn.py
@@ -0,0 +1,69 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Tuple, Union
+
+from ..functional import relu
+from .batchnorm import BatchNorm2d
+from .conv import Conv2d
+from .module import Module
+
+
+class _ConvBnActivation2d(Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: Union[int, Tuple[int, int]] = 1,
+        padding: Union[int, Tuple[int, int]] = 0,
+        dilation: Union[int, Tuple[int, int]] = 1,
+        groups: int = 1,
+        bias: bool = True,
+        conv_mode: str = "CROSS_CORRELATION",
+        compute_mode: str = "DEFAULT",
+        eps=1e-5,
+        momentum=0.9,
+        affine=True,
+        track_running_stats=True,
+    ):
+        super().__init__()
+        self.conv = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            conv_mode,
+            compute_mode,
+        )
+        self.bn = BatchNorm2d(out_channels, eps, momentum, affine, track_running_stats)
+
+
+class ConvBn2d(_ConvBnActivation2d):
+    r"""
+    A fused :class:`~.Module` including Conv2d, BatchNorm2d. Could be replaced
+    with :class:`~.QATModule` version :class:`~.qat.conv_bn.ConvBn2d` using
+    :func:`~.quantize.quantize_qat`.
+    """
+
+    def forward(self, inp):
+        return self.bn(self.conv(inp))
+
+
+class ConvBnRelu2d(_ConvBnActivation2d):
+    r"""
+    A fused :class:`~.Module` including Conv2d, BatchNorm2d and relu. Could be replaced
+    with :class:`~.QATModule` version :class:`~.qat.conv_bn.ConvBnRelu2d` using
+    :func:`~.quantize.quantize_qat`.
+    """
+
+    def forward(self, inp):
+        return relu(self.bn(self.conv(inp)))
diff --git a/imperative/python/megengine/module/dropout.py b/imperative/python/megengine/module/dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..146eba24544bd713e3c2210a78e1466317012ba6
--- /dev/null
+++ b/imperative/python/megengine/module/dropout.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from ..functional import dropout
+from .module import Module
+
+
+class Dropout(Module):
+    r"""Randomly set input elements to zeros with the probability :math:`drop\_prob` during training. Commonly used in large networks to prevent overfitting.
+    Note that we perform dropout only during training, we also rescale(multiply) the output tensor
+    by :math:`\frac{1}{1 - drop\_prob}`. During inference :class:`~.Dropout` is equal to :class:`~.Identity`.
+
+    :param drop_prob: The probability to drop (set to zero) each single element
+    """
+
+    def __init__(self, drop_prob=0.0):
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, inputs):
+        if self.training:
+            return dropout(inputs, self.drop_prob, rescale=True)
+        else:
+            return inputs
diff --git a/imperative/python/megengine/module/elemwise.py b/imperative/python/megengine/module/elemwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..041f56a5de80873e2a85ec92dd1b32e6c7936227
--- /dev/null
+++ b/imperative/python/megengine/module/elemwise.py
@@ -0,0 +1,79 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from ..core.ops._internal import param_defs as P
+from ..functional.elemwise import _elwise
+from ..tensor import Tensor
+from .module import Module
+
+
+class Elemwise(Module):
+    r"""
+    A :class:`~.Module` to do elemwise operator. Could be replaced with :class:`~.QATModule`
+    version :class:`~.qat.elemwise.Elemwise` using :func:`~.quantize.quantize_qat`.
+
+    :param method: the elemwise method, support the following string.
+        It will do the normal elemwise operator for float.
+
+        * "ADD": a + b
+        * "FUSE_ADD_RELU": max(x+y, 0)
+        * "MUL": x * y
+        * "MIN": min(x, y)
+        * "MAX": max(x, y)
+        * "SUB": x - y
+        * "TRUE_DIV": x / y
+        * "FUSE_ADD_SIGMOID": sigmoid(x + y)
+        * "FUSE_ADD_TANH": tanh(x + y)
+        * "RELU": x > 0 ? x : 0
+        * "ABS": x > 0 ? x : -x
+        * "SIGMOID": sigmoid(x)
+        * "EXP": exp(x)
+        * "TANH": tanh(x)
+        * "FUSE_MUL_ADD3": x * y + z
+        * "FAST_TANH": fast_tanh(x)
+        * "NEGATE": -x
+        * "ACOS": acos(x)
+        * "ASIN": asin(x)
+        * "CEIL": ceil(x)
+        * "COS": cos(x)
+        * "EXPM1": expm1(x)
+        * "FLOOR": floor(x)
+        * "LOG": log(x)
+        * "LOG1P": log1p(x)
+        * "SIN": sin(x)
+        * "ROUND": round(x)
+        * "ERF": erf(x)
+        * "ERFINV": erfinv(x)
+        * "ERFC": erfc(x)
+        * "ERFCINV": erfcinv(x)
+        * "ABS_GRAD": abs_grad
+        * "FLOOR_DIV": floor_div
+        * "MOD": mod
+        * "SIGMOID_GRAD": sigmoid_grad
+        * "SWITCH_GT0": switch_gt0
+        * "TANH_GRAD": tanh_grad
+        * "LT": lt
+        * "LEQ": leq
+        * "EQ": eq
+        * "POW": pow
+        * "LOG_SUM_EXP": log_sum_exp
+        * "FAST_TANH_GRAD": fast_tanh_grad
+        * "ATAN2": atan2
+        * "COND_LEQ_MOV": cond_leq_mov
+        * "H_SWISH": h_swish
+        * "FUSE_ADD_H_SWISH": h_swish(x+y)
+        * "H_SWISH_GRAD": h_swish_grad
+    """
+
+    _elemwise_mode_type = P.Elemwise.Mode
+
+    def __init__(self, method):
+        super().__init__()
+        self.method = self._elemwise_mode_type.convert(method)
+
+    def forward(self, *inps):
+        return _elwise(*inps, mode=self.method)
diff --git a/imperative/python/megengine/module/embedding.py b/imperative/python/megengine/module/embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..15c196517cefaecb0e146707affde9affdccf098
--- /dev/null
+++ b/imperative/python/megengine/module/embedding.py
@@ -0,0 +1,171 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Optional
+
+import numpy as np
+
+from ..functional import embedding as embedding_func
+from ..tensor_nn import Parameter
+from . import init
+from .module import Module
+
+
+class Embedding(Module):
+    r"""
+    A simple lookup table that stores embeddings of a fixed dictionary and size.
+
+    This module is often used to store word embeddings and retrieve them using indices.
+    The input to the module is a list of indices, and the output is the corresponding word embeddings.
+    The indices should less than num_embeddings.
+
+    :param num_embeddings: size of embedding dictionary.
+    :param embedding_dim: size of each embedding vector.
+    :param padding_idx: should be set to None, not support now.
+    :param max_norm: should be set to None, not support now.
+    :param norm_type: should be set to None, not support now.
+    :param initial_weight: the learnable weights of the module of shape (num_embeddings, embedding_dim).
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine as mge
+        import megengine.module as M
+        weight = mge.tensor(np.array([(1.2,2.3,3.4,4.5,5.6),(0.1,1.1,2.1,3.1,4.1)], dtype=np.float32))
+        data = mge.tensor(np.array([(0,1,1),(1,0,1),(0,0,1)], dtype=np.int32))
+
+        embedding = M.Embedding(2, 5, initial_weight=weight)
+        output = embedding(data)
+        with np.printoptions(precision=6):
+            print(output.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [[[1.2 2.3 3.4 4.5 5.6]
+          [0.1 1.1 2.1 3.1 4.1]
+          [0.1 1.1 2.1 3.1 4.1]]
+
+         [[0.1 1.1 2.1 3.1 4.1]
+          [1.2 2.3 3.4 4.5 5.6]
+          [0.1 1.1 2.1 3.1 4.1]]
+
+         [[1.2 2.3 3.4 4.5 5.6]
+          [1.2 2.3 3.4 4.5 5.6]
+          [0.1 1.1 2.1 3.1 4.1]]]
+
+    """
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        padding_idx: Optional[int] = None,
+        max_norm: Optional[float] = None,
+        norm_type: Optional[float] = None,
+        initial_weight: Parameter = None,
+    ):
+        super().__init__()
+        if padding_idx is not None:
+            raise ValueError("Not support padding index now.")
+        if max_norm is not None or norm_type is not None:
+            raise ValueError("Not support weight normalize now.")
+        self.padding_idx = padding_idx
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if initial_weight is None:
+            self.weight = Parameter(
+                np.random.uniform(
+                    size=(self.num_embeddings, self.embedding_dim)
+                ).astype(np.float32)
+            )
+            self.reset_parameters()
+        else:
+            if initial_weight.shape != (num_embeddings, embedding_dim):
+                raise ValueError(
+                    "The weight shape should match num_embeddings and embedding_dim"
+                )
+            self.weight = Parameter(initial_weight.numpy())
+
+    def reset_parameters(self) -> None:
+        init.normal_(self.weight)
+
+    def forward(self, inputs):
+        return embedding_func(inputs, self.weight)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        embeddings: Parameter,
+        freeze: Optional[bool] = True,
+        padding_idx: Optional[int] = None,
+        max_norm: Optional[float] = None,
+        norm_type: Optional[float] = None,
+    ):
+        r"""
+        Creates Embedding instance from given 2-dimensional FloatTensor.
+
+        :param embeddings: Tensor contained weight for the embedding.
+        :param freeze: If ``True``, the weight does not get updated during the learning process. Default: ``True``.
+        :param padding_idx: should be set to None, not support Now.
+        :param max_norm: should be set to None, not support Now.
+        :param norm_type: should be set to None, not support Now.
+
+        Examples:
+
+        .. testcode::
+
+            import numpy as np
+            import megengine as mge
+            import megengine.module as M
+            weight = mge.tensor(np.array([(1.2,2.3,3.4,4.5,5.6),(0.1,1.1,2.1,3.1,4.1)], dtype=np.float32))
+            data = mge.tensor(np.array([(0,1,1),(1,0,1),(0,0,1)], dtype=np.int32))
+
+            embedding = M.Embedding.from_pretrained(weight, freeze=False)
+            output = embedding(data)
+            print(output.numpy())
+
+        Outputs:
+
+        .. testoutput::
+
+            [[[1.2 2.3 3.4 4.5 5.6]
+              [0.1 1.1 2.1 3.1 4.1]
+              [0.1 1.1 2.1 3.1 4.1]]
+
+             [[0.1 1.1 2.1 3.1 4.1]
+              [1.2 2.3 3.4 4.5 5.6]
+              [0.1 1.1 2.1 3.1 4.1]]
+
+             [[1.2 2.3 3.4 4.5 5.6]
+              [1.2 2.3 3.4 4.5 5.6]
+              [0.1 1.1 2.1 3.1 4.1]]]
+
+
+        """
+        embeddings_shape = embeddings.shape
+        embeddings_dim = len(embeddings_shape)
+        if embeddings_dim != 2:
+            raise ValueError("Embeddings parameter is expected to be 2-dimensional")
+        rows = embeddings_shape[0]
+        cols = embeddings_shape[1]
+        embedding = cls(
+            num_embeddings=rows,
+            embedding_dim=cols,
+            initial_weight=embeddings,
+            padding_idx=padding_idx,
+            max_norm=max_norm,
+            norm_type=norm_type,
+        )
+        embedding.weight.requires_grad = not freeze
+        return embedding
diff --git a/imperative/python/megengine/module/external.py b/imperative/python/megengine/module/external.py
new file mode 100644
index 0000000000000000000000000000000000000000..387125c412642f4a69e178009b3538e8e5e47543
--- /dev/null
+++ b/imperative/python/megengine/module/external.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+from ..functional import cambricon_subgraph, extern_opr_subgraph
+from .module import Module
+
+
+class CambriconSubgraph(Module):
+    r"""Load a serialized Cambricon subgraph.
+
+    See :func:`~.cambricon_subgraph` for more details.
+    """
+
+    def __init__(
+        self, data, symbol, tensor_dim_mutable,
+    ):
+        super(CambriconSubgraph, self).__init__()
+        self._data = data
+        self.symbol = symbol
+        self.tensor_dim_mutable = tensor_dim_mutable
+
+    @property
+    def data(self):
+        return self._data.tobytes()
+
+    @data.setter
+    def data(self, val):
+        self._data = np.frombuffer(val, dtype=np.uint8)
+
+    def forward(self, inputs):
+        outputs = cambricon_subgraph(
+            inputs, self._data, self.symbol, self.tensor_dim_mutable,
+        )
+        return outputs
+
+
+class ExternOprSubgraph(Module):
+    r"""Load a serialized extern opr subgraph.
+    """
+
+    def __init__(self, data, name, output_shapes):
+        super(ExternOprSubgraph, self).__init__()
+        self.data = data
+        self.name = name
+        self.output_shapes = output_shapes
+
+    def forward(self, inputs):
+        outputs = extern_opr_subgraph(inputs, self.output_shapes, self.name, self.data,)
+        return outputs
diff --git a/imperative/python/megengine/module/identity.py b/imperative/python/megengine/module/identity.py
new file mode 100644
index 0000000000000000000000000000000000000000..51b31e505370020a14744e39054979da5c197027
--- /dev/null
+++ b/imperative/python/megengine/module/identity.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from ..functional import identity
+from .module import Module
+
+
+class Identity(Module):
+    r"""A placeholder identity operator that will ignore any argument."""
+
+    def forward(self, x):
+        return identity(x)
diff --git a/imperative/python/megengine/module/init.py b/imperative/python/megengine/module/init.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2cb50755a5f6fe0e0819d0fa3e87c57e6a73e80
--- /dev/null
+++ b/imperative/python/megengine/module/init.py
@@ -0,0 +1,261 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import math
+from functools import reduce
+from typing import Optional, Tuple, Union
+
+import numpy as np
+
+from ..tensor import Tensor
+
+
+def fill_(tensor: Tensor, val: Union[float, int]) -> None:
+    """Fill the given ``tensor`` with value ``val``.
+
+    :param tensor: An n-dimentional tensor to be initialized
+    :param val: The value to be filled throughout the tensor
+    """
+    tensor.set_value(np.full(tensor.shape, val, tensor.dtype))
+
+
+def zeros_(tensor: Tensor) -> None:
+    """Fill the given ``tensor`` with scalar value `0`.
+
+    :param tensor: An n-dimentional tensor to be initialized
+    """
+    fill_(tensor, 0)
+
+
+def ones_(tensor: Tensor) -> None:
+    """Fill the given ``tensor`` with the scalar value `1`.
+
+    :param tensor: An n-dimentional tensor to be initialized
+    """
+    fill_(tensor, 1)
+
+
+def uniform_(tensor: Tensor, a: float = 0.0, b: float = 1.0) -> None:
+    r"""Fill the given ``tensor`` with random value sampled from uniform distribution
+    :math:`\mathcal{U}(\text{a}, \text{b})`.
+
+    :param tensor: An n-dimentional tensor to be initialized
+    :param a: Lower bound of the sampling interval
+    :param b: Upper bound of the sampling interval
+    """
+    tensor.set_value(np.random.uniform(a, b, tensor.shape).astype(tensor.dtype))
+
+
+def normal_(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> None:
+    r"""Fill the given ``tensor`` with random value sampled from normal distribution
+    :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.
+
+    :param tensor: An n-dimentional tensor to be initialized
+    :param mean: The mean of the normal distribution
+    :param std: The standard deviation of the normal distribution
+    """
+    tensor.set_value(np.random.normal(mean, std, tensor.shape).astype(np.float32))
+
+
+def calculate_gain(
+    nonlinearity: str, param: Optional[Union[int, float]] = None
+) -> float:
+    r"""Return a recommended gain value (see the table below) for the given nonlinearity
+    function.
+
+    ================= ====================================================
+    nonlinearity      gain
+    ================= ====================================================
+    Linear / Identity :math:`1`
+    Conv{1,2,3}D      :math:`1`
+    Sigmoid           :math:`1`
+    Tanh              :math:`\frac{5}{3}`
+    ReLU              :math:`\sqrt{2}`
+    Leaky Relu        :math:`\sqrt{\frac{2}{1 + \text{negative_{slope}}^2}}`
+    ================= ====================================================
+
+    :param nonlinearity: Name of the non-linear function
+    :param param: Optional parameter for leaky_relu. Only effective when
+        ``nonlinearity`` is "leaky_relu".
+
+    """
+    linear_fns = [
+        "linear",
+        "conv1d",
+        "conv2d",
+        "conv3d",
+        "conv_transpose1d",
+        "conv_transpose2d",
+        "conv_transpose3d",
+    ]
+    if nonlinearity in linear_fns or nonlinearity == "sigmoid":
+        return 1
+    if nonlinearity == "tanh":
+        return 5.0 / 3
+    if nonlinearity == "relu":
+        return math.sqrt(2.0)
+    if nonlinearity == "leaky_relu":
+        if param is None:
+            negative_slope = 0.01
+        elif (
+            not isinstance(param, bool)
+            and isinstance(param, int)
+            or isinstance(param, float)
+        ):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError("negative_slope {} not a valid number".format(param))
+        return math.sqrt(2.0 / (1 + negative_slope ** 2))
+    raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+def calculate_fan_in_and_fan_out(tensor: Tensor) -> Tuple[float, float]:
+    """
+    Calculate fan_in / fan_out value for given weight tensor. This function assumes
+    input tensor is stored in NCHW format.
+
+    :param tensor: Weight tensor in NCHW format
+    """
+    shape = tensor.shape
+    ndim = len(shape)
+    if ndim < 2:
+        raise ValueError(
+            "fan_in and fan_out can not be computed for tensor with fewer than 2 "
+            "dimensions"
+        )
+
+    if ndim == 2:  # Linear
+        fan_in = shape[1]
+        fan_out = shape[0]
+    else:
+        num_input_fmaps = shape[1]
+        num_output_fmaps = shape[0]
+        receptive_field_size = 1
+        if ndim > 2:
+            receptive_field_size = reduce(lambda x, y: x * y, shape[2:], 1)
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+    return fan_in, fan_out
+
+
+def calculate_correct_fan(tensor: Tensor, mode: str) -> float:
+    """
+    Calculate fan_in or fan_out value for given weight tensor, depending on given
+    ``mode``.
+
+    See :func:`calculate_fan_in_and_fan_out` for details.
+
+    :param tensor: Weight tensor in NCHW format
+    :param mode: ``'fan_in'`` or ``'fan_out'``
+    """
+    mode = mode.lower()
+    valid_modes = ["fan_in", "fan_out"]
+    if mode not in valid_modes:
+        raise ValueError(
+            "Mode {} not supported, please use one of {}".format(mode, valid_modes)
+        )
+
+    fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
+    return fan_in if mode == "fan_in" else fan_out
+
+
+def xavier_uniform_(tensor: Tensor, gain: float = 1.0) -> None:
+    r"""Fill ``tensor`` with random values sampled from :math:`\mathcal{U}(-a, a)`
+    where
+
+    .. math::
+        a = \text{gain} \times \sqrt{\frac{6}{\text{fan_in} + \text{fan_out}}}
+
+    Also known as Glorot initialization. Detailed information can be retrieved from
+    `Understanding the difficulty of training deep feedforward neural networks` -
+    Glorot, X. & Bengio, Y. (2010).
+
+    :param tensor: An n-dimentional tensor to be initialized
+    :param gain: Scaling factor for :math:`a`.
+    """
+    fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    a = math.sqrt(3.0) * std
+    uniform_(tensor, -a, a)
+
+
+def xavier_normal_(tensor: Tensor, gain: float = 1.0) -> None:
+    r"""Fill ``tensor`` with random values sampled from
+    :math:`\mathcal{N}(0, \text{std}^2)` where
+
+    .. math::
+        \text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan_in} + \text{fan_out}}}
+
+    Also known as Glorot initialization. Detailed information can be retrieved from
+    `Understanding the difficulty of training deep feedforward neural networks` -
+    Glorot, X. & Bengio, Y. (2010).
+
+    :param tensor: An n-dimentional tensor to be initialized
+    :param gain: Scaling factor for :math:`std`.
+    """
+    fan_in, fan_out = calculate_fan_in_and_fan_out(tensor)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    normal_(tensor, 0.0, std)
+
+
+def msra_uniform_(
+    tensor: Tensor, a: float = 0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"
+) -> None:
+    r"""Fill ``tensor`` wilth random values sampled from
+    :math:`\mathcal{U}(-\text{bound}, \text{bound})` where
+
+    .. math::
+        \text{bound} = \sqrt{\frac{6}{(1 + a^2) \times \text{fan_in}}}
+
+    Detailed information can be retrieved from
+    `Delving deep into rectifiers: Surpassing human-level performance on ImageNet
+    classification`
+
+    :param tensor: An n-dimentional tensor to be initialized
+    :param a: Optional parameter for calculating gain for leaky_relu. See
+        :func:`calculate_gain` for details.
+    :param mode: ``'fan_in'`` or ``'fan_out'``, used to calculate :math:`gain`, the
+        scaling factor for :math:`bound`. See :func:`calculate_fan_in_and_fan_out` for
+        details.
+    :param nonlinearity: Name of the non-linear function used to calculate :math:`gain`.
+        See :func:`calculate_gain` for details.
+    """
+    fan = calculate_correct_fan(tensor, mode)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    bound = math.sqrt(3.0) * std
+    uniform_(tensor, -bound, bound)
+
+
+def msra_normal_(
+    tensor: Tensor, a: float = 0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"
+) -> None:
+    r"""Fill ``tensor`` wilth random values sampled from
+    :math:`\mathcal{N}(0, \text{std}^2)` where
+
+    .. math::
+        \text{std} = \sqrt{\frac{2}{(1 + a^2) \times \text{fan_in}}}
+
+    Detailed information can be retrieved from
+    `Delving deep into rectifiers: Surpassing human-level performance on ImageNet
+    classification`
+
+    :param tensor: An n-dimentional tensor to be initialized
+    :param a: Optional parameter for calculating gain for leaky_relu. See
+        :func:`calculate_gain` for details.
+    :param mode: ``'fan_in'`` or ``'fan_out'``, used to calculate :math:`gain`, the
+        scaling factor for :math:`gain`. See :func:`calculate_fan_in_and_fan_out` for
+        details.
+    :param nonlinearity: Name of the non-linear function used to calculate :math:`gain`.
+        See :func:`calculate_gain` for details.
+    """
+    fan = calculate_correct_fan(tensor, mode)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    normal_(tensor, 0, std)
diff --git a/imperative/python/megengine/module/linear.py b/imperative/python/megengine/module/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..34900a2e43b09ce7dcf0ab426d85942342cf72a4
--- /dev/null
+++ b/imperative/python/megengine/module/linear.py
@@ -0,0 +1,61 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+from ..functional import linear
+from ..tensor_nn import Parameter
+from . import init
+from .module import Module
+
+
+class Linear(Module):
+    r"""Applies a linear transformation to the input. For instance, if input
+    is x, then output y is:
+
+    .. math::
+
+            y = xW^T + b
+
+    where :math:`y_i= \sum_j W_{ij} x_j + b_i`
+
+    :param in_features: size of each input sample.
+    :param out_features: size of each output sample.
+    :param bias: If set to ``False``, the layer will not learn an additive bias.
+        Default: ``True``
+
+    """
+
+    def __init__(
+        self, in_features: int, out_features: int, bias: bool = True, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.out_features = out_features
+        self.in_features = in_features
+        w_shape = (out_features, in_features)
+        self.weight = Parameter(np.zeros(w_shape, dtype=np.float32))
+        self.bias = None
+        if bias:
+            b_shape = (out_features,)
+            self.bias = Parameter(np.zeros(b_shape, dtype=np.float32))
+        self.reset_parameters()
+
+    def _get_fanin(self):
+        return self.in_features
+
+    def reset_parameters(self) -> None:
+        fanin = self._get_fanin()
+        std = np.sqrt(1 / fanin)
+        init.normal_(self.weight, 0.0, std)
+        if self.bias is not None:
+            init.zeros_(self.bias)
+
+    def _calc_linear(self, x, weight, bias):
+        return linear(x, weight, bias)
+
+    def forward(self, x):
+        return self._calc_linear(x, self.weight, self.bias)
diff --git a/imperative/python/megengine/module/module.py b/imperative/python/megengine/module/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..723a9fbbb22d444d857eebfead206741295241a6
--- /dev/null
+++ b/imperative/python/megengine/module/module.py
@@ -0,0 +1,508 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+from typing import Any, Callable, Iterable, Optional, Set, Tuple, Union
+
+import numpy as np
+
+from ..core.tensor.dtype import is_quantize
+from ..logger import get_logger
+from ..tensor import Tensor
+from ..tensor_nn import Buffer, Parameter
+from ..utils.hook import HookHandler
+
+logger = get_logger(__name__)
+
+
+def _expand_structure(key, obj):
+    if isinstance(obj, (Tensor, Module)):
+        return [(key, obj)]
+    elif isinstance(obj, (list, tuple, dict)):
+        ret = []
+        if isinstance(obj, dict):
+            targets = ((k, obj[k]) for k in sorted(obj))
+        else:
+            targets = ((str(k), v) for k, v in enumerate(obj))
+        for k, o in targets:
+            sub_ret = _expand_structure(k, o)
+            if sub_ret and not isinstance(k, str):
+                raise AssertionError(
+                    "keys for Tensor and Module must be str, error key: {}".format(k)
+                )
+            for kt, vt in sub_ret:
+                ret.extend([(key + "." + kt, vt)])
+        return ret
+    else:
+        return []
+
+
+def _is_parameter(obj):
+    return isinstance(obj, Parameter)
+
+
+def _is_buffer(obj):
+    return isinstance(obj, Buffer)
+
+
+def _is_module(obj):
+    return isinstance(obj, Module)
+
+
+class Module(metaclass=ABCMeta):
+    """Base Module class.
+    """
+
+    def __init__(self):
+        # runtime attributes
+        self.training = True
+        self.quantize_disabled = False
+
+        # hooks
+        self._forward_pre_hooks = OrderedDict()
+        self._forward_hooks = OrderedDict()
+
+    @abstractmethod
+    def forward(self, inputs):
+        pass
+
+    def register_forward_pre_hook(self, hook: Callable) -> HookHandler:
+        """Register a hook to handle forward inputs. `hook` should be a function
+
+        Note that `inputs` keyword inputs
+
+        :param hook: a function that receive `module` and `inputs`, then return
+        a modified `inputs` or `None`.
+        :return: a handler with :meth:`~.HookHandler.remove` interface to delete the hook.
+        """
+        return HookHandler(self._forward_pre_hooks, hook)
+
+    def register_forward_hook(self, hook: Callable) -> HookHandler:
+        """Register a hook to handle forward results. `hook` should be a function that
+        receive `module`, `inputs` and `outputs`, then return a modified `outputs` or `None`.
+
+        This method return a handler with :meth:`~.HookHandler.remove` interface to delete the hook.
+        """
+        return HookHandler(self._forward_hooks, hook)
+
+    def __call__(self, *inputs, **kwargs):
+        for hook in self._forward_pre_hooks.values():
+            modified_inputs = hook(self, inputs)
+            if modified_inputs is not None:
+                if not isinstance(modified_inputs, tuple):
+                    modified_inputs = (modified_inputs,)
+                inputs = modified_inputs
+
+        outputs = self.forward(*inputs, **kwargs)
+
+        for hook in self._forward_hooks.values():
+            modified_outputs = hook(self, inputs, outputs)
+            if modified_outputs is not None:
+                outputs = modified_outputs
+        return outputs
+
+    def _flatten(
+        self,
+        *,
+        recursive: bool = True,
+        with_key: bool = False,
+        with_parent: bool = False,
+        prefix: Optional[str] = None,
+        predicate: Callable[[Any], bool] = lambda _: True,
+        seen: Optional[Set[int]] = None
+    ) -> Union[Iterable[Any], Iterable[Tuple[str, Any]]]:
+        """Scans the module object and returns an iterable for the :class:`~.Tensor`
+        and :class:`~.Module` attributes that agree with the ``predicate``. For multiple
+        calls of this function with same arguments, the order of objects within the
+        returned iterable is guaranteed to be identical, as long as all the involved
+        module objects' ``__dict__`` does not change thoughout those calls.
+
+        :param recursive: Whether to recursively scan all the submodules.
+        :param with_key: Whether to yield keys along with yielded objects.
+        :param with_parent: Whether to yield ``self`` along with yielded objects.
+        :param prefix: The prefix appended to the yielded keys.
+        :param predicate: The predicate function applied to scanned objects.
+        :param seen: A dict that records whether a module has been traversed yet.
+        """
+        if seen is None:
+            seen = set([id(self)])
+
+        module_dict = vars(self)
+        _prefix = "" if prefix is None else prefix + "."
+
+        for key in sorted(module_dict):
+            for expanded_key, leaf in _expand_structure(key, module_dict[key]):
+                leaf_id = id(leaf)
+                if leaf_id in seen:
+                    continue
+                seen.add(leaf_id)
+
+                if predicate(leaf):
+                    if with_key and with_parent:
+                        yield _prefix + expanded_key, leaf, self
+                    elif with_key:
+                        yield _prefix + expanded_key, leaf
+                    elif with_parent:
+                        yield leaf, self
+                    else:
+                        yield leaf
+
+                if recursive and isinstance(leaf, Module):
+                    yield from leaf._flatten(
+                        recursive=recursive,
+                        with_key=with_key,
+                        with_parent=with_parent,
+                        prefix=_prefix + expanded_key if with_key else None,
+                        predicate=predicate,
+                        seen=seen,
+                    )
+
+    def parameters(
+        self, requires_grad: Optional[bool] = None, recursive: bool = True, **kwargs
+    ) -> Iterable[Parameter]:
+        r"""Returns an iterable for the :class:`~.Parameter` of the module.
+
+        :param requires_grad: Limitation over the :attr:`~.Parameter.requires_grad`
+            attribute of returned :class:`.Parameter`. ``None`` for no limitation.
+        :param recursive: If ``True``, returns all :class:`~.Parameter` within this
+            module, else only returns :class:`~.Parameter` that are direct attributes
+            of this module.
+        """
+
+        def predicate(obj) -> bool:
+            return _is_parameter(obj) and (
+                requires_grad is None or obj.requires_grad == requires_grad
+            )
+
+        yield from self._flatten(
+            with_key=False, predicate=predicate, recursive=recursive, **kwargs
+        )
+
+    def named_parameters(
+        self,
+        requires_grad: Optional[bool] = None,
+        prefix: Optional[str] = None,
+        recursive: bool = True,
+        **kwargs
+    ) -> Iterable[Tuple[str, Parameter]]:
+        """Returns an iterable for key :class:`~.Parameter` pairs of the module, where
+        ``key`` is the dotted path from this module to the :class:`~.Parameter` .
+
+        :param requires_grad: Limitation over the :attr:`~.Parameter.requires_grad`
+            attribute of returned :class:`~.Parameter` . ``None`` for no limitation.
+        :param prefix: The prefix prepended to the keys.
+        :param recursive: If ``True``, returns all :class:`~.Parameter` within this
+            module, else only returns :class:`~.Parameter` that are direct attributes
+            of this module.
+        """
+
+        def predicate(obj) -> bool:
+            return _is_parameter(obj) and (
+                requires_grad is None or obj.requires_grad == requires_grad
+            )
+
+        yield from self._flatten(
+            with_key=True,
+            prefix=prefix,
+            predicate=predicate,
+            recursive=recursive,
+            **kwargs,
+        )
+
+    def buffers(self, recursive: bool = True, **kwargs) -> Iterable[Buffer]:
+        """Returns an iterable for the :class:`~.Buffer` of the module.
+
+        :param recursive: If ``True``, returns all :class:`~.Buffer` within this
+            module, else only returns :class:`~.Buffer` that are direct attributes
+            of this module.
+        """
+        yield from self._flatten(
+            with_key=False, predicate=_is_buffer, recursive=recursive, **kwargs
+        )
+
+    def named_buffers(
+        self, prefix: Optional[str] = None, recursive: bool = True, **kwargs
+    ) -> Iterable[Tuple[str, Buffer]]:
+        """Returns an iterable for key :class:`~.Buffer` pairs of the module, where
+        ``key`` is the dotted path from this module to the :class:`~.Buffer` .
+
+        :param prefix: The prefix prepended to the keys.
+        :param recursive: If ``True``, returns all :class:`~.Buffer` within this
+            module, else only returns :class:`~.Buffer` that are direct attributes
+            of this module.
+        """
+        yield from self._flatten(
+            with_key=True,
+            prefix=prefix,
+            predicate=_is_buffer,
+            recursive=recursive,
+            **kwargs,
+        )
+
+    def children(self, **kwargs) -> "Iterable[Module]":
+        """Returns an iterable for all the submodules that are direct attributes of this
+        module.
+        """
+        yield from self._flatten(
+            with_key=False, predicate=_is_module, recursive=False, **kwargs
+        )
+
+    def named_children(self, **kwargs) -> "Iterable[Tuple[str, Module]]":
+        """Returns an iterable of key-submodule pairs for all the submodules that are
+        direct attributes of this module, where 'key' is the attribute name of
+        submodules.
+        """
+        yield from self._flatten(
+            with_key=True, predicate=_is_module, recursive=False, **kwargs
+        )
+
+    def modules(self, **kwargs) -> "Iterable[Module]":
+        """Returns an iterable for all the modules within this module, including itself.
+        """
+        if "with_parent" in kwargs and kwargs["with_parent"]:
+            yield self, None
+        else:
+            yield self
+        yield from self._flatten(with_key=False, predicate=_is_module, **kwargs)
+
+    def named_modules(
+        self, prefix: Optional[str] = None, **kwargs
+    ) -> "Iterable[Tuple[str, Module]]":
+        """Returns an iterable of key-module pairs for all the modules within this
+        module, including itself, where 'key' is the dotted path from this module to the
+        submodules.
+
+        :param prefix: The prefix prepended to the path.
+        """
+        if "with_parent" in kwargs and kwargs["with_parent"]:
+            yield ("" if prefix is None else prefix), self, None
+        else:
+            yield ("" if prefix is None else prefix), self
+        yield from self._flatten(
+            with_key=True, prefix=prefix, predicate=_is_module, **kwargs
+        )
+
+    def apply(self, fn: "Callable[[Module], Any]") -> None:
+        """Apply function ``fn`` to all the modules within this module, including
+        itself.
+
+        :param fn: The function to be applied on modules.
+        """
+        for it in self.modules():
+            fn(it)
+
+    def zero_grad(self) -> None:
+        """Set all parameters' grads to zero
+        """
+        for param in self.parameters():
+            if param.grad is not None:
+                param.grad.reset_zero()
+
+    def train(self, mode: bool = True, recursive: bool = True) -> None:
+        """Set training mode of all the modules within this module (including itself) to
+        ``mode``. This effectively sets the ``training`` attributes of those modules
+        to ``mode``, but only has effect on certain modules (e.g.
+        :class:`~.BatchNorm2d`, :class:`~.Dropout`, :class:`~.Observer`)
+
+        :param mode: the training mode to be set on modules.
+        :param recursive: whether to recursively call submodules' ``train()``.
+        """
+        if not recursive:
+            self.training = mode
+            return
+
+        def fn(module: Module) -> None:
+            module.train(mode, recursive=False)
+
+        self.apply(fn)
+
+    def eval(self) -> None:
+        """Set training mode of all the modules within this module (including itself) to
+        ``False``. See :meth:`~.Module.train` for details.
+        """
+        self.train(False)
+
+    def disable_quantize(self, value=True):
+        r"""
+        Set ``module``'s ``quantize_disabled`` attribute and return ``module``.
+        Could be used as a decorator.
+        """
+
+        def fn(module: Module) -> None:
+            module.quantize_disabled = value
+
+        self.apply(fn)
+
+    def replace_param(
+        self, params: dict, start_pos: int, seen: Optional[Set[int]] = None
+    ):
+        """Replace module's parameters with `params`, used by :class:`~.ParamPack` to
+        speedup multimachine training.
+        """
+        offset = 0
+        if seen is None:
+            seen = set([id(self)])
+        module_dict = vars(self)
+        for key in sorted(module_dict):
+            hash_id = id(module_dict[key])
+            if hash_id in seen:
+                continue
+            seen.add(hash_id)
+            if isinstance(module_dict[key], Parameter):
+                if start_pos + offset in params:
+                    assert module_dict[key].shape == params[start_pos + offset].shape
+                    module_dict[key] = params[start_pos + offset]
+                offset += 1
+            if isinstance(module_dict[key], Module):
+                offset += module_dict[key].replace_param(
+                    params, start_pos + offset, seen
+                )
+        return offset
+
+    def state_dict(self, rst=None, prefix="", keep_var=False):
+        r"""Returns a dictionary containing whole states of the module.
+        """
+
+        def is_state(obj):
+            return _is_parameter(obj) or _is_buffer(obj)
+
+        if rst is None:
+            rst = OrderedDict()
+
+        for k, v in self._flatten(recursive=False, with_key=True, predicate=is_state):
+            assert prefix + k not in rst, "duplicated state: {}".format(k)
+            if keep_var:
+                rst[prefix + k] = v
+            else:
+                rst[prefix + k] = v.numpy()
+
+        for k, submodule in self._flatten(
+            recursive=False,
+            with_key=True,
+            predicate=lambda obj: isinstance(obj, Module),
+        ):
+            submodule.state_dict(rst, prefix + k + ".", keep_var)
+
+        return rst
+
+    def load_state_dict(
+        self,
+        state_dict: Union[dict, Callable[[str, Tensor], Optional[np.ndarray]]],
+        strict=True,
+    ):
+        r"""Load a given dictionary created by :func:`state_dict` into this module.
+        If ``strict`` is ``True``, the keys of :func:`state_dict` must exactly match the keys
+        returned by :func:`state_dict`.
+
+        Users can also pass a closure: `Function[key: str, var: Tensor] -> Optional[np.ndarray]`
+        as a `state_dict`, in order to handle complex situations. For example, load everything
+        except for the final linear classifier:
+
+        .. code-block::
+
+            state_dict = {...}  #  Dict[str, np.ndarray]
+            model.load_state_dict({
+                k: None if k.startswith('fc') else v
+                for k, v in state_dict.items()
+            }, strict=False)
+
+        Here returning `None` means skipping parameter `k`.
+
+        To prevent shape mismatch (e.g. load PyTorch weights), we can reshape before loading:
+
+        .. code-block::
+
+            state_dict = {...}
+            def reshape_accordingly(k, v):
+                return state_dict[k].reshape(v.shape)
+            model.load_state_dict(reshape_accordingly)
+
+        We can also perform inplace re-initialization or pruning:
+
+        .. code-block::
+
+            def reinit_and_pruning(k, v):
+                if 'bias' in k:
+                    M.init.zero_(v)
+                if 'conv' in k:
+                    return v.numpy() * (np.abs(v.numpy()) > 1e-3).astype("float32)
+            model.load_state_dict(reinit_and_pruning, strict=False)
+        """
+        unused = []
+        if isinstance(state_dict, dict):
+            unused = state_dict.keys()
+
+            def closure(k, _):  # var unused
+                return state_dict[k] if k in state_dict else None
+
+        elif callable(state_dict):
+            closure = state_dict
+        else:
+            raise ValueError(
+                "`state_dict` must load a dict or callable, got {}".format(
+                    type(state_dict)
+                )
+            )
+
+        loaded, skipped = self._load_state_dict_with_closure(closure)
+        unused = set(unused) - loaded
+
+        if len(unused) != 0:
+            if strict:
+                raise KeyError(
+                    "Unused params violate `strict=True`, unused={}".format(unused)
+                )
+            else:
+                logger.warning(
+                    "Unused params in `strict=False` mode, unused={}".format(unused)
+                )
+
+        if len(skipped) != 0:
+            if strict:
+                raise KeyError(
+                    "Missing params violate `strict=True`, missing={}".format(skipped)
+                )
+            else:
+                logger.warning(
+                    "Missing params in `strict=False` mode, missing={}".format(skipped)
+                )
+
+    def _load_state_dict_with_closure(self, closure):
+        """Advance state_dict load through callable `closure` whose signature is
+
+            `closure(key: str, var: Tensor) -> Union[np.ndarry, None]`
+        """
+        assert callable(closure), "closure must be a function"
+
+        loaded = []
+        skipped = []
+
+        local_state_dict = self.state_dict(keep_var=True)
+        for k, var in local_state_dict.items():
+            to_be_load = closure(k, var)
+            if to_be_load is None:
+                skipped.append(k)
+                continue
+            assert isinstance(
+                to_be_load, np.ndarray
+            ), "closure should return a `np.ndarray`, now `{}` get {}".format(
+                k, to_be_load
+            )
+            assert (
+                var.shape == to_be_load.shape
+            ), "param `{}` shape mismatch, should be {}, get {}".format(
+                k, var.shape, to_be_load.shape
+            )
+            # For quantized dtype, the initialized dtype
+            # scale/zero_points maybe invalid, use pretrained dtype instead.
+            if is_quantize(to_be_load.dtype) and is_quantize(var.dtype):
+                var = var.astype(to_be_load.dtype)
+            var.set_value(to_be_load)
+            loaded.append(k)
+
+        return set(loaded), set(skipped)
diff --git a/imperative/python/megengine/module/parampack.py b/imperative/python/megengine/module/parampack.py
new file mode 100644
index 0000000000000000000000000000000000000000..feb14c613aa5198c1f1d9bc627302fb3b868f914
--- /dev/null
+++ b/imperative/python/megengine/module/parampack.py
@@ -0,0 +1,156 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+from typing import Callable, Iterable, Optional, Tuple
+
+import numpy as np
+
+from ..tensor_nn import Parameter, Tensor
+from .module import Module
+
+
+class ParamPack(Module):
+    r"""Pack module's parameters by gathering their memory to continuous address.
+    Using (device, dtype, requires_grad) as key, for example ('gpu0', float32, True),
+    parameters with same key will be packed togather.
+    It helps a lot for multimachine training by speeding up allreduce gradients.
+
+    :param model: the module you want to pack parameters.
+    :param nr_ignore_first: how many parameters will be unpacked at first.
+    :param max_size_per_group: upper bound of packed parameters' size in MB.
+    :param max_nr_params_per_group: upper bound of the number of parameters of each group.
+
+    """
+
+    def __init__(
+        self,
+        model: Module,
+        nr_ignore_first: int = 8,
+        max_size_per_group: int = 10,
+        max_nr_params_per_group: int = 100,
+        group_func: Callable = lambda name, param: 0,
+    ):
+        super().__init__()
+        self._model = model
+        self._nr_ignore_first = nr_ignore_first
+        self._max_size_per_group = max_size_per_group
+        self._max_nr_params_per_group = max_nr_params_per_group
+        self._group_func = group_func
+        self._grouped_params = []
+        self._packed_params = []
+
+        params = model.named_parameters()
+        self._pack_params(params)
+
+    def parameters(self, requires_grad: Optional[bool] = None) -> Iterable[Parameter]:
+        for param in self._packed_params:
+            if requires_grad is None or param.requires_grad == requires_grad:
+                yield param
+
+    def named_parameters(
+        self, requires_grad: Optional[bool] = None
+    ) -> Iterable[Tuple[str, Parameter]]:
+        for idx, param in enumerate(self._packed_params):
+            if requires_grad is None or param.requires_grad == requires_grad:
+                yield "packed_param_" + str(idx), param
+
+    def _pack_params(self, params: Iterable[Tuple[str, Parameter]]):
+        groups = collections.defaultdict(list)
+        ignored = 0
+        param_id = 0
+        for name, param in params:
+            if self._nr_ignore_first > ignored:
+                ignored += 1
+                self._grouped_params.append([{"shape": param.shape, "id": param_id}])
+                param.pack_group_key = self._group_func(name, param)
+                self._packed_params.append(param)
+            else:
+                key = (
+                    param.dtype,
+                    param.device,
+                    param.requires_grad,
+                    self._group_func(name, param),
+                )
+                groups[key].append({"tensor": param, "id": param_id})
+            param_id += 1
+        for (dtype, device, requires_grad, group_key) in groups.keys():
+            dtype_sz = np.dtype(dtype).itemsize
+            align = device.mem_align
+            if align < dtype_sz:
+                align = 1
+            else:
+                assert align % dtype_sz == 0
+                align //= dtype_sz
+
+            group = groups[(dtype, device, requires_grad, group_key)]
+            while group:
+                aligned_pos = []
+                offset = 0
+                params = []
+                idx = 0
+                while idx < len(group):
+                    param = group[idx]
+                    assert param["tensor"].device == device
+                    padding = (align - (offset & (align - 1))) & (align - 1)
+                    offset += padding
+                    aligned_pos.append(offset)
+                    params.append(param)
+                    offset += int(np.prod(param["tensor"].shape))
+                    idx += 1
+
+                    if (
+                        offset * dtype_sz >= self._max_size_per_group * 1024 * 1024
+                        or idx >= self._max_nr_params_per_group
+                    ):
+                        break
+                group = group[idx:]
+                if idx == 1:
+                    # ignore param packs with only one item
+                    params[0]["tensor"].pack_group_key = group_key
+                    self._packed_params.append(params[0]["tensor"])
+                    self._grouped_params.append(
+                        [{"shape": params[0]["tensor"].shape, "id": params[0]["id"]}]
+                    )
+                    continue
+
+                packed_value = np.zeros((offset,), dtype=dtype)
+                for param, pos in zip(params, aligned_pos):
+                    val = param["tensor"].numpy()
+                    packed_value[pos : pos + val.size] = val.flatten()
+                new_param = Parameter(
+                    value=packed_value,
+                    device=device,
+                    dtype=dtype,
+                    requires_grad=requires_grad,
+                )
+                new_param.pack_group_key = group_key
+                self._packed_params.append(new_param)
+                self._grouped_params.append(
+                    [{"shape": i["tensor"].shape, "id": i["id"]} for i in params]
+                )
+
+    def forward(self, *args, **kwargs):
+        replace_param = dict()
+        for i in range(len(self._packed_params)):
+            packed_param = self._packed_params[i]
+            grouped_params = self._grouped_params[i]
+            if len(grouped_params) == 1:
+                continue
+            split = param_pack_split(
+                packed_param._symvar, [i["shape"] for i in grouped_params]
+            )
+            split = [
+                Parameter(Tensor(i, requires_grad=packed_param.requires_grad))
+                for i in split
+            ]
+            for j in range(len(split)):
+                replace_param[grouped_params[j]["id"]] = split[j]
+        self._model.replace_param(replace_param, 0)
+
+        return self._model.forward(*args, **kwargs)
diff --git a/imperative/python/megengine/module/pooling.py b/imperative/python/megengine/module/pooling.py
new file mode 100644
index 0000000000000000000000000000000000000000..8126ddc1f7ecd169efdcfdf620691da7f0f67140
--- /dev/null
+++ b/imperative/python/megengine/module/pooling.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from abc import abstractmethod
+from typing import Tuple, Union
+
+from ..functional import avg_pool2d, max_pool2d
+from .module import Module
+
+
+class _PoolNd(Module):
+    def __init__(
+        self,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: Union[int, Tuple[int, int]] = None,
+        padding: Union[int, Tuple[int, int]] = 0,
+    ):
+        super(_PoolNd, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride or kernel_size
+        self.padding = padding
+
+    @abstractmethod
+    def forward(self, inp):
+        pass
+
+
+class MaxPool2d(_PoolNd):
+    r"""Applies a 2D max pooling over an input.
+
+    For instance, given an input of the size :math:`(N, C, H, W)` and
+    :attr:`kernel_size` :math:`(kH, kW)`, this layer generates the output of
+    the size :math:`(N, C, H_{out}, W_{out})` through a process described as:
+
+    .. math::
+        \begin{aligned}
+            out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1}
+                \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                \text{stride[1]} \times w + n)
+        \end{aligned}
+
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on
+    both sides for :attr:`padding` number of points.
+
+    :param kernel_size: the size of the window to take a max over.
+    :param stride: the stride of the window. Default value is ``kernel_size``.
+    :param padding: implicit zero padding to be added on both sides.
+    """
+
+    def forward(self, inp):
+        return max_pool2d(inp, self.kernel_size, self.stride, self.padding)
+
+
+class AvgPool2d(_PoolNd):
+    r"""Applies a 2D average pooling over an input.
+
+    For instance, given an input of the size :math:`(N, C, H, W)` and
+    :attr:`kernel_size` :math:`(kH, kW)`, this layer generates the output of
+    the size :math:`(N, C, H_{out}, W_{out})` through a process described as:
+
+    .. math::
+
+        out(N_i, C_j, h, w)  = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on
+    both sides for :attr:`padding` number of points.
+
+    :param kernel_size: the size of the window.
+    :param stride: the stride of the window. Default value is ``kernel_size``.
+    :param padding: implicit zero padding to be added on both sides.
+    """
+
+    def forward(self, inp):
+        return avg_pool2d(inp, self.kernel_size, self.stride, self.padding)
diff --git a/imperative/python/megengine/module/qat/__init__.py b/imperative/python/megengine/module/qat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6adab4dc687a322fba6dd5652bdf8975933ad3a
--- /dev/null
+++ b/imperative/python/megengine/module/qat/__init__.py
@@ -0,0 +1,14 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .concat import Concat
+from .conv import Conv2d, ConvRelu2d
+from .conv_bn import ConvBn2d, ConvBnRelu2d
+from .elemwise import Elemwise
+from .linear import Linear
+from .module import QATModule
+from .quant_dequant import DequantStub, QuantStub
diff --git a/imperative/python/megengine/module/qat/concat.py b/imperative/python/megengine/module/qat/concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1f018938ee9fab92159c68a9e3fe7ddf5f5d3cc
--- /dev/null
+++ b/imperative/python/megengine/module/qat/concat.py
@@ -0,0 +1,30 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Iterable
+
+from ...tensor import Tensor
+from .. import concat as Float
+from .module import QATModule
+
+
+class Concat(Float.Concat, QATModule):
+    r"""
+    A :class:`~.QATModule` to do functional concat with QAT support.
+    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
+    """
+
+    def forward(self, inps: Iterable[Tensor], axis: int = 0):
+        return self.apply_quant_activation(super().forward(inps, axis))
+
+    @classmethod
+    def from_float_module(cls, float_module):
+        r"""
+        Return a :class:`~.QATModule` instance converted from
+        a float :class:`~.Module` instance.
+        """
+        return cls()
diff --git a/imperative/python/megengine/module/qat/conv.py b/imperative/python/megengine/module/qat/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..315da839ed278d811fbb5895516140b5f3060129
--- /dev/null
+++ b/imperative/python/megengine/module/qat/conv.py
@@ -0,0 +1,59 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from ... import functional as F
+from ...quantization.utils import fake_quant_bias
+from .. import conv as Float
+from .module import QATModule
+
+
+class Conv2d(Float.Conv2d, QATModule):
+    r"""
+    A :class:`~.QATModule` Conv2d with QAT support.
+    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
+    """
+
+    def calc_conv_qat(self, inp):
+        w_qat = self.apply_quant_weight(self.weight)
+        b_qat = fake_quant_bias(self.bias, inp, w_qat)
+        conv = self.calc_conv(inp, w_qat, b_qat)
+        return conv
+
+    @classmethod
+    def from_float_module(cls, float_module: Float.Conv2d):
+        r"""
+        Return a :class:`~.QATModule` instance converted from
+        a float :class:`~.Module` instance.
+        """
+        qat_module = cls(
+            float_module.in_channels,
+            float_module.out_channels,
+            float_module.kernel_size,
+            float_module.stride,
+            float_module.padding,
+            float_module.dilation,
+            float_module.groups,
+            float_module.bias is not None,
+            float_module.conv_mode.name,
+            float_module.compute_mode.name,
+        )
+        qat_module.weight = float_module.weight
+        qat_module.bias = float_module.bias
+        return qat_module
+
+    def forward(self, inp):
+        return self.apply_quant_activation(self.calc_conv_qat(inp))
+
+
+class ConvRelu2d(Conv2d):
+    r"""
+    A :class:`~.QATModule` include Conv2d and Relu with QAT support.
+    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
+    """
+
+    def forward(self, inp):
+        return self.apply_quant_activation(F.relu(self.calc_conv_qat(inp)))
diff --git a/imperative/python/megengine/module/qat/conv_bn.py b/imperative/python/megengine/module/qat/conv_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..baa0d769ca2034e80f77a1f890d52bdbac13ea48
--- /dev/null
+++ b/imperative/python/megengine/module/qat/conv_bn.py
@@ -0,0 +1,193 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from ...functional import add_update, ones, relu, sqrt, sum, zeros
+from ...quantization.utils import fake_quant_bias
+from .. import conv_bn as Float
+from .module import QATModule
+
+
+class _ConvBnActivation2d(Float._ConvBnActivation2d, QATModule):
+    def get_batch_mean_var(self, inp):
+        def _sum_channel(inp, axis=0, keepdims=True):
+            if isinstance(axis, int):
+                out = sum(inp, axis=axis, keepdims=keepdims)
+            elif isinstance(axis, tuple):
+                for idx, elem in enumerate(axis):
+                    out = sum(inp if idx == 0 else out, axis=elem, keepdims=keepdims)
+            return out
+
+        sum1 = _sum_channel(inp, (0, 2, 3))
+        sum2 = _sum_channel(inp ** 2, (0, 2, 3))
+        reduce_size = inp.size / inp.shape[1]
+        batch_mean = sum1 / reduce_size
+        batch_var = (sum2 - sum1 ** 2 / reduce_size) / reduce_size
+        return batch_mean, batch_var
+
+    def fold_weight_bias(self, bn_mean, bn_var):
+        # get fold bn conv param
+        # bn_istd = 1 / bn_std
+        # w_fold = gamma / bn_std * W
+        # b_fold = gamma * (b - bn_mean) / bn_std + beta
+        gamma = self.bn.weight
+        if gamma is None:
+            gamma = ones((self.bn.num_features), dtype="float32")
+        gamma = gamma.reshape(1, -1, 1, 1)
+        beta = self.bn.bias
+        if beta is None:
+            beta = zeros((self.bn.num_features), dtype="float32")
+        beta = beta.reshape(1, -1, 1, 1)
+
+        if bn_mean is None:
+            bn_mean = zeros((1, self.bn.num_features, 1, 1), dtype="float32")
+        if bn_var is None:
+            bn_var = ones((1, self.bn.num_features, 1, 1), dtype="float32")
+
+        conv_bias = self.conv.bias
+        if conv_bias is None:
+            conv_bias = zeros(self.conv._infer_bias_shape(), dtype="float32")
+
+        bn_istd = 1.0 / sqrt(bn_var + self.bn.eps)
+        # bn_istd = 1 / bn_std
+        # w_fold = gamma / bn_std * W
+        scale_factor = gamma * bn_istd
+        if self.conv.groups == 1:
+            w_fold = self.conv.weight * scale_factor.reshape(-1, 1, 1, 1)
+        else:
+            w_fold = self.conv.weight * scale_factor.reshape(
+                self.conv.groups, -1, 1, 1, 1
+            )
+
+        w_fold = self.apply_quant_weight(w_fold)
+        # b_fold = gamma * (b - bn_mean) / bn_std + beta
+        b_fold = beta + gamma * (conv_bias - bn_mean) * bn_istd
+        return w_fold, b_fold
+
+    def update_running_mean_and_running_var(
+        self, bn_mean, bn_var, num_elements_per_channel
+    ):
+        # update running mean and running var. no grad, use unbiased bn var
+        bn_mean = bn_mean.detach()
+        bn_var = (
+            bn_var.detach() * num_elements_per_channel / (num_elements_per_channel - 1)
+        )
+        exponential_average_factor = 1 - self.bn.momentum
+        add_update(
+            self.bn.running_mean,
+            delta=bn_mean,
+            alpha=1 - exponential_average_factor,
+            beta=exponential_average_factor,
+        )
+        add_update(
+            self.bn.running_var,
+            delta=bn_var,
+            alpha=1 - exponential_average_factor,
+            beta=exponential_average_factor,
+        )
+
+    def calc_conv_bn_qat(self, inp, approx=True):
+        if self.training and not approx:
+            conv = self.conv(inp)
+            bn_mean, bn_var = self.get_batch_mean_var(conv)
+            num_elements_per_channel = conv.size / conv.shape[1]
+            self.update_running_mean_and_running_var(
+                bn_mean, bn_var, num_elements_per_channel
+            )
+        else:
+            bn_mean, bn_var = self.bn.running_mean, self.bn.running_var
+
+        # get gamma and beta in BatchNorm
+        gamma = self.bn.weight
+        if gamma is None:
+            gamma = ones((self.bn.num_features), dtype="float32")
+        gamma = gamma.reshape(1, -1, 1, 1)
+        beta = self.bn.bias
+        if beta is None:
+            beta = zeros((self.bn.num_features), dtype="float32")
+        beta = beta.reshape(1, -1, 1, 1)
+        # conv_bias
+        conv_bias = self.conv.bias
+        if conv_bias is None:
+            conv_bias = zeros(self.conv._infer_bias_shape(), dtype="float32")
+
+        bn_istd = 1.0 / sqrt(bn_var + self.bn.eps)
+        # bn_istd = 1 / bn_std
+        # w_fold = gamma / bn_std * W
+        scale_factor = gamma * bn_istd
+        if self.conv.groups == 1:
+            w_fold = self.conv.weight * scale_factor.reshape(-1, 1, 1, 1)
+        else:
+            w_fold = self.conv.weight * scale_factor.reshape(
+                self.conv.groups, -1, 1, 1, 1
+            )
+        b_fold = None
+        if not (self.training and approx):
+            # b_fold = gamma * (conv_bias - bn_mean) / bn_std + beta
+            b_fold = beta + gamma * (conv_bias - bn_mean) * bn_istd
+
+        w_qat = self.apply_quant_weight(w_fold)
+        b_qat = fake_quant_bias(b_fold, inp, w_qat)
+        conv = self.conv.calc_conv(inp, w_qat, b_qat)
+        if not (self.training and approx):
+            return conv
+
+        # rescale conv to get original conv output
+        orig_conv = conv / scale_factor.reshape(1, -1, 1, 1)
+        if self.conv.bias is not None:
+            orig_conv = orig_conv + self.conv.bias
+        # calculate batch norm
+        bn_mean, bn_var = self.get_batch_mean_var(orig_conv)
+        bn_istd = 1.0 / sqrt(bn_var + self.bn.eps)
+        conv = gamma * bn_istd * (orig_conv - bn_mean) + beta
+        num_elements_per_channel = conv.size / conv.shape[1]
+        self.update_running_mean_and_running_var(
+            bn_mean, bn_var, num_elements_per_channel
+        )
+        return conv
+
+    @classmethod
+    def from_float_module(cls, float_module: Float._ConvBnActivation2d):
+        r"""
+        Return a :class:`~.QATModule` instance converted from
+        a float :class:`~.Module` instance.
+        """
+        qat_module = cls(
+            float_module.conv.in_channels,
+            float_module.conv.out_channels,
+            float_module.conv.kernel_size,
+            float_module.conv.stride,
+            float_module.conv.padding,
+            float_module.conv.dilation,
+            float_module.conv.groups,
+            float_module.conv.bias is not None,
+            float_module.conv.conv_mode.name,
+            float_module.conv.compute_mode.name,
+        )
+        qat_module.conv.weight = float_module.conv.weight
+        qat_module.conv.bias = float_module.conv.bias
+        qat_module.bn = float_module.bn
+        return qat_module
+
+
+class ConvBn2d(_ConvBnActivation2d):
+    r"""
+    A fused :class:`~.QATModule` including Conv2d, BatchNorm2d with QAT support.
+    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
+    """
+
+    def forward(self, inp):
+        return self.apply_quant_activation(self.calc_conv_bn_qat(inp))
+
+
+class ConvBnRelu2d(_ConvBnActivation2d):
+    r"""
+    A fused :class:`~.QATModule` including Conv2d, BatchNorm2d and relu with QAT support.
+    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
+    """
+
+    def forward(self, inp):
+        return self.apply_quant_activation(relu(self.calc_conv_bn_qat(inp)))
diff --git a/imperative/python/megengine/module/qat/elemwise.py b/imperative/python/megengine/module/qat/elemwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..f99583bdeaf8d9a4739088920a99bb8ab7973e29
--- /dev/null
+++ b/imperative/python/megengine/module/qat/elemwise.py
@@ -0,0 +1,31 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .. import elemwise as Float
+from .module import QATModule
+
+
+class Elemwise(Float.Elemwise, QATModule):
+    r"""
+    A :class:`~.QATModule` to do elemwise operator with QAT support.
+    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
+
+    :param method: the elemwise method, see :class:`~.module.elemwise.Elemwise` for detail.
+    """
+
+    with_weight = False
+
+    def forward(self, *inps):
+        return self.apply_quant_activation(super().forward(*inps))
+
+    @classmethod
+    def from_float_module(cls, float_module: Float.Elemwise):
+        r"""
+        Return a :class:`~.QATModule` instance converted from
+        a float :class:`~.Module` instance.
+        """
+        return cls(float_module.method.name)
diff --git a/imperative/python/megengine/module/qat/linear.py b/imperative/python/megengine/module/qat/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..4067d51c6386aeb601f78591f9f609f7495f5751
--- /dev/null
+++ b/imperative/python/megengine/module/qat/linear.py
@@ -0,0 +1,39 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from ...quantization.utils import fake_quant_bias
+from .. import linear as Float
+from .module import QATModule
+
+
+class Linear(Float.Linear, QATModule):
+    r"""
+    A :class:`~.QATModule` version of :class:`~.module.linear.Linear`.
+    Could be applied with :class:`~.Observer` and :class:`~.FakeQuantize`.
+
+    :param in_features: size of each input sample.
+    :param out_features: size of each output sample.
+    :param bias: If set to ``False``, the layer will not learn an additive bias.
+        Default: ``True``
+
+    """
+
+    def forward(self, x):
+        w_qat = self.apply_quant_weight(self.weight)
+        b_qat = fake_quant_bias(self.bias, x, w_qat)
+        return self.apply_quant_activation(self._calc_linear(x, w_qat, b_qat))
+
+    @classmethod
+    def from_float_module(cls, float_module: Float.Linear):
+        r"""
+        Return a :class:`~.QATModule` instance converted from
+        a float :class:`~.Module` instance.
+        """
+        qmod = cls(float_module.in_features, float_module.out_features)
+        qmod.weight = float_module.weight
+        qmod.bias = float_module.bias
+        return qmod
diff --git a/imperative/python/megengine/module/qat/module.py b/imperative/python/megengine/module/qat/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..544e04aff63fb6f73dedf832a6d186d09749bd15
--- /dev/null
+++ b/imperative/python/megengine/module/qat/module.py
@@ -0,0 +1,154 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from abc import abstractmethod
+
+from ...quantization import FakeQuantize, Observer, QConfig
+from ...tensor import Tensor
+from ..module import Module
+
+
+class QATModule(Module):
+    r"""
+    Base class of quantized-float related Module, basically for QAT and Calibration.
+
+    Use :meth:`~.QATModule.from_float_module` to generate a instance from float :class:`~.Module`.
+    Or use :func:`~.quantize.quantize_qat` to do it recursively and automatically.
+
+    Can also be converted to :class:`~.QuantizedModule` for deployment using
+    :func:`~.quantize.quantize` further.
+    """
+
+    with_weight = True
+    with_act = True
+
+    def __init__(self):
+        super().__init__()
+
+        self.weight_observer = None  # type: Observer
+        self.act_observer = None  # type: Observer
+
+        self.weight_fake_quant = None  # type: FakeQuantize
+        self.act_fake_quant = None  # type: FakeQuantize
+
+    def set_qconfig(self, qconfig: QConfig):
+        r"""
+        Set quantization related configs with ``qconfig``, including
+        observer and fake_quant for weight and activation.
+        """
+
+        def safe_call(func):
+            return func() if func is not None else None
+
+        if self.with_act:
+            self.act_observer = safe_call(qconfig.act_observer)
+            self.act_fake_quant = safe_call(qconfig.act_fake_quant)
+        if self.with_weight:
+            self.weight_observer = safe_call(qconfig.weight_observer)
+            self.weight_fake_quant = safe_call(qconfig.weight_fake_quant)
+
+    def _enable_exec(self, with_module, func, enable):
+        if not with_module:
+            return
+        if enable:
+            func.enable()
+        else:
+            func.disable()
+
+    def set_fake_quant(self, enable):
+        self._enable_exec(self.with_act, self.act_fake_quant, enable)
+        self._enable_exec(self.with_weight, self.weight_fake_quant, enable)
+
+    def set_observer(self, enable):
+        self._enable_exec(self.with_act, self.act_observer, enable)
+        self._enable_exec(self.with_weight, self.weight_observer, enable)
+
+    def _apply_fakequant_with_observer(
+        self, target: Tensor, fake_quant: FakeQuantize, observer: Observer
+    ):
+        # do observer
+        if observer is None:
+            oup = target
+            q_dict = None
+        else:
+            oup = observer(target)
+            q_dict = observer.get_qparams()
+        # do fake quant
+        if fake_quant is not None:
+            oup = fake_quant(oup, q_dict)
+            # use qparams of fake_quant if have.
+            if hasattr(fake_quant, "get_qparams"):
+                q_dict = fake_quant.get_qparams()
+        # set to tensor qparams.
+        if q_dict is not None:
+            oup.q_dict.update(q_dict)
+        return oup
+
+    def apply_quant_weight(self, target: Tensor):
+        r"""
+        Apply weight's observer and fake_quant from ``qconfig`` on ``target``.
+        """
+        return self._apply_fakequant_with_observer(
+            target, self.weight_fake_quant, self.weight_observer
+        )
+
+    def apply_quant_activation(self, target: Tensor):
+        r"""
+        Apply weight's observer and fake_quant from ``qconfig`` on ``target``.
+        """
+        return self._apply_fakequant_with_observer(
+            target, self.act_fake_quant, self.act_observer
+        )
+
+    def _get_method_result(
+        self, method: str, fake_quant: FakeQuantize, observer: Observer
+    ):
+        if hasattr(fake_quant, method):
+            return getattr(fake_quant, method)()
+        elif hasattr(observer, method):
+            return getattr(observer, method)()
+        return None
+
+    def get_weight_dtype(self):
+        r"""
+        Get weight's quantization dtype as the method from ``qconfig``.
+        """
+        return self._get_method_result(
+            "get_dtype", self.weight_fake_quant, self.weight_observer
+        )
+
+    def get_activation_dtype(self):
+        r"""
+        Get activation's quantization dtype as the method from ``qconfig``.
+        """
+        return self._get_method_result(
+            "get_dtype", self.act_fake_quant, self.act_observer
+        )
+
+    def get_weight_qparams(self):
+        r"""
+        Get weight's quantization parameters.
+        """
+        return self._get_method_result(
+            "get_qparams", self.weight_fake_quant, self.weight_observer
+        )
+
+    def get_activation_qparams(self):
+        r"""
+        Get activation's quantization parameters.
+        """
+        return self._get_method_result(
+            "get_qparams", self.act_fake_quant, self.act_observer
+        )
+
+    @classmethod
+    @abstractmethod
+    def from_float_module(cls, float_module: Module):
+        r"""
+        Return a :class:`~.QATModule` instance converted from
+        a float :class:`~.Module` instance.
+        """
diff --git a/imperative/python/megengine/module/qat/quant_dequant.py b/imperative/python/megengine/module/qat/quant_dequant.py
new file mode 100644
index 0000000000000000000000000000000000000000..0baa3e1c7822085e520c01017d8219104905d6ec
--- /dev/null
+++ b/imperative/python/megengine/module/qat/quant_dequant.py
@@ -0,0 +1,50 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .. import quant_dequant as Float
+from .module import QATModule
+
+
+class QuantStub(Float.QuantStub, QATModule):
+    r"""
+    A helper QATModule simply return input, but will quantize
+    input after converted to :class:`~.QuantizedModule`.
+    """
+
+    with_weight = False
+
+    def forward(self, inp):
+        return self.apply_quant_activation(inp)
+
+    @classmethod
+    def from_float_module(cls, float_module: Float.QuantStub):
+        r"""
+        Return a :class:`~.QATModule` instance converted from
+        a float :class:`~.Module` instance.
+        """
+        return cls()
+
+
+class DequantStub(Float.DequantStub, QATModule):
+    r"""
+    A helper QATModule simply return input, but will de-quantize
+    input after converted to :class:`~.QuantizedModule`.
+    """
+
+    with_weight = False
+    with_act = False
+
+    def forward(self, inp):
+        return inp
+
+    @classmethod
+    def from_float_module(cls, float_module: Float.DequantStub):
+        r"""
+        Return a :class:`~.QATModule` instance converted from
+        a float :class:`~.Module` instance.
+        """
+        return cls()
diff --git a/imperative/python/megengine/module/quant_dequant.py b/imperative/python/megengine/module/quant_dequant.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaf2b0cc3ae333c34574dfc3e6284a70c99bc3eb
--- /dev/null
+++ b/imperative/python/megengine/module/quant_dequant.py
@@ -0,0 +1,28 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .module import Module
+
+
+class QuantStub(Module):
+    r"""
+    A helper :class:`~.Module` simply returning input. Could be replaced with :class:`~.QATModule`
+    version :class:`~.qat.QuantStub` using :func:`~.quantize.quantize_qat`.
+    """
+
+    def forward(self, inp):
+        return inp
+
+
+class DequantStub(Module):
+    r"""
+    A helper :class:`~.Module` simply returning input. Could be replaced with :class:`~.QATModule`
+    version :class:`~.qat.DequantStub` using :func:`~.quantize.quantize_qat`.
+    """
+
+    def forward(self, inp):
+        return inp
diff --git a/imperative/python/megengine/module/quantized/__init__.py b/imperative/python/megengine/module/quantized/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e641476d6a363a609660fb2495bf946e91b7b6c8
--- /dev/null
+++ b/imperative/python/megengine/module/quantized/__init__.py
@@ -0,0 +1,14 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .concat import Concat
+from .conv import Conv2d, ConvRelu2d
+from .conv_bn import ConvBn2d, ConvBnRelu2d
+from .elemwise import Elemwise
+from .linear import Linear
+from .module import QuantizedModule
+from .quant_dequant import DequantStub, QuantStub
diff --git a/imperative/python/megengine/module/quantized/concat.py b/imperative/python/megengine/module/quantized/concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..5815d7d9ee885cda3965beb4a5171590bcd1eb9b
--- /dev/null
+++ b/imperative/python/megengine/module/quantized/concat.py
@@ -0,0 +1,35 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Iterable
+
+from ... import functional as F
+from ...tensor import Tensor
+from ..qat import concat as QAT
+from .module import QuantizedModule
+
+
+class Concat(QuantizedModule):
+    r"""
+    A :class:`~.QuantizedModule` to do quantized concat, inference only.
+    """
+
+    def __init__(self, dtype=None):
+        super().__init__()
+        self.output_dtype = dtype
+
+    def forward(self, inps: Iterable[Tensor], axis: int = 0):
+        new_inps = (x.astype(self.output_dtype) for x in inps)
+        return F.concat(new_inps, axis)
+
+    @classmethod
+    def from_qat_module(cls, qat_module: QAT.Concat):
+        r"""
+        return a :class:`~.QuantizedModule` instance converted from a
+        :class:`~.QATModule` instance.
+        """
+        return cls(qat_module.get_activation_dtype())
diff --git a/imperative/python/megengine/module/quantized/conv.py b/imperative/python/megengine/module/quantized/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..696e4f63ec62577c726cc43934c43aa30b27e995
--- /dev/null
+++ b/imperative/python/megengine/module/quantized/conv.py
@@ -0,0 +1,107 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Tuple, Union
+
+import numpy as np
+
+from ... import module as Float
+from ...core.tensor import dtype
+from ...functional import conv_bias_activation
+from ...tensor_nn import Parameter
+from ..qat import conv as QAT
+from .module import QuantizedModule
+
+
+class Conv2d(Float.Conv2d, QuantizedModule):
+    r"""quantized version of :class:`~.qat.conv.Conv2d`."""
+    r"""Applies a 2D convolution over an quantized input tensor, inference only.
+
+    The parameter is same with :class: `~.Conv2d`
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: Union[int, Tuple[int, int]] = 1,
+        padding: Union[int, Tuple[int, int]] = 0,
+        dilation: Union[int, Tuple[int, int]] = 1,
+        groups: int = 1,
+        conv_mode: str = "CROSS_CORRELATION",
+        compute_mode: str = "DEFAULT",
+        dtype=None,
+    ):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            True,
+            conv_mode,
+            compute_mode,
+        )
+        self.output_dtype = dtype
+
+    def calc_conv_quantized(self, inp, nonlinear_mode="IDENTITY"):
+        inp_scale = dtype.get_scale(inp.dtype)
+        w_scale = dtype.get_scale(self.weight.dtype)
+        bias_scale = inp_scale * w_scale
+        return conv_bias_activation(
+            inp,
+            self.weight,
+            self.bias.astype(dtype.qint32(bias_scale)),
+            self.output_dtype,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            conv_mode=self.conv_mode,
+            compute_mode=self.compute_mode,
+            nonlinear_mode=nonlinear_mode,
+        )
+
+    @classmethod
+    def from_qat_module(cls, qat_module: QAT.Conv2d):
+        r"""
+        return a :class:`~.QuantizedModule` instance converted from a
+        :class:`~.QATModule` instance.
+        """
+        output_dtype = qat_module.get_activation_dtype()
+        qconv = cls(
+            qat_module.in_channels,
+            qat_module.out_channels,
+            qat_module.kernel_size,
+            qat_module.stride,
+            qat_module.padding,
+            qat_module.dilation,
+            qat_module.groups,
+            dtype=output_dtype,
+        )
+        weight = qat_module.weight.astype(qat_module.get_weight_dtype())
+        qconv.weight = Parameter(weight.numpy())
+        if qat_module.bias is not None:
+            qconv.bias = Parameter(qat_module.bias.numpy())
+        else:
+            qconv.bias = Parameter(
+                np.zeros(qat_module._infer_bias_shape(), dtype=np.float32)
+            )
+        return qconv
+
+    def forward(self, inp):
+        return self.calc_conv_quantized(inp, nonlinear_mode="IDENTITY")
+
+
+class ConvRelu2d(Conv2d):
+    r"""quantized version of :class:`~.qat.conv.ConvRelu2d`."""
+
+    def forward(self, inp):
+        return self.calc_conv_quantized(inp, nonlinear_mode="RELU")
diff --git a/imperative/python/megengine/module/quantized/conv_bn.py b/imperative/python/megengine/module/quantized/conv_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7c1de08aec800101d613a16637c89cf215da70d
--- /dev/null
+++ b/imperative/python/megengine/module/quantized/conv_bn.py
@@ -0,0 +1,56 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from ...tensor_nn import Parameter
+from ..qat import conv_bn as QAT
+from .conv import Conv2d
+
+
+class _ConvBnActivation2d(Conv2d):
+    r"""Applies a 2D convolution over an quantized input tensor, inference only.
+
+    The parameter is same with :class: `~.Conv2d`
+    """
+
+    @classmethod
+    def from_qat_module(cls, qat_module: QAT._ConvBnActivation2d):
+        r"""
+        return a :class:`~.QuantizedModule` instance converted from a
+        :class:`~.QATModule` instance.
+        """
+        output_dtype = qat_module.get_activation_dtype()
+        qconv = cls(
+            qat_module.conv.in_channels,
+            qat_module.conv.out_channels,
+            qat_module.conv.kernel_size,
+            qat_module.conv.stride,
+            qat_module.conv.padding,
+            qat_module.conv.dilation,
+            qat_module.conv.groups,
+            dtype=output_dtype,
+        )
+        w_fold, b_fold = qat_module.fold_weight_bias(
+            qat_module.bn.running_mean, qat_module.bn.running_var
+        )
+        weight = w_fold.astype(qat_module.get_weight_dtype())
+        qconv.weight = Parameter(weight.numpy())
+        qconv.bias = Parameter(b_fold.numpy())
+        return qconv
+
+
+class ConvBn2d(_ConvBnActivation2d):
+    r"""quantized version of :class:`~.qat.conv_bn.ConvBn2d`."""
+
+    def forward(self, inp):
+        return self.calc_conv_quantized(inp, nonlinear_mode="IDENTITY")
+
+
+class ConvBnRelu2d(_ConvBnActivation2d):
+    r"""quantized version of :class:`~.qat.conv_bn.ConvBnRelu2d`."""
+
+    def forward(self, inp):
+        return self.calc_conv_quantized(inp, nonlinear_mode="RELU")
diff --git a/imperative/python/megengine/module/quantized/elemwise.py b/imperative/python/megengine/module/quantized/elemwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..8caee62ed8ff04ffc1520967ccf6f61fb1be8448
--- /dev/null
+++ b/imperative/python/megengine/module/quantized/elemwise.py
@@ -0,0 +1,36 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from ...core.ops._internal import param_defs as P
+from ...functional.elemwise import _elemwise_multi_type
+from ...tensor import Tensor
+from ..qat import elemwise as QAT
+from .module import QuantizedModule
+
+
+class Elemwise(QuantizedModule):
+    r"""quantized version of :class:`~.qat.elemwise.Elemwise`."""
+
+    _elemwise_multi_type_mode = P.ElemwiseMultiType.Mode
+
+    def __init__(self, method, dtype=None):
+        super().__init__()
+        self.method = self._elemwise_multi_type_mode.convert("Q" + method)
+        self.output_dtype = dtype
+
+    def forward(self, *inps):
+        if self.training:
+            raise ValueError("quantized module only support inference.")
+        return _elemwise_multi_type(*inps, mode=self.method, dtype=self.output_dtype)
+
+    @classmethod
+    def from_qat_module(cls, qat_module: QAT.Elemwise):
+        r"""
+        return a :class:`~.QuantizedModule` instance converted from a
+        :class:`~.QATModule` instance.
+        """
+        return cls(qat_module.method.name, qat_module.get_activation_dtype())
diff --git a/imperative/python/megengine/module/quantized/linear.py b/imperative/python/megengine/module/quantized/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..e42fe266b9e051923a20c62ab22e2a5b07ebb18e
--- /dev/null
+++ b/imperative/python/megengine/module/quantized/linear.py
@@ -0,0 +1,52 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+from ... import functional as F
+from ...core.tensor import dtype
+from ...tensor_nn import Parameter
+from ..qat import linear as QAT
+from .module import QuantizedModule
+
+
+class Linear(QuantizedModule):
+    r"""quantized version of :class:`~.qat.linear.Linear`."""
+
+    def __init__(
+        self, dtype: np.dtype = None,
+    ):
+        super().__init__()
+        self.weight = None
+        self.bias = None
+        self.output_dtype = dtype
+
+    def forward(self, inp):
+        if self.training:
+            raise ValueError("quantized module only support inference.")
+        inp_scale = dtype.get_scale(inp.dtype)
+        w_scale = dtype.get_scale(self.weight.dtype)
+        bias_dtype = dtype.qint32(inp_scale * w_scale)
+        return F.linear(
+            inp,
+            self.weight,
+            None if self.bias is None else self.bias.astype(bias_dtype),
+        ).astype(self.output_dtype)
+
+    @classmethod
+    def from_qat_module(cls, qat_module: QAT.Linear):
+        r"""
+        return a :class:`~.QuantizedModule` instance converted from a
+        :class:`~.QATModule` instance.
+        """
+        output_dtype = qat_module.get_activation_dtype()
+        qmod = cls(dtype=output_dtype)
+        weight = qat_module.weight.astype(qat_module.get_weight_dtype())
+        qmod.weight = Parameter(weight.numpy())
+        if qat_module.bias is not None:
+            qmod.bias = Parameter(qat_module.bias.numpy())
+        return qmod
diff --git a/imperative/python/megengine/module/quantized/module.py b/imperative/python/megengine/module/quantized/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fccdbfa27b1b17a8de486bbb41213f3585556a5
--- /dev/null
+++ b/imperative/python/megengine/module/quantized/module.py
@@ -0,0 +1,31 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from abc import abstractmethod
+
+from ..module import Module
+from ..qat import QATModule
+
+
+class QuantizedModule(Module):
+    r"""
+    Base class of quantized Module, which should be converted from QATModule
+    and not support traning.
+    """
+
+    def __call__(self, *inputs, **kwargs):
+        if self.training:
+            raise ValueError("quantized module only support inference.")
+        return super().__call__(*inputs, **kwargs)
+
+    @classmethod
+    @abstractmethod
+    def from_qat_module(cls, qat_module: QATModule):
+        r"""
+        return a :class:`~.QuantizedModule` instance converted from a
+        :class:`~.QATModule` instance.
+        """
diff --git a/imperative/python/megengine/module/quantized/quant_dequant.py b/imperative/python/megengine/module/quantized/quant_dequant.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c245011f80c1b509eb7490633d3bfc921254799
--- /dev/null
+++ b/imperative/python/megengine/module/quantized/quant_dequant.py
@@ -0,0 +1,49 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from ..qat import quant_dequant as QAT
+from .module import QuantizedModule
+
+
+class QuantStub(QuantizedModule):
+    r"""
+    quantized version of :class:`~.qat.quant_dequant.QuantStub`,
+    will convert input to quantized dtype.
+    """
+
+    def __init__(self, dtype=None):
+        super().__init__()
+        self.output_dtype = dtype
+
+    def forward(self, inp):
+        return inp.astype(self.output_dtype)
+
+    @classmethod
+    def from_qat_module(cls, qat_module: QAT.QuantStub):
+        r"""
+        return a :class:`~.QuantizedModule` instance converted from a
+        :class:`~.QATModule` instance.
+        """
+        return cls(qat_module.get_activation_dtype())
+
+
+class DequantStub(QuantizedModule):
+    r"""
+    quantized version of :class:`~.qat.quant_dequant.DequantStub`,
+    will restore quantized input to float32 dtype.
+    """
+
+    def forward(self, inp):
+        return inp.astype("float32")
+
+    @classmethod
+    def from_qat_module(cls, qat_module: QAT.DequantStub):
+        r"""
+        return a :class:`~.QuantizedModule` instance converted from a
+        :class:`~.QATModule` instance.
+        """
+        return cls()
diff --git a/imperative/python/megengine/module/sequential.py b/imperative/python/megengine/module/sequential.py
new file mode 100644
index 0000000000000000000000000000000000000000..03afd48a7e3f0b4012e2fd59e2b6bff4d66b602f
--- /dev/null
+++ b/imperative/python/megengine/module/sequential.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from collections import OrderedDict
+
+from .module import Module
+
+
+class Sequential(Module):
+    r"""A sequential container.
+    Modules will be added to it in the order they are passed in the constructor.
+    Alternatively, an ordered dict of modules can also be passed in.
+
+    To make it easier to understand, here is a small example:
+
+    .. testcode::
+
+        import numpy as np
+        import megengine.nn as nn
+        import megengine.nn.functional as F
+
+        batch_size = 64
+        data = nn.Input("data", shape=(batch_size, 1, 28, 28), dtype=np.float32, value=np.zeros((batch_size, 1, 28, 28)))
+        label = nn.Input("label", shape=(batch_size,), dtype=np.int32, value=np.zeros(batch_size,))
+
+        data = data.reshape(batch_size, -1)
+        net = nn.Sequential(
+                nn.Linear(28 * 28, 320),
+                nn.Linear(320, 500),
+                nn.Linear(500, 320),
+                nn.Linear(320, 10)
+            )
+        pred = net(data)
+
+        loss = F.cross_entropy_with_softmax(pred, label)
+
+    """
+
+    def __init__(self, *args):
+        super().__init__()
+        self.layer_keys = []
+        self.layer_values = []
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            for key, module in args[0].items():
+                # self.add_module(key, module)
+                setattr(self, key, module)
+                self.layer_keys.append(key)
+                self.layer_values.append(module)
+        else:
+            for idx, module in enumerate(args):
+                # self.add_module(str(idx), module)
+                setattr(self, str(idx), module)
+                self.layer_keys.append(str(idx))
+                self.layer_values.append(module)
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            return self.__class__(
+                OrderedDict(zip(self.layer_keys[idx], self.layer_values[idx]))
+            )
+        else:
+            return self.layer_values[idx]
+
+    def __setitem__(self, idx, module):
+        key = self.layer_keys[idx]
+        self.layer_values[idx] = module
+        return setattr(self, key, module)
+
+    def __delitem__(self, idx):
+        if isinstance(idx, slice):
+            for key in self.layer_keys[idx]:
+                delattr(self, key)
+                del self.layer_keys[idx]
+                del self.layer_values[idx]
+        else:
+            delattr(self, self.layer_keys[idx])
+            del self.layer_keys[idx]
+            del self.layer_values[idx]
+
+    def __len__(self):
+        return len(self.layer_keys)
+
+    def __iter__(self):
+        return iter(self.layer_values)
+
+    def forward(self, inp):
+        for layer in self.layer_values:
+            inp = layer(inp)
+        return inp
diff --git a/imperative/python/megengine/optimizer/__init__.py b/imperative/python/megengine/optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad783e0605e0308354d5e2ef3ba21327086f3938
--- /dev/null
+++ b/imperative/python/megengine/optimizer/__init__.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .adadelta import Adadelta
+from .adagrad import Adagrad
+from .adam import Adam
+from .lr_scheduler import LRScheduler
+from .multi_step_lr import MultiStepLR
+from .optimizer import Optimizer
+from .sgd import SGD
diff --git a/imperative/python/megengine/optimizer/adadelta.py b/imperative/python/megengine/optimizer/adadelta.py
new file mode 100644
index 0000000000000000000000000000000000000000..9de92fa9b8631f4b25d2e8b4e2293a3d1c292260
--- /dev/null
+++ b/imperative/python/megengine/optimizer/adadelta.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Iterable, Union
+
+import numpy as np
+
+from ..functional import sqrt
+from ..tensor_nn import Buffer, Parameter
+from .distributed_optimizer import DistributedOptimizer
+
+
+class Adadelta(DistributedOptimizer):
+    r"""Implements Adadelta algorithm.
+
+    It has been proposed in `"ADADELTA: An Adaptive Learning Rate Method" <https://arxiv.org/abs/1212.5701>`_.
+
+    :param params: iterable of parameters to optimize or dicts defining
+        parameter groups.
+    :param lr: coefficient that scale delta before it is applied
+        to the parameters (default: 1.0).
+    :param rho: coefficient used for computing a running average
+        of squared gradients (default: 0.9).
+    :param eps: term added to the denominator to improve
+        numerical stability (default: 1e-6).
+    :param weight_decay: weight decay (L2 penalty) (default: 0).
+    """
+
+    def __init__(
+        self,
+        params: Union[Iterable[Parameter], dict],
+        lr: float = 1.0,
+        rho: float = 0.9,
+        eps: float = 1e-6,
+        weight_decay: float = 0.0,
+        **kwargs
+    ):
+        assert lr >= 0.0, "Invalid learning rate: {}".format(lr)
+        assert rho >= 0.0 and rho <= 1.0, "Invalid rho value: {}".format(rho)
+        assert eps >= 0.0, "Invalid epsilon value: {}".format(eps)
+        assert weight_decay >= 0.0, "Invalid weight_decay value: {}".format(
+            weight_decay
+        )
+
+        defaults = dict(lr=lr, rho=rho, eps=eps, weight_decay=weight_decay)
+        super().__init__(params, defaults, **kwargs)
+
+    def _create_state(self, param_group):
+        for param in param_group["params"]:
+            self._add_state(param, "square_avg")
+            self._add_state(param, "acc_delta")
+            self._add_state(param, "step", initializer=0.0)
+
+    def _updates(self, param_group):
+        lr = param_group["lr"]
+        weight_decay = param_group["weight_decay"]
+        rho = param_group["rho"]
+        eps = param_group["eps"]
+
+        for param in param_group["params"]:
+
+            if param.__wrapped__ in self._grad_skip:
+                self._grad_skip.remove(param.__wrapped__)
+                continue
+
+            if not isinstance(param.grad, Buffer):
+                raise TypeError(
+                    "grad must be a Buffer, maybe you forget to call backward()?"
+                )
+
+            if not param.requires_grad:
+                continue
+
+            states = self._state[param]
+            step = states["step"]
+            step += 1.0
+            grad = param.grad
+            if weight_decay != 0.0:
+                grad += param * weight_decay
+
+            square_avg = states["square_avg"]
+            acc_delta = states["acc_delta"]
+            square_avg = rho * square_avg + (1 - rho) * grad ** 2
+            std = sqrt(square_avg + eps)
+            delta = sqrt(acc_delta + eps) / std * grad
+            param -= lr * delta
+            acc_delta = rho * acc_delta + (1 - rho) * delta ** 2
+            states["square_avg"]._reset(square_avg)
+            states["acc_delta"]._reset(acc_delta)
+
+        assert len(self._grad_skip) == 0
diff --git a/imperative/python/megengine/optimizer/adagrad.py b/imperative/python/megengine/optimizer/adagrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..804c7abe7d0eed8fcd87ac6805ffbec4016947f5
--- /dev/null
+++ b/imperative/python/megengine/optimizer/adagrad.py
@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Iterable, Union
+
+import numpy as np
+
+from ..functional import sqrt
+from ..tensor_nn import Buffer, Parameter
+from .distributed_optimizer import DistributedOptimizer
+
+
+class Adagrad(DistributedOptimizer):
+    r"""Implements Adagrad algorithm.
+
+    It has been proposed in `"Adaptive Subgradient Methods for Online Learning
+    and Stochastic Optimization" <http://jmlr.org/papers/v12/duchi11a.html>`_.
+
+    :param params: iterable of parameters to optimize or dicts defining
+        parameter groups.
+    :param lr: coefficient that scale delta before it is applied
+        to the parameters (default: 1e-2).
+    :param lr_decay: learning rate decay (default: 0)
+    :param eps: term added to the denominator to improve
+        numerical stability (default: 1e-10).
+    :param weight_decay: weight decay (L2 penalty) (default: 0).
+    """
+
+    def __init__(
+        self,
+        params: Union[Iterable[Parameter], dict],
+        lr: float = 1e-2,
+        lr_decay: float = 0.0,
+        eps: float = 1e-10,
+        weight_decay: float = 0.0,
+        **kwargs
+    ):
+        assert lr >= 0.0, "Invalid learning rate: {}".format(lr)
+        assert lr_decay >= 0, "Invalid learning rate decay: {}".format(lr_decay)
+        assert eps >= 0.0, "Invalid epsilon value: {}".format(eps)
+        assert weight_decay >= 0.0, "Invalid weight_decay value: {}".format(
+            weight_decay
+        )
+
+        defaults = dict(lr=lr, lr_decay=lr_decay, eps=eps, weight_decay=weight_decay)
+        super().__init__(params, defaults, **kwargs)
+
+    def _create_state(self, param_group):
+        for param in param_group["params"]:
+            self._add_state(param, "square_avg")
+            self._add_state(param, "step", initializer=0.0)
+
+    def _updates(self, param_group):
+        lr = param_group["lr"]
+        lr_decay = param_group["lr_decay"]
+        weight_decay = param_group["weight_decay"]
+        eps = param_group["eps"]
+
+        for param in param_group["params"]:
+
+            if param.__wrapped__ in self._grad_skip:
+                self._grad_skip.remove(param.__wrapped__)
+                continue
+
+            if not isinstance(param.grad, Buffer):
+                raise TypeError(
+                    "grad must be a Buffer, maybe you forget to call backward()?"
+                )
+
+            if not param.requires_grad:
+                continue
+
+            states = self._state[param]
+            step = states["step"]
+            step += 1.0
+            grad = param.grad
+            if weight_decay != 0.0:
+                grad += param * weight_decay
+
+            square_avg = states["square_avg"]
+            square_avg += grad ** 2
+            delta = grad / sqrt(square_avg + eps)
+            clr = lr / (1 + (step - 1) * lr_decay)
+
+            param -= clr * delta
+        assert len(self._grad_skip) == 0
diff --git a/imperative/python/megengine/optimizer/adam.py b/imperative/python/megengine/optimizer/adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..fac9f4cb88fa56b0d89751c3d4700df3b4fcb649
--- /dev/null
+++ b/imperative/python/megengine/optimizer/adam.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Iterable, Tuple, Union
+
+from ..tensor_nn import Buffer, Parameter
+from .distributed_optimizer import DistributedOptimizer
+
+
+class Adam(DistributedOptimizer):
+    r"""Implements Adam algorithm proposed in `"Adam: A Method for Stochastic Optimization" <https://arxiv.org/abs/1412.6980>`_.
+
+    :param params: iterable of parameters to optimize or dicts defining
+            parameter groups.
+    :param lr: learning rate.
+    :param betas: coefficients used for computing running averages of gradient
+        and its square. Default: (0.9, 0.999)
+    :param eps: term added to the denominator to improve numerical stability
+        Default: 1e-8
+    :param weight_decay: weight decay (L2 penalty). Default: 0
+    """
+
+    def __init__(
+        self,
+        params: Union[Iterable[Parameter], dict],
+        lr: float,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-8,
+        weight_decay: float = 0.0,
+        **kwargs
+    ):
+        if lr < 0.0:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if weight_decay < 0.0:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+
+        defaults = dict(lr=lr, weight_decay=weight_decay, betas=betas, eps=eps)
+        super().__init__(params, defaults, **kwargs)
+
+    def _create_state(self, param_group):
+        for param in param_group["params"]:
+            self._add_state(param, "exp_avg")
+            self._add_state(param, "exp_avg_sq")
+            self._add_state(param, "step", initializer=0.0)
+
+    def _updates(self, param_group):
+        lr = param_group["lr"]
+        weight_decay = param_group["weight_decay"]
+        eps = param_group["eps"]
+        beta0, beta1 = param_group["betas"]
+
+        for param in param_group["params"]:
+
+            if param.__wrapped__ in self._grad_skip:
+                self._grad_skip.remove(param.__wrapped__)
+                continue
+
+            if not param.requires_grad:
+                continue
+
+            if not isinstance(param.grad, Buffer):
+                raise TypeError(
+                    "grad must be a Buffer, maybe you forget to call backward()?"
+                )
+
+            grad = param.grad
+            if weight_decay != 0.0:
+                grad += param * weight_decay
+
+            states = self._state[param]
+            step = states["step"]
+            step += 1.0
+            exp_avg = states["exp_avg"]
+            exp_avg_sq = states["exp_avg_sq"]
+            exp_avg = beta0 * exp_avg + grad * (1 - beta0)
+            exp_avg_sq = beta1 * exp_avg_sq + (1 - beta1) * (grad * grad)
+
+            delta = (exp_avg / (1 - beta0 ** step)) / (
+                (exp_avg_sq / (1 - beta1 ** step)) ** 0.5 + eps
+            )
+            param -= lr * delta
+
+            # not inplace change, need to update underlying tensor handler in state
+            states["exp_avg"]._reset(exp_avg)
+            states["exp_avg_sq"]._reset(exp_avg_sq)
+
+        assert len(self._grad_skip) == 0
diff --git a/imperative/python/megengine/optimizer/distributed_optimizer.py b/imperative/python/megengine/optimizer/distributed_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..86168c9ad844f9d378f277f1721b77521b021746
--- /dev/null
+++ b/imperative/python/megengine/optimizer/distributed_optimizer.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Iterable as Iter
+from typing import Optional, Union
+
+from ..device import get_default_device
+from ..distributed.group import get_client, is_distributed
+from ..functional import add_update
+from ..functional.distributed import WORLD, Group, all_reduce_sum, broadcast
+from ..functional.utils import copy
+from ..tensor import Tensor, TensorDict
+from ..tensor_nn import Parameter
+from .optimizer import Optimizer
+from .param_pack import get_pack_list, pack_allreduce_split
+
+
+class DistributedOptimizer(Optimizer):
+    r"""Add Distributed Func for distributed training.
+
+    :param params: specifies what Tensors should be optimized.
+    :param defaults: a dict of default parameters of Optimizer, like learning rate or momentum.
+    :param reduce_method: use all_reduce_sum or all_reduce_mean to reduce gradients
+    :param bcast_period: broadcasts params every *bcast_period* iterations.
+            if it equals to 0, it will broadcast params only at the beginning. Default: 500
+    :param param_pack: whether to pack gradients to avoid small packages send/recv. Default: False
+    :param param_pack_thd: max size of packed gradients by bytes. Default: 10 * 1024 * 1024
+    """
+
+    def __init__(
+        self,
+        params: Union[Iter[Parameter], dict],
+        defaults: dict,
+        reduce_method: Optional[str] = None,
+        dist_group: Optional[Group] = WORLD,
+        bcast_period: int = 0,
+        param_pack: bool = False,
+        param_pack_thd: int = 10 * 1024 * 1024,
+    ):
+        if is_distributed():
+            assert reduce_method in ["sum", "mean"], "reduce_method must be specified"
+        defaults["orders"] = []
+        defaults["dist_group"] = dist_group
+        super().__init__(params, defaults)
+        self._bcast_period = bcast_period
+        self._param_pack = param_pack
+        self._param_pack_thd = param_pack_thd
+        self._reduce_method = reduce_method
+
+        self.add_save_load_state_ignore_keys(
+            {"grads", "orders", "pack_list", "shape_list", "dist_group"}
+        )
+
+        if is_distributed() and bcast_period != -1:
+            self.bcast_param()
+
+    def grad_callback(self, grad, i, group):
+        if is_distributed() and group["dist_group"] is not None:
+            dist_group = group["dist_group"]
+            if self._param_pack and "pack_list" in group:
+                for pack, shapes in zip(group["pack_list"], group["shape_list"]):
+                    if i == pack[-1]:
+                        pack_allreduce_split(group, pack, shapes, self._reduce_method)
+            else:
+                group["orders"].append(i)
+                group["grads"][i] = all_reduce_sum(
+                    grad, dist_group, dist_group.comp_node
+                )
+                if self._reduce_method == "mean":
+                    group["grads"][i] /= dist_group.size
+
+    def _gen_pack_list(self, group):
+        if "pack_list" not in group:
+            dist_group = group["dist_group"]
+            if dist_group.rank == 0:
+                pack_list, shape_list = get_pack_list(group, self._param_pack_thd)
+                get_client().set_pack_list(dist_group.key, (pack_list, shape_list))
+            else:
+                pack_list, shape_list = get_client().get_pack_list(dist_group.key)
+            group["pack_list"] = pack_list
+            group["shape_list"] = shape_list
+
+    def backward(self, loss: Tensor):
+        ret = super().backward(loss)
+        if is_distributed():
+            for group in self.param_groups:
+                if self._param_pack and group["dist_group"] is not None:
+                    self._gen_pack_list(group)
+        return ret
+
+    def step(self):
+        if is_distributed():
+            for group in self.param_groups:
+                device = get_default_device()
+                for param in group["params"]:
+                    if param.__wrapped__ not in self._grad_skip:
+                        if param.grad.device != device:
+                            param.grad = copy(param.grad, device)
+            if self._bcast_period > 0:
+                self._bcast_iter += 1
+                if self._bcast_iter == self._bcast_period:
+                    self.bcast_param()
+                    self._bcast_iter = 0
+        super().step()
+
+    def bcast_param(self):
+        device = get_default_device()
+        for group in self.param_groups:
+            for param in group["params"]:
+                dist_group = group["dist_group"]
+                new_param = broadcast(param, dist_group)
+                if new_param.device != device:
+                    new_param = copy(new_param, device)
+                add_update(param, new_param, alpha=0)
+                param._reset(new_param)
diff --git a/imperative/python/megengine/optimizer/lr_scheduler.py b/imperative/python/megengine/optimizer/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..46d08d5dc0d4a7a125dc49b108ebca9af43b3018
--- /dev/null
+++ b/imperative/python/megengine/optimizer/lr_scheduler.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from abc import ABCMeta
+
+from .distributed_optimizer import DistributedOptimizer
+
+
+class LRScheduler(metaclass=ABCMeta):
+    r"""Base class for all learning rate based schedulers.
+
+    :param optimizer: Wrapped optimizer.
+    :param current_epoch: The index of current epoch. Default: -1
+    """
+
+    def __init__(  # pylint: disable=too-many-branches
+        self, optimizer: DistributedOptimizer, current_epoch: int = -1
+    ):
+        if not isinstance(optimizer, DistributedOptimizer):
+            raise TypeError(
+                "optimizer argument given to the lr_scheduler should be Optimizer"
+            )
+        self.optimizer = optimizer
+        self.current_epoch = current_epoch
+        if current_epoch == -1:
+            for group in self.optimizer.param_groups:
+                group.setdefault("initial_lr", group["lr"])
+        else:
+            for i, group in enumerate(optimizer.param_groups):
+                if "initial_lr" not in group:
+                    raise KeyError(
+                        "param 'initial_lr' is not specified in "
+                        "param_groups[{}] when resuming an optimizer".format(i)
+                    )
+        self.base_lrs = list(
+            map(lambda group: group["initial_lr"], self.optimizer.param_groups)
+        )
+
+        self.step()
+
+    def state_dict(self):
+        r"""Returns the state of the scheduler as a :class:`dict`.
+            It contains an entry for every variable in self.__dict__ which
+            is not the optimizer.
+        """
+        raise NotImplementedError
+
+    def load_state_dict(self, state_dict):
+        r"""Loads the schedulers state.
+
+        :param state_dict (dict): scheduler state.
+        """
+        raise NotImplementedError
+
+    def get_lr(self):
+        r""" Compute current learning rate for the scheduler.
+        """
+        raise NotImplementedError
+
+    def step(self, epoch=None):
+        if epoch is None:
+            self.current_epoch += 1
+        else:
+            self.current_epoch = epoch
+
+        values = self.get_lr()
+        for param_group, lr in zip(self.optimizer.param_groups, values):
+            param_group["lr"] = lr
diff --git a/imperative/python/megengine/optimizer/multi_step_lr.py b/imperative/python/megengine/optimizer/multi_step_lr.py
new file mode 100644
index 0000000000000000000000000000000000000000..45cc74c3dde18b5feecd775dfa401d38b2de13a4
--- /dev/null
+++ b/imperative/python/megengine/optimizer/multi_step_lr.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from bisect import bisect_right
+from typing import Iterable as Iter
+
+from .distributed_optimizer import DistributedOptimizer
+from .lr_scheduler import LRScheduler
+
+
+class MultiStepLR(LRScheduler):
+    r"""Decays the learning rate of each parameter group by gamma once the
+        number of epoch reaches one of the milestones.
+
+    :param optimizer: Wrapped optimizer.
+    :param milestones (list): List of epoch indices. Must be increasing.
+    :param gamma (float): Multiplicative factor of learning rate decay. Default: 0.1.
+    :param current_epoch: The index of current epoch. Default: -1.
+    """
+
+    def __init__(
+        self,
+        optimizer: DistributedOptimizer,
+        milestones: Iter[int],
+        gamma: float = 0.1,
+        current_epoch: int = -1,
+    ):
+        if not list(milestones) == sorted(milestones):
+            raise ValueError(
+                "Milestones should be a list of increasing integers. Got {}".format(
+                    milestones
+                )
+            )
+
+        self.milestones = milestones
+        self.gamma = gamma
+        super().__init__(optimizer, current_epoch)
+
+    def state_dict(self):
+        r"""Returns the state of the scheduler as a :class:`dict`.
+            It contains an entry for every variable in self.__dict__ which
+            is not the optimizer.
+        """
+        return {
+            key: value
+            for key, value in self.__dict__.items()
+            if key in ["milestones", "gamma", "current_epoch"]
+        }
+
+    def load_state_dict(self, state_dict):
+        r"""Loads the schedulers state.
+
+        :param state_dict (dict): scheduler state.
+        """
+        tmp_dict = {}
+        for key in ["milestones", "gamma", "current_epoch"]:
+            if not key in state_dict.keys():
+                raise KeyError(
+                    "key '{}'' is not specified in "
+                    "state_dict when loading state dict".format(key)
+                )
+            tmp_dict[key] = state_dict[key]
+
+        self.__dict__.update(tmp_dict)
+
+    def get_lr(self):
+        return [
+            base_lr * self.gamma ** bisect_right(self.milestones, self.current_epoch)
+            for base_lr in self.base_lrs
+        ]
diff --git a/imperative/python/megengine/optimizer/optimizer.py b/imperative/python/megengine/optimizer/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5bf18b0797f2cf739b7a41f0f2d33bf09635bf5
--- /dev/null
+++ b/imperative/python/megengine/optimizer/optimizer.py
@@ -0,0 +1,347 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from abc import ABCMeta, abstractmethod
+from collections import Iterable
+from contextlib import contextmanager
+from typing import Dict
+from typing import Iterable as Iter
+from typing import Set, Union
+
+import numpy as np
+
+from ..core.autodiff.grad import Grad
+from ..device import get_default_device
+from ..distributed.group import get_client, is_distributed
+from ..functional import add_update
+from ..functional.distributed import all_reduce_sum, broadcast
+from ..functional.utils import copy
+from ..logger import get_logger
+from ..tensor import Tensor, TensorDict
+from ..tensor_nn import Buffer, Parameter
+
+logger = get_logger(__name__)
+
+
+class _RequiredParameter:
+    def __repr__(self):
+        return "<required parameter>"
+
+
+required = _RequiredParameter()
+
+
+class Optimizer(metaclass=ABCMeta):
+    r"""Base class for all optimizers.
+
+    :param params: specifies what Tensors should be optimized.
+    :param defaults: a dict of default parameters of Optimizer, like learning rate or momentum.
+    """
+
+    _recording = None
+    _grad = None
+    _gradients = None
+
+    def __init__(  # pylint: disable=too-many-branches
+        self, params: Union[Iter[Parameter], dict], defaults: dict,
+    ):
+        self._state = TensorDict()
+        self._defaults = defaults
+
+        if isinstance(params, (Parameter, dict)):
+            params = [params]
+        else:
+            if not isinstance(params, Iterable):
+                raise TypeError(
+                    "params argument given to the optimizer should be "
+                    "Parameter or dict, or Iterable of them"
+                )
+
+        self.param_groups = []  # type: list
+        self.save_load_state_ignore_keys = set()
+
+        param_groups = list(params)
+        if len(param_groups) == 0:
+            raise ValueError("optimizer got an empty parameter list")
+
+        param_type = type(param_groups[0])
+        for param in param_groups:
+            if not isinstance(param, param_type):
+                raise TypeError(
+                    "types of params argument given to the optimizer shoud be same"
+                )
+
+        if not isinstance(param_groups[0], dict):
+            param_groups = [{"params": param_groups}]
+
+        for group in param_groups:
+            self.add_param_group(group)
+
+        for group in self.param_groups:
+            self._create_state(group)
+
+    def add_param_group(self, param_group: dict):
+        r"""Add a param group to ``param_groups`` of the :class:`~megengine.optim.optimizer.Optimizer`.
+
+        This can be useful when fine tuning a pre-trained network as frozen layers can be made
+        trainable and added to the :class:`~megengine.optim.optimizer.Optimizer` as training progresses.
+
+        :param param_group: specifies what tensors should be optimized along with group.
+
+        """
+        assert isinstance(param_group, dict), "param group must be a dict"
+
+        if isinstance(param_group["params"], Parameter):
+            param_group["params"] = [param_group["params"]]
+        else:
+            param_group["params"] = list(param_group["params"])
+
+        for param in param_group["params"]:
+            if not isinstance(param, Parameter):
+                raise TypeError(
+                    "optimizer can only optimize Parameters, but one of the params is "
+                    + type(param)
+                )
+            if not param.requires_grad:
+                raise ValueError(
+                    "optimizer can only optimize Parameters with requires_grad=True"
+                )
+
+        for name, default in self._defaults.items():
+            if default is required and name not in param_group:
+                raise ValueError(
+                    "parameter group didn't specify a value of "
+                    "required optimization parameter " + name
+                )
+            param_group.setdefault(name, default)
+
+        param_set = set()
+
+        for group in self.param_groups:
+            param_set.update(set(map(id, group["params"])))
+
+        assert param_set.isdisjoint(
+            set(map(id, param_group["params"]))
+        ), "some parameters appear in more than one parameter group"
+
+        self.param_groups.append(param_group)
+
+    def _add_state(self, param, state_name, initializer=None):
+        if initializer is None:
+            initializer = np.zeros(param.shape, dtype=np.float32)
+        state_dict = self._state.setdefault(param, {})
+        assert state_name not in state_dict
+        state = Buffer(initializer)
+        state_dict[state_name] = state
+
+    @abstractmethod
+    def _create_state(self, param_group):
+        pass
+
+    @abstractmethod
+    def _updates(self, param_group):
+        pass
+
+    def _get_params(self):
+        params = []
+        for group in self.param_groups:
+            for param in group["params"]:
+                params.append(param)
+        return params
+
+    def grad_callback(self, grad, i, group):
+        pass
+
+    def record(self):
+        @contextmanager
+        def recorder():
+            params = self._get_params()
+            grad = Grad()
+            gradients = [None] * len(params)
+            if self._recording:
+                raise RuntimeError("already recording!")
+            try:
+                self._recording = True
+                self._grad = grad
+                for group in self.param_groups:
+                    group["grads"] = [None] * len(group["params"])
+                    for i, param in enumerate(group["params"]):
+
+                        def callback(tensor, grad, i=i, group=group, self=self):
+                            group["grads"][i] = grad
+                            self.grad_callback(grad, i, group)
+
+                        grad.wrt(param, callback=callback)
+                with grad:
+                    yield
+            finally:
+                self._recording = False
+                self._grad = None
+                for group in self.param_groups:
+                    group["grads"] = []
+
+        return recorder()
+
+    def _calculate_gradients(self, loss: Tensor):
+        if not self._recording:
+            raise RuntimeError(
+                "no computation history. "
+                "did you forget record() or "
+                "call a method that clears the history?"
+            )
+        assert self._grad is not None
+
+        if len(loss.__wrapped__._extra_data) == 0:  # in case loss depends on no tensor
+            self._grad = None
+            return
+
+        one = Tensor([1.0], dtype=loss.dtype, device=loss.device)
+        one = one.reshape(loss.shape)
+        try:
+            self._grad(loss, one)
+        finally:
+            self._grad = None
+
+    def minimize(self, loss: Tensor):
+        self.backward(loss)
+        self.step()
+
+    def backward(self, loss: Tensor):
+        """Computes the back-propagation of the network given loss.
+
+        :param loss: The obtained loss tensor
+        """
+        rst = []
+        self._calculate_gradients(loss)
+
+        # _grad_skip records the parameters which are not in the path of backward
+        self._grad_skip = set()
+        for group in self.param_groups:
+            # _grad_skip is consumed in optimizer.step()
+            # XXX: assumptions
+            # 1. Assume the same execution sequence for all GPUs in data parallel
+            # 2. If backward is called by multiple times to accumulate grad,
+            #    it's also assumed same _grad_skip for all backward() calls
+            # Please change the code if any assumption is invalid
+            for param, grad in zip(group["params"], group["grads"]):
+                if grad is None:
+                    self._grad_skip.add(param.__wrapped__)
+                    continue
+                grad = Buffer(grad)
+                if getattr(param, "grad", None) is None:
+                    param.grad = grad
+                else:
+                    assert isinstance(param.grad, Buffer)
+                    param.grad += grad
+                rst.append(param.grad)
+        if len(self._grad_skip) > 0:
+            get_logger(__name__).warning(
+                "{} parameters have no grad! "
+                "Make sure you pass the right parameters list".format(
+                    len(self._grad_skip)
+                )
+            )
+        return rst
+
+    def step(self):
+        r"""Performs a single optimization step.
+
+        """
+        for group in self.param_groups:
+            if isinstance(group["params"], set):
+                raise TypeError(
+                    "optimized parameters need to be organized in ordered collections, "
+                    "but the ordering of parameters in sets will change between runs. "
+                    "Please use a list instead."
+                )
+            self._updates(group)
+
+    def zero_grad(self):
+        r"""Reset the grad to zeros.
+
+        """
+        for param_group in self.param_groups:
+            for param in param_group["params"]:
+                if getattr(param, "grad", None) is not None:
+                    param.grad = None
+
+    def add_save_load_state_ignore_keys(self, keys: Set[str]):
+        self.save_load_state_ignore_keys |= keys
+
+    def state_dict(self) -> Dict:
+        r"""Export the optimizer state.
+
+        :return: optimizer state. Can be loaded by :meth:`load_state_dict`.
+        """
+        param_groups = []
+        state = dict()
+        param2id = TensorDict()
+
+        cur_id = 0
+        for group in self.param_groups:
+            for param in group["params"]:
+                if param not in param2id:
+                    param2id[param] = cur_id
+                    cur_id += 1
+
+        for param, st in self._state.items():
+            state[param2id[param]] = st
+
+        for group in self.param_groups:
+            param_group = {
+                k: v
+                for k, v in group.items()
+                if k != "params" and k not in self.save_load_state_ignore_keys
+            }
+            param_group["params"] = [param2id[param] for param in group["params"]]
+            param_groups.append(param_group)
+
+        return {"param_groups": param_groups, "state": state}
+
+    def load_state_dict(self, state: dict):
+        r"""Loads the optimizer state.
+
+        :param state: optimizer state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        if len(self.param_groups) != len(state["param_groups"]):
+            raise ValueError(
+                "loaded state dict has a different number of parameter groups"
+            )
+        parameter_map = dict()  # type: Dict
+        for group_new, group_saved in zip(self.param_groups, state["param_groups"]):
+            if len(group_new["params"]) != len(group_saved["params"]):
+                raise ValueError(
+                    "loaded state dict contains a parameter group that "
+                    "doesn't match the size of optimizer's group"
+                )
+            for param_new, param_saved in zip(
+                group_new["params"], group_saved["params"]
+            ):
+                p = param_new
+                self._state[p] = state["state"][param_saved].copy()
+                for k, v in self._state[p].items():
+                    if isinstance(v, Buffer):
+                        self._state[p][k] = Buffer(v.numpy())
+
+            new_keys = set(group_new.keys()) - self.save_load_state_ignore_keys
+            saved_keys = set(group_saved.keys()) - self.save_load_state_ignore_keys
+            if new_keys != saved_keys:
+                raise ValueError(
+                    "loaded state dict contains a parameter group that "
+                    "doesn't match the keys of optimizer's group"
+                )
+            for key in saved_keys:
+                if key != "params":
+                    group_new[key] = group_saved[key]
+
+        if len(self._state.keys()) != len(state["state"].keys()):
+            raise ValueError(
+                "loaded state dict contains a state that doesn't match "
+                "the size of optimizer's state"
+            )
diff --git a/imperative/python/megengine/optimizer/param_pack.py b/imperative/python/megengine/optimizer/param_pack.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea117aa84b1c80faaf9bbf9a30c508b4c27c4335
--- /dev/null
+++ b/imperative/python/megengine/optimizer/param_pack.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+from ..functional import param_pack_concat, param_pack_split
+from ..functional.distributed import all_reduce_sum
+from ..tensor import Tensor
+
+
+def get_offsets(shapes):
+    offsets = []
+    offset = 0
+    for shape in shapes:
+        offsets.append(offset)
+        offset += int(np.prod(shape))
+        offsets.append(offset)
+    return offsets
+
+
+def get_pack_list(param_group, param_pack_thd):
+    pack_list = dict()
+    shape_list = dict()
+    pack_sum = dict()
+    pack_ret, shape_ret = [], []
+    ignore_first = 8
+    ignore_last = 0
+    orders_len = len(param_group["orders"])
+    for i, idx in enumerate(param_group["orders"]):
+        param = param_group["params"][idx]
+        dtype = str(np.dtype(param.dtype))
+        dtype_size = np.dtype(param.dtype).itemsize
+        shape = param.shape
+        if ignore_first > 0:
+            ignore_first -= 1
+            pack_ret.append([idx])
+            shape_ret.append([shape])
+            continue
+        if dtype in pack_list.keys():
+            pack_list[dtype].append(idx)
+            shape_list[dtype].append(shape)
+            pack_sum[dtype] += int(np.prod(shape))
+        else:
+            pack_list[dtype] = [idx]
+            shape_list[dtype] = [shape]
+            pack_sum[dtype] = int(np.prod(shape))
+        if (
+            pack_sum[dtype] * dtype_size > param_pack_thd
+            or i + ignore_last > orders_len
+        ):
+            pack_ret.append(pack_list[dtype])
+            shape_ret.append(shape_list[dtype])
+            pack_list[dtype] = []
+            shape_list[dtype] = []
+            pack_sum[dtype] = 0
+    for key in sorted(pack_list.keys()):
+        if len(pack_list[key]) > 0:
+            pack_ret.append(pack_list[key])
+            shape_ret.append(shape_list[key])
+    return pack_ret, shape_ret
+
+
+def pack_allreduce_split(group, pack, shapes, reduce_method):
+    dist_group = group["dist_group"]
+    grads = [group["grads"][idx] for idx in pack]
+    offsets_val = get_offsets(shapes)
+    offsets = Tensor(offsets_val)
+    packed_grads = param_pack_concat(grads, offsets, offsets_val)
+    packed_grads = all_reduce_sum(packed_grads, dist_group, dist_group.comp_node)
+    if reduce_method == "mean":
+        packed_grads /= dist_group.size
+    grads = param_pack_split(packed_grads, offsets_val, shapes)
+    for i, grad in enumerate(grads):
+        group["grads"][pack[i]] = grad
diff --git a/imperative/python/megengine/optimizer/sgd.py b/imperative/python/megengine/optimizer/sgd.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dfb485bb49b26db03733c7d864643b30aab7a03
--- /dev/null
+++ b/imperative/python/megengine/optimizer/sgd.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Iterable, Union
+
+from ..tensor_nn import Buffer, Parameter
+from .distributed_optimizer import DistributedOptimizer
+
+
+class SGD(DistributedOptimizer):
+    r"""Implements stochastic gradient descent.
+
+    Nesterov momentum is based on the formula from
+    `"On the importance of initialization and momentum in deep learning" <http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf>`_ .
+
+    :param params: iterable of parameters to optimize or dicts defining
+            parameter groups.
+    :param lr: learning rate.
+    :param momentum: momentum factor. Default: 0.0
+    :param weight_decay: weight decay (L2 penalty). Default: 0.0
+    """
+
+    def __init__(
+        self,
+        params: Union[Iterable[Parameter], dict],
+        lr: float,
+        momentum: float = 0.0,
+        weight_decay: float = 0.0,
+        **kwargs
+    ):
+        assert lr >= 0.0, "Invalid learning rate: {}".format(lr)
+        assert momentum >= 0.0, "Invalid momentum value: {}".format(momentum)
+        assert weight_decay >= 0.0, "Invalid weight_decay value: {}".format(
+            weight_decay
+        )
+
+        defaults = dict(lr=lr, momentum=momentum, weight_decay=weight_decay)
+        super().__init__(params, defaults, **kwargs)
+
+    def _create_state(self, param_group):
+        if param_group["momentum"] != 0.0:
+            for param in param_group["params"]:
+                self._add_state(param, "momentum_buffer")
+
+    def _updates(self, param_group):
+        lr = param_group["lr"]
+        weight_decay = param_group["weight_decay"]
+        momentum = param_group["momentum"]
+
+        for param in param_group["params"]:
+
+            if param.__wrapped__ in self._grad_skip:
+                self._grad_skip.remove(param.__wrapped__)
+                continue
+
+            if not isinstance(param.grad, Buffer):
+                raise TypeError(
+                    "grad must be a Buffer, maybe you forget to call backward()?"
+                )
+
+            if not param.requires_grad:
+                continue
+
+            grad = param.grad
+            if weight_decay != 0.0:
+                grad += param * weight_decay
+
+            if momentum:
+                v = self._state[param]["momentum_buffer"]
+                v = momentum * v + grad
+                param -= lr * v
+                self._state[param]["momentum_buffer"]._reset(v)
+            else:
+                param -= lr * grad
+
+        assert len(self._grad_skip) == 0
diff --git a/imperative/python/megengine/quantization/__init__.py b/imperative/python/megengine/quantization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c8a0e0da5f9f7c8609584653f68b1d3ab584c85
--- /dev/null
+++ b/imperative/python/megengine/quantization/__init__.py
@@ -0,0 +1,20 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+from .fake_quant import FakeQuantize
+from .internal_fake_quant import *
+from .observer import HistogramObserver, Observer
+from .qconfig import (
+    QConfig,
+    calibration_qconfig,
+    ema_fakequant_qconfig,
+    ema_lowbit_fakequant_qconfig,
+    min_max_fakequant_qconfig,
+    tqt_quant_qconfig,
+)
+from .utils import QuantMode
diff --git a/imperative/python/megengine/quantization/fake_quant.py b/imperative/python/megengine/quantization/fake_quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..941445c310eefe9f5a650578db3a79a75c8cc1b9
--- /dev/null
+++ b/imperative/python/megengine/quantization/fake_quant.py
@@ -0,0 +1,154 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import math
+from typing import Iterable
+
+import numpy as np
+
+from .. import functional as F
+from ..core.tensor.dtype import _metadata_dict, get_quantized_dtype
+from ..core.tensor.function import Function
+from ..module import Module
+from ..tensor import Tensor
+from ..tensor_nn import Parameter
+from .utils import QuantMode, fake_quant_tensor, get_qparam_dict
+
+
+class _FakeQuantize(Module):
+    r"""
+    A Basic Fake Quant module.
+
+    :param dtype: A string indicating the target quantization type of input.
+    :param narrow_range: Whether the absolute value of ``qmin`` is the same as ``qmax``,
+        instead of 1 greater. Usually True for weight and False for activation.
+    :param enable: Whether do ``normal_forward`` or ``fake_quant_forward``.
+    """
+
+    def __init__(self, dtype: str, narrow_range: bool = False, enable: bool = True):
+        super().__init__()
+        if not dtype in _metadata_dict.keys():
+            raise ValueError(
+                "unknown dtype: {}, only support {}".format(
+                    dtype, _metadata_dict.keys()
+                )
+            )
+        self.dtype = dtype
+        self.narrow_range = narrow_range
+        self.qmin = (
+            -_metadata_dict[dtype].qmax if narrow_range else _metadata_dict[dtype].qmin
+        )
+        self.qmax = _metadata_dict[dtype].qmax
+        self.enabled = enable
+
+    def enable(self):
+        self.enabled = True
+
+    def disable(self):
+        self.enabled = False
+
+    def fake_quant_forward(self, inp, q_dict=None):
+        return inp
+
+    def normal_foward(self, inp, q_dict=None):
+        return inp
+
+    def forward(self, inp, q_dict=None):
+        if self.enabled:
+            return self.fake_quant_forward(inp, q_dict=q_dict)
+        else:
+            return self.normal_foward(inp, q_dict=q_dict)
+
+
+class TQT_Function(Function):
+    def __init__(self, lowerbound, upperbound):
+        super().__init__()
+        self.lowerbound = lowerbound
+        self.upperbound = upperbound
+        self.saved_tensors = ()
+
+    def save_for_backward(self, *tensors: Iterable[Tensor]):
+        """
+        Saves tensors needed for gradient computation. This method should be called only
+        once in :meth:`~.function.Function.forward`, additional calls will replace values saved previously.
+
+        The saved tensors can be accessed through the ``saved_tensors`` attribute.
+        """
+        self.saved_tensors = tensors
+
+    def forward(self, inp, scale):
+        t = 2 ** scale
+        # t = F.maximum(t, 1e-4)
+        inp_scaled = inp / t
+        inp_clipped = F.maximum(F.minimum(inp_scaled, self.upperbound), self.lowerbound)
+        inp_rounded = F.round(inp_clipped)
+        inp_flq = inp_rounded * t
+        self.save_for_backward(inp_scaled, inp_rounded, t)
+        return inp_flq
+
+    def backward(self, grad_inp_flq):
+        (inp_scaled, inp_rounded, t) = self.saved_tensors
+        mask_clip = F.logical_and(
+            inp_scaled < -0.5 + self.lowerbound, inp_scaled > self.upperbound + 0.5
+        )  # mask for accumulating the gradients of |data_scaled|>L
+        mask_quant = F.logical_not(mask_clip)
+        grad_quant = (
+            grad_inp_flq * mask_quant * (inp_rounded - inp_scaled)
+        )  # gradient within |data_scaled|<=L
+        grad_clip = (
+            grad_inp_flq * mask_clip * inp_rounded
+        )  # gradient with   | data_scaled|>L
+        grad_s = grad_clip.sum() + grad_quant.sum()
+        # dL/ds = dL/dt * t * ln(2)
+        grad_s = grad_s * t * math.log(2)
+        grad_inp = grad_inp_flq * mask_quant
+        return grad_inp, grad_s
+
+
+class TQT(_FakeQuantize):
+    r"""
+    TQT: https://arxiv.org/abs/1903.08066 Trained Quantization Thresholds
+    for Accurate and Efficient Fixed-Point Inference of Deep Neural Networks.
+    """
+
+    def __init__(self, dtype: str, narrow_range: bool = False, enable: bool = True):
+        super().__init__(dtype, narrow_range, enable)
+        self.scale = Parameter(0.0, dtype=np.float32)
+
+    def fake_quant_forward(self, inp, q_dict=None):
+        # when enable, TQT will do fakequant forward, finetune the scale
+        return TQT_Function(self.qmin, self.qmax)(inp, self.scale)
+
+    def normal_foward(self, inp, q_dict=None):
+        if q_dict["enable_observer"]:
+            # when disable, TQT will do normal forward, initialize scale weight
+            tmp_scale = F.maximum(F.abs(q_dict["min_val"]), F.abs(q_dict["max_val"]))
+            tmp_scale = F.log(tmp_scale / 127) / math.log(2)
+            F.add_update(self.scale, tmp_scale, alpha=0.0, beta=1.0, bias=0.0)
+        return inp
+
+    def get_qparams(self):
+        q_dict = get_qparam_dict(QuantMode.TQT)
+        q_dict["scale"] = 2 ** self.scale
+        return q_dict
+
+    def get_dtype(self):
+        q_dict = self.get_qparams()
+        scale = None if "scale" not in q_dict else q_dict["scale"].numpy()[0]
+        zero_point = (
+            None if "zero_point" not in q_dict else q_dict["zero_point"].numpy()[0]
+        )
+        return get_quantized_dtype(self.dtype, scale, zero_point)
+
+
+class FakeQuantize(_FakeQuantize):
+    r"""
+    A module to do quant and dequant according to observer's scale and zero_point.
+    """
+
+    def fake_quant_forward(self, inp, q_dict=None):
+        return fake_quant_tensor(inp, self.qmin, self.qmax, q_dict)
diff --git a/imperative/python/megengine/quantization/internal_fake_quant.py b/imperative/python/megengine/quantization/internal_fake_quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..02d1d89767eab1ba801c075a7cea3e53edcaed39
--- /dev/null
+++ b/imperative/python/megengine/quantization/internal_fake_quant.py
@@ -0,0 +1,19 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import copy
+import math
+from functools import partial
+
+import numpy as np
+
+from .. import functional as F
+from ..core.tensor.function import Function
+from .fake_quant import _FakeQuantize
+from .observer import MinMaxObserver
+from .qconfig import QConfig
+
diff --git a/imperative/python/megengine/quantization/observer.py b/imperative/python/megengine/quantization/observer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aa610820e59bff872e91123c2823b7b3ab0bf24
--- /dev/null
+++ b/imperative/python/megengine/quantization/observer.py
@@ -0,0 +1,404 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import math
+from abc import abstractmethod
+
+import numpy as np
+
+from .. import functional as F
+from ..core.tensor.dtype import _metadata_dict, get_quantized_dtype
+from ..module import Module
+from ..tensor_nn import Buffer
+from .utils import QuantMode, Round, get_qparam_dict
+
+
+class Observer(Module):
+    r"""
+    A base class for Observer Module.
+
+    :param dtype: a string indicating to collect scale and zero_point of which dtype
+    :param narrow_range: Whether the absolute value of ``qmin`` is the same as ``qmax``,
+        instead of 1 greater. Usually True for weight and False for activation.
+    """
+
+    def __init__(self, dtype: str, narrow_range: bool = False):
+        super().__init__()
+        if dtype not in _metadata_dict.keys():
+            raise ValueError(
+                "unknown dtype: {}, only support {}".format(
+                    dtype, _metadata_dict.keys()
+                )
+            )
+        self.dtype = dtype
+        self.narrow_range = narrow_range
+        self.qmin = (
+            -_metadata_dict[dtype].qmax if narrow_range else _metadata_dict[dtype].qmin
+        )
+        self.qmax = _metadata_dict[dtype].qmax
+        self.enabled = True
+
+    def get_dtype(self):
+        q_dict = self.get_qparams()
+        numpy_scale = None if "scale" not in q_dict else q_dict["scale"].numpy()[0]
+        numpy_zero_point = (
+            None if "zero_point" not in q_dict else q_dict["zero_point"].numpy()[0]
+        )
+        return get_quantized_dtype(self.dtype, numpy_scale, numpy_zero_point)
+
+    def enable(self):
+        self.enabled = True
+
+    def disable(self):
+        self.enabled = False
+
+    def train(self, mode: bool = True, recursive: bool = True) -> None:
+        super().train(mode, recursive)
+        if mode:
+            self.enable()
+        else:
+            self.disable()
+
+    @abstractmethod
+    def forward(self, x):
+        pass
+
+    @abstractmethod
+    def get_qparams(self, **kwargs):
+        pass
+
+
+class MinMaxObserver(Observer):
+    def __init__(
+        self,
+        mode=QuantMode.SYMMERTIC,
+        eps=0.00001,
+        dtype="qint8",
+        narrow_range: bool = False,
+    ):
+        super().__init__(dtype, narrow_range)
+        self.mode = mode
+        self.min_val = Buffer(np.finfo(np.float32).max, dtype=np.float32)
+        self.max_val = Buffer(np.finfo(np.float32).min, dtype=np.float32)
+        self.scale_limit = eps
+
+    def _calculate_qparams(self, inp_min_val, inp_max_val):
+        min_val = F.minimum(0.0, inp_min_val)
+        max_val = F.maximum(0.0, inp_max_val)
+        q_dict = get_qparam_dict(self.mode)
+        q_dict["min_val"] = inp_min_val
+        q_dict["max_val"] = inp_max_val
+        q_dict["enable_observer"] = self.enable
+        if self.mode == QuantMode.SYMMERTIC:
+            symmetric_max_vals = F.maximum(-min_val, max_val)
+            # use maximun to avoid scale too small at the begin
+            q_dict["scale"] = F.maximum(
+                symmetric_max_vals / ((self.qmax - self.qmin) / 2), self.scale_limit
+            )
+            # zero_point = self.zero_point
+        else:
+            # use maximun to avoid scale too small at the begin
+            q_dict["scale"] = F.maximum(
+                (max_val - min_val) / (self.qmax - self.qmin), self.scale_limit,
+            )
+            # caculate zero_point
+            q_dict["zero_point"] = self.qmin - Round()((min_val / q_dict["scale"]))
+
+        return q_dict
+
+    def get_qparams(self):
+        return self._calculate_qparams(self.min_val, self.max_val)
+
+    def forward(self, x_orig):
+        if self.enabled:
+            # stop gradient
+            x = x_orig.detach()
+            # find max and min
+            self.min_val = F.minimum(self.min_val, x.min())
+            self.max_val = F.maximum(self.max_val, x.max())
+        return x_orig
+
+
+class ExponentialMovingAverageObserver(MinMaxObserver):
+    def __init__(
+        self,
+        momentum=0.9,
+        mode=QuantMode.SYMMERTIC,
+        eps=0.00001,
+        dtype="qint8",
+        narrow_range: bool = False,
+    ):
+        super().__init__(mode, eps, dtype, narrow_range)
+        self.momentum = Buffer(momentum)
+        self.runtime_momentum = Buffer(0.0)
+
+    def set_momentum(self, momentum):
+        self.momentum.set_value(momentum)
+
+    def forward(self, x_orig):
+        if self.enabled:
+            # stop gradient
+            x = x_orig.detach()
+            # Exponential Moving Average
+            self.min_val = (
+                self.min_val * self.runtime_momentum
+                + (1 - self.runtime_momentum) * x.min()
+            )
+            self.max_val = (
+                self.max_val * self.runtime_momentum
+                + (1 - self.runtime_momentum) * x.max()
+            )
+            self.runtime_momentum = self.momentum
+
+        return x_orig
+
+
+class HistogramObserver(MinMaxObserver):
+    def __init__(
+        self,
+        bins=2048,
+        upsample_rate=128,
+        mode=QuantMode.SYMMERTIC,
+        eps=0.00001,
+        dtype="qint8",
+        narrow_range: bool = False,
+    ):
+        super().__init__(mode, eps, dtype, narrow_range)
+        self.bins = bins
+        self.upsample_rate = upsample_rate
+        self.dst_nbins = _metadata_dict[dtype].qmax - _metadata_dict[dtype].qmin + 1
+        self.histogram = Buffer([-1] + [0.0] * (bins - 1))
+
+    def _non_linear_param_search(self):
+        r"""Non-linear parameter search.
+        An approximation for L2 error minimization for selecting min/max.
+        By selecting new min/max, we filter out outliers in input distribution.
+        """
+
+        np_min_val = self.min_val.numpy()[0]
+        np_max_val = self.max_val.numpy()[0]
+        np_histogram = self.histogram.numpy()
+        assert len(np_histogram) == self.bins, "bins mistmatch"
+        bin_width = (np_max_val - np_min_val) / self.bins
+
+        def _get_norm(delta_begin, delta_end, density, norm_type):
+            r"""
+            Compute the norm of the values uniformaly distributed between
+            delta_begin and delta_end.
+            norm = density * (integral_{begin, end} x^2)
+                 = density * (end^3 - begin^3) / 3
+            """
+            assert norm_type == "L2", "Only L2 norms are currently supported"
+            norm = 0.0
+            if norm_type == "L2":
+                norm = (
+                    delta_end * delta_end * delta_end
+                    - delta_begin * delta_begin * delta_begin
+                ) / 3
+            return density * norm
+
+        def _compute_quantization_error(next_start_bin, next_end_bin, norm_type):
+            r"""
+            Compute the quantization error if we use start_bin to end_bin as the
+            min and max to do the quantization.
+            """
+
+            norm = 0.0
+            dst_bin_width = (
+                bin_width * (next_end_bin - next_start_bin + 1) / self.dst_nbins
+            )
+            if dst_bin_width == 0.0:
+                return 0.0
+            for src_bin in range(self.bins):
+                # distances from the beginning of first dst_bin to the beginning and
+                # end of src_bin
+                src_bin_begin = (src_bin - next_start_bin) * bin_width
+                src_bin_end = src_bin_begin + bin_width
+
+                # which dst_bins the beginning and end of src_bin belong to?
+                dst_bin_of_begin = min(
+                    self.dst_nbins - 1,
+                    max(0.0, math.floor(src_bin_begin / dst_bin_width)),
+                )
+                dst_bin_of_end = min(
+                    self.dst_nbins - 1,
+                    max(0.0, math.floor(src_bin_end / dst_bin_width)),
+                )
+                dst_bin_of_begin_center = (
+                    dst_bin_of_begin * dst_bin_width + dst_bin_width / 2
+                )
+
+                density = np_histogram[src_bin] / bin_width
+                if dst_bin_of_begin == dst_bin_of_end:
+                    # if src_bin is entirely within 1 dst_bin
+                    delta_begin = src_bin_begin - dst_bin_of_begin_center
+                    delta_end = src_bin_end - dst_bin_of_begin_center
+                    norm = norm + _get_norm(delta_begin, delta_end, density, norm_type)
+                else:
+                    delta_begin = src_bin_begin - dst_bin_of_begin_center
+                    delta_end = dst_bin_width / 2
+                    norm = norm + _get_norm(delta_begin, delta_end, density, norm_type)
+
+                    norm = norm + (dst_bin_of_end - dst_bin_of_begin - 1) * _get_norm(
+                        -dst_bin_width / 2, dst_bin_width / 2, density, norm_type
+                    )
+
+                    dst_bin_of_end_center = (
+                        dst_bin_of_end * dst_bin_width + dst_bin_width / 2
+                    )
+
+                    delta_begin = -dst_bin_width / 2
+                    delta_end = src_bin_end - dst_bin_of_end_center
+                    norm = norm + _get_norm(delta_begin, delta_end, density, norm_type)
+            return norm
+
+        # cumulative sum
+        total = sum(np_histogram)
+        cSum = np.cumsum(np_histogram, axis=0)
+
+        stepsize = 1e-5  # granularity
+        alpha = 0.0  # lower bound
+        beta = 1.0  # upper bound
+        start_bin = 0
+        end_bin = self.bins - 1
+        norm_min = float("inf")
+
+        while alpha < beta:
+            # Find the next step
+            next_alpha = alpha + stepsize
+            next_beta = beta - stepsize
+
+            # find the left and right bins between the quantile bounds
+            l = start_bin
+            r = end_bin
+            while l < end_bin and cSum[l] < next_alpha * total:
+                l = l + 1
+            while r > start_bin and cSum[r] > next_beta * total:
+                r = r - 1
+
+            # decide the next move
+            next_start_bin = start_bin
+            next_end_bin = end_bin
+            if (l - start_bin) > (end_bin - r):
+                # move the start bin
+                next_start_bin = l
+                alpha = next_alpha
+            else:
+                # move the end bin
+                next_end_bin = r
+                beta = next_beta
+
+            if next_start_bin == start_bin and next_end_bin == end_bin:
+                continue
+
+            # calculate the quantization error using next_start_bin and next_end_bin
+            norm = _compute_quantization_error(next_start_bin, next_end_bin, "L2")
+
+            if norm > norm_min:
+                break
+            norm_min = norm
+            start_bin = next_start_bin
+            end_bin = next_end_bin
+
+        new_min = self.min_val + bin_width * start_bin
+        new_max = self.min_val + bin_width * (end_bin + 1)
+        return new_min, new_max
+
+    def get_qparams(self):
+        new_min, new_max = self._non_linear_param_search()
+        return self._calculate_qparams(new_min, new_max)
+
+    def _combine_histograms(
+        self, orig_hist, new_hist, upsample_rate, downsample_rate, start_idx, Nbins
+    ):
+        # First up-sample the histogram with new data by a factor of L
+        # This creates an approximate probability density thats piecwise constant
+        upsampled_histogram = new_hist.repeat(upsample_rate)
+        # Now insert the upsampled histogram into the output
+        # histogram, which is initialized with zeros.
+        # The offset at which the histogram is introduced is determined
+        # by the start index as the output histogram can cover a wider range
+        histogram_with_output_range = np.zeros((Nbins * downsample_rate))
+        histogram_with_output_range[
+            start_idx : Nbins * upsample_rate + start_idx
+        ] = upsampled_histogram
+        # Compute integral histogram, double precision is needed to ensure
+        # that there are no overflows
+        integral_histogram = np.cumsum(histogram_with_output_range, 0)[
+            downsample_rate - 1 :: downsample_rate
+        ]
+        # Finally perform interpolation
+        shifted_integral_histogram = np.zeros((Nbins))
+        shifted_integral_histogram[1:Nbins] = integral_histogram[0:-1]
+        interpolated_histogram = (
+            integral_histogram - shifted_integral_histogram
+        ) / upsample_rate
+        orig_hist = orig_hist + interpolated_histogram
+        return orig_hist
+
+    def _adjust_min_max(self, combined_min, combined_max, upsample_rate):
+        # We ensure that:
+        # (combined_max - combined_min)/(downsample_rate*Nbins) = (max - min)/(upsample_rate*Nbins)
+        # This allows us to have a common grid of resolution s, where we can align
+        # the input histogram
+        # start_idx maps min_val to the histogram bin index.
+        np_min_val = self.min_val.numpy()[0]
+        np_max_val = self.max_val.numpy()[0]
+
+        hist_bin_width = (np_max_val - np_min_val) / (self.bins * upsample_rate)
+        downsample_rate = int(
+            np.ceil((combined_max - combined_min) / (self.bins * hist_bin_width))
+        )
+        e = downsample_rate * (self.bins * hist_bin_width) - (
+            combined_max - combined_min
+        )
+        combined_max = combined_max + e / 2
+        combined_min = combined_min - e / 2
+        start_idx = int(np.round((np_min_val - combined_min) / hist_bin_width))
+
+        return combined_min, combined_max, downsample_rate, start_idx
+
+    def sideeffect_forward(self, x_orig):
+        x = x_orig.numpy()
+        min_val = self.min_val.numpy()[0]
+        max_val = self.max_val.numpy()[0]
+        histogram = self.histogram.numpy()
+        new_min = x.min()
+        new_max = x.max()
+        if histogram[0] == -1:
+            new_histogram, _ = np.histogram(x, self.bins, (new_min, new_max))
+        else:
+            new_min = min(new_min, min_val)
+            new_max = max(new_max, max_val)
+            # combine the existing histogram and new histogram into 1 histogram
+            # We do this by first upsampling the histogram to a dense grid
+            # and then downsampling the histogram efficiently
+            (new_min, new_max, downsample_rate, start_idx,) = self._adjust_min_max(
+                new_min, new_max, self.upsample_rate
+            )
+
+            new_histogram, _ = np.histogram(x, self.bins, (new_min, new_max))
+            new_histogram = new_histogram.astype(np.float64)
+            if new_min == min_val and new_max == max_val:
+                new_histogram += histogram
+            else:
+                new_histogram = self._combine_histograms(
+                    new_histogram,
+                    histogram,
+                    self.upsample_rate,
+                    downsample_rate,
+                    start_idx,
+                    self.bins,
+                )
+
+        self.histogram.set_value(new_histogram)
+        self.min_val.set_value(new_min)
+        self.max_val.set_value(new_max)
+
+    def forward(self, x_orig):
+        self.sideeffect_forward(x_orig)
+        return x_orig
diff --git a/imperative/python/megengine/quantization/qconfig.py b/imperative/python/megengine/quantization/qconfig.py
new file mode 100644
index 0000000000000000000000000000000000000000..6606c1a513be2cf3d1a766a7c044f550b6c8480d
--- /dev/null
+++ b/imperative/python/megengine/quantization/qconfig.py
@@ -0,0 +1,109 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#'
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from functools import partial
+
+from ..module import Module
+from .fake_quant import TQT, FakeQuantize
+from .observer import (
+    ExponentialMovingAverageObserver,
+    HistogramObserver,
+    MinMaxObserver,
+)
+
+
+class QConfig:
+    r"""
+    A config class indicating how to do quantize toward :class:`~.QATModule`'s
+    ``activation`` and ``weight``. See :meth:`~.QATModule.set_qconfig` for detail usage.
+
+    :param weight_observer: interface to instantiate an :class:`~.Observer` indicating
+        how to collect scales and zero_point of wegiht.
+    :param act_observer: similar to ``weight_observer`` but toward activation.
+    :param weight_fake_quant: interface to instantiate a :class:`~.FakeQuantize` indicating
+        how to do fake_quant calculation.
+    :param act_observer: similar to ``weight_fake_quant`` but toward activation.
+
+    Examples:
+
+    .. code-block::
+
+        # Default EMA QConfig for QAT.
+        ema_fakequant_qconfig = QConfig(
+            weight_observer=partial(MinMaxObserver, dtype="qint8", narrow_range=True),
+            act_observer=partial(ExponentialMovingAverageObserver, dtype="qint8", narrow_range=False),
+            weight_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=True),
+            act_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=False),
+        )
+
+    Each parameter is a ``class`` rather than an instance. And we recommand using ``functools.partial``
+    to add initialization parameters of the ``class``, so that don't need to provide parameters in
+    :meth:`~.QATModule.set_qconfig`.
+
+    Usually we set ``narrow_range`` of weight related paramters to ``True`` and of activation related
+    parameters to ``False``. For the result of multiplication and addition as ``a * b + c * d``, if
+    four variables are all -128 of dtype ``qint8``, then the result will be ``2^15`` and cause overflow.
+    Weights are commonly calculated in this way, so needed to narrow the range.
+    """
+
+    def __init__(
+        self, weight_observer, act_observer, weight_fake_quant, act_fake_quant
+    ):
+        if isinstance(act_observer, Module) or isinstance(weight_observer, Module):
+            raise ValueError(
+                "QConfig must not receive observer instance, please pass observer"
+                " class generator using `partial(Observer, ...)` instead. Use"
+                " partial(MyObserver, x=1) to override arguments to constructor if needed"
+            )
+        self.weight_observer = weight_observer
+        self.act_observer = act_observer
+        self.weight_fake_quant = weight_fake_quant
+        self.act_fake_quant = act_fake_quant
+
+
+tqt_quant_qconfig = QConfig(
+    weight_observer=partial(
+        ExponentialMovingAverageObserver, dtype="qint8", narrow_range=True
+    ),
+    act_observer=partial(
+        ExponentialMovingAverageObserver, dtype="qint8", narrow_range=False
+    ),
+    weight_fake_quant=partial(TQT, dtype="qint8", narrow_range=True),
+    act_fake_quant=partial(TQT, dtype="qint8", narrow_range=False),
+)
+
+min_max_fakequant_qconfig = QConfig(
+    weight_observer=partial(MinMaxObserver, dtype="qint8", narrow_range=True),
+    act_observer=partial(MinMaxObserver, dtype="qint8", narrow_range=False),
+    weight_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=True),
+    act_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=False),
+)
+
+ema_fakequant_qconfig = QConfig(
+    weight_observer=partial(MinMaxObserver, dtype="qint8", narrow_range=True),
+    act_observer=partial(
+        ExponentialMovingAverageObserver, dtype="qint8", narrow_range=False
+    ),
+    weight_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=True),
+    act_fake_quant=partial(FakeQuantize, dtype="qint8", narrow_range=False),
+)
+
+ema_lowbit_fakequant_qconfig = QConfig(
+    weight_observer=partial(MinMaxObserver, dtype="qint4", narrow_range=False),
+    act_observer=partial(
+        ExponentialMovingAverageObserver, dtype="qint4", narrow_range=False
+    ),
+    weight_fake_quant=partial(FakeQuantize, dtype="qint4", narrow_range=False),
+    act_fake_quant=partial(FakeQuantize, dtype="qint4", narrow_range=False),
+)
+
+calibration_qconfig = QConfig(
+    weight_observer=partial(MinMaxObserver, dtype="qint8", narrow_range=True),
+    act_observer=partial(HistogramObserver, dtype="qint8", narrow_range=False),
+    weight_fake_quant=None,
+    act_fake_quant=None,
+)
diff --git a/imperative/python/megengine/quantization/quantize.py b/imperative/python/megengine/quantization/quantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dab2ae4e47e6d01dd557ead8b6ea95a73c90898
--- /dev/null
+++ b/imperative/python/megengine/quantization/quantize.py
@@ -0,0 +1,191 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from copy import copy, deepcopy
+from typing import Callable, Dict, Tuple
+
+from .. import module as Float
+from ..module import Module
+from ..module import qat as QAT
+from ..module import quantized as Quantized
+from ..module.qat import QATModule
+from ..module.quantized import QuantizedModule
+from .fake_quant import TQT
+from .qconfig import QConfig, ema_fakequant_qconfig
+
+
+def _get_quantable_module_names():
+    def is_quantable(key: str):
+        value = getattr(Quantized, key)
+        return (
+            isinstance(value, type)
+            and issubclass(value, QuantizedModule)
+            and value != QuantizedModule
+        )
+
+    # source should have all quantable modules' names
+    quantable_module_names = [key for key in dir(Quantized) if is_quantable(key)]
+    return quantable_module_names
+
+
+def _get_convert_dict() -> Tuple[
+    Dict[Module, QATModule], Dict[QATModule, QuantizedModule]
+]:
+    quantable_module_names = _get_quantable_module_names()
+
+    quantable_modules = [getattr(Float, key) for key in quantable_module_names]
+    qat_modules = [getattr(QAT, key) for key in quantable_module_names]
+    quantized_modules = [getattr(Quantized, key) for key in quantable_module_names]
+
+    float2qat_dict = dict(zip(quantable_modules, qat_modules))
+    qat2quantized_dict = dict(zip(qat_modules, quantized_modules))
+    return float2qat_dict, qat2quantized_dict
+
+
+_float2qat_dict, _qat2quantized_dict = _get_convert_dict()
+
+
+def quantize(module: Module, inplace: bool = True, mapping: dict = None):
+    r"""
+    Recursively convert :class:`~.QATModule` to :class:`~.QuantizedModule`
+    through :meth:`~.Module.apply`.
+
+    :param module: root module to do convert recursively.
+    :param inplace: whether to convert submodules in-place.
+    :param mapping: a dict indicating how to convert custom modules from QATModule to
+        QuantizedModule. Will be combined with internal default convert mapping dict.
+    """
+
+    if not inplace:
+        module = deepcopy(module)
+
+    convert_dict = copy(_qat2quantized_dict)
+    if mapping is not None:
+        convert_dict.update(mapping)
+    qat_modules = tuple(convert_dict.keys())
+
+    def is_qat(mod: Module):
+        return isinstance(mod, qat_modules)
+
+    # must use list to avoid replacement influencing successor modules
+    for key, submodule, parent in list(
+        module._flatten(with_key=True, with_parent=True, predicate=is_qat)
+    ):
+        new_mod = convert_dict[type(submodule)].from_qat_module(submodule)
+        if isinstance(parent, Float.Sequential):
+            # cannnot use setattr to be compatible with Sequential's ``__setitem__``
+            parent[int(key.split(".")[-1])] = new_mod
+        else:
+            setattr(parent, key.split(".")[-1], new_mod)
+
+    return module
+
+
+def quantize_qat(
+    module: Module,
+    inplace: bool = True,
+    qconfig: QConfig = ema_fakequant_qconfig,
+    mapping: dict = None,
+):
+    r"""
+    Recursively convert float :class:`~.Module` to :class:`~.QATModule`
+    through :meth:`~.Module.apply` and set qconfig relatively.
+
+    :param module: root module to do convert recursively.
+    :param inplace: whether to convert submodules in-place.
+    :param qconfig: an instance of :class:`~.QConfig` to be set as submodules' qconfig.
+        default is ``ema_fakequant_qconfig``.
+    :param mapping: a dict indicating how to convert custom modules from Module to QATModule.
+        Will be combined with internal default convert mapping dict.
+    """
+
+    if not inplace:
+        module = deepcopy(module)
+
+    convert_dict = copy(_float2qat_dict)
+    if mapping is not None:
+        convert_dict.update(mapping)
+    quantable_modules = tuple(convert_dict.keys())
+
+    def is_quantable(mod: Module):
+        return isinstance(mod, quantable_modules)
+
+    # must use list to avoid replacement influencing successor modules
+    for key, submodule, parent in list(
+        module._flatten(with_key=True, with_parent=True, predicate=is_quantable)
+    ):
+        # only convert top quantable module.
+        if is_quantable(parent) or submodule.quantize_disabled:
+            continue
+
+        new_mod = convert_dict[type(submodule)].from_float_module(submodule)
+        if isinstance(parent, Float.Sequential):
+            # cannnot use setattr to be compatible with Sequential's ``__setitem__``
+            parent[int(key.split(".")[-1])] = new_mod
+        else:
+            setattr(parent, key.split(".")[-1], new_mod)
+
+    propagate_qconfig(module, qconfig)
+    return module
+
+
+def _propagate(module: Module, func_str: str, *args, **kargs):
+    def fn(mod: Module):
+        if isinstance(mod, QATModule):
+            getattr(mod, func_str)(*args, **kargs)
+
+    module.apply(fn)
+
+
+def propagate_qconfig(module: QATModule, qconfig: QConfig):
+    r"""
+    Recursively set ``module``'s qconfig through :meth:`~.Module.apply`.
+
+    :param module: root module to traverse recursively.
+    :param qconfig: a instance of :class:`~.QConfig` to be set as submodules' qconfig.
+    """
+    _propagate(module, "set_qconfig", qconfig)
+
+
+def disable_fake_quant(module: Module):
+    r"""
+    Recursively disable ``module`` fake quantization in QATModule through :meth:`~.Module.apply`
+
+    :param module: root module to do disable fake quantization recursively.
+    """
+
+    _propagate(module, "set_fake_quant", False)
+
+
+def disable_observer(module: Module):
+    r"""
+    Recursively disable ``module`` observer in QATModule through :meth:`~.Module.apply`
+
+    :param module: root module to do disable observer recursively.
+    """
+
+    _propagate(module, "set_observer", False)
+
+
+def enable_fake_quant(module: Module):
+    r"""
+    Recursively enable ``module`` fake quantization in QATModule through :meth:`~.Module.apply`
+
+    :param module: root module to do enable fake quantization recursively.
+    """
+
+    _propagate(module, "set_fake_quant", True)
+
+
+def enable_observer(module: Module):
+    r"""
+    Recursively enable ``module`` observer in QATModule through :meth:`~.Module.apply`
+
+    :param module: root module to do enable observer recursively.
+    """
+
+    _propagate(module, "set_observer", True)
diff --git a/imperative/python/megengine/quantization/utils.py b/imperative/python/megengine/quantization/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..810bbbb3502c8ff44a1b75d4162bb5659c4b3998
--- /dev/null
+++ b/imperative/python/megengine/quantization/utils.py
@@ -0,0 +1,116 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from enum import Enum
+from functools import partial, update_wrapper, wraps
+from typing import Dict
+
+from .. import functional as F
+from ..core.tensor.dtype import _metadata_dict
+from ..core.tensor.function import Function
+from ..tensor import Tensor
+
+
+class Round(Function):
+    """
+    The functional round have no grad and can not use for quantization-aware-training.
+    We use Function and STE(Straight-Through Estimator) to implement backward propagation.
+    """
+
+    def forward(self, x):
+        return F.round(x)
+
+    def backward(self, output_grads):
+        return output_grads
+
+
+def register_method_to_class(cls):
+    def decorator(func):
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            return func(self, *args, **kwargs)
+
+        if isinstance(func, partial):
+            update_wrapper(func, func.func)
+        setattr(cls, func.__name__, wrapper)
+        return func
+
+    return decorator
+
+
+class QuantMode(Enum):
+    """Quantization mode enumerate class.
+    """
+
+    SYMMERTIC = 1
+    ASYMMERTIC = 2
+    TQT = 3
+
+
+qparam_dict = {
+    QuantMode.SYMMERTIC: {"mode": QuantMode.SYMMERTIC, "scale": None,},
+    QuantMode.ASYMMERTIC: {
+        "mode": QuantMode.ASYMMERTIC,
+        "scale": None,
+        "zero_point": None,
+    },
+    QuantMode.TQT: {"mode": QuantMode.TQT, "scale": None,},
+}
+
+
+def get_qparam_dict(mode: QuantMode):
+    """Return the quantization parameters dictory according to the mode.
+    """
+    return qparam_dict.get(mode, None)
+
+
+def fake_quant_tensor(inp: Tensor, qmin: int, qmax: int, q_dict: Dict) -> Tensor:
+    """Apply fake quantization to the inp tensor.
+
+    :param inp: the input tensor which need to be faked.
+    :param qmin: the minimum value which the integer limit to.
+    :param qmax: the maximum value which the integer limit to.
+    :param q_dict: the quantization parameter dict.
+
+    """
+    scale = q_dict["scale"]
+    zero_point = 0
+    if q_dict["mode"] == QuantMode.ASYMMERTIC:
+        zero_point = q_dict["zero_point"]
+    # Quant
+    oup = Round()(inp / scale) + zero_point
+    # Clip
+    oup = F.minimum(F.maximum(oup, qmin), qmax)
+    # Dequant
+    oup = (oup - zero_point) * scale
+    return oup
+
+
+def fake_quant_bias(bias: Tensor, inp: Tensor, w_qat: Tensor) -> Tensor:
+    """Apply fake quantization to bias, the special scale from input tensor
+    and weight tensor, the quantized type set to qint32 also.
+
+    :param bias: the bias tensor which need to be faked.
+    :param inp:  the input tensor which contain the quantization parameters.
+    :param qmax: the weight tensor which contain the quantization parameters.
+
+    .. warning::
+        Only work for symmetric quantization method now.
+
+    """
+    b_qat = bias
+    if hasattr(inp, "q_dict") and b_qat is not None:
+        if inp.q_dict["scale"] is not None and w_qat.q_dict["scale"] is not None:
+            # use the same mode with weight.
+            b_dict = get_qparam_dict(w_qat.q_dict["mode"])
+            b_dict["scale"] = inp.q_dict["scale"] * w_qat.q_dict["scale"]
+            # TODO: add zero_point for ASYMMERTIC mode.
+            qmax = _metadata_dict["qint32"].qmax
+            qmin = _metadata_dict["qint32"].qmin
+            b_qat = fake_quant_tensor(b_qat, qmin, qmax, b_dict)
+
+    return b_qat
diff --git a/imperative/python/megengine/random/__init__.py b/imperative/python/megengine/random/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..86c8d7979c229c7c9697098a5a70135743f15748
--- /dev/null
+++ b/imperative/python/megengine/random/__init__.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from .distribution import gaussian, uniform
+from .rng import manual_seed
+
+# pylint: disable=undefined-variable
+del distribution, rng  # type: ignore[name-defined]
diff --git a/imperative/python/megengine/random/distribution.py b/imperative/python/megengine/random/distribution.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f4655af75a18cf7551ea4a07e14f6ed73829d47
--- /dev/null
+++ b/imperative/python/megengine/random/distribution.py
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Iterable, Optional
+
+from .. import Tensor
+from ..core._imperative_rt import invoke_op
+from ..core.ops.builtin import GaussianRNG, UniformRNG
+from ..core.tensor import utils
+from ..core.tensor.core import apply
+from .rng import _random_seed_generator
+
+__all__ = ["gaussian", "uniform"]
+
+
+def gaussian(shape: Iterable[int], mean: float = 0, std: float = 1,) -> Tensor:
+    r"""Random variable with Gaussian distribution $N(\mu, \sigma)$
+
+    :param shape: Output tensor shape
+    :param mean: The mean or expectation of the distribution
+    :param std: The standard deviation of the distribution (variance = $\sigma ^ 2$)
+    :return: The output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import megengine as mge
+        import megengine.random as rand
+
+        x = rand.gaussian((2, 2), mean=0, std=1)
+        print(x.numpy())
+
+    .. testoutput::
+        :options: +SKIP
+
+        [[-0.20235455 -0.6959438 ]
+         [-1.4939808  -1.5824696 ]]
+
+    """
+    seed = _random_seed_generator().__next__()
+    op = GaussianRNG(seed=seed, mean=mean, std=std)
+    shape = Tensor(shape, dtype="int32")
+    (output,) = apply(op, shape)
+    return output
+
+
+def uniform(shape: Iterable[int], low: float = 0, high: float = 1,) -> Tensor:
+    r"""Random variable with uniform distribution $U(0, 1)$
+
+    :param shape: Output tensor shape
+    :param low: Lower range
+    :param high: Upper range
+    :return: The output tensor
+
+    Examples:
+
+    .. testcode::
+
+        import megengine as mge
+        import megengine.random as rand
+
+        x = rand.uniform((2, 2))
+        print(x.numpy())
+
+    .. testoutput::
+        :options: +SKIP
+
+        [[0.76901674 0.70496535]
+         [0.09365904 0.62957656]]
+
+    """
+    assert low < high, "Uniform is not defined when low >= high"
+
+    seed = _random_seed_generator().__next__()
+    op = UniformRNG(seed=seed)
+    shape = Tensor(shape, dtype="int32")
+    (output,) = apply(op, shape)
+
+    return low + (high - low) * output
diff --git a/imperative/python/megengine/random/rng.py b/imperative/python/megengine/random/rng.py
new file mode 100644
index 0000000000000000000000000000000000000000..992c6a49565b1349ae5b171156b680cf1e644a4b
--- /dev/null
+++ b/imperative/python/megengine/random/rng.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import time
+
+from numpy.random import MT19937
+
+_rng = None
+
+
+def _random_seed_generator():
+    if _rng is None:
+        from ..distributed.group import get_rank
+
+        manual_seed(seed=int(time.time()) + get_rank())
+    while True:
+        yield _rng.random_raw()
+
+
+def manual_seed(seed: int):
+    global _rng  # pylint: disable=global-statement
+    _rng = MT19937(seed=seed)
diff --git a/imperative/python/megengine/serialization.py b/imperative/python/megengine/serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..300d92b5d0c8e9f6dd91c2b11589cc4f254e2b5b
--- /dev/null
+++ b/imperative/python/megengine/serialization.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import pickle
+
+from .device import _valid_device, get_default_device
+from .tensor import Tensor
+from .utils.max_recursion_limit import max_recursion_limit
+
+
+def save(obj, f, pickle_module=pickle, pickle_protocol=pickle.HIGHEST_PROTOCOL):
+    r"""Save an object to disk file.
+
+    :type obj: object
+    :param obj: object to save. Only ``module`` or ``state_dict`` are allowed.
+    :type f: text file object
+    :param f: a string of file name or a text file object to which ``obj`` is saved to.
+    :type pickle_module:
+    :param pickle_module: Default: ``pickle``.
+    :type pickle_protocol:
+    :param pickle_protocol: Default: ``pickle.HIGHEST_PROTOCOL``.
+
+    """
+    if isinstance(f, str):
+        with open(f, "wb") as fout:
+            save(
+                obj, fout, pickle_module=pickle_module, pickle_protocol=pickle_protocol
+            )
+        return
+
+    with max_recursion_limit():
+        assert hasattr(f, "write"), "{} does not support write".format(f)
+        pickle_module.dump(obj, f, pickle_protocol)
+
+
+class dmap:
+    def __init__(self, map_location):
+        self.map_location = map_location
+
+    def __enter__(self):
+        Tensor.dmap_callback = staticmethod(self.map_location)
+        return self
+
+    def __exit__(self, type, value, traceback):
+        Tensor.dmap_callback = None
+
+
+def _get_callable_map_location(map_location):
+    if map_location is None:
+
+        def callable_map_location(state):
+            return str(get_default_device())
+
+    elif isinstance(map_location, str):
+
+        def callable_map_location(state):
+            return map_location
+
+    elif isinstance(map_location, dict):
+        for key, value in map_location.items():
+            # dict key and values can only be "xpux", "cpux", "gpu0", etc.
+            assert _valid_device(key), "Invalid locator_map key value {}".format(key)
+            assert _valid_device(value), "Invalid locator_map key value {}".format(
+                value
+            )
+
+        def callable_map_location(state):
+            if state[:4] in map_location.keys():
+                state = map_location[state[:4]]
+            return state
+
+    else:
+        assert callable(map_location), "map_location should be str, dict or function"
+        callable_map_location = map_location
+    return callable_map_location
+
+
+def load(f, map_location=None, pickle_module=pickle):
+    r"""Load an object saved with save() from a file.
+
+    :type f: text file object
+    :param f: a string of file name or a text file object from which to load.
+    :type map_location: str, dict or a function specifying the map rules
+    :param map_location: Default: ``None``.
+
+        .. note::
+
+            map_location defines device mapping. See examples for usage.
+
+    :type pickle_module:
+    :param pickle_module: Default: ``pickle``.
+
+    .. note::
+
+        If you will call :func:`mge.set_default_device()`, please do it
+        before :func:`mge.load()`.
+
+    Examples:
+
+    .. testcode:
+
+        import megengine as mge
+        # Load tensors to the same device as defined in model.mge
+        mge.load('model.mge')
+        # Load all tensors to gpu0.
+        mge.load('model.mge', map_location='gpu0')
+        # Load all tensors originally on gpu0 to cpu0
+        mge.load('model.mge', map_location={'gpu0':'cpu0'})
+        # Load all tensors to cpu0
+        mge.load('model.mge', map_location=lambda dev: 'cpu0')
+
+    """
+    if isinstance(f, str):
+        with open(f, "rb") as fin:
+            return load(fin, map_location=map_location, pickle_module=pickle_module)
+
+    map_location = _get_callable_map_location(map_location)  # callable map_location
+
+    with dmap(map_location) as dm:
+        return pickle_module.load(f)
diff --git a/imperative/python/megengine/tensor.py b/imperative/python/megengine/tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..1848463c39605ee460ccdad7f555e178735a7648
--- /dev/null
+++ b/imperative/python/megengine/tensor.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+
+
+import collections
+
+from .core import Tensor as _Tensor
+from .device import get_default_device
+
+
+class Tensor(_Tensor):
+    requires_grad = False
+    dmap_callback = None
+
+    def __init__(self, data, dtype=None, device=None):
+        if device is None:
+            device = get_default_device()
+        self.q_dict = {"mode": None, "scale": None, "zero_point": None}
+        super().__init__(data, dtype=dtype, device=device)
+
+    def set_value(self, value):
+        self._reset(value)
+
+    def reset_zero(self):
+        self *= 0
+
+    def __getstate__(self):
+        r""" __getstate__ will be called for pickle serialization or deep copy
+        """
+
+        state = {
+            "data": self.numpy(),
+            "device": str(self.device),
+            "dtype": self.dtype,
+            "qdict": self.q_dict,
+        }
+        return state
+
+    def __setstate__(self, state):
+        data = state.pop("data")
+        device = state.pop("device")
+        if self.dmap_callback is not None:
+            assert isinstance(device, str)
+            device = self.dmap_callback(device)
+        dtype = state.pop("dtype")
+        self.q_dict = state.pop("qdict")
+        super().__init__(data, dtype=dtype, device=device)
+
+    def detach(self):
+        r"""
+        Returns a new tensor which is treated as constant during backward gradient calcuation,
+        i.e. its gradient is zero.
+
+        :param inp: input tensor
+
+        """
+        Wrapper = type(self)
+        Tensor = type(self.__wrapped__)
+        return Wrapper(Tensor(self.__wrapped__._data))
+
+
+tensor = Tensor
+
+
+class Dict(collections.MutableMapping):
+    def __init__(self, *args, key=None, **kwargs):
+        self.data = {}
+        if key:
+            self.keyfn = key
+        for i in args:
+            self.update(i)
+        self.update(**kwargs)
+
+    @staticmethod
+    def keyfn(key):  # pylint: disable=method-hidden
+        return key
+
+    def __getitem__(self, key):
+        _, v = self.data[self.keyfn(key)]
+        return v
+
+    def __setitem__(self, key, value):
+        self.data[self.keyfn(key)] = key, value
+
+    def __delitem__(self, key):
+        del self.data[self.keyfn(key)]
+
+    def __iter__(self):
+        for _, (k, _) in self.data.items():
+            yield k
+
+    def __len__(self):
+        return len(self.data)
+
+
+class TensorDict(Dict):  # pylint: disable=too-many-ancestors
+    class keyfn:
+        def __new__(cls, x: Tensor):
+            if not isinstance(x, Tensor):
+                return x
+            return super().__new__(cls)
+
+        def __init__(self, x: Tensor):
+            self._data = x  # do not save id directly to make pickle work
+
+        def __hash__(self):
+            return id(self._data)
+
+        def __eq__(self, other):
+            # pylint: disable=undefined-variable
+            return isinstance(other, __class__) and id(self._data) == id(other._data)
+
+    def __init__(self, *args):
+        super().__init__(*args)
diff --git a/imperative/python/megengine/tensor_nn.py b/imperative/python/megengine/tensor_nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c4916fb4a8e14b8bc4bf7f4464202603143b4ef
--- /dev/null
+++ b/imperative/python/megengine/tensor_nn.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from . import Tensor, tensor
+
+
+class Buffer(Tensor):
+    r"""A kind of Tensor with ``requires_grad=False``.
+    """
+
+
+class Parameter(Tensor):
+    r"""A kind of Tensor that is to be considered a module parameter.
+    """
+    requires_grad = True
diff --git a/imperative/python/megengine/test/__init__.py b/imperative/python/megengine/test/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..44ed54c22e810586429e5aed6b0ae41da066e629
--- /dev/null
+++ b/imperative/python/megengine/test/__init__.py
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+
+def assertTensorClose(
+    v0, v1, *, max_err: float = 1e-6, allow_special_values: bool = False, name=None
+):
+    """
+    :param allow_special_values: whether to allow :attr:`v0` and :attr:`v1` to contain inf and nan values.
+    :param max_err: relative error
+    """
+    __tracebackhide__ = True  # pylint: disable=unused-variable
+
+    assert (
+        v0.dtype == v1.dtype
+    ), "Two Tensor must have same dtype, but the inputs are {} and {}".format(
+        v0.dtype, v1.dtype
+    )
+    v0 = np.ascontiguousarray(v0, dtype=np.float32).copy()
+    v1 = np.ascontiguousarray(v1, dtype=np.float32).copy()
+    if allow_special_values:
+        # check nan and rm it
+        v0_nan_mask = np.isnan(v0)
+        if np.any(v0_nan_mask):
+            assert np.array_equiv(v0_nan_mask, np.isnan(v1)), (v0, v1)
+            v0[v0_nan_mask] = 0
+            v1[v0_nan_mask] = 0
+        # check inf and rm it
+        v0_inf_mask = v0 == float("inf")
+        if np.any(v0_inf_mask):
+            assert np.array_equiv(v0_inf_mask, v1 == float("inf")), (v0, v1)
+            v0[v0_inf_mask] = 0
+            v1[v0_inf_mask] = 0
+        # check -inf and rm it
+        v0_inf_mask = v0 == float("-inf")
+        if np.any(v0_inf_mask):
+            assert np.array_equiv(v0_inf_mask, v1 == float("-inf")), (v0, v1)
+            v0[v0_inf_mask] = 0
+            v1[v0_inf_mask] = 0
+    else:
+        assert np.isfinite(v0.sum()) and np.isfinite(v1.sum()), (v0, v1)
+
+    assert v0.shape == v1.shape, "Two tensor must have same shape({} v.s. {})".format(
+        v0.shape, v1.shape
+    )
+    vdiv = np.max([np.abs(v0), np.abs(v1), np.ones_like(v0)], axis=0)
+    err = np.abs(v0 - v1) / vdiv
+    check = err > max_err
+    if check.sum():
+        idx = tuple(i[0] for i in np.nonzero(check))
+        if name is None:
+            name = "tensor"
+        else:
+            name = "tensor {}".format(name)
+        raise AssertionError(
+            "{} not equal: "
+            "shape={} nonequal_idx={} v0={} v1={} err={}".format(
+                name, v0.shape, idx, v0[idx], v1[idx], err[idx]
+            )
+        )
diff --git a/imperative/python/megengine/utils/__init__.py b/imperative/python/megengine/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1207b5d98cd3578bc39e9ce600a1254a434880c8
--- /dev/null
+++ b/imperative/python/megengine/utils/__init__.py
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/imperative/python/megengine/utils/_timed_func_fork_exec_entry.py b/imperative/python/megengine/utils/_timed_func_fork_exec_entry.py
new file mode 100644
index 0000000000000000000000000000000000000000..b962d365eb4379c1cbcb79b234b7f6ed04f151a1
--- /dev/null
+++ b/imperative/python/megengine/utils/_timed_func_fork_exec_entry.py
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import argparse
+import os
+import sys
+
+from megengine.core._imperative_rt.utils import _timed_func_exec_cb
+
+try:
+    from setproctitle import setproctitle
+except ImportError:
+    setproctitle = None
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="entry point for fork-exec callback in TimedFuncInvoker;"
+        " this file should not be used directly by normal user."
+    )
+    parser.add_argument("user_data")
+    args = parser.parse_args()
+
+    if setproctitle:
+        setproctitle("megbrain:timed_func_exec:ppid={}".format(os.getppid()))
+    _timed_func_exec_cb(args.user_data)
+    raise SystemError("_timed_func_exec_cb returned")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/imperative/python/megengine/utils/hook.py b/imperative/python/megengine/utils/hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..9864a94a1f22b81b7a0e50a19fe4febf54386a17
--- /dev/null
+++ b/imperative/python/megengine/utils/hook.py
@@ -0,0 +1,23 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import weakref
+
+
+class HookHandler:
+    hook_num = 0
+
+    def __init__(self, source_dict, hook):
+        self.id = HookHandler.hook_num
+        HookHandler.hook_num += 1
+        source_dict[self.id] = hook
+        self.source_ref = weakref.ref(source_dict)
+
+    def remove(self):
+        source_dict = self.source_ref()
+        if source_dict is not None and self.id in source_dict:
+            del source_dict[self.id]
diff --git a/imperative/python/megengine/utils/http_download.py b/imperative/python/megengine/utils/http_download.py
new file mode 100644
index 0000000000000000000000000000000000000000..add2a649e815eff774ff5ad43bf01a86b931881c
--- /dev/null
+++ b/imperative/python/megengine/utils/http_download.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import hashlib
+import os
+import shutil
+from tempfile import NamedTemporaryFile
+
+import requests
+from tqdm import tqdm
+
+from ..logger import get_logger
+
+logger = get_logger(__name__)
+
+CHUNK_SIZE = 1024
+HTTP_CONNECTION_TIMEOUT = 5
+
+
+class HTTPDownloadError(BaseException):
+    """The class that represents http request error"""
+
+
+def download_from_url(url: str, dst: str, http_read_timeout=120):
+    """
+    Downloads file from given url to ``dst``
+
+    :param url: source URL
+    :param dst: saving path
+    :param http_read_timeout: how many seconds to wait for data before giving up
+    """
+    dst = os.path.expanduser(dst)
+    dst_dir = os.path.dirname(dst)
+
+    resp = requests.get(
+        url, timeout=(HTTP_CONNECTION_TIMEOUT, http_read_timeout), stream=True
+    )
+    if resp.status_code != 200:
+        raise HTTPDownloadError("An error occured when downloading from {}".format(url))
+
+    md5 = hashlib.md5()
+    total_size = int(resp.headers.get("Content-Length", 0))
+    bar = tqdm(
+        total=total_size, unit="iB", unit_scale=True, ncols=80
+    )  # pylint: disable=blacklisted-name
+    try:
+        with NamedTemporaryFile("w+b", delete=False, suffix=".tmp", dir=dst_dir) as f:
+            logger.info("Download file to temp file %s", f.name)
+            for chunk in resp.iter_content(CHUNK_SIZE):
+                if not chunk:
+                    break
+                bar.update(len(chunk))
+                f.write(chunk)
+                md5.update(chunk)
+            bar.close()
+        shutil.move(f.name, dst)
+    finally:
+        # ensure tmp file is removed
+        if os.path.exists(f.name):
+            os.remove(f.name)
+    return md5.hexdigest()
diff --git a/imperative/python/megengine/utils/max_recursion_limit.py b/imperative/python/megengine/utils/max_recursion_limit.py
new file mode 100644
index 0000000000000000000000000000000000000000..0870b7fa0e48bff3bc53aa98d2206ae81b1d2aaa
--- /dev/null
+++ b/imperative/python/megengine/utils/max_recursion_limit.py
@@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import resource
+import sys
+import threading
+
+
+class AlternativeRecursionLimit:
+    r"""A reentrant context manager for setting global recursion limits.
+    """
+
+    def __init__(self, new_py_limit):
+        self.new_py_limit = new_py_limit
+        self.count = 0
+        self.lock = threading.Lock()
+
+        self.orig_py_limit = 0
+        self.orig_rlim_stack_soft = 0
+        self.orig_rlim_stack_hard = 0
+
+    def __enter__(self):
+        with self.lock:
+            if self.count == 0:
+                self.orig_py_limit = sys.getrecursionlimit()
+                (
+                    self.orig_rlim_stack_soft,
+                    self.orig_rlim_stack_hard,
+                ) = resource.getrlimit(resource.RLIMIT_STACK)
+                resource.setrlimit(
+                    resource.RLIMIT_STACK,
+                    (self.orig_rlim_stack_hard, self.orig_rlim_stack_hard),
+                )
+                # increase recursion limit
+                sys.setrecursionlimit(self.new_py_limit)
+            self.count += 1
+
+    def __exit__(self, type, value, traceback):
+        with self.lock:
+            self.count -= 1
+            if self.count == 0:
+                sys.setrecursionlimit(self.orig_py_limit)
+                resource.setrlimit(
+                    resource.RLIMIT_STACK,
+                    (self.orig_rlim_stack_soft, self.orig_rlim_stack_hard),
+                )
+
+
+_max_recursion_limit_context_manager = AlternativeRecursionLimit(2 ** 31 - 1)
+
+
+def max_recursion_limit():
+    r"""Sets recursion limit to the max possible value
+    """
+    return _max_recursion_limit_context_manager
diff --git a/imperative/python/megengine/utils/net_stats.py b/imperative/python/megengine/utils/net_stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8a81d9909ffedc727b99a7be8dd33105879f156
--- /dev/null
+++ b/imperative/python/megengine/utils/net_stats.py
@@ -0,0 +1,280 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from functools import partial
+
+import numpy as np
+import tabulate
+
+import megengine as mge
+import megengine.core.tensor.dtype as dtype
+import megengine.module as m
+import megengine.module.qat as qatm
+import megengine.module.quantized as qm
+from megengine.functional.tensor import zeros
+
+try:
+    mge.logger.MegEngineLogFormatter.max_lines = float("inf")
+except AttributeError as e:
+    raise ValueError("set logger max lines failed")
+
+logger = mge.get_logger(__name__)
+logger.setLevel("INFO")
+
+
+CALC_FLOPS = {}
+
+
+def _register_modules(*modules):
+    def callback(impl):
+        for module in modules:
+            CALC_FLOPS[module] = impl
+        return impl
+
+    return callback
+
+
+@_register_modules(
+    m.Conv2d,
+    m.ConvTranspose2d,
+    m.LocalConv2d,
+    qm.Conv2d,
+    qm.ConvRelu2d,
+    qm.ConvBn2d,
+    qm.ConvBnRelu2d,
+    qatm.Conv2d,
+    qatm.ConvRelu2d,
+    qatm.ConvBn2d,
+    qatm.ConvBnRelu2d,
+)
+def count_convNd(module, input, output):
+    bias = 1 if module.bias is not None else 0
+    group = module.groups
+    ic = input[0].shape[1]
+    oc = output[0].shape[1]
+    goc = oc // group
+    gic = ic // group
+    N = output[0].shape[0]
+    HW = np.prod(output[0].shape[2:])
+    # N x Cout x H x W x  (Cin x Kw x Kh + bias)
+    return N * HW * goc * (gic * np.prod(module.kernel_size) + bias)
+
+
+@_register_modules(m.ConvTranspose2d)
+def count_deconvNd(module, input, output):
+    return np.prod(input[0].shape) * output[0].shape[1] * np.prod(module.kernel_size)
+
+
+@_register_modules(m.Linear, qatm.Linear, qm.Linear)
+def count_linear(module, input, output):
+    return np.prod(output[0].shape) * module.in_features
+
+
+# does not need import qat and quantized module since they inherit from float module.
+hook_modules = (
+    m.Conv2d,
+    m.ConvTranspose2d,
+    m.LocalConv2d,
+    m.BatchNorm2d,
+    m.Linear,
+)
+
+
+def net_stats(model, input_size, bar_length_max=20, log_params=True, log_flops=True):
+    def dict2table(list_of_dict, header):
+        table_data = [header]
+        for d in list_of_dict:
+            row = []
+            for h in header:
+                v = ""
+                if h in d:
+                    v = d[h]
+                row.append(v)
+            table_data.append(row)
+        return table_data
+
+    def sizeof_fmt(num, suffix="B"):
+        for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
+            if abs(num) < 1024.0:
+                return "{:3.3f} {}{}".format(num, unit, suffix)
+            num /= 1024.0
+        sign_str = "-" if num < 0 else ""
+        return "{}{:.1f} {}{}".format(sign_str, num, "Yi", suffix)
+
+    def get_byteswidth(tensor):
+        if dtype.is_quantize(tensor.dtype):
+            return 1
+        # elif dtype.is_bfloat16(tensor.dtype):
+        #      return 2
+        else:
+            return 4
+
+    def print_flops_stats(flops):
+        flops_list = [i["flops_num"] for i in flops]
+        max_flops_num = max(flops_list + [0])
+        # calc total flops and set flops_cum
+        total_flops_num = 0
+        for d in flops:
+            total_flops_num += int(d["flops_num"])
+            d["flops_cum"] = sizeof_fmt(total_flops_num, suffix="OPs")
+
+        for i in flops:
+            f = i["flops_num"]
+            i["flops"] = sizeof_fmt(f, suffix="OPs")
+            r = i["ratio"] = f / total_flops_num
+            i["percentage"] = "{:.2f}%".format(r * 100)
+            bar_length = int(f / max_flops_num * bar_length_max)
+            i["bar"] = "#" * bar_length
+
+        header = [
+            "name",
+            "class_name",
+            "input_shapes",
+            "output_shapes",
+            "flops",
+            "flops_cum",
+            "percentage",
+            "bar",
+        ]
+
+        total_flops_str = sizeof_fmt(total_flops_num, suffix="OPs")
+        total_var_size = sum(sum(s[1] for s in i["output_shapes"]) for i in flops)
+        flops.append(
+            dict(name="total", flops=total_flops_str, output_shapes=total_var_size)
+        )
+
+        logger.info(
+            "flops stats: \n" + tabulate.tabulate(dict2table(flops, header=header))
+        )
+
+        return total_flops_num
+
+    def print_params_stats(params):
+        total_param_dims, total_param_size = 0, 0
+        for d in params:
+            total_param_dims += int(d["param_dim"])
+            total_param_size += int(d["size"])
+            d["size"] = sizeof_fmt(d["size"])
+            d["size_cum"] = sizeof_fmt(total_param_size)
+
+        for d in params:
+            ratio = d["param_dim"] / total_param_dims
+            d["ratio"] = ratio
+            d["percentage"] = "{:.2f}%".format(ratio * 100)
+
+        # construct bar
+        max_ratio = max([d["ratio"] for d in params])
+        for d in params:
+            bar_length = int(d["ratio"] / max_ratio * bar_length_max)
+            d["size_bar"] = "#" * bar_length
+
+        param_size = sizeof_fmt(total_param_size)
+        params.append(dict(name="total", param_dim=total_param_dims, size=param_size,))
+
+        header = [
+            "name",
+            "shape",
+            "mean",
+            "std",
+            "param_dim",
+            "bits",
+            "size",
+            "size_cum",
+            "percentage",
+            "size_bar",
+        ]
+
+        logger.info(
+            "param stats: \n" + tabulate.tabulate(dict2table(params, header=header))
+        )
+
+        return total_param_size
+
+    def net_stats_hook(module, input, output, name=""):
+        class_name = str(module.__class__).split(".")[-1].split("'")[0]
+
+        flops_fun = CALC_FLOPS.get(type(module))
+        if callable(flops_fun):
+            flops_num = flops_fun(module, input, output)
+
+            if not isinstance(output, (list, tuple)):
+                output = [output]
+
+            flops.append(
+                dict(
+                    name=name,
+                    class_name=class_name,
+                    input_shapes=[i.shape for i in input],
+                    output_shapes=[o.shape for o in output],
+                    flops_num=flops_num,
+                    flops_cum=0,
+                )
+            )
+
+        if hasattr(module, "weight") and module.weight is not None:
+            w = module.weight
+            value = w.numpy()
+            param_dim = np.prod(w.shape)
+            param_bytes = get_byteswidth(w)
+            params.append(
+                dict(
+                    name=name + "-w",
+                    shape=w.shape,
+                    param_dim=param_dim,
+                    bits=param_bytes * 8,
+                    size=param_dim * param_bytes,
+                    size_cum=0,
+                    mean="{:.2g}".format(value.mean()),
+                    std="{:.2g}".format(value.std()),
+                )
+            )
+
+        if hasattr(module, "bias") and module.bias is not None:
+            b = module.bias
+            value = b.numpy()
+            param_dim = np.prod(b.shape)
+            param_bytes = get_byteswidth(b)
+            params.append(
+                dict(
+                    name=name + "-b",
+                    shape=b.shape,
+                    param_dim=param_dim,
+                    bits=param_bytes * 8,
+                    size=param_dim * param_bytes,
+                    size_cum=0,
+                    mean="{:.2g}".format(value.mean()),
+                    std="{:.2g}".format(value.std()),
+                )
+            )
+
+    # multiple inputs to the network
+    if not isinstance(input_size[0], tuple):
+        input_size = [input_size]
+
+    params = []
+    flops = []
+    hooks = []
+
+    for (name, module) in model.named_modules():
+        if isinstance(module, hook_modules):
+            hooks.append(
+                module.register_forward_hook(partial(net_stats_hook, name=name))
+            )
+
+    inputs = [zeros(in_size, dtype=np.float32) for in_size in input_size]
+    model.eval()
+    model(*inputs)
+    for h in hooks:
+        h.remove()
+
+    total_flops, total_params = 0, 0
+    if log_params:
+        total_params = print_params_stats(params)
+    if log_flops:
+        total_flops = print_flops_stats(flops)
+
+    return total_params, total_flops
diff --git a/imperative/python/megengine/utils/profile_analyze.py b/imperative/python/megengine/utils/profile_analyze.py
new file mode 100755
index 0000000000000000000000000000000000000000..8041c0d8fc66a696f4d6fe5011f1fb63bbe280bf
--- /dev/null
+++ b/imperative/python/megengine/utils/profile_analyze.py
@@ -0,0 +1,424 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import argparse
+import collections
+import json
+import re
+import textwrap
+
+import numpy as np
+from tabulate import tabulate
+
+from megengine.utils.profile_analyzer import (
+    NonExistNum,
+    ProfileAnalyzer,
+    TimeFuncHelper,
+)
+
+
+def _tabulate_ml(tab, **kwargs):
+    """Tabulate profile output with multi-line support."""
+    new_tab = []
+    new_tab_is_row = []
+    for row in tab:
+        col_lines = [str(i).split("\n") for i in row]
+        max_nr_line = max(map(len, col_lines))
+        new_tab_is_row.append(True)
+        if max_nr_line > 1:
+            new_tab_is_row.extend([False] * (max_nr_line - 1))
+            for i in col_lines:
+                if len(i) < max_nr_line:
+                    i.extend([""] * (max_nr_line - len(i)))
+            new_tab.extend(zip(*col_lines))
+        else:
+            new_tab.append(row)
+
+    assert len(new_tab_is_row) == len(new_tab)
+    ret = [i + "\n" for i in tabulate(new_tab, **kwargs).split("\n")]
+    for idx, val in enumerate(new_tab_is_row):
+        if not val:
+            ret[idx * 2 + 2] = ""
+    return "".join(ret)[:-1]
+
+
+def _tabulate_confluence(tab, **kwargs):
+    """Tabulate profile output."""
+    kwargs.pop("tablefmt", None)
+    s = tabulate(tab, tablefmt="orgtbl", **kwargs)
+    lines = s.split("\n")
+    lines[1] = lines[1].replace("+", "|")
+    return "\n".join(lines)
+
+
+def main(passed_args=None):  # pylint: disable=too-many-statements
+    """Analyses profile info from :mod:`~.utils.profile_analyzer` .
+
+    Run this file with ``--help`` to get more usage.
+    """
+    parser = argparse.ArgumentParser(
+        description="analyze analyzer result",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("dump")
+    parser.add_argument(
+        "-t",
+        "--top",
+        type=int,
+        default=3,
+        help="number of most time-consuming operators to print",
+    )
+    parser.add_argument(
+        "--type", action="append", help="filter oprs in the top list by type"
+    )
+    parser.add_argument(
+        "--aggregate-by",
+        default=None,
+        choices=["type"],
+        help="aggragate profiling result by",
+    )
+    parser.add_argument(
+        "--opr-name", help="filter oprs in the top list by regex of name"
+    )
+    parser.add_argument(
+        "--input-dtype", type=str, help="filter oprs in the top list by input dtype"
+    )
+    parser.add_argument(
+        "--top-end-key",
+        default="end",
+        choices=["end", "kern"],
+        help="how time in top is calculated; end corresponds "
+        "to total device time, and kern corresponds to only "
+        "wait time",
+    )
+    parser.add_argument(
+        "--aggregate",
+        default=None,
+        help="aggregate operations",
+        choices=["max", "min", "sum", "mean"],
+    )
+    parser.add_argument(
+        "--order-by",
+        default="time",
+        help="sort result according to given column; the param can be "
+        "<col_name> or +<col_name>, meaning sorting in descending or "
+        "ascending order respectively",
+    )
+    parser.add_argument(
+        "--copy-time", action="store_true", help="show copy time related result"
+    )
+    parser.add_argument(
+        "--min-time",
+        type=float,
+        default=float("-inf"),
+        help="minimal time of a result to be printed",
+    )
+    parser.add_argument(
+        "--max-time",
+        type=float,
+        default=float("inf"),
+        help="maximal time of a result to be printed",
+    )
+    parser.add_argument(
+        "--show-host", action="store_true", help="show host profiling info"
+    )
+    parser.add_argument(
+        "--dump-only-opr",
+        action="store_true",
+        help="only dump operator info as plaintext; useful "
+        "for diff between two filtered profile results",
+    )
+    parser.add_argument(
+        "--confluence",
+        "--wiki",
+        action="store_true",
+        help="output confluence-markdown-compatible table",
+    )
+    parser.add_argument(
+        "--print-only",
+        choices={"summary", "device", "host"},
+        help="print only chosen info",
+    )
+
+    args = parser.parse_args(passed_args)
+
+    opr_filters = []
+    if args.type:
+        opr_filters.append(lambda o, a, b: o["type"] in args.type)
+    if args.opr_name:
+        opr_filters.append(
+            lambda o, a, b, r=re.compile(args.opr_name): r.match(o["name"])
+        )
+    if args.input_dtype:
+        opr_filters.append(
+            lambda o, a, b: any(
+                [i["mem_plan"]["layout"]["dtype"] == args.input_dtype for i in a]
+            )
+        )
+    if not opr_filters:
+
+        def opr_filter(o, a, b):  # pylint: disable=unused-argument
+            return True
+
+    else:
+
+        def opr_filter(o, a, b):
+            return all(i(o, a, b) for i in opr_filters)
+
+    with open(args.dump) as fin:
+        dump = json.load(fin)
+
+    analyzer = ProfileAnalyzer(dump, opr_filter)
+    analyzer_tot = ProfileAnalyzer(dump, lambda _, __, ___: True)
+
+    def summary():
+        device_end_func = TimeFuncHelper.eval_time_func("device", "end", np.max)
+        device_kern_func = TimeFuncHelper.eval_time_func("device", "kern", np.max)
+        host_end_func = TimeFuncHelper.eval_time_func("host", "end", np.max)
+
+        def get_tot_time(func):
+            rec = analyzer_tot.select(func, aggregate=np.sum)
+            if not rec:
+                return "N/A"
+            rec = rec[0]
+            return rec.time
+
+        tab = []
+        tot_dev_time = get_tot_time(device_end_func)
+        tot_host_time = get_tot_time(host_end_func)
+        tab.append(("total device time", tot_dev_time))
+        tab.append(("total host time", tot_host_time))
+        if args.copy_time:
+
+            def fmt(a, b):
+                a = a[0]
+                b = b[0]
+                return "tot={:.4f} avg={:.4f}".format(a.time, b.time)
+
+            tab.append(
+                (
+                    "copy time",
+                    fmt(
+                        analyzer.select(
+                            device_end_func,
+                            lambda opr: opr.opr_info["type"] == "Copy",
+                            aggregate=np.sum,
+                        ),
+                        analyzer.select(
+                            device_end_func,
+                            lambda opr: opr.opr_info["type"] == "Copy",
+                            aggregate=np.mean,
+                        ),
+                    ),
+                )
+            )
+            tab.append(
+                (
+                    "copy wait time",
+                    fmt(
+                        analyzer.select(
+                            device_kern_func,
+                            lambda opr: opr.opr_info["type"] == "Copy",
+                            aggregate=np.sum,
+                        ),
+                        analyzer.select(
+                            device_kern_func,
+                            lambda opr: opr.opr_info["type"] == "Copy",
+                            aggregate=np.mean,
+                        ),
+                    ),
+                )
+            )
+
+        if args.confluence:
+            tab_str = _tabulate_confluence(tab, headers=["name", "value"])
+        else:
+            tab_str = tabulate(tab)
+
+        return tab_str, tot_dev_time, tot_host_time
+
+    def prof_details(prof_type, tot_time):
+        tab = []
+
+        def func(
+            opr,
+            *,
+            f0=TimeFuncHelper.eval_time_func(prof_type, args.top_end_key, np.max)
+        ):
+            t = f0(opr)
+            if t is not None and (t < args.min_time or t > args.max_time):
+                return None
+            return t
+
+        records = analyzer.select(
+            func,
+            aggregate=args.aggregate,
+            aggregate_by=args.aggregate_by,
+            top_k=args.top,
+            sort_by=args.order_by,
+        )
+
+        if args.dump_only_opr:
+            ret = []
+            for i in records:
+                ret.append(" ".join(i.info.values()))
+            return "\n".join(ret)
+
+        def format_shapes(shapes, layouts=None, sep="\n"):
+            if isinstance(shapes, NonExistNum) or shapes is None:
+                return repr(shapes)
+            if layouts is None:
+                layouts = [None] * len(shapes)
+
+            comp = []
+            for i, j in zip(shapes, layouts):
+                i = "{" + ",".join(map(str, i)) + "}"
+                if j:
+                    i += "\n -[" + ",".join(map(str, j)) + "]"
+                comp.append(i)
+            return sep.join(comp)
+
+        def fix_num_and_find_unit(x, base):
+            if isinstance(x, NonExistNum) or (
+                isinstance(x, float) and not np.isfinite(x)
+            ):
+                return x, ""
+            unit = iter(["", "K", "M", "G", "T", "P"])
+            while x >= base:
+                x /= base
+                next(unit)
+            return x, next(unit)
+
+        def get_number_with_unit(num, unit, base, sep="\n"):
+            num, unit_prefix = fix_num_and_find_unit(num, base)
+            if isinstance(unit, list):
+                unit = unit[int(unit_prefix != "")]
+            return ("{:.2f}" + sep + "{}{}").format(num, unit_prefix, unit)
+
+        if args.confluence:
+            rows = []
+            cum_time = 0
+
+            max_time = max([r.time for r in records])
+            max_bandwidth = max([r.bandwidth for r in records])
+            max_flops = max(
+                [r.flops for r in records if not isinstance(r.flops, NonExistNum)]
+            )
+
+            bar_length = 15
+            for idx, record in enumerate(records):
+                cum_time += record.time
+
+                opr_info = [("opr " + k, v) for k, v in record.info.items()]
+
+                row = collections.OrderedDict(
+                    [
+                        ("#", idx),
+                        ("time", "{:.3}".format(record.time)),
+                        ("ratio", "{:.1f}%".format(record.time / tot_time * 100)),
+                        ("time bar", "#" * int(record.time / max_time * bar_length)),
+                        ("cum-time", cum_time),
+                        ("cum-time ratio", cum_time / tot_time),
+                    ]
+                    + opr_info
+                    + [
+                        (
+                            "computation (MFLO)",
+                            "{:.1f}".format(record.computation / 1000 ** 2),
+                        ),
+                        ("MFLOPS", "{:.1f}".format(record.flops / 1000 ** 2)),
+                        (
+                            "MFLOPS-bar",
+                            ""
+                            if isinstance(record.flops, NonExistNum)
+                            else ("#" * int(record.flops / max_flops * bar_length)),
+                        ),
+                        ("memory (MB)", "{:.1f}".format(record.memory / 1024 ** 2)),
+                        (
+                            "bandwidth (MiB/s)",
+                            "{:.1f}".format(record.bandwidth / 1024 ** 2),
+                        ),
+                        (
+                            "bandwidth bar",
+                            "#" * int(record.bandwidth / max_bandwidth * bar_length),
+                        ),
+                        (
+                            "in_shapes",
+                            format_shapes(
+                                record.in_shapes, record.in_layouts, sep=", "
+                            ),
+                        ),
+                        ("out_shapes", format_shapes(record.out_shapes, sep=", ")),
+                    ]
+                )
+                rows.append(row)
+            headers = list(rows[0].keys())
+            tab = [[row[i] for i in headers] for row in rows]
+
+            return _tabulate_confluence(tab, headers=headers)
+
+        else:
+            cum_time = 0
+            for idx, record in enumerate(records):
+                cum_time += record.time
+                tab.append(
+                    (
+                        "#{}\n{:.3}\n{:.1f}%".format(
+                            idx, record.time, record.time / tot_time * 100
+                        ),
+                        "{:.3}\n{:.1f}%".format(cum_time, cum_time / tot_time * 100),
+                        "\n".join(
+                            "\n-  ".join(textwrap.wrap(str(i), width=30))
+                            for i in record.info.values()
+                        ),
+                        get_number_with_unit(record.computation, "FLO", 1000),
+                        get_number_with_unit(record.flops, "FLOPS", 1000),
+                        get_number_with_unit(record.memory, ["byte", "iB"], 1024),
+                        get_number_with_unit(
+                            record.bandwidth, ["byte/s", "iB/s"], 1024
+                        ),
+                        format_shapes(record.in_shapes, record.in_layouts),
+                        format_shapes(record.out_shapes),
+                    )
+                )
+            return _tabulate_ml(
+                tab,
+                headers=[
+                    "{} self time".format(prof_type),
+                    "cumulative",
+                    "operator info",
+                    "computation",
+                    "FLOPS",
+                    "memory",
+                    "bandwidth",
+                    "in_shapes",
+                    "out_shapes",
+                ],
+                tablefmt="fancy_grid",
+            )
+
+    summary_tab, tot_dev_time, tot_host_time = summary()
+    if args.print_only:
+        print(
+            {
+                "summary": lambda: summary_tab,
+                "device": lambda: prof_details("device", tot_dev_time),
+                "host": lambda: prof_details("host", tot_host_time),
+            }[args.print_only]()
+        )
+    else:
+        print(summary_tab)
+        print()
+        print(prof_details("device", tot_dev_time))
+        if args.show_host:
+            print()
+            print(prof_details("host", tot_host_time))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/imperative/python/megengine/utils/profile_analyzer.py b/imperative/python/megengine/utils/profile_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..75cc0c0c7c6511dd0e5b232b0acd102e4ffe456d
--- /dev/null
+++ b/imperative/python/megengine/utils/profile_analyzer.py
@@ -0,0 +1,401 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+import copy
+import functools
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+
+
+class NonExistNum:
+    """An object that behaves like a number but means a field does not exist; It is
+    always greater than any real number
+    """
+
+    def __truediv__(self, _):
+        return self
+
+    def __add__(self, rhs):
+        return rhs
+
+    def __radd__(self, lhs):
+        return lhs
+
+    def __neg__(self):
+        return self
+
+    def __gt__(self, rhs):
+        if isinstance(rhs) is NonExistNum:
+            return id(self) > id(rhs)
+        return True
+
+    def __ge__(self, rhs):
+        return self > rhs or self == rhs
+
+    def __lt__(self, rhs):
+        if isinstance(rhs) is NonExistNum:
+            return id(self) < id(rhs)
+        return False
+
+    def __le__(self, rhs):
+        return self < rhs or self == rhs
+
+    def __eq__(self, rhs):
+        return self is rhs
+
+    def __format__(self, spec):
+        return "N/A"
+
+    def __repr__(self):
+        return "N/A"
+
+
+class OprProfRst:
+    """Opr profiling result dumped from megengine profiler."""
+
+    opr_info = None
+    """A dict containing operator info:  name, id and type."""
+
+    time_dict = None
+    """A mapping from ``"host"`` or ``"device"`` to list of profiling
+    results."""
+
+    footprint = None
+    """A mapping from ``"memory"`` or ``"computation"`` to the actual number
+    of corresponding operations"""
+
+    def __init__(self, entry: dict):
+        """Opr profiling initialization, which sets up name, type and id of opr_info.
+
+        :param entry: profiling json exec_graph items
+        """
+        assert isinstance(entry, dict)
+        self.opr_info = collections.OrderedDict()
+        for key in ["name", "type", "id"]:
+            self.opr_info[key] = entry[key]
+        self.time_dict = collections.defaultdict(list)
+        self.footprint = collections.defaultdict(NonExistNum)
+
+    def update_device_prof_info(self, dev_time: dict):
+        """Updates device profiling info
+
+        :param dev_time: device time for single opr,
+            is an attribute of profiling result.
+        """
+        assert isinstance(dev_time, dict)
+        self.time_dict["device"].append(copy.deepcopy(dev_time))
+
+    def update_host_prof_info(self, host_time: dict):
+        """Updates host profiling info
+
+        :param host_time: host time for single opr,
+            is an attribute of profiling result.
+        """
+        assert isinstance(host_time, dict)
+        self.time_dict["host"].append(copy.deepcopy(host_time))
+
+    def update_footprint(self, footprint: dict):
+        """Updates opr footprint
+
+        :param footprint: footprint for single opr,
+            is an attribute of profiling result.
+        """
+        assert isinstance(footprint, dict)
+        self.footprint.update(footprint)
+
+
+class Record:
+    """A record of analyzing result"""
+
+    __slot__ = [
+        "time",
+        "info",
+        "computation",
+        "memory",
+        "in_shapes",
+        "in_layouts",
+        "out_shapes",
+        "flops",
+        "bandwidth",
+        "opr_id",
+    ]
+
+    def __init__(self, time: float, info: dict, footprint: dict):
+        """Initializes single record
+
+        :param time: opr running time, evaluated by applying users providing
+            function to OprProfRst.
+        :param info: opr information, could be original opr information or
+            aggregate infomation if aggregating enabled.
+        :param footprint: contains footprint information, for now, we have
+            ``"computation"``, ``"memory"``, ``"in_shapes"``, ``"out_shapes"``.
+        """
+
+        assert isinstance(footprint, dict)
+        self.time = time
+        self.info = collections.OrderedDict(copy.deepcopy(info))
+        self.computation = footprint["computation"] or NonExistNum()
+        self.memory = footprint["memory"]
+        self.in_shapes = footprint["in_shapes"]
+        self.in_layouts = footprint.get("in_layouts")
+        self.out_shapes = footprint["out_shapes"]
+        self.flops = self.computation / self.time
+        self.bandwidth = self.memory / self.time
+        self.opr_id = info.get("id")
+        if isinstance(self.opr_id, str) and self.opr_id != "N/A":
+            self.opr_id = int(self.opr_id)
+
+    def get_column_by_name(self, name: str = None):
+        """extracts column value by its column name
+
+        :param name: column name, None for time.
+        """
+
+        if name is None:
+            name = "time"
+        return getattr(self, name)
+
+
+class ProfileAnalyzer:
+    def __init__(self, obj: dict, opr_filter: Callable = lambda opr, inp, out: True):
+        """Initializes ProfileAnalyzer
+
+        :param obj: dict dumped from json str.
+        :param opr_filter: function that filter oprs.
+        """
+        self._opr_set = dict()  # type: dict
+        assert isinstance(obj, dict)
+        varz = obj["graph_exec"]["var"]
+        for opr_id, entry in obj["graph_exec"]["operator"].items():
+            inp = [varz[i] for i in entry["input"]]
+            out = [varz[i] for i in entry["output"]]
+            if opr_filter(entry, inp, out):
+                self._opr_set[opr_id] = OprProfRst(entry)
+
+        for opr_id, entry in obj["profiler"]["device"].items():
+            if opr_id not in self._opr_set:
+                continue
+            opr = self._opr_set[opr_id]
+            for _, time in entry.items():
+                opr.update_device_prof_info(time)
+
+        for opr_id, entry in obj["profiler"]["host"].items():
+            if opr_id not in self._opr_set:
+                continue
+            opr = self._opr_set[opr_id]
+            for _, time in entry.items():
+                opr.update_host_prof_info(time)
+
+        for opr_id, entry in obj["profiler"].get("opr_footprint", {}).items():
+            if opr_id not in self._opr_set:
+                continue
+            opr = self._opr_set[opr_id]
+            opr.update_footprint(entry)
+
+    def _aggregate(
+        self, records: List[Record], aop: Union[str, Callable], atype: Optional[str]
+    ) -> List[Record]:
+        """Aggregate operation
+
+        :param records: selected records
+        :param aop: aggregate operation, if aop is str, we would replace it
+            with associated numpy function wth aop name"
+        :param atype: the type aggregated by, None for aggregating all into single
+            record.
+        """
+        if aop is None:
+            assert atype is None, "must specify aggregate op"
+            return records
+        if isinstance(aop, str):
+            aop = getattr(np, aop)
+        type2stat = collections.defaultdict(lambda: [[], [], []])  # type: dict
+        for item in records:
+            if atype == "type":
+                d = type2stat[item.info["type"]]
+            else:
+                d = type2stat["all"]
+            d[0].append(item.time)
+            d[1].append(item.computation)
+            d[2].append(item.memory)
+
+        rst = []
+        for opr_type in type2stat.keys():
+            time, computation, memory = type2stat[opr_type]
+            nr_oprs = len(time)
+            time_rst = aop(time)
+            comp_rst = aop(computation)
+            mem_rst = aop(memory)
+
+            item = Record(
+                time_rst,
+                {"type": opr_type, "count": nr_oprs, "id": "N/A"},
+                {
+                    "computation": comp_rst,
+                    "memory": mem_rst,
+                    "in_shapes": None,
+                    "out_shapes": None,
+                },
+            )
+            rst.append(item)
+        return rst
+
+    def _sort(self, records: List[Record], sort_by: str) -> List[Record]:
+        """sort operation
+
+        :param records: the records after aggregate operation.
+        :param sort_by: keyword for sorting the list
+        """
+        if sort_by is None:
+            return records
+        if sort_by.startswith("+"):
+            sort_by = sort_by[1:]
+            key = lambda record: record.get_column_by_name(sort_by)
+        else:
+            key = lambda record: -record.get_column_by_name(sort_by)
+        records.sort(key=key)
+        return records
+
+    def select(
+        self,
+        time_func: Callable,
+        opr_filter: Callable = lambda opr: True,
+        aggregate: Callable = None,
+        aggregate_by: str = None,
+        sort_by: str = None,
+        top_k: int = 0,
+    ) -> List[Record]:
+        """Select operation
+
+        :param time_func: time_func provided by user, would apply to every
+            OprProfRst
+        :param opr_filter: filter satisfied operatiors.
+        :param aggregate: function that apply to list of records which are
+            aggregated by atype
+        :param aggregate_by: the type aggregated by
+        :param sort_by: keyword for sorting all records.
+        :param top_k: specify the maximum number of records.
+        :return: the records that go through select, aggregate, sort.
+        """
+
+        records = []
+        for opr in self._opr_set.values():
+            if opr_filter(opr):
+                time = time_func(opr)
+                if time is None:
+                    continue
+                item = Record(time, opr.opr_info, opr.footprint)
+                records.append(item)
+
+        records = self._aggregate(records, aggregate, aggregate_by)
+        if not records:
+            return records
+        return self._sort(records, sort_by)[0 : len(records) if top_k == 0 else top_k]
+
+
+class TimeFuncHelper:
+    """Time Function Helper for users."""
+
+    @staticmethod
+    def _eval_time(prof_type, end_key, func, opr_prof):
+        """Eval time
+
+        :type prof_type: str
+        :param prof_type: 'host' or 'device'
+        :type end_key: str
+        :param end_key: 'kern' or 'end'
+        :type func: function
+        :param func: apply to list of all ``thread`` of ``gpu`` time.
+        :type opr_prof: `class OprProfRst`
+        :param opr_prof: operator profiling result
+        :rtype: float
+        :return: time
+        """
+
+        if prof_type not in opr_prof.time_dict:
+            return None
+        time = [time[end_key] - time["start"] for time in opr_prof.time_dict[prof_type]]
+        return func(time)
+
+    @staticmethod
+    def eval_time_func(prof_type: str, end_key: str, func: Callable) -> float:
+        """Eval oprerator profile time.
+
+        :param prof_type: 'host' or 'device'
+        :param end_key: 'kern' or 'end'
+        :param func: apply to list of all ``thread`` of ``gpu`` time.
+        :return: Eval time results
+        """
+        return functools.partial(TimeFuncHelper._eval_time, prof_type, end_key, func)
+
+    @staticmethod
+    def _min_start(
+        prof_type, end_key, func, opr_prof
+    ):  # pylint: disable=unused-argument
+        """Eval minimum start time
+
+        :type prof_type: str
+        :param prof_type: 'host' or 'device'
+        :type end_key: str
+        :param end_key: 'kern' or 'end'
+        :type func: function
+        :param func: apply to list of all ``thread`` of ``gpu`` time.
+        :type opr_prof: `class OprProfRst`
+        :param opr_prof: operator profiling result
+        :rtype: float
+        :return: time
+        """
+        if prof_type not in opr_prof.time_dict:
+            return None
+        time = [time["start"] for time in opr_prof.time_dict[prof_type]]
+        return np.min(time)
+
+    @staticmethod
+    def min_start_func(
+        prof_type: str, end_key: str, func: Callable
+    ) -> float:  # pylint: disable=unused-argument
+        """Eval oprerator profile min start time
+
+        :param prof_type: 'host' or 'device'
+        :param end_key: 'kern' or 'end'
+        :param func: apply to list of all ``thread`` of ``gpu`` time.
+        :return: Eval time results
+        """
+        return functools.partial(TimeFuncHelper._min_start, prof_type, end_key, func)
+
+    @staticmethod
+    def _max_end(prof_type, end_key, func, opr_prof):  # pylint: disable=unused-argument
+        """Eval maximum end time
+
+        :type prof_type: str
+        :param prof_type: 'host' or 'device'
+        :type end_key: str
+        :param end_key: 'kern' or 'end'
+        :type func: function
+        :param func: apply to list of all ``thread`` of ``gpu`` time.
+        :type opr_prof: `class OprProfRst`
+        :param opr_prof: operator profiling result
+        :rtype: float
+        :return: time
+        """
+        if prof_type not in opr_prof.time_dict:
+            return None
+        time = [time["end"] for time in opr_prof.time_dict[prof_type]]
+        return np.max(time)
+
+    @staticmethod
+    def max_end_func(prof_type: str, end_key: str, func: Callable) -> float:
+        """Eval oprerator profile max end time
+
+        :param prof_type: 'host' or 'device'
+        :param end_key: 'kern' or 'end'
+        :param func: apply to list of all ``thread`` of ``gpu`` time.
+        :return: Eval time results
+        """
+        return functools.partial(TimeFuncHelper._max_end, prof_type, end_key, func)
diff --git a/imperative/python/megengine/utils/profiler.py b/imperative/python/megengine/utils/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..12dae2d2ff6406f4f26ea3ed9fc5c807973060a0
--- /dev/null
+++ b/imperative/python/megengine/utils/profiler.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from typing import Optional
+
+from ..core._imperative_rt import ProfilerImpl
+from ..core._imperative_rt.imperative import sync
+
+
+class Profiler:
+    def __init__(self, path: Optional[str] = None):
+        self.impl = ProfilerImpl(path)
+
+    def __enter__(self):
+        sync()
+        self.impl.enable()
+        return self
+
+    def __exit__(self, val, type, trace):
+        sync()
+        self.impl.disable()
+
+    def dump(self, path: Optional[str] = None):
+        self.impl.dump(path)
diff --git a/imperative/python/megengine/utils/types.py b/imperative/python/megengine/utils/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..465ca03ce68f02d3944ddb87f5b0d4abde5ef9f9
--- /dev/null
+++ b/imperative/python/megengine/utils/types.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+import functools
+
+
+def get_ndtuple(value, *, n, allow_zero=True):
+    r"""Converts possibly 1D tuple to nd tuple
+
+    :type allow_zero: bool
+    :param allow_zero: whether to allow zero tuple value"""
+    if not isinstance(value, collections.Iterable):
+        value = int(value)
+        value = tuple([value for i in range(n)])
+    else:
+        assert len(value) == n, "tuple len is not equal to n: {}".format(value)
+        spatial_axis = map(int, value)
+        value = tuple(spatial_axis)
+    if allow_zero:
+        minv = 0
+    else:
+        minv = 1
+    assert min(value) >= minv, "invalid value: {}".format(value)
+    return value
+
+
+_single = functools.partial(get_ndtuple, n=1, allow_zero=True)
+_pair = functools.partial(get_ndtuple, n=2, allow_zero=True)
+_pair_nonzero = functools.partial(get_ndtuple, n=2, allow_zero=False)
+_triple = functools.partial(get_ndtuple, n=3, allow_zero=True)
+_quadruple = functools.partial(get_ndtuple, n=4, allow_zero=True)
diff --git a/imperative/python/megengine/version.py b/imperative/python/megengine/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4ce488fb676a8fee1611889924a2064f7e100f9
--- /dev/null
+++ b/imperative/python/megengine/version.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+__version__ = "0.8.0"
+
diff --git a/imperative/python/requires-style.txt b/imperative/python/requires-style.txt
new file mode 100644
index 0000000000000000000000000000000000000000..899aac5275ae83a271837340ac99fddea04f6b1e
--- /dev/null
+++ b/imperative/python/requires-style.txt
@@ -0,0 +1,4 @@
+black==19.10b0
+isort==4.3.21
+pylint==2.4.3
+mypy==0.750
diff --git a/imperative/python/requires-test.txt b/imperative/python/requires-test.txt
new file mode 100644
index 0000000000000000000000000000000000000000..545de8af81210195442467c3e0c245ea83dc6e3a
--- /dev/null
+++ b/imperative/python/requires-test.txt
@@ -0,0 +1 @@
+pytest==5.3.0
diff --git a/imperative/python/requires.txt b/imperative/python/requires.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a2d8a55df078d1b40a2d8b46a75035f485c0a263
--- /dev/null
+++ b/imperative/python/requires.txt
@@ -0,0 +1,8 @@
+numpy>=1.18
+multipledispatch==0.6.0
+opencv-python
+pyarrow
+requests
+tabulate
+tqdm
+redispy
diff --git a/imperative/python/setup.py b/imperative/python/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..e583cce4412f89331ef8de62eb1455b46f75a524
--- /dev/null
+++ b/imperative/python/setup.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import os
+import re
+import pathlib
+from distutils.file_util import copy_file
+from setuptools import setup, find_packages, Extension
+from setuptools.command.build_ext import build_ext as _build_ext
+
+class PrecompiledExtesion(Extension):
+    def __init__(self, name):
+        super().__init__(name, sources=[])
+
+class build_ext(_build_ext):
+
+    def build_extension(self, ext):
+        if not isinstance(ext, PrecompiledExtesion):
+            return super().build_extension(ext)
+
+        if not self.inplace:
+            fullpath = self.get_ext_fullpath(ext.name)
+            extdir = pathlib.Path(fullpath)
+            extdir.parent.mkdir(parents=True, exist_ok=True)
+
+            modpath = self.get_ext_fullname(ext.name).split('.')
+            modpath[-1] += '.so'
+            modpath = str(pathlib.Path(*modpath).resolve())
+
+            copy_file(modpath, fullpath, verbose=self.verbose, dry_run=self.dry_run)
+
+package_name = 'MegEngine'
+
+v = {}
+with open("megengine/version.py") as fp:
+    exec(fp.read(), v)
+__version__ = v['__version__']
+
+email = 'megengine@megvii.com'
+local_version = os.environ.get('LOCAL_VERSION')
+if local_version:
+    __version__ = '{}+{}'.format(__version__, local_version)
+
+packages = find_packages(exclude=['test'])
+
+with open('requires.txt') as f:
+    requires = f.read().splitlines()
+with open('requires-style.txt') as f:
+    requires_style = f.read().splitlines()
+with open('requires-test.txt') as f:
+    requires_test = f.read().splitlines()
+
+setup_kwargs = dict(
+    name=package_name,
+    version=__version__,
+    description='Framework for numerical evaluation with '
+    'auto-differentiation',
+    author='Megvii Engine Team',
+    author_email=email,
+    packages=packages,
+    ext_modules=[PrecompiledExtesion('megengine.core._imperative_rt')],
+    install_requires=requires,
+    extras_require={
+        'dev': requires_style + requires_test,
+        'ci': requires_test,
+    },
+    cmdclass={'build_ext': build_ext},
+)
+
+setup_kwargs.update(dict(
+    classifiers=[
+    'Development Status :: 3 - Alpha',
+    'Intended Audience :: Developers',
+    'Intended Audience :: Education',
+    'Intended Audience :: Science/Research',
+    'License :: OSI Approved :: Apache Software License',
+    'Programming Language :: C++',
+    'Programming Language :: Python :: 3',
+    'Programming Language :: Python :: 3.5',
+    'Programming Language :: Python :: 3.6',
+    'Programming Language :: Python :: 3.7',
+    'Programming Language :: Python :: 3.8',
+    'Topic :: Scientific/Engineering',
+    'Topic :: Scientific/Engineering :: Mathematics',
+    'Topic :: Scientific/Engineering :: Artificial Intelligence',
+    'Topic :: Software Development',
+    'Topic :: Software Development :: Libraries',
+    'Topic :: Software Development :: Libraries :: Python Modules',
+    ],
+    license='Apache 2.0',
+    keywords='megengine deep learning',
+    data_files = [("megengine", [
+        "../LICENSE",
+        "../ACKNOWLEDGMENTS",
+    ])]
+))
+
+setup(**setup_kwargs)
diff --git a/imperative/python/src/common.cpp b/imperative/python/src/common.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..047a0f80f2e45ed8b3973fcc1d3b4d4bcd0615ab
--- /dev/null
+++ b/imperative/python/src/common.cpp
@@ -0,0 +1,117 @@
+#include "./common.h"
+
+#include <pybind11/operators.h>
+
+#include "megbrain/comp_node.h"
+#include "megbrain/graph.h"
+#include "megbrain/imperative/physical_tensor.h"
+#include "./numpy_dtypes.h"
+#include "./helper.h"
+
+namespace py = pybind11;
+using namespace mgb;
+using namespace imperative;
+
+void init_common(py::module m) {
+    py::class_<CompNode>(m, "CompNode")
+        .def(py::init())
+        .def(py::init(py::overload_cast<const std::string&>(&CompNode::load)))
+        .def("__str__", &CompNode::to_string)
+        .def_static("_sync_all", &CompNode::sync_all)
+        .def(py::self == py::self)
+        .def_static("_get_device_count", &CompNode::get_device_count,
+                    "Get total number of specific devices on this system")
+        .def(py::pickle(
+                [](const CompNode& cn) {
+                    return py::str(cn.to_string_logical());
+                },
+                [](py::str cn) {
+                    return CompNode::load(cn);
+                }));
+
+    py::implicitly_convertible<std::string, CompNode>();
+
+    py::class_<DeviceTensorND>(m, "DeviceTensorND")
+        .def(py::init())
+        .def_property_readonly("shape", py::overload_cast<>(&DeviceTensorND::shape, py::const_))
+        .def_property_readonly("dtype", py::overload_cast<>(&DeviceTensorND::dtype, py::const_))
+        .def_property_readonly("comp_node", py::overload_cast<>(&DeviceTensorND::comp_node, py::const_))
+        .def("numpy", [](const DeviceTensorND& self) {
+                HostTensorND hv;
+                hv.copy_from(self).sync();
+                return py::handle(npy::ndarray_from_tensor(hv, npy::ShareType::TRY_SHARE));
+            });
+
+    py::class_<cg::OperatorNodeConfig>(m, "OperatorNodeConfig")
+        .def(py::init())
+        .def_property("name",
+            [](const OperatorNodeConfig& config) -> py::object {
+                auto name = config.name();
+                if (name.valid()) {
+                    return py::str(name.val());
+                } else {
+                    return py::none();
+                }
+            },
+            [](OperatorNodeConfig& config, std::string name){
+                config.name(std::move(name));
+            })
+        .def_property("dtype",
+            [](const OperatorNodeConfig& config) {
+                return config.output_dtype();
+            },
+            [](OperatorNodeConfig& config, DType dtype) {
+                config.output_dtype(dtype);
+            })
+        .def_property("comp_node_arr",
+            [](const OperatorNodeConfig& config) -> py::tuple {
+                auto arr = config.comp_node();
+                std::vector<CompNode> tmp(arr.begin(), arr.end());
+                return py::cast(tmp);
+            },
+            [](OperatorNodeConfig& config, std::vector<CompNode> cns) {
+                config.comp_node_arr({cns.begin(), cns.end()});
+            })
+        .def_property("comp_node",
+            [](const OperatorNodeConfig& config) {
+                auto arr = config.comp_node();
+                if (arr.size() != 1) {
+                    throw py::value_error("invalid number of comp_node");
+                }
+                return arr[0];
+            },
+            [](OperatorNodeConfig& config, CompNode cn) {
+                OperatorNodeConfig::CompNodeArray arr{cn};
+                config.comp_node_arr(arr);
+            });
+
+    py::class_<LogicalTensorDesc>(m, "TensorAttr")
+        .def(py::init())
+        .def(py::init([](const TensorShape& shape, const DType& dtype, const CompNode& comp_node){
+                return LogicalTensorDesc{TensorLayout{shape, dtype}, comp_node};
+            }))
+        .def_property("shape",
+            [](const LogicalTensorDesc& desc) {
+                return static_cast<TensorShape>(desc.layout);
+            },
+            [](LogicalTensorDesc& desc, TensorShape shape) {
+            })
+        .def_property("dtype",
+            [](const LogicalTensorDesc& desc) {
+                return desc.layout.dtype;
+            },
+            [](LogicalTensorDesc& desc, DType dtype) {
+                desc.layout.dtype = dtype;
+            })
+        .def_readwrite("comp_node", &LogicalTensorDesc::comp_node);
+
+    py::enum_<CompNode::DeviceType>(m, "DeviceType")
+            .value("UNSPEC", CompNode::DeviceType::UNSPEC)
+            .value("CUDA", CompNode::DeviceType::CUDA)
+            .value("CPU", CompNode::DeviceType::CPU)
+            .value("MULTITHREAD", CompNode::DeviceType::MULTITHREAD)
+            .value("MAX_DEVICE_ID", CompNode::DeviceType::MAX_DEVICE_ID);
+
+    init_npy_num_bfloat16(m);
+    init_npy_num_intbx(m);
+}
diff --git a/imperative/python/src/common.h b/imperative/python/src/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..582019b817ce137e1c30d008bb1dcd37977322be
--- /dev/null
+++ b/imperative/python/src/common.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "./helper.h"
+
+void init_common(pybind11::module m);
diff --git a/imperative/python/src/graph_rt.cpp b/imperative/python/src/graph_rt.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..022bbf8c90e7d64442747454a989be9e4b94e706
--- /dev/null
+++ b/imperative/python/src/graph_rt.cpp
@@ -0,0 +1,191 @@
+#include "./graph_rt.h"
+
+#include "megbrain/imperative/opr_utility.h"
+#include "megbrain/opr/basic_arith.h"
+#include "megbrain/imperative.h"
+#include "./helper.h"
+
+namespace py = pybind11;
+
+using namespace mgb;
+using namespace imperative;
+
+#define DEF_READWRITE(name) .def_readwrite(#name, &CURRENT_CLASS::name)
+
+template<typename T>
+auto def_rendezvous(py::object m, const char* name) {
+    return py::class_<Rendezvous<T>, std::shared_ptr<Rendezvous<T>>>(m, name)
+        .def(py::init([](){return std::make_shared<Rendezvous<T>>();}))
+        .def("set", [](Rendezvous<T>& r, T v) {r.set(std::move(v));})
+        .def("get", [](Rendezvous<T>& r) {return r.get();}, py::call_guard<py::gil_scoped_release>())
+        .def("reset", &Rendezvous<T>::reset);
+}
+
+using TensorAttr = LogicalTensorDesc;
+
+void init_graph_rt(py::module m) {
+    def_rendezvous<DeviceTensorND>(m, "DeviceTensorNDRendezvous");
+
+    def_rendezvous<TensorAttr>(m, "TensorAttrRendezvous");
+
+    py::class_<cg::VarNode, GraphNodePtr<cg::VarNode>>(m, "VarNode")
+        .def_property_readonly("owner", [](cg::VarNode* v) {return v->owner_opr();})
+        .def_property_readonly("graph", [](cg::VarNode* v) {return v->owner_graph();})
+        .def_property_readonly("dtype", [](cg::VarNode* v) {return v->dtype();})
+        .def_property_readonly("comp_node", [](cg::VarNode* v) {return v->comp_node();});
+
+    py::class_<cg::OperatorNodeBase, GraphNodePtr<cg::OperatorNodeBase>>(m, "OperatorNode")
+        .def_property_readonly("graph", [](cg::OperatorNodeBase* opr) {return opr->owner_graph();})
+        .def_property_readonly("inputs", [](cg::OperatorNodeBase* opr) {
+                return to_tuple(opr->input());
+            })
+        .def_property_readonly("outputs", [](cg::OperatorNodeBase* opr) {
+                return to_tuple(opr->output());
+            });
+
+    py::class_<cg::AsyncExecutable>(m, "AsyncExecutable")
+        .def("execute", &cg::AsyncExecutable::execute, py::call_guard<py::gil_scoped_release>())
+        .def("wait", &cg::AsyncExecutable::wait, py::call_guard<py::gil_scoped_release>());
+
+    auto PyComputingGraph = py::class_<cg::ComputingGraph, std::shared_ptr<cg::ComputingGraph>>(m, "ComputingGraph")
+        .def(py::init(py::overload_cast<>(&cg::ComputingGraph::make)))
+        .def("compile", [](cg::ComputingGraph& graph, const std::vector<cg::VarNode*>& dest_vars) {
+                mgb_assert(!dest_vars.empty());
+                cg::ComputingGraph::OutputSpec spec;
+                for (auto v : dest_vars) {
+                    spec.emplace_back(v, nullptr);
+                }
+                return graph.compile(spec);
+            })
+        .def_property_readonly("options", py::overload_cast<>(&cg::ComputingGraph::options));
+
+#define CURRENT_CLASS cg::ComputingGraph::Options
+
+    auto PyComputingGraphOptions = py::class_<cg::ComputingGraph::Options>(PyComputingGraph, "Options")
+        // DEF_READWRITE(opr_attribute)
+        DEF_READWRITE(seq_opt)
+        DEF_READWRITE(graph_opt)
+        DEF_READWRITE(graph_opt_level)
+        DEF_READWRITE(log_level)
+        DEF_READWRITE(async_exec_level)
+        DEF_READWRITE(force_dynamic_alloc)
+        DEF_READWRITE(var_sanity_check_first_run)
+        DEF_READWRITE(allocate_static_mem_after_graph_compile)
+        DEF_READWRITE(fake_next_exec)
+        DEF_READWRITE(enable_sublinear_memory_opt)
+        DEF_READWRITE(no_profiling_on_shape_change)
+        DEF_READWRITE(enable_var_mem_defragment)
+        DEF_READWRITE(enable_grad_var_static_reshape)
+        DEF_READWRITE(enable_memory_swap)
+        DEF_READWRITE(comp_node_seq_record_level)
+        // DEF_READWRITE(eager_evaluation)
+        // DEF_READWRITE(imperative_proxy_graph)
+        // DEF_READWRITE(extra_vardeps)
+        // DEF_READWRITE(user_data)
+        ;
+
+#undef CURRENT_CLASS
+#define CURRENT_CLASS cg::ComputingGraph::Options::SeqOpt
+
+    py::class_<cg::ComputingGraph::Options::SeqOpt>(PyComputingGraphOptions, "SeqOpt")
+        DEF_READWRITE(enable_mem_plan_opt)
+        DEF_READWRITE(enable_mem_reuse_alloc)
+        DEF_READWRITE(enable_seq_comp_node_opt);
+
+#undef CURRENT_CLASS
+#define CURRENT_CLASS cg::ComputingGraph::Options::GraphOpt
+
+    py::class_<cg::ComputingGraph::Options::GraphOpt>(PyComputingGraphOptions, "GraphOpt")
+        DEF_READWRITE(jit)
+        DEF_READWRITE(tensorrt);
+
+#undef CURRENT_CLASS
+
+    auto common = rel_import("common", m, 1);
+
+    common.def("invoke_op", [](const OpDef& def, const std::vector<cg::VarNode*> inputs, cg::ComputingGraph* graph) {
+            cg::VarNodeArray vinputs(inputs.begin(), inputs.end());
+            auto opr = OpDef::apply_on_var_node(def, vinputs);
+            auto outputs = opr->output();
+            return to_tuple(outputs);
+        },
+        py::arg(), py::arg(), py::arg("graph") = py::none());
+
+    auto input_callback = [](auto callback,
+                             const CompNode& comp_node,
+                             const DType& dtype,
+                             const std::vector<cg::VarNode*>& inputs,
+                             cg::ComputingGraph* graph) {
+        if (!graph) {
+            graph = inputs[0]->owner_graph();
+        }
+        SymbolVarArray sinputs;
+        for (auto i : inputs) {
+            sinputs.emplace_back(i);
+        }
+        static_assert(!std::is_reference<decltype(callback)>::value);
+        auto soutputs = opr::InputCallback::make(*graph, std::move(callback), comp_node, dtype, sinputs);
+        std::vector<VarNode*> outputs;
+        outputs.reserve(soutputs.size());
+        for (auto i : soutputs) {
+            outputs.push_back(i.node());
+        }
+        return outputs;
+    };
+
+    m.def("input_callback", [input_callback](std::function<DeviceTensorND(void)> callback,
+                                             const CompNode& comp_node,
+                                             const DType& dtype,
+                                             const std::vector<cg::VarNode*>& inputs,
+                                             cg::ComputingGraph* graph) {
+            return input_callback([f=std::move(callback)](){py::gil_scoped_acquire _; return f();}, comp_node, dtype, inputs, graph);
+        },
+        py::arg(), py::arg(), py::arg(), py::arg() = py::tuple(), py::arg("graph") = py::none());
+
+    m.def("input_callback", [input_callback](std::shared_ptr<Rendezvous<DeviceTensorND>> p,
+                                             const CompNode& comp_node,
+                                             const DType& dtype,
+                                             const std::vector<cg::VarNode*>& inputs,
+                                             cg::ComputingGraph* graph) {
+            auto f = [p]() -> DeviceTensorND {
+                return p->get();
+            };
+            return input_callback(std::move(f), comp_node, dtype, inputs, graph);
+        },
+        py::arg(), py::arg(), py::arg(), py::arg() = py::tuple(), py::arg("graph") = py::none());
+
+    auto output_callback = [](auto callback, const std::vector<cg::VarNode*>& inputs, bool borrow = false) {
+        SymbolVarArray sinputs;
+        for (auto i : inputs) {
+            sinputs.emplace_back(i);
+        }
+        static_assert(!std::is_reference<decltype(callback)>::value);
+        opr::OutputCallback::Param param{std::move(callback), borrow};
+        auto output = opr::OutputCallback::make(std::move(param), sinputs);
+        return output.node();
+    };
+
+    m.def("output_callback", [output_callback](std::function<void(DeviceTensorND)> callback, std::vector<cg::VarNode*> inputs) {
+        auto f = [f=std::move(callback)](DeviceTensorND dv) {
+            auto task = [f=std::move(f), dv=std::move(dv)]() {
+                f(dv);
+            };
+            py_task_q.add_task(std::move(task));
+        };
+        return output_callback(std::move(f), std::move(inputs));
+    });
+
+    m.def("output_callback", [output_callback](std::shared_ptr<Rendezvous<DeviceTensorND>> p, std::vector<cg::VarNode*> inputs) {
+        auto f = [p](DeviceTensorND dv) {
+            p->set(std::move(dv));
+        };
+        return output_callback(std::move(f), std::move(inputs));
+    });
+
+    m.def("attr_output_callback", [output_callback](std::shared_ptr<Rendezvous<TensorAttr>> p, std::vector<cg::VarNode*> inputs) {
+        auto f = [p](DeviceTensorND dv) {
+            p->set(TensorAttr{TensorLayout{dv.shape(), dv.dtype()}, dv.comp_node()});
+        };
+        return output_callback(std::move(f), std::move(inputs), true);
+    });
+}
diff --git a/imperative/python/src/graph_rt.h b/imperative/python/src/graph_rt.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbc127c45dc071d2be0f75c5cfeb855b3323d9a6
--- /dev/null
+++ b/imperative/python/src/graph_rt.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include "./helper.h"
+
+#include <memory>
+#include <mutex>
+#include <future>
+
+#include "megbrain/graph.h"
+
+template<typename T>
+class GraphNodePtr {
+    std::shared_ptr<mgb::cg::ComputingGraph> m_graph;
+    T* m_node;
+public:
+    GraphNodePtr(T* node) :
+        m_graph(node ? nullptr : node->owner_graph()->shared_from_this()),
+        m_node(node) {}
+    T* operator->() {return m_node;}
+    T& operator*() {return *m_node;}
+    operator bool() {return m_node;}
+    T* get() {return m_node;}
+};
+
+PYBIND11_DECLARE_HOLDER_TYPE(T, GraphNodePtr<T>, true);
+
+template<typename R>
+class Rendezvous {
+    std::mutex m_lock;
+    int m_read_ahead = 0;
+    std::promise<R> m_promise;
+public:
+    Rendezvous() = default;
+    Rendezvous(const Rendezvous& rhs) = delete;
+    Rendezvous(Rendezvous&& rhs) = default;
+    Rendezvous& operator=(const Rendezvous& rhs) = delete;
+    Rendezvous& operator=(Rendezvous&& rhs) {
+        MGB_LOCK_GUARD(m_lock);
+        m_read_ahead = rhs.m_read_ahead;
+        m_promise = std::move(rhs.m_promise);
+        return *this;
+    }
+
+    R get() {
+        std::future<R> f;
+        {
+            MGB_LOCK_GUARD(m_lock);
+            mgb_assert(m_read_ahead <= 0);
+            mgb_assert(m_read_ahead >= -1);
+            f = m_promise.get_future();
+            if (m_read_ahead == -1) {
+                m_promise = {};
+            }
+            ++m_read_ahead;
+        }
+        return f.get();
+    }
+
+    template<typename T>
+    void set(T&& value) {
+        MGB_LOCK_GUARD(m_lock);
+        mgb_assert(m_read_ahead >= 0);
+        mgb_assert(m_read_ahead <= 1);
+        m_promise.set_value(std::forward<T>(value));
+        if (m_read_ahead == 1) {
+            m_promise = {};
+        }
+        --m_read_ahead;
+    }
+
+    void reset() {
+        MGB_LOCK_GUARD(m_lock);
+        m_promise = {};
+        m_read_ahead = 0;
+    }
+};
+
+void init_graph_rt(pybind11::module m);
diff --git a/imperative/python/src/helper.cpp b/imperative/python/src/helper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a1b8b27759e7b3873d60223023fa84b1049dd687
--- /dev/null
+++ b/imperative/python/src/helper.cpp
@@ -0,0 +1,705 @@
+#include "./helper.h"
+
+#include <pybind11/eval.h>
+
+#include "megbrain/graph/exc_extra_info.h"
+#include "megbrain/graph/event.h"
+#include "megbrain/graph/cg.h"
+#include "megbrain/tensor.h"
+#include "megbrain/utils/mempool.h"
+#include "./numpy_dtypes.h"
+
+/*
+ * demangle typeid, see
+ * http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname
+ */
+#ifdef __GNUG__
+#include <cstdlib>
+#include <memory>
+#include <cxxabi.h>
+
+namespace py = pybind11;
+
+PyTaskDipatcher py_task_q = {};
+
+py::module submodule(py::module parent, const char* name, const char* doc) {
+    auto m = parent.def_submodule(name, doc);
+    m.attr("__package__") = parent.attr("__name__");
+    m.attr("__builtins__") = py::module::import("builtins");
+    return m;
+}
+
+py::module rel_import(py::str name, py::module m, int level) {
+    py::object import = py::module::import("builtins").attr("__import__");
+    return import(name, m.attr("__dict__"), py::arg("level")=level);
+}
+
+namespace {
+
+std::string demangle_typeid(const char* name) {
+
+    int status = -4; // some arbitrary value to eliminate the compiler warning
+
+    // enable c++11 by passing the flag -std=c++11 to g++
+    std::unique_ptr<char, void(*)(void*)> res {
+        abi::__cxa_demangle(name, nullptr, nullptr, &status),
+        std::free
+    };
+
+    return (status==0) ? res.get() : name ;
+}
+}
+#else
+
+namespace {
+// does nothing if not g++
+std::string demangle_typeid(const char* name) {
+    return name;
+}
+}
+
+#endif
+
+using namespace mgb;
+using namespace cg;
+
+namespace {
+
+    std::string repr_pyobj(PyObject *obj) {
+        if (!obj)
+            return "<null PyObject>";
+        PYTHON_GIL;
+        auto str = PyObject_Repr(obj);
+        if (!str)
+            return ssprintf("<PyObject at %p (repr failed)>", obj);
+        std::string ret{PyUnicode_AsUTF8(str)};
+        Py_DECREF(str);
+        return ret;
+    }
+
+    template<typename T>
+    std::string typeid_name(const T &t) {
+        return demangle_typeid(typeid(t).name());
+    }
+
+} // anonymous namespace
+
+/* ============== PyExceptionForward ============== */
+
+PyExceptionForward::~PyExceptionForward() {
+    PYTHON_GIL;
+    PyObjRefKeeper::deleter(m_type);
+    PyObjRefKeeper::deleter(m_value);
+    PyObjRefKeeper::deleter(m_traceback);
+}
+
+void PyExceptionForward::restore() {
+    PyErr_Restore(m_type, m_value, m_traceback);
+    m_type = m_value = m_traceback = nullptr;
+}
+
+void PyExceptionForward::throw_() {
+    PyObject *etype, *obj, *trace;
+    PyErr_Fetch(&etype, &obj, &trace);
+    PyErr_NormalizeException(&etype, &obj, &trace);
+
+    std::string msg{"python exception"};
+    bool succ = false;
+    if (etype && obj && trace) {
+        auto run = [&]() {
+#define DEF(name, expr)        \
+    PyObjRefKeeper name{expr}; \
+    if (!name.get())           \
+    return
+            DEF(mod, PyImport_ImportModule("traceback"));
+            DEF(result, PyObject_CallMethod(mod.get(), "format_exception",
+                                            "(OOO)", etype, obj, trace));
+            if (!PyList_Check(result.get()))
+                return;
+            auto size = PyList_Size(result.get());
+            msg.append(":\n");
+            for (Py_ssize_t i = 0; i < size; ++i) {
+                msg.append("  ");
+                msg.append(PyUnicode_AsUTF8(PyList_GetItem(result.get(), i)));
+            }
+            msg.pop_back();  // remove last \n
+            succ = true;
+#undef DEF
+        };
+        run();
+    }
+    if (!succ) {
+        PyObject* obj_str_py;
+        if (obj && (obj_str_py = PyObject_Repr(obj))) {
+            msg.append(" with message ");
+            msg.append(PyUnicode_AsUTF8(obj_str_py));
+            Py_DECREF(obj_str_py);
+        } else {
+            msg.append(" with unknown message");
+        }
+    }
+    // throwing exception may cause abort due to unknown reasons; so we first
+    // log the message
+    mgb_log_error("caught exception from python callback: %s", msg.c_str());
+    fflush(stdout);
+    fflush(stderr);
+    throw PyExceptionForward{etype, obj, trace, msg};
+}
+
+/* ============== namespace npy ============== */
+
+namespace {
+
+int to_mgb_supported_dtype_raw(int dtype) {
+    if (dtype == NPY_INT64)
+        return NPY_INT32;
+    if (dtype == NPY_FLOAT64)
+        return NPY_FLOAT32;
+    return dtype;
+}
+
+#define FOREACH_NPY_DTYPE_PAIR(cb) \
+    cb(Uint8, NPY_UINT8) \
+    cb(Int8, NPY_INT8) \
+    cb(Int16, NPY_INT16) \
+    cb(Int32, NPY_INT32) \
+    cb(Float16, NPY_FLOAT16) \
+    cb(Float32, NPY_FLOAT32) \
+    cb(Bool, NPY_BOOL)
+
+#define FOREACH_NPY_MGB_DTYPE_PAIR(cb) \
+    FOREACH_NPY_DTYPE_PAIR(cb) \
+    FOREACH_MGB_DTYPE_PAIR(cb)
+
+
+
+//! convert megbrain dtype to numpy dtype
+int dtype_mgb2np_raw(DType dtype) {
+    mgb_assert(dtype.valid(), "attempt to convert from invalid dtype");
+    switch (dtype.enumv()) {
+#define cb(_m, _n) \
+        case DTypeEnum::_m: \
+            return _n;
+        FOREACH_NPY_MGB_DTYPE_PAIR(cb)
+#undef cb
+        default:
+            break;
+    }
+    throw ConversionError(ssprintf(
+                "can not convert dtype %s to numpy dtype", dtype.name()));
+}
+
+struct PyArrayDescrDeleter {
+    void operator()(PyArray_Descr* obj) {
+        Py_XDECREF(obj);
+    }
+};
+
+//! Convert MegBrain DType to NumPy DType descriptor, the caller receives a new
+//! reference to the descriptor.
+std::unique_ptr<PyArray_Descr, PyArrayDescrDeleter> dtype_mgb2np_descr(
+        DType dtype) {
+    PYTHON_GIL;
+    mgb_assert(dtype.valid(), "attempt to convert from invalid dtype");
+    auto build_mgb_dtype_dict =
+            [](const char* name,
+               const std::vector<std::pair<const char*, PyObject*>>& data) {
+                PyObject* metadata = PyDict_New();
+                PyObject* mgb_dtype_metadata = PyDict_New();
+                PyDict_SetItemString(mgb_dtype_metadata, "name",
+                                     PyUnicode_FromString(name));
+                for (const auto& d : data) {
+                    PyDict_SetItemString(mgb_dtype_metadata, d.first, d.second);
+                }
+                PyDict_SetItemString(metadata, "mgb_dtype", mgb_dtype_metadata);
+                return metadata;
+            };
+    if (dtype.has_param()) {
+        PyArray_Descr* type_descr;
+        switch (dtype.enumv()) {
+            case DTypeEnum::Quantized4Asymm: {
+                auto& param = dtype.param<dtype::Quantized4Asymm>();
+                type_descr = PyArray_DescrNewFromType(NPY_UINT8);
+                type_descr->metadata = build_mgb_dtype_dict(
+                        DTypeTrait<dtype::Quantized4Asymm>::name,
+                        {{"scale", PyFloat_FromDouble(param.scale)},
+                         {"zero_point", PyLong_FromLong(param.zero_point)}});
+                break;
+            }
+            case DTypeEnum::QuantizedS4: {
+                auto& param = dtype.param<dtype::QuantizedS4>();
+                type_descr = PyArray_DescrNewFromType(NPY_INT8);
+                type_descr->metadata = build_mgb_dtype_dict(
+                        DTypeTrait<dtype::QuantizedS4>::name,
+                        {{"scale", PyFloat_FromDouble(param.scale)}});
+                break;
+            }
+            case DTypeEnum::Quantized8Asymm: {
+                auto& param = dtype.param<dtype::Quantized8Asymm>();
+                type_descr = PyArray_DescrNewFromType(NPY_UINT8);
+                type_descr->metadata = build_mgb_dtype_dict(
+                        DTypeTrait<dtype::Quantized8Asymm>::name,
+                        {{"scale", PyFloat_FromDouble(param.scale)},
+                         {"zero_point", PyLong_FromLong(param.zero_point)}});
+                break;
+            }
+            case DTypeEnum::QuantizedS8: {
+                auto& param = dtype.param<dtype::QuantizedS8>();
+                type_descr = PyArray_DescrNewFromType(NPY_INT8);
+                type_descr->metadata = build_mgb_dtype_dict(
+                        DTypeTrait<dtype::QuantizedS8>::name,
+                        {{"scale", PyFloat_FromDouble(param.scale)}});
+                break;
+            }
+            case DTypeEnum::QuantizedS32: {
+                auto& param = dtype.param<dtype::QuantizedS32>();
+                type_descr = PyArray_DescrNewFromType(NPY_INT32);
+                type_descr->metadata = build_mgb_dtype_dict(
+                        DTypeTrait<dtype::QuantizedS32>::name,
+                        {{"scale", PyFloat_FromDouble(param.scale)}});
+                break;
+            }
+            default:
+                mgb_throw(ConversionError, "unhandled parameterized DType %s",
+                          dtype.name());
+        }
+        return std::unique_ptr<PyArray_Descr, PyArrayDescrDeleter>(type_descr);
+    }
+    PyArray_Descr* basic_descr = PyArray_DescrFromType(dtype_mgb2np_raw(dtype));
+    mgb_assert(basic_descr != nullptr,
+                   "failed to convert expected dtype to numpy type descriptor");
+    return std::unique_ptr<PyArray_Descr, PyArrayDescrDeleter>(basic_descr);
+}
+
+DType dtype_np2mgb_raw(int npt) {
+    switch (npt) {
+#define cb(_m, _n) \
+        case _n: \
+            return dtype::_m();
+        FOREACH_NPY_DTYPE_PAIR(cb)
+#undef cb
+    }
+#define cb(_m, _n) \
+    if (_n == npt) return dtype::_m();
+    FOREACH_MGB_DTYPE_PAIR(cb)
+#undef cb
+
+    PYTHON_GIL;
+    std::string msg;
+    auto py_obj = PyArray_TypeObjectFromType(npt);
+    if (!py_obj) {
+        msg = ssprintf("unknown numpy dtype enum %d", npt);
+    } else {
+        msg = ssprintf("unsupported numpy dtype %s",
+                repr_pyobj(py_obj).c_str());
+    }
+    Py_DECREF(py_obj);
+    throw ConversionError(msg);
+}
+
+DType dtype_np2mgb_descr(PyArray_Descr* descr) {
+    PYTHON_GIL;
+    auto handle_parameterized_dtype = [](PyObject* metadata) -> DType {
+        mgb_assert(PyDict_Check(metadata),
+                   "Invalid parameterized DType metadata: should be a dict");
+        PyObject* dtype_name_py = PyDict_GetItemString(metadata, "name");
+        mgb_assert(
+                PyUnicode_Check(dtype_name_py),
+                "Invalid parameterized DType metadata: name should be a str");
+        std::string dtype_name(PyUnicode_AsUTF8(dtype_name_py));
+        if (dtype_name == "Quantized8Asymm") {
+            PyObject* scale_py = PyDict_GetItemString(metadata, "scale");
+            PyObject* zero_point_py =
+                    PyDict_GetItemString(metadata, "zero_point");
+            mgb_assert(scale_py && zero_point_py,
+                       "Invalid Quantized8Asymm metadata: missing scale or "
+                       "zero_point.");
+            mgb_assert(
+                    PyFloat_Check(scale_py),
+                    "Invalid Quantized8Asymm metadata: scale should be float");
+            mgb_assert(PyLong_Check(zero_point_py),
+                       "Invalid Quantized8Asymm metadata: zero_point should be "
+                       "integer");
+            auto zero_point = PyLong_AS_LONG(zero_point_py);
+            mgb_assert(zero_point >= 0 && zero_point < 256,
+                       "Invalid Quantized8Asymm metadata: zero_point should be "
+                       "in [0, 256)");
+            return dtype::Quantized8Asymm(
+                    static_cast<float>(PyFloat_AS_DOUBLE(scale_py)),
+                    static_cast<uint8_t>(zero_point));
+        }
+        if (dtype_name == "Quantized4Asymm") {
+            PyObject* scale_py = PyDict_GetItemString(metadata, "scale");
+            PyObject* zero_point_py =
+                    PyDict_GetItemString(metadata, "zero_point");
+            mgb_assert(scale_py && zero_point_py,
+                       "Invalid Quantized4Asymm metadata: missing scale or "
+                       "zero_point.");
+            mgb_assert(
+                    PyFloat_Check(scale_py),
+                    "Invalid Quantized4Asymm metadata: scale should be float");
+            mgb_assert(PyLong_Check(zero_point_py),
+                       "Invalid Quantized4Asymm metadata: zero_point should be "
+                       "integer");
+            auto zero_point = PyLong_AS_LONG(zero_point_py);
+            mgb_assert(zero_point >= 0 && zero_point < 15,
+                       "Invalid Quantized4Asymm metadata: zero_point should be "
+                       "in [0, 15)");
+            return dtype::Quantized4Asymm(
+                    static_cast<float>(PyFloat_AS_DOUBLE(scale_py)),
+                    static_cast<uint8_t>(zero_point));
+        }
+        if (dtype_name == "QuantizedS32" || dtype_name == "QuantizedS8" ||
+            dtype_name == "QuantizedS4") {
+            PyObject* scale_py = PyDict_GetItemString(metadata, "scale");
+            mgb_assert(scale_py, "Invalid metadata: missing scale");
+            mgb_assert(PyFloat_Check(scale_py),
+                       "Invalid metadata: scale should be float");
+            float scale = static_cast<float>(PyFloat_AS_DOUBLE(scale_py));
+            if (dtype_name == "QuantizedS32") {
+                return dtype::QuantizedS32(scale);
+            } else if (dtype_name == "QuantizedS8"){
+                return dtype::QuantizedS8(scale);
+            } else {
+                return dtype::QuantizedS4(scale);
+            }
+        }
+        throw ConversionError(
+                ssprintf("Unknown parameterized DType: %s", dtype_name.c_str())
+                        .c_str());
+    };
+    PyObject* dtype_metadata;
+    if (descr->metadata && PyDict_Check(descr->metadata) &&
+        (dtype_metadata = PyDict_GetItemString(descr->metadata, "mgb_dtype"))) {
+        return handle_parameterized_dtype(dtype_metadata);
+    }
+    return dtype_np2mgb_raw(descr->type_num);
+}
+
+HostTensorND lowbit_ndarray_to_host_tensor(
+        CompNode comp_node, TensorLayout &layout, PyArrayObject *input) {
+    auto src_ptr = reinterpret_cast<dt_byte*>(PyArray_DATA(input));
+    if (!layout.ndim) {
+        // numpy scalar
+        mgb_assert(src_ptr, "can not convert from null numpy array");
+        layout.init_contiguous_stride({1});
+    } else {
+        mgb_assert(layout.ndim && layout.ndim <= TensorShape::MAX_NDIM,
+                "unsupported ndim %zu", layout.ndim);
+        for (size_t i = 0; i < layout.ndim; ++ i) {
+            layout.shape[i] = PyArray_SHAPE(input)[i];
+            layout.stride[i] = PyArray_STRIDE(input, i);
+            mgb_assert(layout.shape[i], "zero shape not supported");
+        }
+        mgb_assert(layout.is_contiguous());
+    }
+    HostTensorND ret{comp_node, layout};
+    lowbit_memcpy_byte2compact(layout.dtype, ret.raw_ptr(), src_ptr,
+            layout.total_nr_elems());
+    return ret;
+}
+
+/*!
+ * \brief convert a python object to tensor and try to borrow memory if the
+ *      original object is a contiguous numpy array
+ * \param dtype see np2tensor
+ * \return the megbrain tensor, and whether memory is borrowed
+ */
+std::pair<HostTensorND, bool> np2tensor_try_borrow(
+        PyObject *obj, const npy::Meth& meth, DType dtype) {
+    auto dest_cn = meth.dest_cn_;
+    mgb_assert(dest_cn.valid());
+
+    PYTHON_GIL;
+
+    PyArray_Descr* expected_descr = nullptr;
+    if (dtype.valid()) {
+        // The reference to expected_descr will be stealed later.
+        expected_descr = dtype_mgb2np_descr(dtype).release();
+    }
+
+    // make result from PyArrayObject; its reference may be stolen
+    auto make_from_arr = [&](PyArrayObject *input, bool allow_borrow) {
+
+        TensorLayout layout;
+        layout.dtype = dtype_np2mgb_descr(PyArray_DESCR(input));
+        if (dtype.valid())
+            mgb_assert(dtype == layout.dtype);
+        layout.ndim = PyArray_NDIM(input);
+
+        if (layout.dtype.is_low_bit()) {
+            auto ret = lowbit_ndarray_to_host_tensor(dest_cn, layout, input);
+            if (meth.dest_tensor_) {
+                meth.dest_tensor_->copy_from(ret);
+                ret = *meth.dest_tensor_;
+            }
+            return std::make_pair(ret, false);
+        }
+
+        auto data = reinterpret_cast<dt_byte*>(PyArray_DATA(input));
+        if (!layout.ndim) {
+            // numpy scalar
+            mgb_assert(data, "can not convert from null numpy array");
+            layout.init_contiguous_stride({1});
+        } else {
+            mgb_assert(layout.ndim && layout.ndim <= TensorShape::MAX_NDIM,
+                    "unsupported ndim %zu", layout.ndim);
+            auto dsize = layout.dtype.size();
+            bool is_empty = false;
+            for (size_t i = 0; i < layout.ndim; ++ i) {
+                layout.shape[i] = PyArray_SHAPE(input)[i];
+                layout.stride[i] = PyArray_STRIDE(input, i);
+                if (!layout.shape[i]) {
+                    is_empty = true;
+                }
+                mgb_assert(layout.stride[i] % dsize == 0,
+                        "bad stride %zd", layout.stride[i]);
+                layout.stride[i] /= dsize;
+            }
+            mgb_assert(is_empty || layout.is_contiguous());
+        }
+
+        if (!meth.dest_tensor_ && allow_borrow) {
+            Py_INCREF(input);
+            PyObjRefKeeper ref_obj_cvt{reinterpret_cast<PyObject*>(input)};
+            HostTensorStorage storage;
+            auto input_ptr = ref_obj_cvt.make_shared(data);
+            storage.reset(dest_cn, layout.span().high_byte, input_ptr);
+            HostTensorND ret;
+            ret.reset(storage, layout);
+            return std::make_pair(ret, true);
+        } else {
+            auto storage = HostTensorStorage(dest_cn);
+            storage.ensure_size(layout.span().dist_byte());
+            memcpy(storage.ptr(), data, layout.span().dist_byte());
+            HostTensorND ret{dest_cn, layout.dtype};
+            if (meth.dest_tensor_) {
+                meth.dest_tensor_->reset(storage, layout);
+                return std::make_pair(*meth.dest_tensor_, false);
+            } else {
+                HostTensorND ret;
+                ret.reset(storage, layout);
+                return std::make_pair(ret, false);
+            }
+        }
+    };
+
+    PyArrayObject *obj_as_arr = nullptr;
+    do {
+        // check contiguous and dtype, and borrow mem if ok
+        if (!PyArray_Check(obj))
+            break;
+        obj_as_arr = reinterpret_cast<PyArrayObject*>(obj);
+        int typenum = PyArray_DTYPE(obj_as_arr)->type_num;
+        // We have to check dtype.valid() and typenum first to avoid
+        // accidentally trigger ConversionError on incompatible dtypes which can
+        // be automatically converted into comptaible ones (e.g. float64).
+        if (dtype.valid() &&
+            (expected_descr->type_num != typenum ||
+             dtype_np2mgb_descr(PyArray_DTYPE(obj_as_arr)) != dtype))
+            break;
+        if (typenum != to_mgb_supported_dtype_raw(typenum)) {
+            mgb_assert(!dtype.valid() && expected_descr == nullptr);
+            expected_descr =
+                    PyArray_DescrFromType(to_mgb_supported_dtype_raw(typenum));
+            break;
+        }
+        if (PyArray_ISCARRAY_RO(obj_as_arr)) {
+            return make_from_arr(obj_as_arr, true);
+        }
+    } while(0);
+
+    constexpr auto NP_FLAGS = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_FORCECAST;
+    PyObject *obj_cvt;
+    if (obj_as_arr) {
+        obj_cvt = PyArray_FromArray(obj_as_arr, expected_descr, NP_FLAGS);
+    } else {
+        obj_cvt = PyArray_FromAny(obj, expected_descr, 0, 0, NP_FLAGS, nullptr);
+    }
+
+    if (obj_cvt) {
+        // convert to mgb supported dtype
+        auto arr = reinterpret_cast<PyArrayObject*>(obj_cvt);
+        int dt0 = PyArray_TYPE(arr), dt1 = to_mgb_supported_dtype_raw(dt0);
+        if (dt0 != dt1) {
+            mgb_assert(expected_descr == nullptr);
+            expected_descr = PyArray_DescrFromType(dt1);
+            mgb_assert(expected_descr);
+            auto obj_cvt_new = PyArray_FromAny(
+                    obj_cvt, expected_descr, 0, 0, NP_FLAGS, nullptr);
+            Py_DECREF(obj_cvt);
+            obj_cvt = obj_cvt_new;
+        }
+    }
+
+    if (!obj_cvt) {
+        if (PyErr_Occurred()) {
+            PyExceptionForward::throw_();
+        }
+        throw ConversionError(ssprintf("can not convert to numpy array from %s",
+                    repr_pyobj(obj).c_str()));
+    }
+
+    auto ret =  make_from_arr(reinterpret_cast<PyArrayObject*>(obj_cvt), false);
+    Py_DECREF(obj_cvt);
+    return ret;
+}
+
+//! hold a reference to HostTensorND
+class HostTensorNDRefHolder final: public NonCopyableObj {
+    HostTensorND m_val;
+    static MemPool<HostTensorNDRefHolder> sm_mem_pool;
+
+    friend class MemPool<HostTensorNDRefHolder>;
+
+    HostTensorNDRefHolder(const HostTensorND &v):
+        m_val{v}
+    {
+    }
+
+    public:
+
+        static HostTensorNDRefHolder* alloc(const HostTensorND &v) {
+            return sm_mem_pool.alloc(v);
+        }
+
+        static void free(HostTensorNDRefHolder *p) {
+            return sm_mem_pool.free(p);
+        }
+};
+MemPool<HostTensorNDRefHolder> HostTensorNDRefHolder::sm_mem_pool;
+
+void ndarray_shared_from_tensor_py_capsule_dtor(PyObject *cap) {
+    auto ptr = PyCapsule_GetPointer(cap, "HostTensorND");
+    mgb_assert(ptr, "not a PyCapsule: %s", repr_pyobj(cap).c_str());
+    HostTensorNDRefHolder::free(static_cast<HostTensorNDRefHolder*>(ptr));
+}
+
+} // anonymous namespace
+
+PyObject* npy::ndarray_from_tensor(
+        const HostTensorND &val, ShareType share_type) {
+    if (!val.layout().is_contiguous() && !val.shape().is_empty()) {
+        mgb_assert(share_type != ShareType::MUST_SHARE);
+        HostTensorND contig;
+        contig.copy_from(val);
+        return ndarray_from_tensor(contig, ShareType::TRY_SHARE);
+    }
+    PYTHON_GIL;
+    npy_intp dims[TensorLayout::MAX_NDIM];
+    for (size_t i = 0; i < val.layout().ndim; ++ i)
+        dims[i] = val.shape()[i];
+    PyObject* ret = nullptr;
+
+    auto alloc_new_ret = [&]() {
+        mgb_assert(!ret);
+        ret = PyArray_NewFromDescr(
+                &PyArray_Type, dtype_mgb2np_descr(val.dtype()).release(),
+                val.layout().ndim, dims, nullptr, nullptr, 0, nullptr);
+        mgb_assert(ret, "failed to allocate array");
+        mgb_assert(PyArray_Check(ret));
+        return PyArray_DATA(reinterpret_cast<PyArrayObject*>(ret));
+    };
+    if (val.dtype().is_low_bit()) {
+        mgb_assert(share_type != ShareType::MUST_SHARE,
+                "can not share memory for lowbit dtype");
+        lowbit_memcpy_compact2byte(val.dtype(), alloc_new_ret(), val.raw_ptr(),
+                val.layout().total_nr_elems());
+    } else if (share_type == ShareType::MUST_UNSHARE) {
+        memcpy(alloc_new_ret(), val.raw_ptr(), val.layout().span().dist_byte());
+    } else {
+        // share data
+        ret = PyArray_NewFromDescr(
+                &PyArray_Type, dtype_mgb2np_descr(val.dtype()).release(),
+                val.layout().ndim, dims, nullptr,
+                const_cast<dt_byte*>(val.raw_ptr()), 0, nullptr);
+        mgb_assert(ret, "failed to alloc ndarray");
+        auto capsule = PyCapsule_New(HostTensorNDRefHolder::alloc(val),
+                "HostTensorND", ndarray_shared_from_tensor_py_capsule_dtor);
+        mgb_assert(capsule, "failed to create PyCapsule");
+        auto err = PyArray_SetBaseObject(
+                reinterpret_cast<PyArrayObject*>(ret), capsule);
+        mgb_assert(!err);
+    }
+    return ret;
+}
+
+HostTensorND npy::np2tensor(PyObject* obj, const Meth& meth, DType dtype) {
+    auto ret_full = np2tensor_try_borrow(obj, meth, dtype);
+    if (meth.must_borrow_) {
+        mgb_assert(ret_full.second,
+                   "can not borrow from numpy array as contig array with dtype "
+                   "%s; src=%s",
+                   dtype.name(), repr_pyobj(obj).c_str());
+    }
+    return ret_full.first;
+}
+
+PyObject* npy::dtype_mgb2np(mgb::DType dtype) {
+    PYTHON_GIL;
+    // According to
+    // https://docs.scipy.org/doc/numpy/reference/c-api.array.html#c.PyArray_TypeObjectFromType
+    // the following is equivalent to PyArray_TypeObjectFromType for built-in
+    // types.
+    auto descr = dtype_mgb2np_descr(dtype);
+    if (descr == nullptr) {
+        return nullptr;
+    }
+    if (dtype.has_param()) {
+        return reinterpret_cast<PyObject*>(descr.release());
+    }
+    PyObject* typeobj = reinterpret_cast<PyObject*>(descr->typeobj);
+    Py_XINCREF(typeobj);
+    return typeobj;
+}
+
+mgb::DType npy::dtype_np2mgb(PyObject *obj) {
+    mgb_assert(obj && obj != Py_None,
+               "can not convert null PyObject to numpy dtype");
+    // see
+    // http://stackoverflow.com/questions/8477122/numpy-c-api-convert-type-object-to-type-number
+    PYTHON_GIL;
+
+    PyArray_Descr* dtype;
+    if(!PyArray_DescrConverter(obj, &dtype)) {
+        throw ConversionError(ssprintf("can not convert to np.dtype from %s",
+                    repr_pyobj(obj).c_str()));
+    }
+
+    mgb::DType result = dtype_np2mgb_descr(dtype);
+    Py_DECREF(dtype);
+    return result;
+}
+
+PyObject* npy::to_mgb_supported_dtype(PyObject* dtype) {
+    PYTHON_GIL;
+
+    PyArray_Descr* descr;
+    if (!PyArray_DescrConverter(dtype, &descr)) {
+        throw ConversionError(ssprintf("can not convert to np.dtype from %s",
+                                       repr_pyobj(dtype).c_str()));
+    }
+    mgb_assert(!descr->metadata,
+               "unexpected metadata in dtype: "
+               "dtype_obj=%s metadata=%s",
+               repr_pyobj(dtype).c_str(), repr_pyobj(descr->metadata).c_str());
+    int type_num = to_mgb_supported_dtype_raw(descr->type_num);
+    return PyArray_TypeObjectFromType(type_num);
+}
+
+TensorShape npy::vec2shape(const std::vector<size_t> &vec) {
+    TensorShape shape;
+    mgb_assert(vec.size() <= TensorShape::MAX_NDIM,
+            "dim too large: %zd (max %zd)",
+            vec.size(), TensorShape::MAX_NDIM);
+    shape.ndim = vec.size();
+    for (size_t i = 0; i < vec.size(); i ++) {
+        if (!vec[i]) {
+            shape.ndim = 0;
+            break;
+        }
+        shape[i] = vec[i];
+    }
+    mgb_assert(shape.ndim, "shape should not be empty");
+    return shape;
+}
diff --git a/imperative/python/src/helper.h b/imperative/python/src/helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..f97b6fd0c3ec349ba4ef3d3d5288b632df1ea147
--- /dev/null
+++ b/imperative/python/src/helper.h
@@ -0,0 +1,320 @@
+#pragma once
+
+#include "megbrain/graph.h"
+
+#include <Python.h>
+#include <string>
+#include <iterator>
+#if __cplusplus > 201703L
+#include <ranges>
+#endif
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/numpy.h>
+#include <pybind11/functional.h>
+
+pybind11::module submodule(pybind11::module parent, const char* name, const char* doc = nullptr);
+
+pybind11::module rel_import(pybind11::str name, pybind11::module m, int level);
+
+#if __cplusplus > 201703L
+using std::ranges::range_value_t;
+#else
+template<typename T>
+using range_value_t = std::remove_cv_t<std::remove_reference_t<decltype(*std::declval<T>().begin())>>;
+#endif
+
+template<typename T>
+auto to_list(const T& x) {
+    using elem_t = range_value_t<T>;
+    std::vector<elem_t> ret(x.begin(), x.end());
+    return pybind11::cast(ret);
+}
+
+template<typename T>
+auto to_tuple(const T& x, pybind11::return_value_policy policy = pybind11::return_value_policy::automatic) {
+    auto ret = pybind11::tuple(x.size());
+    for (size_t i = 0; i < x.size(); ++i) {
+        ret[i] = pybind11::cast(x[i], policy);
+    }
+    return ret;
+}
+
+template<typename T>
+auto to_tuple(T begin, T end, pybind11::return_value_policy policy = pybind11::return_value_policy::automatic) {
+    auto ret = pybind11::tuple(end - begin);
+    for (size_t i = 0; begin < end; ++begin, ++i) {
+        ret[i] = pybind11::cast(*begin, policy);
+    }
+    return ret;
+}
+
+class PyTaskDipatcher {
+    struct Queue : mgb::AsyncQueueSC<std::function<void(void)>, Queue> {
+        using Task = std::function<void(void)>;
+        void process_one_task(Task& f) {
+            if (!Py_IsInitialized()) return;
+            pybind11::gil_scoped_acquire _;
+            f();
+        }
+    };
+    Queue queue;
+    bool finalized = false;
+public:
+    template<typename T>
+    void add_task(T&& task) {
+        // CPython never dlclose an extension so
+        // finalized means the interpreter has been shutdown
+        if (!finalized) {
+            queue.add_task(std::forward<T>(task));
+        }
+    }
+    void wait_all_task_finish() {
+        queue.wait_all_task_finish();
+    }
+    ~PyTaskDipatcher() {
+        finalized = true;
+        queue.wait_all_task_finish();
+    }
+};
+
+extern PyTaskDipatcher py_task_q;
+
+class GILManager {
+    PyGILState_STATE gstate;
+
+    public:
+        GILManager():
+            gstate(PyGILState_Ensure())
+        {
+        }
+
+        ~GILManager() {
+            PyGILState_Release(gstate);
+        }
+};
+#define PYTHON_GIL GILManager __gil_manager
+
+//! wraps a shared_ptr and decr PyObject ref when destructed
+class PyObjRefKeeper {
+    std::shared_ptr<PyObject> m_ptr;
+
+public:
+    static void deleter(PyObject* p) {
+        if (p) {
+            py_task_q.add_task([p](){Py_DECREF(p);});
+        }
+    }
+
+    PyObjRefKeeper() = default;
+    PyObjRefKeeper(PyObject* p) : m_ptr{p, deleter} {}
+
+    PyObject* get() const { return m_ptr.get(); }
+
+    //! create a shared_ptr as an alias of the underlying ptr
+    template <typename T>
+    std::shared_ptr<T> make_shared(T* ptr) const {
+        return {m_ptr, ptr};
+    }
+};
+
+//! exception to be thrown when python callback fails
+class PyExceptionForward : public std::exception {
+    PyObject *m_type, *m_value, *m_traceback;
+    std::string m_msg;
+
+    PyExceptionForward(PyObject* type, PyObject* value, PyObject* traceback,
+                       const std::string& msg)
+            : m_type{type},
+              m_value{value},
+              m_traceback{traceback},
+              m_msg{msg} {}
+
+public:
+    PyExceptionForward(const PyExceptionForward&) = delete;
+    PyExceptionForward& operator=(const PyExceptionForward&) = delete;
+    ~PyExceptionForward();
+
+    PyExceptionForward(PyExceptionForward&& rhs)
+            : m_type{rhs.m_type},
+              m_value{rhs.m_value},
+              m_traceback{rhs.m_traceback},
+              m_msg{std::move(rhs.m_msg)} {
+        rhs.m_type = rhs.m_value = rhs.m_traceback = nullptr;
+    }
+
+    //! throw PyExceptionForward from current python error state
+    static void throw_() __attribute__((noreturn));
+
+    //! restore python error
+    void restore();
+
+    const char* what() const noexcept override { return m_msg.c_str(); }
+};
+
+//! numpy utils
+namespace npy {
+    //! convert tensor shape to raw vector
+    static inline std::vector<size_t> shape2vec(const mgb::TensorShape &shape) {
+        return {shape.shape, shape.shape + shape.ndim};
+    }
+
+    //! change numpy dtype to megbrain supported dtype
+    PyObject* to_mgb_supported_dtype(PyObject *dtype);
+
+    //! convert raw vector to tensor shape
+    mgb::TensorShape vec2shape(const std::vector<size_t> &vec);
+
+    //! convert megbrain dtype to numpy dtype object; return new reference
+    PyObject* dtype_mgb2np(mgb::DType dtype);
+
+    //! convert numpy dtype object or string to megbrain dtype
+    mgb::DType dtype_np2mgb(PyObject *obj);
+
+    //! buffer sharing type
+    enum class ShareType {
+        MUST_SHARE,     //!< must be shared
+        MUST_UNSHARE,   //!< must not be shared
+        TRY_SHARE       //!< share if possible
+    };
+
+    //! get ndarray from HostTensorND
+    PyObject* ndarray_from_tensor(const mgb::HostTensorND &val,
+            ShareType share_type);
+
+    //! specify how to convert numpy array to tensor
+    struct Meth {
+        bool must_borrow_ = false;
+        mgb::HostTensorND *dest_tensor_ = nullptr;
+        mgb::CompNode dest_cn_;
+
+        //! make a Meth that allows borrowing numpy array memory
+        static Meth borrow(
+                mgb::CompNode dest_cn = mgb::CompNode::default_cpu()) {
+            return {false, nullptr, dest_cn};
+        }
+
+        //! make a Meth that requires the numpy array to be borrowed
+        static Meth must_borrow(
+                mgb::CompNode dest_cn = mgb::CompNode::default_cpu()) {
+            return {true, nullptr, dest_cn};
+        }
+
+        //! make a Meth that requires copying the value into another
+        //! tensor
+        static Meth copy_into(mgb::HostTensorND *tensor) {
+            return {false, tensor, tensor->comp_node()};
+        }
+    };
+    /*!
+     * \brief convert an object to megbrain tensor
+     * \param meth specifies how the conversion should take place
+     * \param dtype desired dtype; it can be set as invalid to allow arbitrary
+     *      dtype
+     */
+    mgb::HostTensorND np2tensor(PyObject *obj, const Meth &meth,
+            mgb::DType dtype);
+}
+
+// Note: following macro was copied from pybind11/detail/common.h
+// Robust support for some features and loading modules compiled against different pybind versions
+// requires forcing hidden visibility on pybind code, so we enforce this by setting the attribute on
+// the main `pybind11` namespace.
+#if !defined(PYBIND11_NAMESPACE)
+#  ifdef __GNUG__
+#    define PYBIND11_NAMESPACE pybind11 __attribute__((visibility("hidden")))
+#  else
+#    define PYBIND11_NAMESPACE pybind11
+#  endif
+#endif
+
+namespace PYBIND11_NAMESPACE {
+namespace detail {
+
+    template<typename T, unsigned N> struct type_caster<megdnn::SmallVector<T, N>>
+        : list_caster<megdnn::SmallVector<T, N>, T> {};
+
+    template <> struct type_caster<mgb::DType> {
+        PYBIND11_TYPE_CASTER(mgb::DType, _("DType"));
+    public:
+        bool load(handle src, bool convert) {
+            auto obj = reinterpret_borrow<object>(src);
+            if (!convert && !isinstance<dtype>(obj)) {
+                return false;
+            }
+            if (obj.is_none()) {
+                return true;
+            }
+            try {
+                obj = pybind11::dtype::from_args(obj);
+            } catch (pybind11::error_already_set&) {
+                return false;
+            }
+            try {
+                value = npy::dtype_np2mgb(obj.ptr());
+            } catch (...) {
+                return false;
+            }
+            return true;
+        }
+
+        static handle cast(mgb::DType dt, return_value_policy /* policy */, handle /* parent */) {
+            // ignore policy and parent because we always return a pure python object
+            return npy::dtype_mgb2np(std::move(dt));
+        }
+    };
+
+    template <> struct type_caster<mgb::TensorShape> {
+        PYBIND11_TYPE_CASTER(mgb::TensorShape, _("TensorShape"));
+    public:
+        bool load(handle src, bool convert) {
+            auto obj = reinterpret_steal<object>(src);
+            if (!isinstance<tuple>(obj)) {
+                return false;
+            }
+            value.ndim = len(obj);
+            mgb_assert(value.ndim <= mgb::TensorShape::MAX_NDIM);
+            size_t i = 0;
+            for (auto v : obj) {
+                mgb_assert(i < value.ndim);
+                value.shape[i] = reinterpret_borrow<object>(v).cast<size_t>();
+                ++i;
+            }
+            return true;
+        }
+
+        static handle cast(mgb::TensorShape shape, return_value_policy /* policy */, handle /* parent */) {
+            // ignore policy and parent because we always return a pure python object
+            return to_tuple(shape.shape, shape.shape + shape.ndim).release();
+        }
+    };
+
+    // hack to make custom object implicitly convertible from None
+    template <typename T> struct from_none_caster : public type_caster_base<T> {
+        using base = type_caster_base<T>;
+        bool load(handle src, bool convert) {
+            if (!convert || !src.is_none()) {
+                return base::load(src, convert);
+            }
+            // adapted from pybind11::implicitly_convertible
+            auto temp = reinterpret_steal<object>(PyObject_Call(
+                (PyObject*) this->typeinfo->type, tuple().ptr(), nullptr));
+            if (!temp) {
+                PyErr_Clear();
+                return false;
+            }
+            // adapted from pybind11::detail::type_caster_generic
+            if (base::load(temp, false)) {
+                loader_life_support::add_patient(temp);
+                return true;
+            }
+            return false;
+        }
+    };
+
+    template<> struct type_caster<mgb::CompNode> : public from_none_caster<mgb::CompNode> {};
+
+} // detail
+} // PYBIND11_NAMESPACE
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/python/src/imperative_rt.cpp b/imperative/python/src/imperative_rt.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e33832a69ce41726bfef63de760b84b7dd9390a
--- /dev/null
+++ b/imperative/python/src/imperative_rt.cpp
@@ -0,0 +1,94 @@
+#include "./imperative_rt.h"
+
+#include <future>
+#include <variant>
+#include <unordered_map>
+#include <pybind11/numpy.h>
+#include <pybind11/operators.h>
+
+#include "megbrain/imperative.h"
+#include "megbrain/imperative/interpreter.h"
+#include "megbrain/imperative/ops/opr_attr.h"
+#include "./helper.h"
+
+namespace py = pybind11;
+
+using namespace mgb;
+using namespace imperative;
+using namespace interpreter;
+
+
+namespace {
+
+std::optional<std::tuple<std::shared_ptr<OpDef>, std::vector<bool>, std::vector<bool>>>
+make_backward_graph(
+    const OpDef& opdef, std::vector<LogicalTensorDesc> inputs,
+    std::vector<bool> input_requires_grad,
+    std::vector<bool> output_has_grad) {
+    auto res = OpDef::make_backward_graph(opdef,
+        SmallVector<LogicalTensorDesc>(inputs.begin(), inputs.end()),
+        SmallVector<bool>(input_requires_grad.begin(), input_requires_grad.end()),
+        SmallVector<bool>(output_has_grad.begin(), output_has_grad.end()));
+    if (res.backward) {
+        return std::optional<std::tuple<std::shared_ptr<OpDef>, std::vector<bool>, std::vector<bool>>>{
+                std::in_place, res.backward, res.save_for_backward, res.input_has_grad};
+    } else {
+        return {};
+    }
+}
+} // namespace
+
+void init_imperative_rt(py::module m) {
+    py::class_<Interpreter::Channel>(m, "Interpreter")
+        .def("put", [](Interpreter::Channel& self, py::array data, DType dtype, CompNode cn) {
+                if (!cn.valid()) {
+                    cn = CompNode::load("xpux");
+                }
+                constexpr int size_threshhold = TensorShape::MAX_NDIM;
+                if (data.size() > size_threshhold) {
+                    return self.put(npy::np2tensor(data.ptr(), npy::Meth::borrow(cn), dtype));
+                } else {
+                    HostTensorND ret(cn);
+                    return self.put(npy::np2tensor(data.ptr(), npy::Meth::copy_into(&ret), dtype));
+                }
+            }, py::arg(), py::arg("dtype") = py::none(), py::arg("device") = py::none())
+        .def("delete", [](Interpreter::Channel& self, Interpreter::Handle handle) {
+                return self.del(handle);
+            })
+        .def("get_value", [](Interpreter::Channel& self, Interpreter::Handle handle) {
+                PyObject* optr = npy::ndarray_from_tensor(self.get_value(handle), npy::ShareType::TRY_SHARE);
+                return py::reinterpret_steal<py::object>(optr);
+            })
+        .def("get_dtype", &Interpreter::Channel::get_dtype)
+        .def("get_device", &Interpreter::Channel::get_device)
+        .def("get_shape", &Interpreter::Channel::get_shape)
+        .def("_get_dev_tensor", &Interpreter::Channel::get_dev_tensor)
+        .def("apply_op", &Interpreter::Channel::apply_op)
+        .def("sync", &Interpreter::Channel::sync);
+
+    std::unique_ptr<Interpreter::Channel> ch = Interpreter::inst().create_channel();
+    m.attr("interpreter") = py::detail::make_caster<decltype(ch)>::cast(
+        std::move(ch), py::return_value_policy::move, {});
+    for (auto name : {"put", "delete", "get_value", "get_dtype", "get_device", "get_shape", "_get_dev_tensor", "apply_op"}) {
+        m.attr(name) = m.attr("interpreter").attr(name);
+    }
+
+    m.def("sync", [m]() {
+            m.attr("interpreter").attr("sync")();
+            py_task_q.wait_all_task_finish();
+         });
+
+    m.def("make_backward_graph", &make_backward_graph);
+
+    py::class_<OpDef, std::shared_ptr<OpDef>>(m, "OpDef")
+        .def("ctype", [](const OpDef& opdef) {
+                if (auto attr = opdef.try_cast_final<OprAttr>()) {
+                    return attr->type.c_str();
+                }
+                return opdef.dyn_typeinfo()->name;
+            })
+        .def("__eq__", [](const OpDef& lhs, const OpDef& rhs) {
+                return lhs.is_same(rhs);
+            })
+        .def("__hash__", &OpDef::hash);
+}
diff --git a/imperative/python/src/imperative_rt.h b/imperative/python/src/imperative_rt.h
new file mode 100644
index 0000000000000000000000000000000000000000..2194bdb4ebbeb4a4a746b8a6eb15fdd70847dbd4
--- /dev/null
+++ b/imperative/python/src/imperative_rt.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include "./helper.h"
+
+#include "megbrain/imperative.h"
+
+void init_imperative_rt(pybind11::module m);
diff --git a/imperative/python/src/module.cpp b/imperative/python/src/module.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..309a0220899c63ffc36486e024983ce211cb69bb
--- /dev/null
+++ b/imperative/python/src/module.cpp
@@ -0,0 +1,55 @@
+#include <pybind11/eval.h>
+
+#define DO_IMPORT_ARRAY
+#include "./numpy_dtypes.h"
+#include "./helper.h"
+
+#include "./common.h"
+#include "./utils.h"
+#include "./imperative_rt.h"
+#include "./graph_rt.h"
+#include "./ops.h"
+
+namespace py = pybind11;
+
+#ifndef MODULE_NAME
+#define MODULE_NAME imperative_rt
+#endif
+
+PYBIND11_MODULE(MODULE_NAME, m) {
+    // initialize numpy
+    if ([]() {import_array1(1); return 0;}()) {
+        throw py::error_already_set();
+    }
+
+    py::module::import("sys").attr("modules")[m.attr("__name__")] = m;
+
+    m.attr("__package__") = m.attr("__name__");
+    m.attr("__builtins__") = py::module::import("builtins");
+
+    auto atexit = py::module::import("atexit");
+    atexit.attr("register")(py::cpp_function([]() {
+        py::gil_scoped_release _;
+        py_task_q.wait_all_task_finish();
+    }));
+
+    auto common = submodule(m, "common");
+    auto utils = submodule(m, "utils");
+    auto imperative = submodule(m, "imperative");
+    auto graph = submodule(m, "graph");
+    auto ops = submodule(m, "ops");
+
+    init_common(common);
+    init_utils(utils);
+    init_imperative_rt(imperative);
+    init_graph_rt(graph);
+    init_ops(ops);
+
+    py::exec(R"(
+        from .common import *
+        from .utils import *
+        from .imperative import *
+        from .graph import *
+        )",
+        py::getattr(m, "__dict__"));
+}
diff --git a/imperative/python/src/numpy_dtypes.h b/imperative/python/src/numpy_dtypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..d13780d582955a75c1d340fc56315be06ef400fe
--- /dev/null
+++ b/imperative/python/src/numpy_dtypes.h
@@ -0,0 +1,45 @@
+/**
+ * \file imperative/python/src/numpy_dtypes.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \brief import numpy array with proper settings
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+#pragma once
+
+#ifndef DO_IMPORT_ARRAY
+#define NO_IMPORT_ARRAY
+#endif
+#define PY_ARRAY_UNIQUE_SYMBOL mgb_numpy_array_api
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <numpy/arrayobject.h>
+#include <pybind11/pybind11.h>
+
+#define FOREACH_MGB_LOW_BIT(cb) \
+    cb(1) \
+    cb(2) \
+    cb(4) \
+
+#define FOREACH_MGB_DTYPE_PAIR(cb) \
+    cb(IntB1, npy_num_intb1()) \
+    cb(IntB2, npy_num_intb2()) \
+    cb(IntB4, npy_num_intb4()) \
+    cb(BFloat16, npy_num_bfloat16()) \
+
+namespace mgb {
+    //! numpy type num for intb1/2/4 type
+#define DEFINE_NPY_INTBX(n) \
+    int npy_num_intb##n();
+FOREACH_MGB_LOW_BIT(DEFINE_NPY_INTBX)
+#undef DEFINE_NPY_INTBX
+    void init_npy_num_intbx(pybind11::module m);
+
+    //! numpy type num for bfloat16 type
+    int npy_num_bfloat16();
+    void init_npy_num_bfloat16(pybind11::module m);
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/python/src/numpy_dtypes_bfloat16.cpp b/imperative/python/src/numpy_dtypes_bfloat16.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..52d122aa84faed567e60a45620bd21966b33cc7d
--- /dev/null
+++ b/imperative/python/src/numpy_dtypes_bfloat16.cpp
@@ -0,0 +1,275 @@
+/**
+ * \file imperative/python/src/numpy_dtypes_bfloat16.cpp
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \brief numpy dtypes for bfloat16
+ *
+ * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ */
+#include "./numpy_dtypes.h"
+
+#include <Python.h>
+#include <structmember.h>
+#include <pybind11/operators.h>
+
+#include "megbrain/common.h"
+#include "megbrain/dtype.h"
+
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+
+namespace {
+
+struct BFloat16Type {
+    static int npy_typenum;
+    mgb::dt_bfloat16 value;
+
+    struct PyObj;
+    struct NpyType;
+
+    template <typename S, typename T>
+    struct NpyCast;
+};
+
+int BFloat16Type::npy_typenum;
+
+/* ==================== BFloat16Type::NpyCast ==================== */
+
+template <typename S>
+struct BFloat16Type::NpyCast<S, BFloat16Type> {
+    static void apply(void* from_, void* to_, npy_intp n, void* /*fromarr*/,
+                      void* /*toarr*/) {
+        auto from = static_cast<S*>(from_);
+        auto to = static_cast<BFloat16Type*>(to_);
+        for (npy_intp i = 0; i < n; ++i) {
+            float cur = static_cast<float>(from[i]);
+            to[i].value = cur;
+        }
+    }
+};
+
+template <typename T>
+struct BFloat16Type::NpyCast<BFloat16Type, T> {
+    static void apply(void* from_, void* to_, npy_intp n, void* /*fromarr*/,
+                      void* /*toarr*/) {
+        auto from = static_cast<BFloat16Type*>(from_);
+        auto to = static_cast<T*>(to_);
+        for (npy_intp i = 0; i < n; ++i) {
+            to[i] = from[i].value;
+        }
+    }
+};
+
+/* ==================== BFloat16Type::PyObj ==================== */
+struct BFloat16Type::PyObj {
+    PyObject_HEAD BFloat16Type obj;
+
+    static PyTypeObject py_type;
+
+    static PyObject* from_bfloat16(BFloat16Type val) {
+        auto p = reinterpret_cast<PyObj*>(py_type.tp_alloc(&py_type, 0));
+        p->obj.value = val.value;
+        return reinterpret_cast<PyObject*>(p);
+    }
+
+    static PyObject* py_new(PyTypeObject* type, PyObject* args, PyObject* kwds);
+    static PyObject* py_repr(PyObject* obj);
+    static PyObject* py_richcompare(PyObject* a, PyObject* b, int op);
+};
+PyTypeObject BFloat16Type::PyObj::py_type;
+
+PyObject* BFloat16Type::PyObj::py_new(PyTypeObject* type, PyObject* args,
+                                      PyObject* kwds) {
+    PyObj* self;
+    Py_ssize_t size;
+
+    self = (PyObj*)type->tp_alloc(type, 0);
+
+    size = PyTuple_GET_SIZE(args);
+    if (size > 1) {
+        PyErr_SetString(PyExc_TypeError, "BFloat16Type Only has 1 parameter");
+        return NULL;
+    }
+    PyObject* x = PyTuple_GET_ITEM(args, 0);
+    if (PyObject_IsInstance(x, (PyObject*)&py_type)) {
+        Py_INCREF(x);
+        return x;
+    }
+
+    if (!PyFloat_Check(x)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "BFloat16Type must be initialized wit float");
+        return NULL;
+    }
+
+    const float s = PyFloat_AsDouble(x);
+
+    self->obj.value = s;
+
+    return (PyObject*)self;
+}
+
+PyObject* BFloat16Type::PyObj::py_repr(PyObject* obj) {
+    float fval = static_cast<float>(((PyObj*)obj)->obj.value);
+    return PyUnicode_FromString(mgb::ssprintf("%f", fval).c_str());
+}
+
+PyObject* BFloat16Type::PyObj::py_richcompare(PyObject* a, PyObject* b,
+                                              int op) {
+    mgb_assert(PyObject_IsInstance(a, (PyObject*)&py_type));
+    auto bval = PyFloat_AsDouble(b);
+    if (bval == -1 && PyErr_Occurred()) {
+        return NULL;
+    }
+    double aval = ((PyObj*)a)->obj.value;
+#define OP(py, op)           \
+    case py: {               \
+        if (aval op bval) {  \
+            Py_RETURN_TRUE;  \
+        } else {             \
+            Py_RETURN_FALSE; \
+        }                    \
+    }
+    switch (op) {
+        OP(Py_LT, <)
+        OP(Py_LE, <=)
+        OP(Py_EQ, ==)
+        OP(Py_NE, !=)
+        OP(Py_GT, >)
+        OP(Py_GE, >=)
+    };
+#undef OP
+    return Py_NotImplemented;
+}
+
+/* ==================== BFloat16Type<N>::NpyType ==================== */
+struct BFloat16Type::NpyType {
+    static PyArray_ArrFuncs funcs;
+    static PyArray_Descr descr;
+
+    static bool init();
+
+    static void copyswap(void* dst, void* src, int swap, void* /*arr*/) {
+        if (src) {
+            mgb_assert(!swap);
+            memcpy(dst, src, sizeof(BFloat16Type));
+        }
+    }
+    static PyObject* getitem(void* data, void* ap) {
+        return BFloat16Type::PyObj::from_bfloat16(
+                *static_cast<BFloat16Type*>(data));
+    }
+    static int setitem(PyObject* op, void* ov, void* ap);
+};
+
+PyArray_ArrFuncs BFloat16Type::NpyType::funcs;
+PyArray_Descr BFloat16Type::NpyType::descr;
+
+int BFloat16Type::NpyType::setitem(PyObject* op, void* ov, void* ap) {
+    if (PyLong_Check(op)) {
+        int a = PyLong_AsLong(op);
+        static_cast<BFloat16Type*>(ov)->value = a;
+    } else if (PyFloat_Check(op)) {
+        float a = PyFloat_AsDouble(op);
+        static_cast<BFloat16Type*>(ov)->value = a;
+    } else if (PyObject_IsInstance(
+                       op, (PyObject*)(&(BFloat16Type::PyObj::py_type)))) {
+        static_cast<BFloat16Type*>(ov)->value = ((PyObj*)op)->obj.value;
+    } else {
+        PyErr_SetString(PyExc_ValueError,
+                        "input type must be int/float/bfloat16");
+        return -1;
+    }
+    return 0;
+}
+
+bool BFloat16Type::NpyType::init() {
+    descr = {PyObject_HEAD_INIT(0) & BFloat16Type::PyObj::py_type,
+             'V',  // kind
+             'f',  // type
+             '=',  // byteorder
+             NPY_NEEDS_PYAPI | NPY_USE_GETITEM | NPY_USE_SETITEM,
+             1,  // type num
+             sizeof(BFloat16Type),
+             alignof(BFloat16Type),
+             NULL,
+             NULL,
+             NULL,
+             &funcs};
+    Py_TYPE(&descr) = &PyArrayDescr_Type;
+    PyArray_InitArrFuncs(&funcs);
+    funcs.copyswap = copyswap;
+    funcs.getitem = getitem;
+    funcs.setitem = setitem;
+    npy_typenum = PyArray_RegisterDataType(&descr);
+
+#define REGISTER_CAST(From, To, From_descr, To_typenum, safe)         \
+    {                                                                 \
+        PyArray_Descr* from_descr = (From_descr);                     \
+        if (PyArray_RegisterCastFunc(from_descr, (To_typenum),        \
+                                     NpyCast<From, To>::apply) < 0) { \
+            return false;                                             \
+        }                                                             \
+        if (safe && PyArray_RegisterCanCast(from_descr, (To_typenum), \
+                                            NPY_NOSCALAR) < 0) {      \
+            return false;                                             \
+        }                                                             \
+    }
+#define REGISTER_INT_CASTS(bits)                                         \
+    REGISTER_CAST(npy_int##bits, BFloat16Type,                           \
+                  PyArray_DescrFromType(NPY_INT##bits),                  \
+                  BFloat16Type::npy_typenum, 1)                          \
+    REGISTER_CAST(BFloat16Type, npy_int##bits, &descr, NPY_INT##bits, 0) \
+    REGISTER_CAST(npy_uint##bits, BFloat16Type,                          \
+                  PyArray_DescrFromType(NPY_UINT##bits),                 \
+                  BFloat16Type::npy_typenum, 1)                          \
+    REGISTER_CAST(BFloat16Type, npy_uint##bits, &descr, NPY_UINT##bits, 0)
+
+    REGISTER_INT_CASTS(8)
+    REGISTER_INT_CASTS(16)
+    REGISTER_INT_CASTS(32)
+    REGISTER_INT_CASTS(64)
+    REGISTER_CAST(BFloat16Type, float, &descr, NPY_FLOAT, 0)
+    REGISTER_CAST(float, BFloat16Type, PyArray_DescrFromType(NPY_FLOAT),
+                  BFloat16Type::npy_typenum, 0)
+    REGISTER_CAST(BFloat16Type, double, &descr, NPY_DOUBLE, 1)
+    REGISTER_CAST(double, BFloat16Type, PyArray_DescrFromType(NPY_DOUBLE),
+                  BFloat16Type::npy_typenum, 0)
+    return true;
+}
+
+}  // anonymous namespace
+
+// define a new python type: pybfloat16
+bool init_pytype_bfloat16() {
+    auto& py_type = BFloat16Type::PyObj::py_type;
+    py_type = {PyVarObject_HEAD_INIT(NULL, 0)};
+    py_type.tp_name = "megengine.core._imperative_rt.pybfloat16";
+    py_type.tp_basicsize = sizeof(BFloat16Type::PyObj);
+    py_type.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE;
+    py_type.tp_doc = "bfloat16 type";
+    py_type.tp_new = BFloat16Type::PyObj::py_new;
+    py_type.tp_str = BFloat16Type::PyObj::py_repr;
+    py_type.tp_repr = BFloat16Type::PyObj::py_repr;
+    py_type.tp_richcompare = BFloat16Type::PyObj::py_richcompare;
+    py_type.tp_base = &PyGenericArrType_Type;
+    return PyType_Ready(&py_type) >= 0;
+}
+
+int mgb::npy_num_bfloat16() {
+    return BFloat16Type::npy_typenum;
+}
+
+namespace py = pybind11;
+
+void mgb::init_npy_num_bfloat16(py::module m) {
+    mgb_assert(init_pytype_bfloat16());
+    mgb_assert(BFloat16Type::NpyType::init());
+    m.add_object("pybfloat16", reinterpret_cast<PyObject*>(
+        &BFloat16Type::PyObj::py_type));
+    m.add_object("bfloat16", reinterpret_cast<PyObject*>(
+        PyArray_DescrFromType(npy_num_bfloat16())));
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/python/src/numpy_dtypes_intbx.cpp b/imperative/python/src/numpy_dtypes_intbx.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d6d231cf01b78586739aff47d459bf3fd619e211
--- /dev/null
+++ b/imperative/python/src/numpy_dtypes_intbx.cpp
@@ -0,0 +1,333 @@
+/**
+ * \file imperative/python/src/numpy_dtypes_intbx.cpp
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \brief numpy dtypes for low bit
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+#include "./numpy_dtypes.h"
+
+#include <Python.h>
+#include <structmember.h>
+
+#include "megbrain/common.h"
+
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+
+namespace {
+
+template <size_t N>
+struct LowBitType {
+    static_assert(N < 8, "low bit only supports less than 8 bits");
+    static int npy_typenum;
+    //! numerical value (-3, -1, 1, 3)
+    int8_t value;
+
+    struct PyObj;
+    struct NpyType;
+
+    const static int32_t max_value = (1 << N) - 1;
+
+    //! check whether val is (-3, -1, 1, 3) and set python error
+    static bool check_value_set_err(int val) {
+        int t = val + max_value;
+        if ((t & 1) || t < 0 || t > (max_value << 1)) {
+            PyErr_SetString(PyExc_ValueError,
+                            mgb::ssprintf("low bit dtype number error: "
+                                          "value=%d; allowed {-3, -1, 1, 3}",
+                                          val)
+                                    .c_str());
+            return false;
+        }
+
+        return true;
+    }
+
+    template <typename S, typename T>
+    struct NpyCast;
+};
+
+template <size_t N>
+int LowBitType<N>::npy_typenum;
+
+/* ==================== LowBitType::NpyCast ==================== */
+
+template <size_t N>
+template <typename S>
+struct LowBitType<N>::NpyCast<S, LowBitType<N>> {
+    static void apply(void* from_, void* to_, npy_intp n, void* /*fromarr*/,
+                      void* /*toarr*/) {
+        auto from = static_cast<S*>(from_);
+        auto to = static_cast<LowBitType<N>*>(to_);
+        for (npy_intp i = 0; i < n; ++i) {
+            int cur = static_cast<int>(from[i]);
+            if (!LowBitType<N>::check_value_set_err(cur))
+                return;
+            to[i].value = cur;
+        }
+    }
+};
+
+template <size_t N>
+template <typename T>
+struct LowBitType<N>::NpyCast<LowBitType<N>, T> {
+    static void apply(void* from_, void* to_, npy_intp n, void* /*fromarr*/,
+                      void* /*toarr*/) {
+        auto from = static_cast<LowBitType<N>*>(from_);
+        auto to = static_cast<T*>(to_);
+        for (npy_intp i = 0; i < n; ++i) {
+            to[i] = from[i].value;
+        }
+    }
+};
+
+/* ==================== LowBitType::PyObj ==================== */
+template <size_t N>
+struct LowBitType<N>::PyObj {
+    PyObject_HEAD LowBitType<N> obj;
+
+    static PyTypeObject py_type;
+
+    static PyObject* from_lowbit(LowBitType<N> val) {
+        auto p = reinterpret_cast<PyObj*>(py_type.tp_alloc(&py_type, 0));
+        p->obj.value = val.value;
+        return reinterpret_cast<PyObject*>(p);
+    }
+
+    static PyObject* py_new(PyTypeObject* type, PyObject* args, PyObject* kwds);
+    static PyObject* py_repr(PyObject* obj);
+    static PyObject* py_richcompare(PyObject* a, PyObject* b, int op);
+};
+template <size_t N>
+PyTypeObject LowBitType<N>::PyObj::py_type;
+
+template <size_t N>
+PyObject* LowBitType<N>::PyObj::py_new(PyTypeObject* type, PyObject* args,
+                                       PyObject* kwds) {
+    PyObj* self;
+    Py_ssize_t size;
+
+    self = (PyObj*)type->tp_alloc(type, 0);
+
+    size = PyTuple_GET_SIZE(args);
+    if (size > 1) {
+        PyErr_SetString(PyExc_TypeError, "LowBitType Only has 1 parameter");
+        return NULL;
+    }
+    PyObject* x = PyTuple_GET_ITEM(args, 0);
+    if (PyObject_IsInstance(x, (PyObject*)&py_type)) {
+        Py_INCREF(x);
+        return x;
+    }
+
+    if (!PyLong_Check(x)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "LowBitType must be initialized wit int");
+        return NULL;
+    }
+
+    const long s = PyLong_AsLong(x);
+
+    self->obj.value = s;
+
+    return (PyObject*)self;
+}
+
+template <size_t N>
+PyObject* LowBitType<N>::PyObj::py_repr(PyObject* obj) {
+    return PyUnicode_FromFormat("%d", ((PyObj*)obj)->obj.value);
+}
+
+template <size_t N>
+PyObject* LowBitType<N>::PyObj::py_richcompare(PyObject* a, PyObject* b,
+                                               int op) {
+    mgb_assert(PyObject_IsInstance(a, (PyObject*)&py_type));
+    auto bval = PyFloat_AsDouble(b);
+    if (bval == -1 && PyErr_Occurred()) {
+        return NULL;
+    }
+    double aval = ((PyObj*)a)->obj.value;
+#define OP(py, op)           \
+    case py: {               \
+        if (aval op bval) {  \
+            Py_RETURN_TRUE;  \
+        } else {             \
+            Py_RETURN_FALSE; \
+        }                    \
+    }
+    switch (op) {
+        OP(Py_LT, <)
+        OP(Py_LE, <=)
+        OP(Py_EQ, ==)
+        OP(Py_NE, !=)
+        OP(Py_GT, >)
+        OP(Py_GE, >=)
+    };
+#undef OP
+    return Py_NotImplemented;
+}
+
+/* ==================== LowBitType<N>::NpyType ==================== */
+template <size_t N>
+struct LowBitType<N>::NpyType {
+    static PyArray_ArrFuncs funcs;
+    static PyArray_Descr descr;
+
+    static bool init();
+
+    static void copyswap(void* dst, void* src, int swap, void* /*arr*/) {
+        if (src) {
+            mgb_assert(!swap);
+            memcpy(dst, src, sizeof(LowBitType<N>));
+        }
+    }
+    static PyObject* getitem(void* data, void* ap) {
+        return LowBitType<N>::PyObj::from_lowbit(
+                *static_cast<LowBitType<N>*>(data));
+    }
+    static int setitem(PyObject* op, void* ov, void* ap);
+    static int fill(void* data_, npy_intp length, void* arr);
+};
+
+template <size_t N>
+PyArray_ArrFuncs LowBitType<N>::NpyType::funcs;
+template <size_t N>
+PyArray_Descr LowBitType<N>::NpyType::descr;
+
+template <size_t N>
+int LowBitType<N>::NpyType::setitem(PyObject* op, void* ov, void* ap) {
+    if (!PyLong_Check(op)) {
+        PyErr_SetString(PyExc_ValueError, "input type must be int");
+        return -1;
+    }
+
+    int a = PyLong_AsLong(op);
+    if (!check_value_set_err(a))
+        return -1;
+
+    static_cast<LowBitType<N>*>(ov)->value = a;
+    return 0;
+}
+
+template <size_t N>
+int LowBitType<N>::NpyType::fill(void* data_, npy_intp length, void* arr) {
+    auto data = static_cast<LowBitType<N>*>(data_);
+    int8_t delta = data[1].value - data[0].value, r = data[1].value;
+    if (!check_value_set_err(data[0].value) ||
+        !check_value_set_err(data[1].value))
+        return -1;
+    for (npy_intp i = 2; i < length; i++) {
+        r += delta;
+        if (r > max_value)
+            r = -max_value;
+        else if (r < -max_value)
+            r = max_value;
+        data[i].value = r;
+    }
+    return 0;
+}
+
+template <size_t N>
+bool LowBitType<N>::NpyType::init() {
+    descr = {PyObject_HEAD_INIT(0) & LowBitType<N>::PyObj::py_type,
+             'V',  // kind
+             'r',  // type
+             '=',  // byteorder
+             NPY_NEEDS_PYAPI | NPY_USE_GETITEM | NPY_USE_SETITEM,
+             0,  // type num
+             sizeof(LowBitType<N>),
+             alignof(LowBitType<N>),
+             NULL,
+             NULL,
+             NULL,
+             &funcs};
+    Py_TYPE(&descr) = &PyArrayDescr_Type;
+    PyArray_InitArrFuncs(&funcs);
+    funcs.copyswap = copyswap;
+    funcs.getitem = getitem;
+    funcs.setitem = setitem;
+    funcs.fill = fill;
+    npy_typenum = PyArray_RegisterDataType(&descr);
+
+#define REGISTER_CAST(From, To, From_descr, To_typenum, safe)         \
+    {                                                                 \
+        PyArray_Descr* from_descr = (From_descr);                     \
+        if (PyArray_RegisterCastFunc(from_descr, (To_typenum),        \
+                                     NpyCast<From, To>::apply) < 0) { \
+            return false;                                             \
+        }                                                             \
+        if (safe && PyArray_RegisterCanCast(from_descr, (To_typenum), \
+                                            NPY_NOSCALAR) < 0) {      \
+            return false;                                             \
+        }                                                             \
+    }
+#define REGISTER_INT_CASTS(bits)                                          \
+    REGISTER_CAST(npy_int##bits, LowBitType<N>,                           \
+                  PyArray_DescrFromType(NPY_INT##bits),                   \
+                  LowBitType<N>::npy_typenum, 1)                          \
+    REGISTER_CAST(LowBitType<N>, npy_int##bits, &descr, NPY_INT##bits, 0) \
+    REGISTER_CAST(npy_uint##bits, LowBitType<N>,                          \
+                  PyArray_DescrFromType(NPY_UINT##bits),                  \
+                  LowBitType<N>::npy_typenum, 1)                          \
+    REGISTER_CAST(LowBitType<N>, npy_uint##bits, &descr, NPY_UINT##bits, 0)
+
+    REGISTER_INT_CASTS(8)
+    REGISTER_INT_CASTS(16)
+    REGISTER_INT_CASTS(32)
+    REGISTER_INT_CASTS(64)
+    REGISTER_CAST(LowBitType<N>, float, &descr, NPY_FLOAT, 0)
+    REGISTER_CAST(float, LowBitType<N>, PyArray_DescrFromType(NPY_FLOAT),
+                  LowBitType<N>::npy_typenum, 0)
+    REGISTER_CAST(LowBitType<N>, double, &descr, NPY_DOUBLE, 1)
+    REGISTER_CAST(double, LowBitType<N>, PyArray_DescrFromType(NPY_DOUBLE),
+                  LowBitType<N>::npy_typenum, 0)
+    return true;
+}
+
+}  // anonymous namespace
+
+#define DEFINE_INTBX(n) using IntB##n = LowBitType<n>;
+FOREACH_MGB_LOW_BIT(DEFINE_INTBX)
+#undef DEFINE_INTBX
+
+// define a new python type: pyintb1/2/4
+#define DEFINE_INIT_PYTYPE(n)                                        \
+    bool init_pytype_intb##n() {                                     \
+        auto& py_type = IntB##n::PyObj::py_type;                     \
+        py_type = {PyVarObject_HEAD_INIT(NULL, 0)};                  \
+        py_type.tp_name = "megengine.core._imperative_rt.pyintb" #n; \
+        py_type.tp_basicsize = sizeof(IntB##n::PyObj);               \
+        py_type.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE; \
+        py_type.tp_doc = "an low bit int type";                      \
+        py_type.tp_new = IntB##n::PyObj::py_new;                     \
+        py_type.tp_str = IntB##n::PyObj::py_repr;                    \
+        py_type.tp_repr = IntB##n::PyObj::py_repr;                   \
+        py_type.tp_richcompare = IntB##n::PyObj::py_richcompare;     \
+        py_type.tp_base = &PyGenericArrType_Type;                    \
+        return PyType_Ready(&py_type) >= 0;                          \
+    }
+FOREACH_MGB_LOW_BIT(DEFINE_INIT_PYTYPE)
+#undef DEFINE_INIT_PYTYPE
+
+#define DEFINE_NPY_INTBX(n) \
+    int mgb::npy_num_intb##n() { return IntB##n::npy_typenum; }
+FOREACH_MGB_LOW_BIT(DEFINE_NPY_INTBX)
+#undef DEFINE_NPY_INTBX
+
+namespace py = pybind11;
+
+void mgb::init_npy_num_intbx(py::module m) {
+#define ADD_OBJ_INTBX(n)                                             \
+    mgb_assert(init_pytype_intb##n());                               \
+    mgb_assert(IntB##n::NpyType::init());                            \
+    m.add_object("pyintb" #n, reinterpret_cast<PyObject*>(           \
+        &IntB##n::PyObj::py_type));                                  \
+    m.add_object("intb" #n, reinterpret_cast<PyObject*>(             \
+        PyArray_DescrFromType(npy_num_intb##n())));
+    FOREACH_MGB_LOW_BIT(ADD_OBJ_INTBX)
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/python/src/ops.cpp b/imperative/python/src/ops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2e2442538d90b49ba4c4980acece2761b28b5be9
--- /dev/null
+++ b/imperative/python/src/ops.cpp
@@ -0,0 +1,83 @@
+#include "./ops.h"
+
+#include "megbrain/imperative.h"
+#include "megbrain/imperative/ops/backward_graph.h"
+#include "megbrain/imperative/ops/opr_attr.h"
+#include "megbrain/imperative/ops/tensor_manip.h"
+#include "megbrain/imperative/ops/collective_comm.h"
+#include "megbrain/imperative/ops/io_remote.h"
+#include "megbrain/imperative/ops/cond_take.h"
+#include "megbrain/imperative/ops/nms.h"
+
+namespace py = pybind11;
+
+void init_ops(py::module m) {
+    #include "opdef.inl"
+
+    py::class_<OprAttr, std::shared_ptr<OprAttr>, OpDef>(m, "OprAttr")
+        .def(py::init<>())
+        .def_readwrite("type", &OprAttr::type)
+        .def_readwrite("param", &OprAttr::param)
+        .def_readwrite("config", &OprAttr::config)
+        .def_property("param",
+            [](const OprAttr& attr) -> py::bytes {
+                return std::string(attr.param.begin(), attr.param.end());
+            },
+            [] (OprAttr& attr, py::bytes data) {
+                auto s = py::cast<std::string>(data);
+                attr.param.clear();
+                attr.param.insert(attr.param.end(), s.begin(), s.end());
+            });
+
+    py::class_<GetVarShape, std::shared_ptr<GetVarShape>, OpDef>(m, "GetVarShape")
+        .def(py::init());
+
+    py::class_<CollectiveComm, std::shared_ptr<CollectiveComm>, OpDef>(m, "CollectiveComm")
+        .def(py::init<>())
+        .def_readwrite("key", &CollectiveComm::key)
+        .def_readwrite("nr_devices", &CollectiveComm::nr_devices)
+        .def_readwrite("rank", &CollectiveComm::rank)
+        .def_readwrite("is_root", &CollectiveComm::is_root)
+        .def_readwrite("local_grad", &CollectiveComm::local_grad)
+        .def_readwrite("addr", &CollectiveComm::addr)
+        .def_readwrite("port", &CollectiveComm::port)
+        .def_readwrite("mode", &CollectiveComm::mode)
+        .def_readwrite("dtype", &CollectiveComm::dtype)
+        .def_readwrite("backend", &CollectiveComm::backend)
+        .def_readwrite("comp_node", &CollectiveComm::comp_node);
+
+    py::class_<RemoteSend, std::shared_ptr<RemoteSend>, OpDef>(m, "RemoteSend")
+        .def(py::init<>())
+        .def_readwrite("key", &RemoteSend::key)
+        .def_readwrite("addr", &RemoteSend::addr)
+        .def_readwrite("port", &RemoteSend::port)
+        .def_readwrite("rank_to", &RemoteSend::rank_to);
+
+    py::class_<RemoteRecv, std::shared_ptr<RemoteRecv>, OpDef>(m, "RemoteRecv")
+        .def(py::init<>())
+        .def_readwrite("key", &RemoteRecv::key)
+        .def_readwrite("addr", &RemoteRecv::addr)
+        .def_readwrite("port", &RemoteRecv::port)
+        .def_readwrite("rank_from", &RemoteRecv::rank_from)
+        .def_readwrite("shape", &RemoteRecv::shape)
+        .def_readwrite("cn", &RemoteRecv::cn)
+        .def_readwrite("dtype", &RemoteRecv::dtype);
+
+    py::class_<ParamPackSplit, std::shared_ptr<ParamPackSplit>, OpDef>(m, "ParamPackSplit")
+        .def(py::init<>())
+        .def_readwrite("offsets", &ParamPackSplit::offsets)
+        .def_readwrite("shapes", &ParamPackSplit::shapes);
+
+    py::class_<ParamPackConcat, std::shared_ptr<ParamPackConcat>, OpDef>(m, "ParamPackConcat")
+        .def(py::init<>())
+        .def_readwrite("offsets", &ParamPackConcat::offsets);
+
+    py::class_<BackwardGraph, std::shared_ptr<BackwardGraph>, OpDef>(m, "BackwardGraph");
+    py::class_<CondTake, std::shared_ptr<CondTake>, OpDef>(m, "CondTake")
+        .def(py::init<>());
+
+    py::class_<NMSKeep, std::shared_ptr<NMSKeep>, OpDef>(m, "NMSKeep")
+        .def(py::init<float, uint32_t>())
+        .def_readwrite("iou_thresh", &NMSKeep::iou_thresh)
+        .def_readwrite("max_output", &NMSKeep::max_output);
+}
diff --git a/imperative/python/src/ops.h b/imperative/python/src/ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..900b89e1aab14f66968330e6c92f90343c19be61
--- /dev/null
+++ b/imperative/python/src/ops.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "./helper.h"
+
+void init_ops(pybind11::module m);
diff --git a/imperative/python/src/utils.cpp b/imperative/python/src/utils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b0e615a437e63d9aade2a1d17f2c1bc56ec0aa8b
--- /dev/null
+++ b/imperative/python/src/utils.cpp
@@ -0,0 +1,236 @@
+#include "utils.h"
+
+#include <pybind11/operators.h>
+#include <atomic>
+#include <cstdint>
+#include "./imperative_rt.h"
+#include "megbrain/common.h"
+#include "megbrain/comp_node.h"
+#include "megbrain/imperative/blob_manager.h"
+#include "megbrain/imperative/profiler.h"
+#include "megbrain/serialization/helper.h"
+
+#if MGB_ENABLE_OPR_MM
+#include "megbrain/opr/mm_handler.h"
+#endif
+
+namespace py = pybind11;
+
+namespace {
+
+bool g_global_finalized = false;
+
+class LoggerWrapper {
+public:
+    using LogLevel = mgb::LogLevel;
+    using LogHandler = mgb::LogHandler;
+    static void set_log_handler(py::object logger_p) {
+        logger = logger_p;
+        mgb::set_log_handler(py_log_handler);
+    }
+    static LogLevel set_log_level(LogLevel log_level) {
+        return mgb::set_log_level(log_level);
+    }
+
+private:
+    static py::object logger;
+    static void py_log_handler(mgb::LogLevel level, const char* file,
+                               const char* func, int line, const char* fmt,
+                               va_list ap) {
+        using mgb::LogLevel;
+
+        MGB_MARK_USED_VAR(file);
+        MGB_MARK_USED_VAR(func);
+        MGB_MARK_USED_VAR(line);
+
+        if (g_global_finalized)
+            return;
+
+        const char* py_type;
+        switch (level) {
+            case LogLevel::DEBUG:
+                py_type = "debug";
+                break;
+            case LogLevel::INFO:
+                py_type = "info";
+                break;
+            case LogLevel::WARN:
+                py_type = "warning";
+                break;
+            case LogLevel::ERROR:
+                py_type = "error";
+                break;
+            default:
+                throw std::runtime_error("bad log level");
+        }
+
+        std::string msg = mgb::svsprintf(fmt, ap);
+        auto do_log = [msg = msg, py_type]() {
+            if (logger.is_none())
+                return;
+            py::object _call = logger.attr(py_type);
+            _call(py::str(msg));
+        };
+        if (PyGILState_Check()) {
+            do_log();
+        } else {
+            py_task_q.add_task(do_log);
+        }
+    }
+};
+py::object LoggerWrapper::logger = py::none{};
+
+uint32_t _get_dtype_num(py::object dtype) {
+    return static_cast<uint32_t>(npy::dtype_np2mgb(dtype.ptr()).enumv());
+}
+
+py::bytes _get_serialized_dtype(py::object dtype) {
+    std::string sdtype;
+    auto write = [&sdtype](const void* data, size_t size) {
+        auto pos = sdtype.size();
+        sdtype.resize(pos + size);
+        memcpy(&sdtype[pos], data, size);
+    };
+    mgb::serialization::serialize_dtype(npy::dtype_np2mgb(dtype.ptr()), write);
+    return py::bytes(sdtype.data(), sdtype.size());
+}
+
+int fork_exec_impl(const std::string& arg0, const std::string& arg1,
+                   const std::string& arg2) {
+#ifdef WIN32
+    STARTUPINFO si;
+    PROCESS_INFORMATION pi;
+    ZeroMemory(&si, sizeof(si));
+    si.cb = sizeof(si);
+    ZeroMemory(&pi, sizeof(pi));
+    auto args_str = " " + arg1 + " " + arg2;
+
+    // Start the child process.
+    if (!CreateProcess(arg0.c_str(),                         // exe name
+                       const_cast<char*>(args_str.c_str()),  // Command line
+                       NULL,   // Process handle not inheritable
+                       NULL,   // Thread handle not inheritable
+                       FALSE,  // Set handle inheritance to FALSE
+                       0,      // No creation flags
+                       NULL,   // Use parent's environment block
+                       NULL,   // Use parent's starting directory
+                       &si,    // Pointer to STARTUPINFO structure
+                       &pi)    // Pointer to PROCESS_INFORMATION structure
+    ) {
+        mgb_log_warn("CreateProcess failed (%lu).\n", GetLastError());
+        fprintf(stderr, "[megbrain] failed to execl %s [%s, %s]\n",
+                arg0.c_str(), arg1.c_str(), arg2.c_str());
+        __builtin_trap();
+    }
+    return pi.dwProcessId;
+#else
+    auto pid = fork();
+    if (!pid) {
+        execl(arg0.c_str(), arg0.c_str(), arg1.c_str(), arg2.c_str(), nullptr);
+        fprintf(stderr, "[megbrain] failed to execl %s [%s, %s]: %s\n",
+                arg0.c_str(), arg1.c_str(), arg2.c_str(), std::strerror(errno));
+        std::terminate();
+    }
+    mgb_assert(pid > 0, "failed to fork: %s", std::strerror(errno));
+    return pid;
+#endif
+}
+
+}  // namespace
+
+void init_utils(py::module m) {
+    auto atexit = py::module::import("atexit");
+    atexit.attr("register")(py::cpp_function([]() {
+        g_global_finalized = true;
+    }));
+
+    py::class_<std::atomic<uint64_t>>(m, "AtomicUint64")
+            .def(py::init<>())
+            .def(py::init<uint64_t>())
+            .def("load",
+                 [](const std::atomic<uint64_t>& self) { return self.load(); })
+            .def("store", [](std::atomic<uint64_t>& self,
+                             uint64_t value) { return self.store(value); })
+            .def("fetch_add",
+                 [](std::atomic<uint64_t>& self, uint64_t value) {
+                     return self.fetch_add(value);
+                 })
+            .def("fetch_sub",
+                 [](std::atomic<uint64_t>& self, uint64_t value) {
+                     return self.fetch_sub(value);
+                 })
+            .def(py::self += uint64_t())
+            .def(py::self -= uint64_t());
+
+    // FIXME!!! Should add a submodule instead of using a class for logger
+    py::class_<LoggerWrapper> logger(m, "Logger");
+    logger.def(py::init<>())
+            .def_static("set_log_level", &LoggerWrapper::set_log_level)
+            .def_static("set_log_handler", &LoggerWrapper::set_log_handler);
+
+    py::enum_<LoggerWrapper::LogLevel>(logger, "LogLevel")
+            .value("Debug", LoggerWrapper::LogLevel::DEBUG)
+            .value("Info", LoggerWrapper::LogLevel::INFO)
+            .value("Warn", LoggerWrapper::LogLevel::WARN)
+            .value("Error", LoggerWrapper::LogLevel::ERROR);
+
+    m.def("_get_dtype_num", &_get_dtype_num,
+          "Convert numpy dtype to internal dtype");
+
+    m.def("_get_serialized_dtype", &_get_serialized_dtype,
+          "Convert numpy dtype to internal dtype for serialization");
+
+    m.def("_get_device_count", &mgb::CompNode::get_device_count,
+          "Get total number of specific devices on this system");
+
+    using mgb::imperative::Profiler;
+
+    py::class_<Profiler>(m, "ProfilerImpl")
+            .def(py::init<>())
+            .def(py::init<const std::string&>())
+            .def("enable",
+                 [](Profiler& profiler) -> Profiler& {
+                     profiler.enable();
+                     return profiler;
+                 })
+            .def("disable",
+                 [](Profiler& profiler) {
+                     if (profiler.get_dump_count() == 0) {
+                         profiler.dump();
+                     }
+                     profiler.disable();
+                 })
+            .def("dump",
+                [](Profiler& profiler, std::optional<std::string> path) {
+                    if (path.has_value()) {
+                        profiler.dump(path.value());
+                    } else {
+                        profiler.dump();
+                    }
+                },
+                py::arg("path") = std::optional<std::string>());
+
+#if MGB_ENABLE_OPR_MM
+    m.def("create_mm_server", &create_zmqrpc_server, py::arg("addr"),
+          py::arg("port") = 0);
+#else
+    m.def("create_mm_server", []() {});
+#endif
+
+    // Debug code, internal only
+    m.def("_set_defrag", [](bool enable) {
+        mgb::imperative::BlobManager::inst()->set_enable(enable);
+    });
+    m.def("_defrag", [](const mgb::CompNode& cn) {
+        mgb::imperative::BlobManager::inst()->defrag(cn);
+    });
+    m.def("_set_fork_exec_path_for_timed_func", [](const std::string& arg0,
+                                                   const ::std::string arg1) {
+        using namespace std::placeholders;
+        mgb::sys::TimedFuncInvoker::ins().set_fork_exec_impl(std::bind(
+                fork_exec_impl, std::string{arg0}, std::string{arg1}, _1));
+    });
+    m.def("_timed_func_exec_cb", [](const std::string& user_data){
+        mgb::sys::TimedFuncInvoker::ins().fork_exec_impl_mainloop(user_data.c_str());
+    });
+}
diff --git a/imperative/python/src/utils.h b/imperative/python/src/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a4d69e560401111e3abd2dcb39e9cb77e99bd0c
--- /dev/null
+++ b/imperative/python/src/utils.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "./helper.h"
+
+void init_utils(pybind11::module m);
diff --git a/imperative/python/test/integration/mnist_model_with_test.mge b/imperative/python/test/integration/mnist_model_with_test.mge
new file mode 100644
index 0000000000000000000000000000000000000000..126837d41f24151b9bad560e0b496908e1e5e9af
Binary files /dev/null and b/imperative/python/test/integration/mnist_model_with_test.mge differ
diff --git a/imperative/python/test/integration/mnist_model_with_test_cpu.mge b/imperative/python/test/integration/mnist_model_with_test_cpu.mge
new file mode 100644
index 0000000000000000000000000000000000000000..b0e8ad5c98b17584cbbcdf50c395c553ca1f74ef
Binary files /dev/null and b/imperative/python/test/integration/mnist_model_with_test_cpu.mge differ
diff --git a/imperative/python/test/integration/test_advance_indexing.py b/imperative/python/test/integration/test_advance_indexing.py
new file mode 100644
index 0000000000000000000000000000000000000000..261f6daf8110d3adfe586a5c252b4b9259c08b4a
--- /dev/null
+++ b/imperative/python/test/integration/test_advance_indexing.py
@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+import megengine
+import megengine.optimizer as optimizer
+from megengine import Parameter, tensor
+from megengine.module import Module
+
+
+class Simple(Module):
+    def __init__(self):
+        super().__init__()
+        self.a = Parameter(1.0, dtype=np.float32)
+
+    def forward(self, x, y):
+        x = x[y] * self.a
+        return x
+
+
+class Simple2(Module):
+    def __init__(self):
+        super().__init__()
+        self.a = Parameter(1.0, dtype=np.float32)
+
+    def forward(self, x):
+        x = x[1, ..., :, 0:4:2, 0:2] * self.a
+        return x
+
+
+def test_advance_indexing():
+    net = Simple()
+
+    optim = optimizer.SGD(net.parameters(), lr=1.0)
+    optim.zero_grad()
+
+    dshape = (10, 10)
+    raw_data = np.arange(100).reshape(dshape).astype(np.float32)
+    raw_mask = (np.random.random_sample(dshape) > 0.5).astype(np.bool_)
+    data = tensor(raw_data)
+    mask = tensor(raw_mask)
+    answer = 1.0 - raw_data[raw_mask].sum()
+    with optim.record():
+        loss = net(data, mask).sum()
+        optim.backward(loss)
+    optim.step()
+    np.testing.assert_almost_equal(net.a.numpy(), np.array([answer]).astype(np.float32))
+
+
+def test_advance_indexing_with_subtensor():
+    net = Simple2()
+
+    optim = optimizer.SGD(net.parameters(), lr=1.0)
+    optim.zero_grad()
+
+    dshape = (2, 3, 4, 3, 4, 2)
+    raw_data = np.arange(576).reshape(dshape).astype(np.float32)
+    data = tensor(raw_data)
+    answer = 1.0 - raw_data[1, ..., :, 0:4:2, 0:2].sum()
+    with optim.record():
+        loss = net(data).sum()
+        optim.backward(loss)
+    optim.step()
+    np.testing.assert_almost_equal(net.a.numpy(), np.array([answer]).astype(np.float32))
diff --git a/imperative/python/test/integration/test_ai.py b/imperative/python/test/integration/test_ai.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e40bac9271b6894fdb3e4333794b6d9664396e8
--- /dev/null
+++ b/imperative/python/test/integration/test_ai.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+import megengine
+import megengine.optimizer as optimizer
+from megengine import Parameter, tensor
+from megengine.module import Module
+
+
+class Simple(Module):
+    def __init__(self):
+        super().__init__()
+        self.a = Parameter(1.0, dtype=np.float32)
+
+    def forward(self, x):
+        x = x[:, 0] * self.a
+        return x
+
+
+def test_ai():
+    net = Simple()
+
+    optim = optimizer.SGD(net.parameters(), lr=1.0)
+    optim.zero_grad()
+
+    dshape = (10, 10)
+    data = tensor(np.ones(dshape).astype(np.float32))
+    with optim.record():
+        loss = net(data).sum()
+        optim.backward(loss)
+    optim.step()
+    np.testing.assert_almost_equal(
+        net.a.numpy(), np.array([1.0 - dshape[0]]).astype(np.float32)
+    )
diff --git a/imperative/python/test/integration/test_bn.py b/imperative/python/test/integration/test_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..779b2ef9e77b4900e024d5eb81d6085f6e3427e1
--- /dev/null
+++ b/imperative/python/test/integration/test_bn.py
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+import pytest
+
+import megengine
+import megengine.optimizer as optimizer
+from megengine import Parameter, tensor
+from megengine.module import BatchNorm2d
+
+
+def test_frozen_bn():
+    nchannel = 3
+    m = BatchNorm2d(nchannel, freeze=True)
+
+    saved_var = m.running_var.numpy()
+    saved_mean = m.running_mean.numpy()
+    saved_wt = m.weight.numpy()
+    saved_bias = m.bias.numpy()
+
+    optim = optimizer.SGD(m.parameters(), lr=1.0)
+    optim.zero_grad()
+
+    data = np.random.random((6, nchannel, 2, 2)).astype("float32")
+    with optim.record():
+        loss = m(data).mean()
+        optim.backward(loss)
+    optim.step()
+
+    np.testing.assert_equal(m.running_var.numpy(), saved_var)
+    np.testing.assert_equal(m.running_mean.numpy(), saved_mean)
+    np.testing.assert_equal(m.weight.numpy(), saved_wt)
+    np.testing.assert_equal(m.bias.numpy(), saved_bias)
+    np.testing.assert_almost_equal(loss.numpy(), data.mean(), 5)
+
+
+def test_bn_no_track_stat():
+    nchannel = 3
+    m = BatchNorm2d(nchannel, track_running_stats=False)
+
+    optim = optimizer.SGD(m.parameters(), lr=1.0)
+    optim.zero_grad()
+
+    data = np.random.random((6, nchannel, 2, 2)).astype("float32")
+    with optim.record():
+        loss = m(data).sum()
+        optim.backward(loss)
+    optim.step()
+
+
+def test_bn_no_track_stat2():
+    nchannel = 3
+    m = BatchNorm2d(nchannel)  # Init with track_running_stat = True
+    m.track_running_stats = False
+
+    # m.running_var and m.running_mean created during init time
+    saved_var = m.running_var.numpy()
+    assert saved_var is not None
+    saved_mean = m.running_mean.numpy()
+    assert saved_mean is not None
+
+    optim = optimizer.SGD(m.parameters(), lr=1.0)
+    optim.zero_grad()
+
+    data = np.random.random((6, nchannel, 2, 2)).astype("float32")
+    with optim.record():
+        loss = m(data).sum()
+        optim.backward(loss)
+    optim.step()
+
+    np.testing.assert_equal(m.running_var.numpy(), saved_var)
+    np.testing.assert_equal(m.running_mean.numpy(), saved_mean)
+
+
+def test_bn_no_track_stat3():
+    nchannel = 3
+    m = BatchNorm2d(nchannel, track_running_stats=False)
+    m.track_running_stats = True
+    data = np.random.random((6, nchannel, 2, 2)).astype("float32")
+    with pytest.raises(Exception):
+        m(data)
diff --git a/imperative/python/test/integration/test_converge.py b/imperative/python/test/integration/test_converge.py
new file mode 100644
index 0000000000000000000000000000000000000000..7778c6a9eb1cad7a51ef496f703b1c645a3ec4f4
--- /dev/null
+++ b/imperative/python/test/integration/test_converge.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import itertools
+
+import numpy as np
+import pytest
+
+import megengine as mge
+import megengine.functional as F
+from megengine import Tensor
+from megengine.module import Linear, Module
+from megengine.optimizer import SGD
+
+batch_size = 64
+data_shape = (batch_size, 2)
+label_shape = (batch_size,)
+
+
+def minibatch_generator():
+    while True:
+        inp_data = np.zeros((batch_size, 2))
+        label = np.zeros(batch_size, dtype=np.int32)
+        for i in range(batch_size):
+            # [x0, x1], sampled from U[-1, 1]
+            inp_data[i, :] = np.random.rand(2) * 2 - 1
+            label[i] = 0 if np.prod(inp_data[i]) < 0 else 1
+        yield inp_data.astype(np.float32), label.astype(np.int32)
+
+
+def calculate_precision(data: np.ndarray, pred: np.ndarray) -> float:
+    """ Calculate precision for given data and prediction.
+
+    :type data: [[x, y], ...]
+    :param data: Input data
+    :type pred: [[x_pred, y_pred], ...]
+    :param pred: Network output data
+    """
+    correct = 0
+    assert len(data) == len(pred)
+    for inp_data, pred_output in zip(data, pred):
+        label = 0 if np.prod(inp_data) < 0 else 1
+        pred_label = np.argmax(pred_output)
+        if pred_label == label:
+            correct += 1
+    return float(correct) / len(data)
+
+
+class XORNet(Module):
+    def __init__(self):
+        self.mid_layers = 14
+        self.num_class = 2
+        super().__init__()
+
+        self.fc0 = Linear(self.num_class, self.mid_layers, bias=True)
+        self.fc1 = Linear(self.mid_layers, self.mid_layers, bias=True)
+
+        self.fc2 = Linear(self.mid_layers, self.num_class, bias=True)
+
+    def forward(self, x):
+        x = self.fc0(x)
+        x = F.tanh(x)
+        x = self.fc1(x)
+        x = F.tanh(x)
+        x = self.fc2(x)
+        return x
+
+
+def test_training_converge():
+    net = XORNet()
+    opt = SGD(
+        net.parameters(requires_grad=True), lr=0.01, momentum=0.9, weight_decay=5e-4
+    )
+
+    def train(data, label):
+        with opt.record():
+            pred = net(data)
+            loss = F.cross_entropy_with_softmax(pred, label)
+            opt.backward(loss)
+        return loss
+
+    def infer(data):
+        return net(data)
+
+    train_dataset = minibatch_generator()
+    losses = []
+
+    for data, label in itertools.islice(train_dataset, 2000):
+        data = Tensor(data, dtype=np.float32)
+        label = Tensor(label, dtype=np.int32)
+        opt.zero_grad()
+        loss = train(data, label)
+        opt.step()
+        losses.append(loss.numpy())
+
+    assert np.mean(losses[-100:]) < 0.1, "Final training Loss must be low enough"
+
+    ngrid = 10
+    x = np.linspace(-1.0, 1.0, ngrid)
+    xx, yy = np.meshgrid(x, x)
+    xx = xx.reshape((ngrid * ngrid, 1))
+    yy = yy.reshape((ngrid * ngrid, 1))
+    data = np.concatenate((xx, yy), axis=1).astype(np.float32)
+
+    pred = infer(data).numpy()
+    precision = calculate_precision(data, pred)
+    assert precision == 1.0, "Test precision must be high enough, get {}".format(
+        precision
+    )
diff --git a/imperative/python/test/integration/test_correctness.py b/imperative/python/test/integration/test_correctness.py
new file mode 100644
index 0000000000000000000000000000000000000000..73d3fbed247021c1a708a5645bd8814f09144bb2
--- /dev/null
+++ b/imperative/python/test/integration/test_correctness.py
@@ -0,0 +1,194 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import os
+import re
+import subprocess
+import sys
+
+import numpy as np
+import pytest
+
+import megengine as mge
+import megengine.functional as F
+from megengine.functional.debug_param import set_conv_execution_strategy
+from megengine.module import AvgPool2d, BatchNorm2d, Conv2d, Linear, Module
+from megengine.optimizer import SGD
+from megengine.tensor import Tensor
+from megengine.test import assertTensorClose
+
+
+def get_gpu_name():
+    try:
+        gpu_info = subprocess.check_output(
+            ["nvidia-smi", "--query-gpu=gpu_name", "--format=csv,noheader"]
+        )
+        gpu_info = gpu_info.decode("ascii").split("\n")[0]
+    except:
+        gpu_info = "None"
+    return gpu_info
+
+
+def get_cpu_name():
+    cpu_info = "None"
+    try:
+        cpu_info = subprocess.check_output(["cat", "/proc/cpuinfo"]).decode("ascii")
+        for line in cpu_info.split("\n"):
+            if "model name" in line:
+                return re.sub(".*model name.*:", "", line, 1).strip()
+    except:
+        pass
+    return cpu_info
+
+
+def get_xpu_name():
+    if mge.is_cuda_available():
+        return get_gpu_name()
+    else:
+        return get_cpu_name()
+
+
+class MnistNet(Module):
+    def __init__(self, has_bn=False):
+        super().__init__()
+        self.conv0 = Conv2d(1, 20, kernel_size=5, bias=True)
+        self.pool0 = AvgPool2d(2)
+        self.conv1 = Conv2d(20, 20, kernel_size=5, bias=True)
+        self.pool1 = AvgPool2d(2)
+        self.fc0 = Linear(20 * 4 * 4, 500, bias=True)
+        self.fc1 = Linear(500, 10, bias=True)
+        self.bn0 = None
+        self.bn1 = None
+        if has_bn:
+            self.bn0 = BatchNorm2d(20)
+            self.bn1 = BatchNorm2d(20)
+
+    def forward(self, x):
+        x = self.conv0(x)
+        if self.bn0:
+            x = self.bn0(x)
+        x = F.relu(x)
+        x = self.pool0(x)
+        x = self.conv1(x)
+        if self.bn1:
+            x = self.bn1(x)
+        x = F.relu(x)
+        x = self.pool1(x)
+        x = F.flatten(x, 1)
+        x = self.fc0(x)
+        x = F.relu(x)
+        x = self.fc1(x)
+        return x
+
+
+def train(data, label, net, opt):
+    with opt.record():
+        pred = net(data)
+        loss = F.cross_entropy_with_softmax(pred, label)
+        opt.backward(loss)
+    return loss
+
+
+def update_model(model_path):
+    """
+    Update the dumped model with test cases for new reference values.
+
+    The model with pre-trained weights is trained for one iter with the test data attached.
+    The loss and updated net state dict is dumped.
+
+    .. code-block:: python
+
+        from test_correctness import update_model
+        update_model('mnist_model_with_test.mge') # for gpu
+        update_model('mnist_model_with_test_cpu.mge') # for cpu
+
+    """
+    net = MnistNet(has_bn=True)
+    checkpoint = mge.load(model_path)
+    net.load_state_dict(checkpoint["net_init"])
+    lr = checkpoint["sgd_lr"]
+    opt = SGD(net.parameters(), lr=lr)
+
+    data = Tensor(checkpoint["data"], dtype=np.float32)
+    label = Tensor(checkpoint["label"], dtype=np.int32)
+
+    opt.zero_grad()
+    loss = train(data, label, net=net, opt=opt)
+    opt.step()
+
+    xpu_name = get_xpu_name()
+
+    checkpoint.update(
+        {"net_updated": net.state_dict(), "loss": loss.numpy(), "xpu": xpu_name}
+    )
+    mge.save(checkpoint, model_path)
+
+
+def run_test(
+    model_path, use_jit, use_symbolic, sublinear_memory_config=None, max_err=None,
+):
+
+    """
+    Load the model with test cases and run the training for one iter.
+    The loss and updated weights are compared with reference value to verify the correctness.
+
+    Dump a new file with updated result by calling update_model
+    if you think the test fails due to numerical rounding errors instead of bugs.
+    Please think twice before you do so.
+
+    """
+    net = MnistNet(has_bn=True)
+    checkpoint = mge.load(model_path)
+    net.load_state_dict(checkpoint["net_init"])
+    lr = checkpoint["sgd_lr"]
+    opt = SGD(net.parameters(), lr=lr)
+
+    data = Tensor(checkpoint["data"], dtype=np.float32)
+    label = Tensor(checkpoint["label"], dtype=np.int32)
+
+    if max_err is None:
+        max_err = 1e-5
+
+    train_func = train
+    if use_jit:
+        train_func = jit.trace(
+            train_func,
+            symbolic=use_symbolic,
+            sublinear_memory_config=sublinear_memory_config,
+        )
+
+    opt.zero_grad()
+    loss = train_func(data, label, net=net, opt=opt)
+    opt.step()
+
+    assertTensorClose(loss.numpy(), checkpoint["loss"], max_err=max_err)
+
+    for param, param_ref in zip(
+        net.state_dict().items(), checkpoint["net_updated"].items()
+    ):
+        assert param[0] == param_ref[0]
+        assertTensorClose(param[1], param_ref[1], max_err=max_err)
+
+
+def test_correctness():
+    if mge.is_cuda_available():
+        model_name = "mnist_model_with_test.mge"
+    else:
+        model_name = "mnist_model_with_test_cpu.mge"
+    model_path = os.path.join(os.path.dirname(__file__), model_name)
+    set_conv_execution_strategy("HEURISTIC_REPRODUCIBLE")
+
+    run_test(model_path, False, False, max_err=1e-5)
+    # run_test(model_path, True, False)
+    # run_test(model_path, True, True)
+
+    # sublinear
+    # config = SublinearMemoryConfig(genetic_nr_iter=10)
+    # run_test(
+    #     model_path, True, True, sublinear_memory_config=config, max_err=1e-5,
+    # )
diff --git a/imperative/python/test/integration/test_detach.py b/imperative/python/test/integration/test_detach.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d0b3d5c7fbed7ed9ed9c3d6268d031b1abe99a0
--- /dev/null
+++ b/imperative/python/test/integration/test_detach.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+import megengine
+import megengine.optimizer as optimizer
+from megengine import Parameter, tensor
+from megengine.module import Module
+
+
+class Simple(Module):
+    def __init__(self):
+        super().__init__()
+        self.a = Parameter(1.0, dtype=np.float32)
+        self.b = Parameter(1.0, dtype=np.float32)
+
+    def forward(self, x):
+        x = x * self.a
+        x = x.detach() * self.b
+        return x
+
+
+def test_detach():
+    net = Simple()
+
+    optim = optimizer.SGD(net.parameters(), lr=1.0)
+    optim.zero_grad()
+
+    dshape = (10, 10)
+    data = tensor(np.ones(dshape).astype(np.float32))
+    with optim.record():
+        loss = net(data).sum()
+        optim.backward(loss)
+    optim.step()
+    np.testing.assert_equal(net.a.numpy(), np.array([1.0]).astype(np.float32))
+    np.testing.assert_equal(
+        net.b.numpy(), np.array([1.0 - 10.0 * 10.0]).astype(np.float32)
+    )
diff --git a/imperative/python/test/integration/test_dp_correctness.py b/imperative/python/test/integration/test_dp_correctness.py
new file mode 100644
index 0000000000000000000000000000000000000000..5719136942cced84a8e17f0bc0351f1b5d5c618c
--- /dev/null
+++ b/imperative/python/test/integration/test_dp_correctness.py
@@ -0,0 +1,203 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import multiprocessing as mp
+import os
+import re
+import subprocess
+import sys
+from math import ceil
+
+import numpy as np
+import pytest
+
+import megengine as mge
+import megengine.distributed as dist
+import megengine.functional as F
+from megengine.device import get_default_device, set_default_device
+from megengine.functional.debug_param import set_conv_execution_strategy
+from megengine.module import AvgPool2d, BatchNorm2d, Conv2d, Linear, Module
+from megengine.optimizer import SGD
+from megengine.tensor import Tensor
+from megengine.test import assertTensorClose
+
+p_num = 4
+
+
+def get_gpu_name():
+    try:
+        gpu_info = subprocess.check_output(
+            ["nvidia-smi", "--query-gpu=gpu_name", "--format=csv,noheader"]
+        )
+        gpu_info = gpu_info.decode("ascii").split("\n")[0]
+    except:
+        gpu_info = "None"
+    return gpu_info
+
+
+def get_cpu_name():
+    cpu_info = "None"
+    try:
+        cpu_info = subprocess.check_output(["cat", "/proc/cpuinfo"]).decode("ascii")
+        for line in cpu_info.split("\n"):
+            if "model name" in line:
+                return re.sub(".*model name.*:", "", line, 1).strip()
+    except:
+        pass
+    return cpu_info
+
+
+def get_xpu_name():
+    if mge.is_cuda_available():
+        return get_gpu_name()
+    else:
+        return get_cpu_name()
+
+
+class MnistNet(Module):
+    def __init__(self, has_bn=True):
+        super().__init__()
+        self.conv0 = Conv2d(1, 20, kernel_size=5, bias=True)
+        self.pool0 = AvgPool2d(2)
+        self.conv1 = Conv2d(20, 20, kernel_size=5, bias=True)
+        self.pool1 = AvgPool2d(2)
+        self.fc0 = Linear(20 * 4 * 4, 500, bias=True)
+        self.fc1 = Linear(500, 10, bias=True)
+        self.bn0 = None
+        self.bn1 = None
+        if has_bn:
+            self.bn0 = BatchNorm2d(20)
+            self.bn1 = BatchNorm2d(20)
+
+    def forward(self, x):
+        x = self.conv0(x)
+        if self.bn0:
+            x = self.bn0(x)
+        x = F.relu(x)
+        x = self.pool0(x)
+        x = self.conv1(x)
+        if self.bn1:
+            x = self.bn1(x)
+        x = F.relu(x)
+        x = self.pool1(x)
+        x = F.flatten(x, 1)
+        x = self.fc0(x)
+        x = F.relu(x)
+        x = self.fc1(x)
+        return x
+
+
+def train(data, label, net, opt):
+    with opt.record():
+        pred = net(data)
+        loss = F.cross_entropy_with_softmax(pred, label)
+        opt.backward(loss)
+    return loss
+
+
+def update_model(model_path):
+    """
+    Update the dumped model with test cases for new reference values.
+
+    The model with pre-trained weights is trained for one iter with the test data attached.
+    The loss and updated net state dict is dumped.
+
+    .. code-block:: python
+
+        from test_correctness import update_model
+        update_model('mnist_model_with_test.mge') # for gpu
+        update_model('mnist_model_with_test_cpu.mge') # for cpu
+
+    """
+    net = MnistNet(has_bn=True)
+    checkpoint = mge.load(model_path)
+    net.load_state_dict(checkpoint["net_init"])
+    lr = checkpoint["sgd_lr"]
+    opt = SGD(net.parameters(), lr=lr)
+
+    data = Tensor(checkpoint["data"], dtype=np.float32)
+    label = Tensor(checkpoint["label"], dtype=np.int32)
+
+    opt.zero_grad()
+    loss = train(data, label, net=net, opt=opt)
+    opt.step()
+
+    xpu_name = get_xpu_name()
+
+    checkpoint.update(
+        {"net_updated": net.state_dict(), "loss": loss.numpy(), "xpu": xpu_name}
+    )
+    mge.serialization.save(checkpoint, model_path)
+
+
+def run_test(
+    model_path, use_jit, use_symbolic, sublinear_memory_config=None, max_err=None,
+):
+
+    """
+    Load the model with test cases and run the training for one iter.
+    The loss and updated weights are compared with reference value to verify the correctness.
+
+    Dump a new file with updated result by calling update_model
+    if you think the test fails due to numerical rounding errors instead of bugs.
+    Please think twice before you do so.
+
+    """
+    checkpoint = mge.load(model_path)
+    data = checkpoint["data"]
+    label = checkpoint["label"]
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+
+    def worker(rank, max_err):
+        dist.init_process_group("localhost", port, p_num, rank, rank)
+        set_default_device(device="gpu{}".format(dist.get_rank()))
+        net = MnistNet(has_bn=True)
+        net.load_state_dict(checkpoint["net_init"])
+        lr = checkpoint["sgd_lr"]
+        opt = SGD(net.parameters(), reduce_method="mean", lr=lr)
+
+        # use same data and label for all gpu's
+        # such that the result does not depend on number of gpu
+        data_train = Tensor(data)
+        label_train = Tensor(label)
+
+        train_func = train
+
+        opt.zero_grad()
+        loss = train_func(data_train, label_train, net=net, opt=opt)
+        opt.step()
+
+        print("{} loss {}".format(get_default_device(), loss.numpy()[0]))
+        assertTensorClose(loss.numpy(), checkpoint["loss"], max_err=max_err)
+
+        if dist.get_rank():
+            return
+        for param, param_ref in zip(
+            net.state_dict().items(), checkpoint["net_updated"].items()
+        ):
+            assert param[0] == param_ref[0]
+            assertTensorClose(param[1], param_ref[1], max_err=max_err)
+
+    procs = []
+    for rank in range(p_num):
+        p = mp.Process(target=worker, args=(rank, max_err,))
+        p.start()
+        procs.append(p)
+
+    for p in procs:
+        p.join(20)
+        assert p.exitcode == 0
+
+
+@pytest.mark.isolated_distributed
+def test_dp_correctness():
+    model_name = "mnist_model_with_test.mge"
+    model_path = os.path.join(os.path.dirname(__file__), model_name)
+    set_conv_execution_strategy("HEURISTIC_REPRODUCIBLE")
+    run_test(model_path, False, False, max_err=1e-5)
diff --git a/imperative/python/test/integration/test_hello_world.py b/imperative/python/test/integration/test_hello_world.py
new file mode 100644
index 0000000000000000000000000000000000000000..033d28544b6a1254163abb598382ec68e4849e88
--- /dev/null
+++ b/imperative/python/test/integration/test_hello_world.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import subprocess
+
+import numpy as np
+import pytest
+
+import megengine
+import megengine.optimizer as optimizer
+from megengine import Parameter, tensor
+from megengine.module import Module
+
+
+class Simple(Module):
+    def __init__(self):
+        super().__init__()
+        self.a = Parameter(1.23, dtype=np.float32)
+
+    def forward(self, x):
+        x = x * self.a
+        return x
+
+
+def test_hello_world():
+    net = Simple()
+
+    optim = optimizer.SGD(net.parameters(), lr=1.0)
+    optim.zero_grad()
+
+    data = tensor([2.34])
+    with optim.record():
+        loss = net(data)
+        optim.backward(loss)
+    optim.step()
+    np.testing.assert_almost_equal(
+        net.a.numpy(), np.array([1.23 - 2.34]).astype(np.float32)
+    )
diff --git a/imperative/python/test/integration/test_lr_scheduler.py b/imperative/python/test/integration/test_lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0f788f6887a946de5c12da526cbf32ce3f18e6c
--- /dev/null
+++ b/imperative/python/test/integration/test_lr_scheduler.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from bisect import bisect_right
+
+import numpy as np
+
+from megengine import Parameter, tensor
+from megengine.module import Module
+from megengine.optimizer import SGD, MultiStepLR
+
+
+class Simple(Module):
+    def __init__(self):
+        super().__init__()
+        self.a = Parameter(1.23, dtype=np.float32)
+
+    def forward(self, x):
+        x = x * self.a
+        return x
+
+
+def test_multi_step_lr():
+    net = Simple()
+    opt = SGD(net.parameters(), lr=0.01, momentum=0.9)
+    scheduler = MultiStepLR(opt, [3, 6, 8])
+
+    lr = np.array(0.01, dtype=np.float32)
+    for i in range(10):
+        for group in opt.param_groups:
+            np.testing.assert_almost_equal(
+                np.array(group["lr"], dtype=np.float32),
+                (lr * 0.1 ** bisect_right([3, 6, 8], i)).astype(np.float32),
+            )
+        scheduler.step()
diff --git a/imperative/python/test/integration/test_optimizer.py b/imperative/python/test/integration/test_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..388a881485e4e63e69a6c84b0410ac430277c50a
--- /dev/null
+++ b/imperative/python/test/integration/test_optimizer.py
@@ -0,0 +1,206 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+import megengine.functional as F
+from megengine import Parameter, optimizer
+from megengine.module import Linear, Module
+from megengine.tensor import TensorDict, tensor
+
+
+class MLP(Module):
+    def __init__(self):
+        super().__init__()
+        self.dense0 = Linear(28, 50)
+        self.dense1 = Linear(50, 20)
+
+    def forward(self, x):
+        x = self.dense0(x)
+        x = F.relu(x)
+        x = self.dense1(x)
+        return x
+
+
+class Simple(Module):
+    def __init__(self):
+        super().__init__()
+        self.a = Parameter(1.23, dtype=np.float32)
+
+    def forward(self, x):
+        x = x * self.a
+        return x
+
+
+def _test_optimizer(opt_str, test_case, check_class, update_lr=False):
+    iter_num = 3
+    net = Simple()
+    opt = getattr(optimizer, opt_str)(net.parameters(), **test_case)
+    check_func = check_class(net, **test_case)
+
+    step = 0
+    data_shape = (2, 28)
+
+    for i in range(iter_num):
+        if update_lr and i == 1:  # change learning rate
+            for group in opt.param_groups:
+                group["lr"] += 0.01
+            check_func.lr += 0.01
+        data = tensor(np.random.random(data_shape).astype(np.float32))
+
+        opt.zero_grad()
+        with opt.record():
+            pred = net(data)
+            loss = pred.sum()
+            opt.backward(loss)
+
+        ori_params = TensorDict()
+        for param in net.parameters():
+            ori_params[param] = np.copy(param.numpy())
+        opt.step()
+        step += 1
+        check_func(ori_params, net.parameters(), step)
+
+
+def test_sgd():
+    class CheckValue:
+        def __init__(self, net, **kwarg):
+            self.slots = TensorDict()
+            for param in net.parameters():
+                self.slots[param] = np.zeros(param.shape).astype(np.float32)
+            for k, v in kwarg.items():
+                setattr(self, k, v)
+
+        def __call__(self, ori_params, new_params, step):
+            for param in new_params:
+                grad = param.grad.numpy()
+                if hasattr(self, "momentum"):
+                    self.slots[param] = grad + self.slots[param] * self.momentum
+                    delta = -self.lr * self.slots[param]
+                else:
+                    delta = -self.lr * grad
+                np.testing.assert_almost_equal(param.numpy(), ori_params[param] + delta)
+
+    cases = [
+        {"momentum": 0.9, "lr": 0.01},  # SGD with momentum
+        {"lr": 0.01},  # simple SGD
+        {"weight_decay": 0.1, "lr": 0.01},  # with weight_decay
+    ]
+    for case in cases:
+        _test_optimizer("SGD", case, CheckValue)
+        _test_optimizer("SGD", case, CheckValue, update_lr=True)
+
+
+def test_adam():
+    class CheckValue:
+        def __init__(self, net, **kwarg):
+            self.m_slots = TensorDict()
+            self.v_slots = TensorDict()
+            for param in net.parameters():
+                self.m_slots[param] = np.zeros(param.shape).astype(np.float32)
+                self.v_slots[param] = np.zeros(param.shape).astype(np.float32)
+            for k, v in kwarg.items():
+                setattr(self, k, v)
+
+        def __call__(self, ori_params, new_params, step):
+            for param in new_params:
+                grad = param.grad.numpy()
+                m = self.m_slots[param]
+                v = self.v_slots[param]
+                m *= self.betas[0]
+                m += (1 - self.betas[0]) * grad
+                v *= self.betas[1]
+                v += (1 - self.betas[1]) * grad * grad
+                delta = (m / (1 - self.betas[0] ** step)) / (
+                    np.sqrt(v / (1 - self.betas[1] ** step)) + self.eps
+                )
+                np.testing.assert_almost_equal(
+                    param.numpy(), ori_params[param] - self.lr * delta
+                )
+
+    cases = [
+        {"betas": (0.8, 0.9), "eps": 1e-04, "lr": 0.01},
+        {
+            "betas": (0.8, 0.9),
+            "eps": 1e-04,
+            "lr": 0.01,
+            "weight_decay": 0.1,
+        },  # with weight_decay
+    ]
+    for case in cases:
+        _test_optimizer("Adam", case, CheckValue)
+        _test_optimizer("Adam", case, CheckValue, update_lr=True)
+
+
+def test_adagrad():
+    class CheckValue:
+        def __init__(self, net, **kwarg):
+            self.s_slots = TensorDict()
+            for param in net.parameters():
+                self.s_slots[param] = np.zeros(param.shape).astype(np.float32)
+            for k, v in kwarg.items():
+                setattr(self, k, v)
+
+        def __call__(self, ori_params, new_params, step):
+            for param in new_params:
+                grad = param.grad.numpy()
+                self.s_slots[param] += grad ** 2
+                delta = grad / (self.s_slots[param] + self.eps) ** 0.5
+                delta *= -(self.lr / (1 + (step - 1) * self.lr_decay))
+                np.testing.assert_almost_equal(param.numpy(), ori_params[param] + delta)
+
+    cases = [
+        {"lr": 0.01, "eps": 1e-06, "lr_decay": 0.01},
+        {"lr": 0.01, "eps": 1e-06, "lr_decay": 0.0},  # without lr_decay
+        {
+            "lr": 0.01,
+            "eps": 1e-06,
+            "lr_decay": 0.01,
+            "weight_decay": 0.1,
+        },  # with weight_decay
+    ]
+    for case in cases:
+        _test_optimizer("Adagrad", case, CheckValue)
+        _test_optimizer("Adagrad", case, CheckValue, update_lr=True)
+
+
+def test_adadelta():
+    class CheckValue:
+        def __init__(self, net, **kwarg):
+            self.s_slots = TensorDict()
+            self.a_slots = TensorDict()
+            for param in net.parameters():
+                self.s_slots[param] = np.zeros(param.shape).astype(np.float32)
+                self.a_slots[param] = np.zeros(param.shape).astype(np.float32)
+            for k, v in kwarg.items():
+                setattr(self, k, v)
+
+        def __call__(self, ori_params, new_params, step):
+            for param in new_params:
+                grad = param.grad.numpy()
+                self.s_slots[param] = self.s_slots[param] * self.rho + grad ** 2 * (
+                    1 - self.rho
+                )
+                delta = (
+                    grad
+                    * ((self.a_slots[param] + self.eps) ** 0.5)
+                    / (self.s_slots[param] + self.eps) ** 0.5
+                )
+                self.a_slots[param] = self.a_slots[param] * self.rho + delta ** 2 * (
+                    1 - self.rho
+                )
+                delta *= -self.lr
+                np.testing.assert_almost_equal(param.numpy(), ori_params[param] + delta)
+
+    cases = [
+        {"lr": 1.0, "eps": 1e-06, "rho": 0.9},
+        {"lr": 1.0, "eps": 1e-06, "rho": 0.9, "weight_decay": 0.9},  # with weight_decay
+    ]
+    for case in cases:
+        _test_optimizer("Adadelta", case, CheckValue)
+        _test_optimizer("Adadelta", case, CheckValue, update_lr=True)
diff --git a/imperative/python/test/integration/test_save_load.py b/imperative/python/test/integration/test_save_load.py
new file mode 100644
index 0000000000000000000000000000000000000000..11bbcf58a69dd2147a7c9b8afcc36a70884ee0eb
--- /dev/null
+++ b/imperative/python/test/integration/test_save_load.py
@@ -0,0 +1,58 @@
+import numpy as np
+
+import megengine as mge
+import megengine.optimizer as optimizer
+from megengine import Parameter, tensor
+from megengine.core.tensor.raw_tensor import RawTensor
+from megengine.module import Module
+
+
+class Simple(Module):
+    def __init__(self):
+        self.a = Parameter(1.23, dtype=np.float32)
+
+    def forward(self, x):
+        x = x * self.a
+        return x
+
+
+def test_save_load():
+    net = Simple()
+
+    optim = optimizer.SGD(net.parameters(), lr=1.0, momentum=0.9)
+    optim.zero_grad()
+
+    data = tensor([2.34])
+
+    with optim.record():
+        loss = net(data)
+        optim.backward(loss)
+
+    optim.step()
+
+    model_name = "simple.pkl"
+    print("save to {}".format(model_name))
+
+    mge.save(
+        {
+            "name": "simple",
+            "state_dict": net.state_dict(),
+            "opt_state": optim.state_dict(),
+        },
+        model_name,
+    )
+
+    # Load param to cpu
+    checkpoint = mge.load(model_name, map_location="cpu0")
+    mge.set_default_device("cpu0")
+    net = Simple()
+    net.load_state_dict(checkpoint["state_dict"])
+    optim = optimizer.SGD(net.parameters(), lr=1.0, momentum=0.9)
+    optim.load_state_dict(checkpoint["opt_state"])
+    print("load done")
+
+    with optim.record():
+        loss = net([1.23])
+        optim.backward(loss)
+
+    optim.step()
diff --git a/imperative/python/test/integration/test_sgd_momentum.py b/imperative/python/test/integration/test_sgd_momentum.py
new file mode 100644
index 0000000000000000000000000000000000000000..33944150e1de1cdebb37a3a0eb1e37a688f54fce
--- /dev/null
+++ b/imperative/python/test/integration/test_sgd_momentum.py
@@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+import megengine
+import megengine.optimizer as optimizer
+from megengine import Parameter, tensor
+from megengine.module import Module
+
+
+class Simple(Module):
+    def __init__(self):
+        super().__init__()
+        self.a = Parameter(1.23, dtype=np.float32)
+
+    def forward(self, x):
+        x = x * self.a
+        return x
+
+
+def test_sgd_momentum():
+    net = Simple()
+
+    optim = optimizer.SGD(net.parameters(), lr=1.0, momentum=0.9)
+    optim.zero_grad()
+
+    data = tensor([2.34])
+
+    # do a step of train
+    with optim.record():
+        loss = net(data)
+        optim.backward(loss)
+    optim.step()
+
+    np.testing.assert_almost_equal(optim._state[net.a]["momentum_buffer"].numpy(), 2.34)
+
+    # do a step of infer
+    loss = net(data)
+    np.testing.assert_almost_equal(loss.numpy(), 2.34 * (1.23 - 2.34), 5)
+
+    np.testing.assert_almost_equal(optim._state[net.a]["momentum_buffer"].numpy(), 2.34)
+
+    # do a step of train
+    optim.zero_grad()
+    with optim.record():
+        loss = net(data)
+        optim.backward(loss)
+    optim.step()
+
+    np.testing.assert_almost_equal(loss.numpy(), 2.34 * (1.23 - 2.34), 5)
+    np.testing.assert_almost_equal(
+        optim._state[net.a]["momentum_buffer"].numpy(), 0.9 * 2.34 + 2.34
+    )
diff --git a/imperative/python/test/pytest.ini b/imperative/python/test/pytest.ini
new file mode 100644
index 0000000000000000000000000000000000000000..da914aaacff993b4fa4723aeb86ad65af70fe953
--- /dev/null
+++ b/imperative/python/test/pytest.ini
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+[pytest]
+markers =
+    isolated_distributed: marks distributed tests that should runs without cuda use
+        in main thread (deselect with '-m "not "isolated_distributed"')
diff --git a/imperative/python/test/unit/functional/__init__.py b/imperative/python/test/unit/functional/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1207b5d98cd3578bc39e9ce600a1254a434880c8
--- /dev/null
+++ b/imperative/python/test/unit/functional/__init__.py
@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/imperative/python/test/unit/functional/test_distributed.py b/imperative/python/test/unit/functional/test_distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ff2031907b51240faccb2ea30dd23619bb88d41
--- /dev/null
+++ b/imperative/python/test/unit/functional/test_distributed.py
@@ -0,0 +1,463 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import multiprocessing as mp
+import platform
+
+import numpy as np
+import pytest
+
+import megengine as mge
+import megengine.distributed as dist
+from megengine import Parameter, Tensor, tensor
+from megengine.functional.distributed import (
+    all_gather,
+    all_reduce_max,
+    all_reduce_min,
+    all_reduce_sum,
+    all_to_all,
+    broadcast,
+    gather,
+    reduce_scatter_sum,
+    reduce_sum,
+    remote_recv,
+    remote_send,
+    scatter,
+)
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
+)
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+)
+@pytest.mark.isolated_distributed
+def test_reduce_sum():
+    world_size = 2
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+
+    def worker(rank, data, expect, port):
+        if mge.get_device_count("gpu") < world_size:
+            return
+        dist.init_process_group("localhost", port, world_size, rank, rank)
+        inp = tensor(data)
+        output = reduce_sum(inp)
+        if rank == 0:
+            assert np.allclose(output.numpy(), expect)
+        else:
+            assert np.allclose(output.numpy(), 0)
+
+    def check(shape):
+        x = np.random.rand(*shape).astype("float32")
+        y = np.random.rand(*shape).astype("float32")
+        z = x + y
+        p0 = mp.Process(target=worker, args=(0, x, z, port))
+        p1 = mp.Process(target=worker, args=(1, y, None, port))
+
+        p0.start()
+        p1.start()
+
+        p0.join(10)
+        p1.join(10)
+
+        assert p0.exitcode == 0 and p1.exitcode == 0
+
+    for shape in [(2, 3), (8, 10), (99, 77)]:
+        check(shape)
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
+)
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+)
+@pytest.mark.isolated_distributed
+def test_broadcast():
+    world_size = 2
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+
+    def worker(rank, data, expect, port):
+        if mge.get_device_count("gpu") < world_size:
+            return
+        dist.init_process_group("localhost", port, world_size, rank, rank)
+        inp = tensor(data)
+        output = broadcast(inp)
+        assert np.allclose(output.numpy(), expect)
+
+    def check(shape):
+        x = np.random.rand(*shape).astype("float32")
+        y = x + 1
+        p0 = mp.Process(target=worker, args=(0, x, x, port))
+        p1 = mp.Process(target=worker, args=(1, y, x, port))
+
+        p0.start()
+        p1.start()
+
+        p0.join(10)
+        p1.join(10)
+
+        assert p0.exitcode == 0 and p1.exitcode == 0
+
+    for shape in [(2, 3), (8, 10), (99, 77)]:
+        check(shape)
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
+)
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+)
+@pytest.mark.isolated_distributed
+def test_all_gather():
+    world_size = 2
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+
+    def worker(rank, data, expect, port):
+        if mge.get_device_count("gpu") < world_size:
+            return
+        dist.init_process_group("localhost", port, world_size, rank, rank)
+        inp = tensor(data)
+        output = all_gather(inp)
+        assert np.allclose(output.numpy(), expect)
+
+    def check(shape):
+        x = np.random.rand(*shape).astype("float32")
+        y = np.random.rand(*shape).astype("float32")
+        z = np.concatenate((x, y))
+        p0 = mp.Process(target=worker, args=(0, x, z, port))
+        p1 = mp.Process(target=worker, args=(1, y, z, port))
+
+        p0.start()
+        p1.start()
+
+        p0.join(10)
+        p1.join(10)
+
+        assert p0.exitcode == 0 and p1.exitcode == 0
+
+    for shape in [(2, 3), (8, 10), (99, 77)]:
+        check(shape)
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
+)
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+)
+@pytest.mark.isolated_distributed
+def test_reduce_scatter_sum():
+    world_size = 2
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+
+    def worker(rank, data, expect, port):
+        if mge.get_device_count("gpu") < world_size:
+            return
+        dist.init_process_group("localhost", port, world_size, rank, rank)
+        inp = tensor(data)
+        output = reduce_scatter_sum(inp)
+        assert np.allclose(output.numpy(), expect)
+
+    def check(shape):
+        x = np.random.rand(*shape).astype("float32")
+        y = np.random.rand(*shape).astype("float32")
+        z = x + y
+        p0 = mp.Process(target=worker, args=(0, x, z[: shape[0] // 2], port))
+        p1 = mp.Process(target=worker, args=(1, y, z[shape[0] // 2 :], port))
+
+        p0.start()
+        p1.start()
+
+        p0.join(10)
+        p1.join(10)
+
+        assert p0.exitcode == 0 and p1.exitcode == 0
+
+    for shape in [(2, 4), (8, 10), (88, 44)]:
+        check(shape)
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
+)
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+)
+@pytest.mark.isolated_distributed
+def test_all_reduce_sum():
+    world_size = 2
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+
+    def worker(rank, data, expect, port):
+        if mge.get_device_count("gpu") < world_size:
+            return
+        dist.init_process_group("localhost", port, world_size, rank, rank)
+        inp = tensor(data)
+        output = all_reduce_sum(inp)
+        assert np.allclose(output.numpy(), expect)
+
+    def check(shape):
+        x = np.random.rand(*shape).astype("float32")
+        y = np.random.rand(*shape).astype("float32")
+        z = x + y
+        p0 = mp.Process(target=worker, args=(0, x, z, port))
+        p1 = mp.Process(target=worker, args=(1, y, z, port))
+
+        p0.start()
+        p1.start()
+
+        p0.join(10)
+        p1.join(10)
+
+        assert p0.exitcode == 0 and p1.exitcode == 0
+
+    for shape in [(2, 3), (8, 10), (99, 77)]:
+        check(shape)
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
+)
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+)
+@pytest.mark.isolated_distributed
+def test_all_reduce_max():
+    world_size = 2
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+
+    def worker(rank, data, expect, port):
+        if mge.get_device_count("gpu") < world_size:
+            return
+        dist.init_process_group("localhost", port, world_size, rank, rank)
+        inp = tensor(data)
+        output = all_reduce_max(inp)
+        assert np.allclose(output.numpy(), expect)
+
+    def check(shape):
+        x = np.random.rand(*shape).astype("float32")
+        y = np.random.rand(*shape).astype("float32")
+        z = np.maximum(x, y)
+        p0 = mp.Process(target=worker, args=(0, x, z, port))
+        p1 = mp.Process(target=worker, args=(1, y, z, port))
+
+        p0.start()
+        p1.start()
+
+        p0.join(10)
+        p1.join(10)
+
+        assert p0.exitcode == 0 and p1.exitcode == 0
+
+    for shape in [(2, 3), (8, 10), (99, 77)]:
+        check(shape)
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
+)
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+)
+@pytest.mark.isolated_distributed
+def test_all_reduce_min():
+    world_size = 2
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+
+    def worker(rank, data, expect, port):
+        if mge.get_device_count("gpu") < world_size:
+            return
+        dist.init_process_group("localhost", port, world_size, rank, rank)
+        inp = tensor(data)
+        output = all_reduce_min(inp)
+        assert np.allclose(output.numpy(), expect)
+
+    def check(shape):
+        x = np.random.rand(*shape).astype("float32")
+        y = np.random.rand(*shape).astype("float32")
+        z = np.minimum(x, y)
+        p0 = mp.Process(target=worker, args=(0, x, z, port))
+        p1 = mp.Process(target=worker, args=(1, y, z, port))
+
+        p0.start()
+        p1.start()
+
+        p0.join(10)
+        p1.join(10)
+
+        assert p0.exitcode == 0 and p1.exitcode == 0
+
+    for shape in [(2, 3), (8, 10), (99, 77)]:
+        check(shape)
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
+)
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+)
+@pytest.mark.isolated_distributed
+def test_gather():
+    world_size = 2
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+
+    def worker(rank, data, expect, port):
+        if mge.get_device_count("gpu") < world_size:
+            return
+        dist.init_process_group("localhost", port, world_size, rank, rank)
+        inp = tensor(data)
+        output = gather(inp)
+        if rank == 0:
+            assert np.allclose(output.numpy(), expect)
+        else:
+            assert np.allclose(output.numpy(), 0)
+
+    def check(shape):
+        x = np.random.rand(*shape).astype("float32")
+        y = np.random.rand(*shape).astype("float32")
+        z = np.concatenate((x, y))
+        p0 = mp.Process(target=worker, args=(0, x, z, port))
+        p1 = mp.Process(target=worker, args=(1, y, None, port))
+
+        p0.start()
+        p1.start()
+
+        p0.join(10)
+        p1.join(10)
+
+        assert p0.exitcode == 0 and p1.exitcode == 0
+
+    for shape in [(2, 3), (8, 10), (99, 77)]:
+        check(shape)
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
+)
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+)
+@pytest.mark.isolated_distributed
+def test_scatter():
+    world_size = 2
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+
+    def worker(rank, data, expect, port):
+        if mge.get_device_count("gpu") < world_size:
+            return
+        dist.init_process_group("localhost", port, world_size, rank, rank)
+        inp = tensor(data)
+        output = scatter(inp)
+        assert np.allclose(output.numpy(), expect)
+
+    def check(shape):
+        x = np.random.rand(*shape).astype("float32")
+        y = x + 1
+        p0 = mp.Process(target=worker, args=(0, x, x[: shape[0] // 2], port))
+        p1 = mp.Process(target=worker, args=(1, y, x[shape[0] // 2 :], port))
+
+        p0.start()
+        p1.start()
+
+        p0.join(10)
+        p1.join(10)
+
+        assert p0.exitcode == 0 and p1.exitcode == 0
+
+    for shape in [(2, 3), (8, 10), (100, 77)]:
+        check(shape)
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
+)
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+)
+@pytest.mark.isolated_distributed
+def test_all_to_all():
+    world_size = 2
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+
+    def worker(rank, data, expect, port):
+        if mge.get_device_count("gpu") < world_size:
+            return
+        dist.init_process_group("localhost", port, world_size, rank, rank)
+        inp = tensor(data)
+        output = all_to_all(inp)
+        assert np.allclose(output.numpy(), expect)
+
+    def check(shape):
+        x = np.random.rand(*shape).astype("float32")
+        y = np.random.rand(*shape).astype("float32")
+        a = np.concatenate((x[: shape[0] // 2], y[: shape[0] // 2]))
+        b = np.concatenate((x[shape[0] // 2 :], y[shape[0] // 2 :]))
+        p0 = mp.Process(target=worker, args=(0, x, a, port))
+        p1 = mp.Process(target=worker, args=(1, y, b, port))
+
+        p0.start()
+        p1.start()
+
+        p0.join(10)
+        p1.join(10)
+
+        assert p0.exitcode == 0 and p1.exitcode == 0
+
+    for shape in [(2, 3), (8, 10), (100, 77)]:
+        check(shape)
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
+)
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+)
+@pytest.mark.isolated_distributed
+def test_io_remote():
+    world_size = 2
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+    val = np.random.rand(4, 5).astype(np.float32)
+
+    def worker(rank):
+        if mge.get_device_count("gpu") < world_size:
+            return
+        if rank == 0:  # remote send
+            dist.init_process_group("localhost", port, world_size, rank, rank)
+            x = Tensor(val, device="gpu0")
+            y = remote_send(x, 1)
+            assert y.numpy()[0] == 0
+        else:  # remote recv
+            dist.init_process_group("localhost", port, world_size, rank, rank)
+            y = remote_recv(0, val.shape, val.dtype, cn="gpu1")
+            np.testing.assert_almost_equal(val, y.numpy())
+
+    procs = []
+    for rank in range(world_size):
+        p = mp.Process(target=worker, args=(rank,))
+        p.start()
+        procs.append(p)
+
+    for p in procs:
+        p.join(10)
+        assert p.exitcode == 0
diff --git a/imperative/python/test/unit/functional/test_elemwise.py b/imperative/python/test/unit/functional/test_elemwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..75d6874dbb6a74617716701d7543af4cdda57b44
--- /dev/null
+++ b/imperative/python/test/unit/functional/test_elemwise.py
@@ -0,0 +1,139 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+import megengine.functional as F
+from megengine import tensor
+from megengine.test import assertTensorClose
+
+
+def test_abs():
+    assertTensorClose(
+        F.abs(tensor([-3.0, -4.0, -5.0])).numpy(),
+        np.abs(np.array([-3.0, -4.0, -5.0], dtype=np.float32)),
+    )
+
+    # assertTensorClose(F.abs(-3.0), np.abs(np.float32(-3.0)))
+
+
+def test_multiply():
+    # assertTensorClose(
+    #     F.mul(-3.0, -4.0), np.multiply(np.float32(-3.0), np.float32(-4.0))
+    # )
+
+    assertTensorClose(
+        F.mul(tensor([3.0, 4.0]), 4.0).numpy(),
+        np.multiply(np.array([3.0, 4.0], dtype=np.float32), 4.0),
+    )
+
+    assertTensorClose(
+        F.mul(4.0, tensor([3.0, 4.0])).numpy(),
+        np.multiply(4.0, np.array([3.0, 4.0], dtype=np.float32)),
+    )
+
+    assertTensorClose(
+        F.mul(tensor([3.0, 4.0]), tensor([3.0, 4.0])).numpy(),
+        np.multiply(
+            np.array([3.0, 4.0], dtype=np.float32),
+            np.array([3.0, 4.0], dtype=np.float32),
+        ),
+    )
+
+
+def test_clamp():
+    """Fix an issue when `lower` or `upper` is 0, it will be recognized as `False` and
+    `F.clamp` will fall into wrong conditions unexpectedly.
+    """
+    x = np.linspace(-6, 6, dtype="float32")
+    assertTensorClose(F.clamp(tensor(x) + 3, 0, 6).numpy(), np.clip(x + 3, 0, 6))
+    assertTensorClose(F.clamp(tensor(x) - 3, -6, 0).numpy(), np.clip(x - 3, -6, 0))
+
+
+# def test_isnan():
+#     for case in [[1, float("nan"), 0]]:
+#         assertTensorClose(F.isnan(tensor(case)), np.isnan(case).astype("uint8"))
+
+
+def test_isinf():
+    for case in [[1, float("inf"), 0]]:
+        assertTensorClose(F.isinf(tensor(case)).numpy(), np.isinf(case).astype("uint8"))
+
+
+def test_cosh():
+    np.random.seed(42)
+    x = np.random.randn(100).astype("float32")
+    y_np = np.cosh(x)
+    y_mge = F.cosh(tensor(x)).numpy()
+    np.testing.assert_allclose(y_np, y_mge, rtol=1e-5)
+
+
+def test_sinh():
+    np.random.seed(42)
+    x = np.random.randn(100).astype("float32")
+    y_np = np.sinh(x)
+    y_mge = F.sinh(tensor(x)).numpy()
+    np.testing.assert_allclose(y_np, y_mge, rtol=1e-5)
+
+
+def test_asinh():
+    np.random.seed(42)
+    x = np.random.randn(100).astype("float32")
+    y_np = np.arcsinh(x)
+    y_mge = F.asinh(tensor(x)).numpy()
+    np.testing.assert_almost_equal(y_np, y_mge, decimal=5)
+
+
+def test_acosh():
+    x = np.arange(0, 10000).astype("float32") / 100 + 1
+    y_np = np.arccosh(x)
+    y_mge = F.acosh(tensor(x)).numpy()
+    np.testing.assert_almost_equal(y_np, y_mge, decimal=6)
+
+
+def test_atanh():
+    np.random.seed(42)
+    x = np.random.rand(100).astype("float32") * 2 - 1
+    y_np = np.arctanh(x)
+    y_mge = F.atanh(tensor(x)).numpy()
+    np.testing.assert_almost_equal(y_np, y_mge, decimal=5)
+
+
+def test_fast_tanh():
+    np.random.seed(42)
+    x = np.random.randn(100).astype("float32")
+    y_np = x * (27.0 + x * x) / (27.0 + 9.0 * x * x)
+    y_mge = F.fast_tanh(tensor(x)).numpy()
+    np.testing.assert_almost_equal(y_np, y_mge, decimal=6)
+
+
+def test_hswish():
+    np.random.seed(42)
+    x = np.random.randn(100).astype("float32")
+    y_np = x * np.minimum(np.maximum(x + 3, 0), 6) / 6
+    y_mge = F.hswish(tensor(x)).numpy()
+    np.testing.assert_almost_equal(y_np, y_mge, decimal=6)
+
+
+def test_hsigmoid():
+    np.random.seed(42)
+    x = np.random.randn(100).astype("float32")
+    y_np = np.minimum(np.maximum(x + 3, 0), 6) / 6
+    y_mge = F.hsigmoid(tensor(x)).numpy()
+    np.testing.assert_equal(y_np, y_mge)
+
+
+def test_logical_oprs():
+    x = np.array([[True, False], [False, True]])
+    y = np.array([[True, True], [False, False]])
+    xx = tensor(x)
+    yy = tensor(y)
+    np.testing.assert_equal(~x, (F.logical_not(xx)).numpy())
+    np.testing.assert_equal(x & y, F.logical_and(xx, yy).numpy())
+    np.testing.assert_equal(x | y, F.logical_or(xx, yy).numpy())
+    np.testing.assert_equal(x ^ y, F.logical_xor(xx, yy).numpy())
diff --git a/imperative/python/test/unit/functional/test_functional.py b/imperative/python/test/unit/functional/test_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..beaff64845f59fb48e8f9ffd1de54de272e21b68
--- /dev/null
+++ b/imperative/python/test/unit/functional/test_functional.py
@@ -0,0 +1,623 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import itertools
+
+import numpy as np
+import pytest
+
+import megengine.core.tensor.dtype as dtype
+import megengine.functional as F
+from megengine import Buffer, Parameter, is_cuda_available, tensor
+from megengine.core.autodiff.grad import Grad
+from megengine.test import assertTensorClose
+
+
+def _default_compare_fn(x, y):
+    assertTensorClose(x.numpy(), y)
+
+
+def opr_test(cases, func, compare_fn=_default_compare_fn, ref_fn=None, **kwargs):
+    """
+    func: the function to run opr.
+    compare_fn: the function to compare the result and expected, use assertTensorClose if None.
+    ref_fn: the function to generate expected data, should assign output if None.
+    cases: the list which have dict element, the list length should be 2 for dynamic shape test.
+           and the dict should have input,
+           and should have output if ref_fn is None.
+           should use list for multiple inputs and outputs for each case.
+    kwargs: The additional kwargs for opr func.
+
+    simple examples:
+
+        dtype = np.float32
+        cases = [{"input": [10, 20]}, {"input": [20, 30]}]
+        opr_test(cases,
+                 F.eye,
+                 ref_fn=lambda n, m: np.eye(n, m).astype(dtype),
+                 dtype=dtype)
+
+    """
+
+    def check_results(results, expected):
+        if not isinstance(results, (tuple, list)):
+            results = (results,)
+        for r, e in zip(results, expected):
+            compare_fn(r, e)
+
+    def get_param(cases, idx):
+        case = cases[idx]
+        inp = case.get("input", None)
+        outp = case.get("output", None)
+        if inp is None:
+            raise ValueError("the test case should have input")
+        if not isinstance(inp, (tuple, list)):
+            inp = (inp,)
+        if ref_fn is not None and callable(ref_fn):
+            outp = ref_fn(*inp)
+        if outp is None:
+            raise ValueError("the test case should have output or reference function")
+        if not isinstance(outp, (tuple, list)):
+            outp = (outp,)
+
+        return inp, outp
+
+    if len(cases) == 0:
+        raise ValueError("should give one case at least")
+
+    if not callable(func):
+        raise ValueError("the input func should be callable")
+
+    inp, outp = get_param(cases, 0)
+    inp_tensor = [tensor(inpi) for inpi in inp]
+
+    results = func(*inp_tensor, **kwargs)
+    check_results(results, outp)
+
+
+def test_flatten():
+    data0_shape = (2, 3, 4, 5)
+    data1_shape = (4, 5, 6, 7)
+    data0 = np.random.random(data0_shape).astype(np.float32)
+    data1 = np.random.random(data1_shape).astype(np.float32)
+
+    def compare_fn(x, y):
+        assert x.numpy().shape == y
+
+    output0 = (2 * 3 * 4 * 5,)
+    output1 = (4 * 5 * 6 * 7,)
+    cases = [
+        {"input": data0, "output": (output0,)},
+        {"input": data1, "output": (output1,)},
+    ]
+    opr_test(cases, F.flatten, compare_fn=compare_fn)
+
+    output0 = (2, 3 * 4 * 5)
+    output1 = (4, 5 * 6 * 7)
+    cases = [
+        {"input": data0, "output": (output0,)},
+        {"input": data1, "output": (output1,)},
+    ]
+    opr_test(cases, F.flatten, compare_fn=compare_fn, start_axis=1)
+
+    output0 = (2, 3, 4 * 5)
+    output1 = (4, 5, 6 * 7)
+    cases = [
+        {"input": data0, "output": (output0,)},
+        {"input": data1, "output": (output1,)},
+    ]
+    opr_test(cases, F.flatten, compare_fn=compare_fn, start_axis=2)
+
+    output0 = (2, 3 * 4, 5)
+    output1 = (4, 5 * 6, 7)
+    cases = [
+        {"input": data0, "output": (output0,)},
+        {"input": data1, "output": (output1,)},
+    ]
+    opr_test(cases, F.flatten, compare_fn=compare_fn, start_axis=1, end_axis=2)
+
+
+# def test_where():
+#     maskv0 = np.array([[1, 0], [0, 1]], dtype=np.int32)
+#     xv0 = np.array([[1, np.inf], [np.nan, 4]], dtype=np.float32)
+#     yv0 = np.array([[5, 6], [7, 8]], dtype=np.float32)
+
+#     maskv1 = np.array([[1, 0, 1], [1, 0, 0], [1, 1, 0]], dtype=np.int32)
+#     xv1 = np.array([[1, np.inf, 2], [0, np.nan, 4], [1, 5, 7]], dtype=np.float32)
+#     yv1 = np.array([[5, 6, 9], [2, 7, 8], [2, 1, 9]], dtype=np.float32)
+
+#     cases = [
+#         {"input": [maskv0, xv0, yv0]},
+#         {"input": [maskv1, xv1, yv1]},
+#     ]
+#     opr_test(cases, F.where, ref_fn=np.where)
+
+#     maskv2 = np.array([1, 1, 1], dtype=np.int32)
+#     xv2 = np.array([1, 3, 2], dtype=np.float32)
+#     yv2 = np.array([5, 6, 9], dtype=np.float32)
+
+#     maskv3 = np.array([0, 0, 0], dtype=np.int32)
+#     xv3 = np.array([1, 3, 2], dtype=np.float32)
+#     yv3 = np.array([5, 6, 9], dtype=np.float32)
+
+#     cases = [
+#         {"input": [maskv2, xv2, yv2]},
+#         {"input": [maskv3, xv3, yv3]},
+#     ]
+#     opr_test(cases, F.where, ref_fn=np.where)
+
+
+def test_matmul():
+    shape1 = 3
+    shape2 = 3
+    shape3 = (3, 5)
+    shape4 = (5, 6)
+    data1 = np.random.random(shape1).astype("float32")
+    data2 = np.random.random(shape2).astype("float32")
+    data3 = np.random.random(shape3).astype("float32")
+    data4 = np.random.random(shape4).astype("float32")
+
+    cases = [
+        {"input": [data1, data2]},
+        {"input": [data2, data3]},
+        {"input": [data3, data4]},
+    ]
+    opr_test(cases, F.matmul, ref_fn=np.matmul)
+
+    batch_size = 10
+    shape1 = (batch_size, 2, 3)
+    shape2 = (batch_size, 3, 4)
+    shape3 = (batch_size, 10, 4, 5)
+    data1 = np.random.random(shape1).astype("float32")
+    data2 = np.random.random(shape2).astype("float32")
+    data3 = np.random.random(shape3).astype("float32")
+
+    cases = [{"input": [data1, data2]}, {"input": [data2, data3]}]
+    for i in range(0, batch_size):
+
+        def compare_fn(x, y):
+            x.numpy()[i, ...] == y
+
+        opr_test(
+            cases,
+            F.matmul,
+            compare_fn=compare_fn,
+            ref_fn=lambda x, y: np.matmul(x[i, ...], y[i, ...]),
+        )
+
+
+def test_interpolate():
+    def linear_interpolate():
+        inp = tensor(np.arange(1, 3, dtype=np.float32).reshape(1, 1, 2))
+
+        out = F.interpolate(inp, scale_factor=2.0, mode="LINEAR")
+        out2 = F.interpolate(inp, 4, mode="LINEAR")
+
+        assertTensorClose(
+            out.numpy(), np.array([[[1.0, 1.25, 1.75, 2.0]]], dtype=np.float32)
+        )
+        assertTensorClose(
+            out2.numpy(), np.array([[[1.0, 1.25, 1.75, 2.0]]], dtype=np.float32)
+        )
+
+    def many_batch_interpolate():
+        inp = tensor(np.arange(1, 9, dtype=np.float32).reshape(2, 1, 2, 2))
+
+        out = F.interpolate(inp, [4, 4])
+        out2 = F.interpolate(inp, scale_factor=2.0)
+
+        assertTensorClose(out.numpy(), out2.numpy())
+
+    def assign_corner_interpolate():
+        inp = tensor(np.arange(1, 5, dtype=np.float32).reshape(1, 1, 2, 2))
+
+        out = F.interpolate(inp, [4, 4], align_corners=True)
+        out2 = F.interpolate(inp, scale_factor=2.0, align_corners=True)
+
+        assertTensorClose(out.numpy(), out2.numpy())
+
+    def error_shape_linear_interpolate():
+        inp = tensor(np.arange(1, 5, dtype=np.float32).reshape(1, 1, 2, 2))
+
+        with pytest.raises(ValueError):
+            F.interpolate(inp, scale_factor=2.0, mode="LINEAR")
+
+    def inappropriate_scale_linear_interpolate():
+        inp = tensor(np.arange(1, 3, dtype=np.float32).reshape(1, 1, 2))
+
+        with pytest.raises(ValueError):
+            F.interpolate(inp, scale_factor=[2.0, 3.0], mode="LINEAR")
+
+    linear_interpolate()
+    many_batch_interpolate()
+    assign_corner_interpolate()
+    error_shape_linear_interpolate()
+    inappropriate_scale_linear_interpolate()
+
+
+def _save_to(self, name="grad"):
+    def callback(tensor, grad):
+        setattr(self, name, grad)
+
+    return callback
+
+
+def _gen_roi_inp():
+    inp_feat = np.random.randn(2, 32, 256, 256)
+    rois = np.zeros((4, 5))
+    rois[:, 0] = [0, 0, 1, 1]
+    rois[:, 1:3] = np.random.rand(4, 2) * 100
+    rois[:, 3:] = np.random.rand(4, 2) * 100 + 150
+
+    inp_feat = tensor(inp_feat)
+    rois = tensor(rois)
+    return inp_feat, rois
+
+
+def test_roi_align():
+    inp_feat, rois = _gen_roi_inp()
+    grad = Grad().wrt(inp_feat, callback=_save_to(inp_feat))
+
+    output_shape = (7, 7)
+    out_feat = F.roi_align(
+        inp_feat,
+        rois,
+        output_shape=output_shape,
+        mode="average",
+        spatial_scale=1.0 / 4,
+        sample_points=2,
+        aligned=True,
+    )
+    assert out_feat.shape == (rois.shape[0], inp_feat.shape[1], *output_shape)
+
+    grad(out_feat, tensor(F.ones_like(out_feat)))
+    assert inp_feat.grad.shape == inp_feat.shape
+
+
+def test_roi_pooling():
+    inp_feat, rois = _gen_roi_inp()
+    grad = Grad().wrt(inp_feat, callback=_save_to(inp_feat))
+    output_shape = (7, 7)
+    out_feat = F.roi_pooling(
+        inp_feat, rois, output_shape=output_shape, mode="max", scale=1.0 / 4,
+    )
+    assert out_feat.shape == (rois.shape[0], inp_feat.shape[1], *output_shape)
+
+    grad(out_feat, tensor(F.ones_like(out_feat)))
+    assert inp_feat.grad.shape == inp_feat.shape
+
+
+# def test_one_hot():
+#     def onehot_low_dimension():
+#         inp = tensor(np.arange(1, 4, dtype=np.int32))
+#         out = F.one_hot(inp, num_classes=4)
+
+#         assertTensorClose(
+#             out.numpy(), np.eye(4, dtype=np.int32)[np.arange(1, 4, dtype=np.int32)]
+#         )
+
+
+#     def onehot_high_dimension():
+#         arr = np.array(
+#             [[3, 2, 4, 4, 2, 4, 0, 4, 4, 1], [4, 1, 1, 3, 2, 2, 4, 2, 4, 3]], dtype=np.int32
+#         )
+
+#         inp = tensor(arr)
+#         out = F.one_hot(inp, 10)
+
+#         assertTensorClose(out.numpy(), np.eye(10, dtype=np.int32)[arr])
+
+#     onehot_low_dimension()
+#     onehot_high_dimension()
+
+
+def test_add_update():
+    shape = (2, 3)
+    v = np.random.random(shape).astype(np.float32)
+    b = Buffer(v)
+
+    u = F.add_update(b, 1)
+    assertTensorClose(u.numpy(), v + 1)
+    u = F.add_update(b, 1)
+    assertTensorClose(u.numpy(), v + 2)
+
+    x = np.ones((2, 2), dtype=np.float32)
+    y = x * 0.5
+    dest = tensor(x)
+    delta = tensor(y)
+    r = F.add_update(dest, delta, alpha=0.9, beta=0.1, bias=0.1)
+    assertTensorClose(r.numpy(), x * 0.9 + y * 0.1 + 0.1)
+
+
+def test_add_update_params():
+    b = np.random.random((2, 3)).astype(np.float32)
+    y = Buffer(b)
+
+    # @jit.trace
+    def f(x):
+        return F.add_update(y, x)
+
+    f(np.zeros((2, 3)).astype(np.float32))
+
+    z = Buffer(np.zeros((2, 3)).astype(np.float32))
+    F.add_update(y, z, beta=0.1)
+
+    res = f(np.ones((2, 3)).astype(np.float32))
+    assertTensorClose(res.numpy(), b + 1)
+
+
+# def test_cross_entropy_with_softmax():
+#     data1_shape = (1, 2)
+#     label1_shape = (1,)
+#     data2_shape = (1, 3)
+#     label2_shape = (1,)
+
+#     data1 = np.array([1, 0.5], dtype=np.float32).reshape(data1_shape)
+#     label1 = np.array([1], dtype=np.int32).reshape(label1_shape)
+#     expect1 = F.cross_entropy(F.softmax(tensor(data1)), tensor(label1)).numpy()
+
+#     data2 = np.array([0.3, 0.4, 0.3], dtype=np.float32).reshape(data2_shape)
+#     label2 = np.array([1], dtype=np.int32).reshape(label2_shape)
+#     expect2 = F.cross_entropy(F.softmax(tensor(data2)), tensor(label2)).numpy()
+
+#     cases = [
+#         {"input": [data1, label1], "output": expect1,},
+#         {"input": [data2, label2], "output": expect2,},
+#     ]
+#     opr_test(cases, F.cross_entropy_with_softmax)
+
+
+# def test_cross_entropy():
+#     data1_shape = (1, 2)
+#     label1_shape = (1,)
+#     data2_shape = (1, 3)
+#     label2_shape = (1,)
+
+#     data1 = np.array([0.5, 0.5], dtype=np.float32).reshape(data1_shape)
+#     label1 = np.array([1], dtype=np.int32).reshape(label1_shape)
+#     expect1 = np.array([-np.log(0.5)], dtype=np.float32)
+
+#     data2 = np.array([0.3, 0.4, 0.3], dtype=np.float32).reshape(data2_shape)
+#     label2 = np.array([1], dtype=np.int32).reshape(label2_shape)
+#     expect2 = np.array([-np.log(0.4)], dtype=np.float32)
+
+#     cases = [
+#         {"input": [data1, label1], "output": expect1,},
+#         {"input": [data2, label2], "output": expect2,},
+#     ]
+#     opr_test(cases, F.cross_entropy)
+
+
+def test_binary_cross_entropy():
+    data1_shape = (2, 2)
+    label1_shape = (2, 2)
+    data2_shape = (2, 3)
+    label2_shape = (2, 3)
+
+    def sigmoid(x):
+        return 1 / (1 + np.exp(-x))
+
+    def compare_fn(x, y):
+        assertTensorClose(x.numpy(), y, max_err=5e-4)
+
+    np.random.seed(123)
+    data1 = sigmoid(np.random.uniform(size=data1_shape).astype(np.float32))
+    label1 = np.random.uniform(size=label1_shape).astype(np.float32)
+    expect1 = np.array([0.6361], dtype=np.float32)
+
+    np.random.seed(123)
+    data2 = sigmoid(np.random.uniform(size=data2_shape).astype(np.float32))
+    label2 = np.random.uniform(size=label2_shape).astype(np.float32)
+    expect2 = np.array([0.6750], dtype=np.float32)
+
+    cases = [
+        {"input": [data1, label1], "output": expect1,},
+        {"input": [data2, label2], "output": expect2,},
+    ]
+    opr_test(cases, F.binary_cross_entropy, compare_fn=compare_fn)
+
+
+def test_hinge_loss():
+    np.random.seed(123)
+    # case with L1 norm
+    cases = []
+    for shape in [(2, 2), (2, 3)]:
+        data = np.random.uniform(size=shape).astype(np.float32)
+        label = 2 * np.random.randint(0, 1, size=shape).astype(np.float32) - 1
+        expect = np.clip(0, np.inf, 1 - data * label).sum(axis=1).mean()
+        cases.append({"input": [data, label], "output": expect})
+
+    opr_test(cases, F.hinge_loss)
+
+    # cases with L2 norm
+    cases = []
+    for shape in [(2, 2), (2, 3)]:
+        data = np.random.uniform(size=shape).astype(np.float32)
+        label = 2 * np.random.randint(0, 1, size=shape).astype(np.float32) - 1
+        expect = ((np.clip(0, np.inf, 1 - data * label) ** 2).sum(axis=1)).mean()
+        cases.append({"input": [data, label], "output": expect})
+
+    def hinge_loss_with_l2_norm(pred, label):
+        return F.hinge_loss(pred, label, "L2")
+
+    opr_test(cases, hinge_loss_with_l2_norm)
+
+
+def test_nms():
+    x = np.array(
+        [
+            [0, 0, 100, 100],
+            [10, 10, 100, 100],
+            [50, 50, 100, 100],
+            [100, 100, 150, 150],
+        ],
+        dtype=np.float32,
+    )
+    inp = tensor(x)
+    scores = tensor([0.5, 0.8, 0.9, 0.6], dtype=np.float32)
+    result = F.nms(inp, iou_thresh=0.5, scores=scores)
+    np.testing.assert_equal(result.numpy(), np.array([2, 1, 3], dtype=np.int32))
+
+
+def test_batched_nms():
+    x = np.array(
+        [
+            [0, 0, 100, 100],
+            [0.5, 0.5, 1.5, 1.5],
+            [20, 20, 100, 100],
+            [0.5, 0.5, 1.0, 1.0],
+            [10, 10, 100, 100],
+            [0.5, 0.5, 1.0, 1.0],
+        ],
+        dtype=np.float32,
+    )
+    inp = tensor(x)
+    scores = tensor([0.6, 0.9, 0.5, 0.6, 0.8, 0.7], dtype=np.float32)
+    idxs = tensor([0, 1, 0, 1, 0, 1], dtype=np.int32)
+    results = F.batched_nms(inp, iou_thresh=0.5, idxs=idxs, scores=scores)
+    np.testing.assert_equal(results.numpy(), np.array([1, 4, 5], dtype=np.int32))
+
+
+# def test_smooth_l1_loss():
+#     np.random.seed(123)
+#     cases = []
+#     for shape in [(2, 2), (2, 3)]:
+#         data = np.random.uniform(size=shape).astype(np.float32)
+#         label = np.random.uniform(size=shape).astype(np.float32)
+#         diff = np.abs(data - label)
+#         expect = np.where(diff < 1, 0.5 * diff ** 2, diff - 0.5).mean()
+#         cases.append({"input": [data, label], "output": tensor(expect)})
+
+#     opr_test(cases, F.smooth_l1_loss)
+
+
+def test_conv_bias():
+    inp_scale = 1.5
+    w_scale = 2.5
+    outp_scale = 1.5
+    inp_dtype = dtype.qint8(inp_scale)
+    w_dtype = dtype.qint8(w_scale)
+    b_dtype = dtype.qint32(inp_scale * w_scale)
+    out_dtype = dtype.qint8(outp_scale)
+
+    def run(
+        N,
+        IC,
+        OC,
+        IH,
+        IW,
+        KH,
+        KW,
+        PH,
+        PW,
+        SH,
+        SW,
+        has_bias=True,
+        nonlinear_mode="IDENTITY",
+    ):
+        inp_v = np.random.normal(size=(N, IC, IH, IW))
+        w_v = np.random.normal(size=(OC, IC, KW, KW))
+        b_v = np.random.normal(size=(1, OC, 1, 1))
+        inp_scale = dtype.get_scale(inp_dtype)
+        w_scale = dtype.get_scale(w_dtype)
+        b_scale = dtype.get_scale(b_dtype)
+
+        inpv = dtype.convert_to_qint8(inp_v * inp_scale, inp_dtype)
+        wv = dtype.convert_to_qint8(w_v * w_scale, w_dtype)
+        bv = dtype.convert_to_qint32(b_v * b_scale, b_dtype)
+
+        inp_int8 = tensor(inpv, dtype=inp_dtype)
+        w_int8 = Parameter(wv, dtype=w_dtype)
+        b_int32 = Parameter(bv, dtype=b_dtype)
+
+        inp_fp32 = inp_int8.astype("float32")
+        w_fp32 = w_int8.astype("float32")
+        b_fp32 = b_int32.astype("float32")
+
+        def convert_to_nchw4(var):
+            var = F.reshape(
+                var, (var.shape[0], var.shape[1] // 4, 4, var.shape[2], var.shape[3])
+            )
+            var = F.dimshuffle(var, (0, 1, 3, 4, 2))
+            return var
+
+        def run_conv2d(inp, w, b):
+            O = F.conv2d(
+                inp, w, b if has_bias else None, stride=(SH, SW), padding=(PH, PW),
+            )
+            if nonlinear_mode == "RELU":
+                return F.relu(O)
+            else:
+                return O
+
+        def run_conv_bias(inp, w, b, format="NCHW"):
+            b = b if has_bias else Parameter(np.zeros_like(b.numpy()))
+            if format == "NCHW4":
+                inp = convert_to_nchw4(inp)
+                w = convert_to_nchw4(w)
+                b = convert_to_nchw4(b)
+            return F.conv_bias_activation(
+                inp,
+                w,
+                b,
+                stride=(SH, SW),
+                padding=(PH, PW),
+                format=format,
+                dtype=out_dtype,
+                nonlinear_mode=nonlinear_mode,
+            )
+
+        format = "NCHW4" if is_cuda_available() else "NCHW"
+
+        expected = run_conv2d(inp_fp32, w_fp32, b_fp32)
+        expected = expected.astype(out_dtype).astype("float32")
+        result = run_conv_bias(inp_int8, w_int8, b_int32, format=format).astype(
+            "float32"
+        )
+        if format == "NCHW4":
+            result = F.dimshuffle(result, (0, 1, 4, 2, 3))
+        expected = F.flatten(expected)
+        result = F.flatten(result)
+        assertTensorClose(result.numpy(), expected.numpy(), max_err=outp_scale)
+
+    run(1, 4, 4, 24, 33, 1, 1, 2, 3, 1, 1, False)
+    run(10, 12, 24, 46, 46, 1, 1, 2, 1, 3, 1, False)
+    run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2, False)
+
+    run(1, 4, 4, 24, 33, 1, 1, 2, 3, 1, 1)
+    run(10, 12, 24, 46, 46, 1, 1, 2, 1, 3, 1)
+    run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2)
+
+    run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2, False, "RELU")
+    run(10, 36, 8, 46, 26, 2, 2, 2, 1, 1, 2, True, "RELU")
+
+
+# def test_softplus():
+#     x = np.arange(1000).astype(np.float32)
+#     out = F.softplus(tensor(x))
+#     mask = x <= 20
+#     with np.errstate(over="ignore"):
+#         expected = np.where(mask, np.log(1 + np.exp(x)), x)
+#     assertTensorClose(out, expected)
+#     beta = 2
+#     out = F.softplus(tensor(x), beta=beta, threshold=30)
+#     mask = beta * x <= 30
+#     # ignore overflow
+#     with np.errstate(over="ignore"):
+#         expected = np.where(mask, np.log(1 + np.exp(x * beta)) / beta, x)
+#     assertTensorClose(out, expected)
+
+
+def test_condtake():
+    x = np.array([[1, 2, 3], [4, 5, 6]])
+    y = np.array([[True, False, True], [False, True, True]])
+    xx = tensor(x)
+    yy = tensor(y)
+    val, idx = F.cond_take(yy, xx)
+    np.testing.assert_equal(val.numpy(), x[y])
+    np.testing.assert_equal(idx.numpy(), np.where(y.reshape(-1))[0])
diff --git a/imperative/python/test/unit/functional/test_math.py b/imperative/python/test/unit/functional/test_math.py
new file mode 100644
index 0000000000000000000000000000000000000000..d693f36b9d1e508bba8328c9ef7f51890ff9272e
--- /dev/null
+++ b/imperative/python/test/unit/functional/test_math.py
@@ -0,0 +1,258 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from functools import partial
+
+import numpy as np
+
+import megengine.functional as F
+from megengine import tensor
+from megengine.test import assertTensorClose
+
+# from helpers import opr_test
+
+
+def _default_compare_fn(x, y):
+    assertTensorClose(x.numpy(), y)
+
+
+def opr_test(cases, func, compare_fn=_default_compare_fn, ref_fn=None, **kwargs):
+    """
+    func: the function to run opr.
+    compare_fn: the function to compare the result and expected, use assertTensorClose if None.
+    ref_fn: the function to generate expected data, should assign output if None.
+    cases: the list which have dict element, the list length should be 2 for dynamic shape test.
+           and the dict should have input,
+           and should have output if ref_fn is None.
+           should use list for multiple inputs and outputs for each case.
+    kwargs: The additional kwargs for opr func.
+
+    simple examples:
+
+        dtype = np.float32
+        cases = [{"input": [10, 20]}, {"input": [20, 30]}]
+        opr_test(cases,
+                 F.eye,
+                 ref_fn=lambda n, m: np.eye(n, m).astype(dtype),
+                 dtype=dtype)
+
+    """
+
+    def check_results(results, expected):
+        if not isinstance(results, tuple):
+            results = (results,)
+        for r, e in zip(results, expected):
+            compare_fn(r, e)
+
+    def get_param(cases, idx):
+        case = cases[idx]
+        inp = case.get("input", None)
+        outp = case.get("output", None)
+        if inp is None:
+            raise ValueError("the test case should have input")
+        if not isinstance(inp, list):
+            inp = (inp,)
+        else:
+            inp = tuple(inp)
+        if ref_fn is not None and callable(ref_fn):
+            outp = ref_fn(*inp)
+        if outp is None:
+            raise ValueError("the test case should have output or reference function")
+        if not isinstance(outp, list):
+            outp = (outp,)
+        else:
+            outp = tuple(outp)
+
+        return inp, outp
+
+    if len(cases) == 0:
+        raise ValueError("should give one case at least")
+
+    if not callable(func):
+        raise ValueError("the input func should be callable")
+
+    inp, outp = get_param(cases, 0)
+    inp_tensor = [tensor(inpi) for inpi in inp]
+
+    results = func(*inp_tensor, **kwargs)
+    check_results(results, outp)
+
+
+def common_test_reduce(opr, ref_opr):
+    data1_shape = (5, 6, 7)
+    data2_shape = (2, 9, 12)
+    data1 = np.random.random(data1_shape).astype(np.float32)
+    data2 = np.random.random(data2_shape).astype(np.float32)
+    cases = [{"input": data1}, {"input": data2}]
+
+    if opr not in (F.argmin, F.argmax):
+        # test default axis
+        opr_test(cases, opr, ref_fn=ref_opr)
+        # test all axises in range of input shape
+        for axis in range(-3, 3):
+            # test keepdims False
+            opr_test(cases, opr, ref_fn=lambda x: ref_opr(x, axis=axis), axis=axis)
+            # test keepdims True
+            opr_test(
+                cases,
+                opr,
+                ref_fn=lambda x: ref_opr(x, axis=axis, keepdims=True),
+                axis=axis,
+                keepdims=True,
+            )
+    else:
+        # test defaut axis
+        opr_test(cases, opr, ref_fn=lambda x: ref_opr(x).astype(np.int32))
+        # test all axises in range of input shape
+        for axis in range(0, 3):
+            opr_test(
+                cases,
+                opr,
+                ref_fn=lambda x: ref_opr(x, axis=axis).astype(np.int32),
+                axis=axis,
+            )
+
+
+def test_sum():
+    common_test_reduce(opr=F.sum, ref_opr=np.sum)
+
+
+def test_prod():
+    common_test_reduce(opr=F.prod, ref_opr=np.prod)
+
+
+def test_mean():
+    common_test_reduce(opr=F.mean, ref_opr=np.mean)
+
+
+def test_var():
+    common_test_reduce(opr=F.var, ref_opr=np.var)
+
+
+def test_std():
+    common_test_reduce(opr=F.std, ref_opr=np.std)
+
+
+def test_min():
+    common_test_reduce(opr=F.min, ref_opr=np.min)
+
+
+def test_max():
+    common_test_reduce(opr=F.max, ref_opr=np.max)
+
+
+def test_argmin():
+    common_test_reduce(opr=F.argmin, ref_opr=np.argmin)
+
+
+def test_argmax():
+    common_test_reduce(opr=F.argmax, ref_opr=np.argmax)
+
+
+def test_sqrt():
+    d1_shape = (15,)
+    d2_shape = (25,)
+    d1 = np.random.random(d1_shape).astype(np.float32)
+    d2 = np.random.random(d2_shape).astype(np.float32)
+
+    cases = [{"input": d1}, {"input": d2}]
+    opr_test(cases, F.sqrt, ref_fn=np.sqrt)
+
+
+def test_sort():
+    data1_shape = (10, 3)
+    data2_shape = (12, 2)
+    data1 = np.random.random(data1_shape).astype(np.float32)
+    data2 = np.random.random(data2_shape).astype(np.float32)
+    output0 = [np.sort(data1), np.argsort(data1).astype(np.int32)]
+    output1 = [np.sort(data2), np.argsort(data2).astype(np.int32)]
+
+    cases = [
+        {"input": data1, "output": output0},
+        {"input": data2, "output": output1},
+    ]
+    opr_test(cases, F.sort)
+
+
+def test_normalize():
+
+    cases = [
+        {"input": np.random.random((2, 3, 12, 12)).astype(np.float32)} for i in range(2)
+    ]
+
+    def np_normalize(x, p=2, axis=None, eps=1e-12):
+        if axis is None:
+            norm = np.sum(x ** p) ** (1.0 / p)
+        else:
+            norm = np.sum(x ** p, axis=axis, keepdims=True) ** (1.0 / p)
+        return x / np.clip(norm, a_min=eps, a_max=np.inf)
+
+    # Test L-2 norm along all dimensions
+    opr_test(cases, F.normalize, ref_fn=np_normalize)
+
+    # Test L-1 norm along all dimensions
+    opr_test(cases, partial(F.normalize, p=1), ref_fn=partial(np_normalize, p=1))
+
+    # Test L-2 norm along the second dimension
+    opr_test(cases, partial(F.normalize, axis=1), ref_fn=partial(np_normalize, axis=1))
+
+    # Test some norm == 0
+    cases[0]["input"][0, 0, 0, :] = 0
+    cases[1]["input"][0, 0, 0, :] = 0
+    opr_test(cases, partial(F.normalize, axis=3), ref_fn=partial(np_normalize, axis=3))
+
+
+# def test_logsumexp():
+#     x = np.arange(10).astype(np.float32)
+#     expected = np.log(np.sum(np.exp(x)))
+#     cases = [{"input": x, "output": expected}]
+#     compare_fn = partial(assertTensorClose, allow_special_values=True)
+#     # large value check
+#     n = 100
+#     x = np.full(n, 10000, dtype=np.float32)
+#     expected = 10000 + np.log(n)
+#     cases.append({"input": x, "output": expected.astype(np.float32)})
+#     opr_test(cases, F.logsumexp, axis=0, compare_fn=compare_fn)
+
+#     # special value check
+#     x = np.array([np.inf], dtype=np.float32)
+#     expected = x
+#     cases = [{"input": x, "output": expected}]
+
+#     x = np.array([-np.inf, 0.0], dtype=np.float32)
+#     expected = np.zeros(1).astype(np.float32)
+#     cases.append({"input": x, "output": expected})
+#     opr_test(cases, F.logsumexp, axis=0, compare_fn=compare_fn)
+
+#     x = np.array([np.nan], dtype=np.float32)
+#     expected = x
+#     cases = [{"input": x, "output": expected}]
+
+#     x = np.array([-np.inf, 1], dtype=np.float32)
+#     expected = np.array([1.0], dtype=np.float32)
+#     cases.append({"input": x, "output": expected})
+
+#     opr_test(cases, F.logsumexp, axis=0, compare_fn=compare_fn)
+
+#     # keepdims check
+#     x = np.array([[1e10, 1e-10], [-1e10, -np.inf]], dtype=np.float32)
+#     expected = np.array([[1e10], [-1e10]], dtype=np.float32)
+#     cases = [{"input": x, "output": expected}]
+#     x = np.array([[1e10, -1e-10, 1e-10], [1e10, 1e-10, np.inf]], dtype=np.float32)
+#     expected = np.array([[1e10], [np.inf]], dtype=np.float32)
+#     cases.append({"input": x, "output": expected})
+#     opr_test(cases, F.logsumexp, axis=1, keepdims=True, compare_fn=compare_fn)
+
+#     # multiple axes check
+#     x = np.array([[1e10, 1e-10], [-1e10, -np.inf]], dtype=np.float32)
+#     expected = np.array([1e10], dtype=np.float32)
+#     cases = [{"input": x, "output": expected}]
+#     x = np.array([[1e10, -1e-10, 1e-10], [1e10, 1e-10, np.inf]], dtype=np.float32)
+#     expected = np.array([np.inf], dtype=np.float32)
+#     cases.append({"input": x, "output": expected})
+#     opr_test(cases, F.logsumexp, axis=(0, 1), keepdims=False, compare_fn=compare_fn)
diff --git a/imperative/python/test/unit/functional/test_tensor.py b/imperative/python/test/unit/functional/test_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..018871a20706782d37001c94d39053ad45293611
--- /dev/null
+++ b/imperative/python/test/unit/functional/test_tensor.py
@@ -0,0 +1,313 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+import pytest
+
+import megengine.functional as F
+from megengine import Buffer, Parameter, is_cuda_available, tensor
+from megengine.core.tensor.utils import astensor1d
+from megengine.test import assertTensorClose
+
+
+def _default_compare_fn(x, y):
+    assertTensorClose(x.numpy(), y)
+
+
+def opr_test(cases, func, compare_fn=_default_compare_fn, ref_fn=None, **kwargs):
+    """
+    func: the function to run opr.
+    compare_fn: the function to compare the result and expected, use assertTensorClose if None.
+    ref_fn: the function to generate expected data, should assign output if None.
+    cases: the list which have dict element, the list length should be 2 for dynamic shape test.
+           and the dict should have input,
+           and should have output if ref_fn is None.
+           should use list for multiple inputs and outputs for each case.
+    kwargs: The additional kwargs for opr func.
+
+    simple examples:
+
+        dtype = np.float32
+        cases = [{"input": [10, 20]}, {"input": [20, 30]}]
+        opr_test(cases,
+                 F.eye,
+                 ref_fn=lambda n, m: np.eye(n, m).astype(dtype),
+                 dtype=dtype)
+
+    """
+
+    def check_results(results, expected):
+        if not isinstance(results, tuple):
+            results = (results,)
+        for r, e in zip(results, expected):
+            compare_fn(r, e)
+
+    def get_param(cases, idx):
+        case = cases[idx]
+        inp = case.get("input", None)
+        outp = case.get("output", None)
+        if inp is None:
+            raise ValueError("the test case should have input")
+        if not isinstance(inp, list):
+            inp = (inp,)
+        else:
+            inp = tuple(inp)
+        if ref_fn is not None and callable(ref_fn):
+            outp = ref_fn(*inp)
+        if outp is None:
+            raise ValueError("the test case should have output or reference function")
+        if not isinstance(outp, list):
+            outp = (outp,)
+        else:
+            outp = tuple(outp)
+
+        return inp, outp
+
+    if len(cases) == 0:
+        raise ValueError("should give one case at least")
+
+    if not callable(func):
+        raise ValueError("the input func should be callable")
+
+    inp, outp = get_param(cases, 0)
+    inp_tensor = [tensor(inpi) for inpi in inp]
+
+    results = func(*inp_tensor, **kwargs)
+    check_results(results, outp)
+
+
+def test_eye():
+    dtype = np.float32
+    cases = [{"input": [10, 20]}, {"input": [20, 30]}]
+    for case in cases:
+        assertTensorClose(
+            F.eye(case["input"], dtype=dtype).numpy(),
+            np.eye(*case["input"]).astype(dtype),
+        )
+
+
+def test_concat():
+    def get_data_shape(length: int):
+        return (length, 2, 3)
+
+    data1 = np.random.random(get_data_shape(5)).astype("float32")
+    data2 = np.random.random(get_data_shape(6)).astype("float32")
+    data3 = np.random.random(get_data_shape(7)).astype("float32")
+
+    def run(data1, data2):
+        return F.concat([data1, data2])
+
+    cases = [{"input": [data1, data2]}, {"input": [data1, data3]}]
+    opr_test(cases, run, ref_fn=lambda x, y: np.concatenate([x, y]))
+
+
+def test_stack():
+    data1 = np.random.random((3, 2, 2)).astype("float32")
+    data2 = np.random.random((3, 2, 2)).astype("float32")
+    data3 = np.random.random((3, 2, 2)).astype("float32")
+
+    cases = [{"input": [data1, data2]}, {"input": [data1, data3]}]
+    for ai in range(3):
+
+        def run(data1, data2):
+            return F.stack([data1, data2], axis=ai)
+
+        opr_test(cases, run, ref_fn=lambda x, y: np.stack([x, y], axis=ai))
+
+
+def test_split():
+    data = np.random.random((2, 3, 4, 5)).astype(np.float32)
+    mge_out1 = F.split(tensor(data), 2, axis=3)
+    mge_out2 = F.split(tensor(data), [3, 5], axis=3)
+
+    np_out = np.split(data, [3, 5], axis=3)
+
+    np.testing.assert_equal(mge_out1[0].numpy(), mge_out2[0].numpy())
+    np.testing.assert_equal(mge_out1[0].numpy(), np_out[0])
+
+
+def test_reshape():
+    x = np.arange(6, dtype="float32")
+    xx = tensor(x)
+    y = x.reshape(1, 2, 3)
+
+    for shape in [
+        (1, 2, 3),
+        (1, -1, 3),
+        (1, tensor(-1), 3),
+        np.array([1, -1, 3], dtype="int32"),
+        tensor([1, -1, 3]),
+    ]:
+        yy = F.reshape(xx, shape)
+        np.testing.assert_equal(yy.numpy(), y)
+
+
+def test_squeeze():
+    x = np.arange(6, dtype="float32").reshape(1, 2, 3, 1)
+    xx = tensor(x)
+
+    for axis in [None, 3, -4, (3, -4)]:
+        y = np.squeeze(x, axis)
+        yy = F.squeeze(xx, axis)
+        np.testing.assert_equal(y, yy.numpy())
+
+
+def test_expand_dims():
+    x = np.arange(6, dtype="float32").reshape(2, 3)
+    xx = tensor(x)
+
+    for axis in [2, -3, (3, -4), (1, -4)]:
+        y = np.expand_dims(x, axis)
+        yy = F.expand_dims(xx, axis)
+        np.testing.assert_equal(y, yy.numpy())
+
+
+def test_elemwise_dtype_promotion():
+    x = np.random.rand(2, 3).astype("float32")
+    y = np.random.rand(1, 3).astype("float16")
+    xx = tensor(x)
+    yy = tensor(y)
+    z = xx * yy
+    np.testing.assert_equal(z.numpy(), x * y)
+
+    z = xx + y
+    np.testing.assert_equal(z.numpy(), x + y)
+
+    z = x - yy
+    np.testing.assert_equal(z.numpy(), x - y)
+
+
+def test_linspace():
+    cases = [
+        {"input": [1, 9, 9]},
+        {"input": [3, 10, 8]},
+    ]
+    opr_test(
+        cases,
+        F.linspace,
+        ref_fn=lambda start, end, step: np.linspace(start, end, step, dtype=np.float32),
+    )
+
+    cases = [
+        {"input": [9, 1, 9]},
+        {"input": [10, 3, 8]},
+    ]
+    opr_test(
+        cases,
+        F.linspace,
+        ref_fn=lambda start, end, step: np.linspace(start, end, step, dtype=np.float32),
+    )
+
+
+def test_arange():
+    cases = [
+        {"input": [1, 9, 1]},
+        {"input": [2, 10, 2]},
+    ]
+    opr_test(
+        cases,
+        F.arange,
+        ref_fn=lambda start, end, step: np.arange(start, end, step, dtype=np.float32),
+    )
+
+    cases = [
+        {"input": [9, 1, -1]},
+        {"input": [10, 2, -2]},
+    ]
+    opr_test(
+        cases,
+        F.arange,
+        ref_fn=lambda start, end, step: np.arange(start, end, step, dtype=np.float32),
+    )
+
+    cases = [
+        {"input": [9.3, 1.2, -0.5]},
+        {"input": [10.3, 2.1, -1.7]},
+    ]
+    opr_test(
+        cases,
+        F.arange,
+        ref_fn=lambda start, end, step: np.arange(start, end, step, dtype=np.float32),
+    )
+
+
+def test_round():
+    data1_shape = (15,)
+    data2_shape = (25,)
+    data1 = np.random.random(data1_shape).astype(np.float32)
+    data2 = np.random.random(data2_shape).astype(np.float32)
+
+    cases = [{"input": data1}, {"input": data2}]
+    opr_test(cases, F.round, ref_fn=np.round)
+
+
+def test_broadcast():
+    input1_shape = (20, 30)
+    output1_shape = (30, 20, 30)
+    data1 = np.random.random(input1_shape).astype(np.float32)
+
+    input2_shape = (10, 20)
+    output2_shape = (20, 10, 20)
+    data2 = np.random.random(input2_shape).astype(np.float32)
+
+    def compare_fn(x, y):
+        assert x.numpy().shape == y
+
+    cases = [
+        {"input": [data1, output1_shape], "output": output1_shape},
+        {"input": [data2, output2_shape], "output": output2_shape},
+    ]
+    opr_test(cases, F.broadcast, compare_fn=compare_fn)
+
+
+def test_utils_astensor1d():
+    reference = tensor(0)
+
+    # literal
+    x = [1, 2, 3]
+    for dtype in [None, "float32"]:
+        xx = astensor1d(x, reference, dtype=dtype)
+        assert type(xx) is tensor
+        np.testing.assert_equal(xx.numpy(), x)
+
+    # numpy array
+    x = np.asarray([1, 2, 3], dtype="int32")
+    for dtype in [None, "float32"]:
+        xx = astensor1d(x, reference, dtype=dtype)
+        assert type(xx) is tensor
+        np.testing.assert_equal(xx.numpy(), x.astype(dtype) if dtype else x)
+
+    # tensor
+    x = tensor([1, 2, 3], dtype="int32")
+    for dtype in [None, "float32"]:
+        xx = astensor1d(x, reference, dtype=dtype)
+        assert type(xx) is tensor
+        np.testing.assert_equal(xx.numpy(), x.numpy())
+
+    # mixed
+    x = [1, tensor(2), 3]
+    for dtype in [None, "float32"]:
+        xx = astensor1d(x, reference, dtype=dtype)
+        assert type(xx) is tensor
+        np.testing.assert_equal(xx.numpy(), [1, 2, 3])
+
+
+def test_device():
+    x = tensor([1, 2, 3], dtype="float32")
+
+    y1 = F.eye(x.shape, dtype="float32")
+    y2 = F.eye(x.shape, dtype="float32", device=None)
+    np.testing.assert_almost_equal(y1.numpy(), y2.numpy())
+
+    y3 = F.eye(x.shape, dtype="float32", device="xpux")
+    y4 = F.eye(x.shape, dtype="float32", device=x.device.to_c())
+    np.testing.assert_almost_equal(y3.numpy(), y4.numpy())
+
+    y5 = F.full((3, 2), 4, device=x.device)
+    y6 = F.full((3, 2), 4, device="xpux")
+    np.testing.assert_almost_equal(y5.numpy(), y6.numpy())
diff --git a/imperative/python/test/unit/quantization/quantize.py b/imperative/python/test/unit/quantization/quantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..236ef9e137e5c95da76855791c8759c450a75b67
--- /dev/null
+++ b/imperative/python/test/unit/quantization/quantize.py
@@ -0,0 +1,80 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from megengine import module as Float
+from megengine.module import qat as QAT
+from megengine.quantization.quantize import _get_quantable_module_names, quantize_qat
+
+
+def test_get_quantable_module_names():
+    # need to make sure names from Quantized and QAT are the same
+    def _get_qat_module_names():
+        def is_qat(key: str):
+            value = getattr(QAT, key)
+            return (
+                isinstance(value, type)
+                and issubclass(value, QAT.QATModule)
+                and value != QAT.QATModule
+            )
+
+        # source should have all quantable modules' names
+        quantable_module_names = [key for key in dir(QAT) if is_qat(key)]
+        return quantable_module_names
+
+    qat_module_names = _get_qat_module_names()
+    quantized_module_names = _get_quantable_module_names()
+    assert set(qat_module_names) == set(quantized_module_names)
+
+    for key in qat_module_names:
+        value = getattr(Float, key)
+        assert (
+            isinstance(value, type)
+            and issubclass(value, Float.Module)
+            and value != Float.Module
+        )
+
+
+def test_disable_quantize():
+    class Net(Float.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = Float.ConvBnRelu2d(3, 3, 3)
+            self.conv.disable_quantize()
+
+        def forward(self, x):
+            return self.conv(x)
+
+    net = Net()
+    qat_net = quantize_qat(net, inplace=False)
+    assert isinstance(qat_net.conv, Float.ConvBnRelu2d)
+    assert isinstance(qat_net.conv.conv, Float.Conv2d)
+
+
+def test_convert_with_custom_mapping():
+    class FloatExample(Float.Module):
+        def forward(self, x):
+            return x
+
+    class QATExample(QAT.QATModule):
+        def forward(self, x):
+            return x
+
+        @classmethod
+        def from_float_module(cls, float_module):
+            return cls()
+
+    class Net(Float.Module):
+        def __init__(self):
+            super().__init__()
+            self.example = FloatExample()
+
+        def forward(self, x):
+            return self.example(x)
+
+    net = Net()
+    qat_net = quantize_qat(net, inplace=False, mapping={FloatExample: QATExample})
+    assert isinstance(qat_net.example, QATExample)
diff --git a/imperative/python/test/unit/quantization/test_fake_quant.py b/imperative/python/test/unit/quantization/test_fake_quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff999b75828b2eb8dbc3d4ae41779c8a198b457f
--- /dev/null
+++ b/imperative/python/test/unit/quantization/test_fake_quant.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+import pytest
+
+import megengine as mge
+from megengine import tensor
+from megengine.quantization.fake_quant import TQT_Function
+from megengine.quantization.internal_fake_quant import *
+from megengine.test import assertTensorClose
+
+
+class numpy_TQT_Function:
+    def __init__(self, lowerbound, upperbound):
+        super().__init__()
+        self.lowerbound = lowerbound
+        self.upperbound = upperbound
+
+    def forward(self, inp, scale):
+        t = 2 ** scale
+        # t = F.maximum(t, 1e-4)
+        inp_scaled = inp / t
+        inp_clipped = np.maximum(
+            np.minimum(inp_scaled, self.upperbound), self.lowerbound
+        )
+        inp_rounded = np.round(inp_clipped)
+        inp_flq = inp_rounded * t
+        self.saved_tensors = (inp_scaled, inp_rounded, t)
+        return inp_flq
+
+    def backward(self, grad_inp_flq):
+        (inp_scaled, inp_rounded, t) = self.saved_tensors
+        mask_clip = (inp_scaled < -0.5 + self.lowerbound) + (
+            inp_scaled > self.upperbound + 0.5
+        )  # mask for accumulating the gradients of |data_scaled|>L
+        mask_quant = np.abs(
+            mask_clip - 1
+        )  # mask for accumulating the gradients with |data_scaled|<=L
+        grad_quant = (
+            grad_inp_flq * mask_quant * (inp_rounded - inp_scaled)
+        )  # gradient within |data_scaled|<=L
+        grad_clip = (
+            grad_inp_flq * mask_clip * inp_rounded
+        )  # gradient with   | data_scaled|>L
+        grad_s = grad_clip.sum() + grad_quant.sum()
+        # dL/ds = dL/dt * t * ln(2)
+        grad_s = grad_s * t * np.log(2)
+        grad_inp = grad_inp_flq * mask_quant
+        return grad_inp, grad_s
+
+
+def test_TQT():
+    f = TQT_Function(-127, 127)
+    nf = numpy_TQT_Function(-127, 127)
+
+    def check_inp(a, b, c, a_np, b_np, c_np):
+        assertTensorClose(
+            f.forward(a, b).numpy(), nf.forward(a_np, b_np).astype("float32")
+        )
+        c1, c2 = f.backward(c)
+        c1_np, c2_np = nf.backward(c_np)
+        assertTensorClose(c1.numpy(), c1_np.astype("float32"))
+        assertTensorClose(c2.numpy(), c2_np.astype("float32"))
+
+    a_np = np.random.random((4, 3)).astype("float32")
+    b_np = np.random.random((1)).astype("float32")
+    a = tensor(a_np)
+    b = tensor(b_np)
+    check_inp(a, b, b, a_np, b_np, b_np)
+
+
diff --git a/imperative/python/test/unit/test_autodiff.py b/imperative/python/test/unit/test_autodiff.py
new file mode 100644
index 0000000000000000000000000000000000000000..929e967cae28069bc287dcaae5159160d4b61d2d
--- /dev/null
+++ b/imperative/python/test/unit/test_autodiff.py
@@ -0,0 +1,227 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import weakref
+
+import numpy as np
+import pytest
+
+import megengine as mge
+import megengine.distributed as dist
+from megengine.core._imperative_rt import TensorAttr, imperative
+from megengine.core._imperative_rt.imperative import sync
+from megengine.core.autodiff.grad import Grad
+from megengine.core.ops.builtin import Elemwise
+from megengine.core.tensor.raw_tensor import as_raw_tensor
+from megengine.core.tensor.tensor import Tensor, apply
+from megengine.core.tensor.tensor_wrapper import TensorWrapper
+from megengine.functional.distributed import remote_recv, remote_send
+
+
+def _elwise(mode):
+    op = Elemwise(mode=mode)
+
+    def f(*args):
+        (result,) = apply(op, *args)
+        return result
+
+    return f
+
+
+add = _elwise("add")
+mul = _elwise("mul")
+cos = _elwise("cos")
+relu = _elwise("relu")
+
+
+def as_tensor(x):
+    return Tensor(as_raw_tensor(x, device=mge.device.get_default_device()))
+
+
+def save_to(self, name="grad"):
+    def callback(tensor, grad):
+        setattr(self, name, grad)
+
+    return callback
+
+
+@pytest.mark.isolated_distributed
+def test_dist_grad():
+    world_size = 2
+    x_np = np.random.rand(10).astype("float32")
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+
+    def worker0():
+        dist.init_process_group("localhost", port, world_size, 0, 0)
+        mge.device.set_default_device("gpu0")
+        grad = Grad()
+
+        x = as_tensor(x_np)
+        grad.wrt(x, callback=save_to(x))
+        # need a placeholder to trace operator
+        send_x = remote_send(x, 1)
+        recv_x = remote_recv(1, x_np.shape, x_np.dtype, "gpu0")
+        y = recv_x * recv_x
+
+        grad([y], [as_tensor(np.ones_like(x_np))])
+        np.testing.assert_almost_equal(x.grad.numpy(), x.numpy() * 2)
+
+    def worker1():
+        dist.init_process_group("localhost", port, world_size, 1, 1)
+        mge.device.set_default_device("gpu1")
+        grad = Grad()
+
+        recv_x = remote_recv(0, x_np.shape, x_np.dtype, "gpu1")
+        send_x = remote_send(recv_x, 0)
+
+        grad([], [])
+
+        # sync because grad has a send operator
+        sync()
+        send_x.device._cn._sync_all()
+
+    import multiprocessing as mp
+
+    p0 = mp.Process(target=worker0)
+    p1 = mp.Process(target=worker1)
+    p0.start()
+    p1.start()
+    p0.join(10)
+    p1.join(10)
+    assert p0.exitcode == 0 and p1.exitcode == 0
+
+
+def test_grad():
+    x_np = np.random.rand(10).astype("float32")
+    x = as_tensor(x_np)
+
+    grad = Grad().wrt(x, callback=save_to(x))
+
+    y = cos(x)
+
+    grad(y, as_tensor(np.ones_like(x_np)))
+    np.testing.assert_almost_equal(x.grad.numpy(), -np.sin(x_np))
+
+
+def test_grad_2():
+    x_np = np.random.rand(10).astype("float32")
+    x = as_tensor(x_np)
+
+    grad = Grad().wrt(x, callback=save_to(x))
+
+    y = mul(x, x)
+    y = mul(y, y)
+
+    grad(y, as_tensor(np.ones_like(x_np)))
+    np.testing.assert_almost_equal(x.grad.numpy(), 4 * x_np ** 3, decimal=6)
+
+
+@pytest.mark.skip(reason="high order gradient was not implemented yet")
+def test_2nd_grad():
+    x_np = np.random.rand(10).astype("float32")
+    x = as_tensor(x_np)
+    ones = as_tensor(np.ones_like(x_np))
+
+    grad = Grad().wrt(x, callback=save_to(x))
+    grad2 = Grad().wrt(x, callback=save_to(x))
+
+    y = cos(x)
+
+    grad(y, ones)
+    np.testing.assert_almost_equal(x.grad.numpy(), -np.sin(x_np), decimal=5)
+
+    grad2(x.grad, ones)
+    np.testing.assert_almost_equal(x.grad.numpy(), -np.cos(x_np))
+
+
+def test_grad_with_tensor_wrapper():
+    x_np = np.random.rand(10).astype("float32")
+    x = TensorWrapper(x_np)
+
+    grad = Grad().wrt(x, callback=save_to(x))
+
+    y = mul(x, x)
+    y = mul(y, y)
+
+    grad(y, TensorWrapper(np.ones_like(x_np)))
+    np.testing.assert_almost_equal(x.grad.numpy(), 4 * x_np ** 3, decimal=6)
+
+
+def test_grad_inplace():
+    x_np = np.random.rand(10).astype("float32")
+    x = TensorWrapper(x_np)
+
+    grad = Grad().wrt(x, callback=save_to(x))
+
+    y = mul(x, x)
+    y *= y
+
+    grad(y, TensorWrapper(np.ones_like(x_np)))
+    np.testing.assert_almost_equal(x.grad.numpy(), 4 * x_np ** 3, decimal=6)
+
+
+def test_elemwise_add():
+    x_np = np.random.rand(10).astype("float32")
+    y_np = np.random.rand(10, 10).astype("float32")
+    dz_np = np.random.rand(10, 10).astype("float32")
+    x = TensorWrapper(x_np)
+    y = TensorWrapper(y_np)
+    dz = TensorWrapper(dz_np)
+
+    refs = {}
+
+    def f(x, y):
+        x = x * 2
+        refs["x"] = weakref.ref(x.__wrapped__)
+        refs["y"] = weakref.ref(y.__wrapped__)
+        return x + y
+
+    grad = Grad().wrt(x, callback=save_to(x))
+
+    z = f(x, y)
+    del y
+
+    for k, r in refs.items():
+        assert r() is None
+
+    grad(z, dz)
+    np.testing.assert_almost_equal(x.grad.numpy(), dz_np.sum(0) * 2, decimal=5)
+
+
+def test_elemwise_relu():
+    x_np = [1.0, -1.0]
+    dz_np = [1.0]
+    x = TensorWrapper(x_np)
+    dz = TensorWrapper(dz_np)
+
+    refs = {}
+
+    def f(x):
+        x = x * 2
+        refs["x"] = weakref.ref(x.__wrapped__)
+        return relu(x)
+
+    grad = Grad().wrt(x, callback=save_to(x))
+
+    z = f(x)
+
+    assert refs["x"]() is None
+
+    grad(z, dz)
+    np.testing.assert_almost_equal(x.grad.numpy(), [2.0, 0])
+
+
+def test_elemwise_relu_backward_fn():
+    op = Elemwise(mode="relu").to_c()
+    attr = TensorAttr()
+    attr.dtype = "float32"
+    attr.comp_node = "xpux"
+    result = imperative.make_backward_graph(op, [attr], [True], [True])
+    backward_graph, save_for_backward_mask, input_has_grad = result
+    assert save_for_backward_mask == [False, True, True], save_for_backward_mask
diff --git a/imperative/python/test/unit/test_distributed.py b/imperative/python/test/unit/test_distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..70692eb9f2aaead42c9918ce9cdc15084a1502ac
--- /dev/null
+++ b/imperative/python/test/unit/test_distributed.py
@@ -0,0 +1,193 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import multiprocessing as mp
+import platform
+import queue
+
+import pytest
+
+import megengine as mge
+import megengine.distributed as dist
+
+
+def _assert_q_empty(q):
+    try:
+        res = q.get(timeout=1)
+    except Exception as e:
+        assert isinstance(e, queue.Empty)
+    else:
+        assert False, "queue is not empty"
+
+
+def _assert_q_val(q, val):
+    ret = q.get()
+    assert ret == val
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
+)
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+)
+@pytest.mark.isolated_distributed
+def test_init_process_group():
+    world_size = 2
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+
+    def worker(rank, backend):
+        if mge.get_device_count("gpu") < world_size:
+            return
+        dist.init_process_group("localhost", port, world_size, rank, rank, backend)
+        assert dist.is_distributed() == True
+        assert dist.get_rank() == rank
+        assert dist.get_world_size() == world_size
+        assert dist.get_backend() == backend
+
+        py_server_addr = dist.get_py_server_addr()
+        assert py_server_addr[0] == "localhost"
+        assert py_server_addr[1] == port
+
+        mm_server_addr = dist.get_mm_server_addr()
+        assert mm_server_addr[0] == "localhost"
+        assert mm_server_addr[1] > 0
+
+        assert isinstance(dist.get_client(), dist.Client)
+
+    def check(backend):
+        procs = []
+        for rank in range(world_size):
+            p = mp.Process(target=worker, args=(rank, backend))
+            p.start()
+            procs.append(p)
+
+        for p in procs:
+            p.join(20)
+            assert p.exitcode == 0
+
+    check("nccl")
+    check("ucx")
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
+)
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+)
+@pytest.mark.isolated_distributed
+def test_new_group():
+    world_size = 3
+    ranks = [2, 0]
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+
+    def worker(rank):
+        if mge.get_device_count("gpu") < world_size:
+            return
+        dist.init_process_group("localhost", port, world_size, rank, rank)
+        if rank in ranks:
+            group = dist.new_group(ranks)
+            assert group.size == 2
+            assert group.key == "2,0"
+            assert group.rank == ranks.index(rank)
+            assert group.comp_node == "gpu{}:2".format(rank)
+
+    procs = []
+    for rank in range(world_size):
+        p = mp.Process(target=worker, args=(rank,))
+        p.start()
+        procs.append(p)
+
+    for p in procs:
+        p.join(20)
+        assert p.exitcode == 0
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
+)
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+)
+@pytest.mark.isolated_distributed
+def test_group_barrier():
+    world_size = 2
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+
+    def worker(rank, q):
+        if mge.get_device_count("gpu") < world_size:
+            return
+        dist.init_process_group("localhost", port, world_size, rank, rank)
+        dist.group_barrier()
+        if rank == 0:
+            dist.group_barrier()
+            q.put(0)  # to be observed in rank 1
+        else:
+            _assert_q_empty(q)  # q.put(0) is not executed in rank 0
+            dist.group_barrier()
+            _assert_q_val(q, 0)  # q.put(0) executed in rank 0
+
+    Q = mp.Queue()
+    procs = []
+    for rank in range(world_size):
+        p = mp.Process(target=worker, args=(rank, Q))
+        p.start()
+        procs.append(p)
+
+    for p in procs:
+        p.join(20)
+        assert p.exitcode == 0
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
+)
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+)
+@pytest.mark.isolated_distributed
+def test_synchronized():
+    world_size = 2
+    port = dist.get_free_ports(1)[0]
+    server = dist.Server(port)
+
+    @dist.synchronized
+    def func(rank, q):
+        q.put(rank)
+
+    def worker(rank, q):
+        if mge.get_device_count("gpu") < world_size:
+            return
+        dist.init_process_group("localhost", port, world_size, rank, rank)
+        dist.group_barrier()
+        if rank == 0:
+            func(0, q)  # q.put(0)
+            q.put(2)
+        else:
+            _assert_q_val(q, 0)  # func executed in rank 0
+            _assert_q_empty(q)  # q.put(2) is not executed
+            func(1, q)
+            _assert_q_val(
+                q, 1
+            )  # func in rank 1 executed earlier than q.put(2) in rank 0
+            _assert_q_val(q, 2)  # q.put(2) executed in rank 0
+
+    Q = mp.Queue()
+    procs = []
+    for rank in range(world_size):
+        p = mp.Process(target=worker, args=(rank, Q))
+        p.start()
+        procs.append(p)
+
+    for p in procs:
+        p.join(20)
+        assert p.exitcode == 0
diff --git a/imperative/python/test/unit/test_function.py b/imperative/python/test/unit/test_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d46e26e23142fdc235b1ef26039f037153028a6
--- /dev/null
+++ b/imperative/python/test/unit/test_function.py
@@ -0,0 +1,128 @@
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+import megengine.optimizer as optimizer
+from megengine import Parameter
+from megengine import Tensor as tensor
+from megengine import tensor
+from megengine.core.tensor.function import Function
+from megengine.module import Module
+
+
+def test_single_input():
+    data_shape = (9, 2, 6)
+    av = np.random.random(data_shape).astype(np.float32)
+
+    class MulFunc(Function):
+        def forward(self, a):
+            self.a = a
+            return a * 10
+
+        def backward(self, grad_o):
+            return grad_o * 10
+
+    class Simple(Module):
+        def __init__(self, a):
+            super().__init__()
+            self.a = Parameter(a, dtype=np.float32)
+            self.layer1 = MulFunc()
+
+        def forward(self):
+            x = self.layer1(self.a)
+            return x
+
+    net = Simple(av)
+    optim = optimizer.SGD(net.parameters(), lr=1.0)
+    optim.zero_grad()
+
+    with optim.record():
+        loss = net()
+        optim.backward(loss.sum())
+    optim.step()
+
+    np.testing.assert_almost_equal(loss.numpy(), (av * 10))
+    np.testing.assert_almost_equal(net.a.numpy(), (av - 10))
+
+
+def test_multi_input():
+    data_shape = (9, 2, 6)
+    av = np.random.random(data_shape).astype(np.float32)
+    bv = np.random.random(data_shape).astype(np.float32)
+
+    class MulFunc(Function):
+        def forward(self, a, b):
+            self.a = a
+            self.b = b
+            return a * b
+
+        def backward(self, grad_o):
+            return grad_o * self.b * 2, grad_o * self.a * 3
+
+    class Simple(Module):
+        def __init__(self, a, b):
+            super().__init__()
+            self.a = Parameter(a, dtype=np.float32)
+            self.b = Parameter(b, dtype=np.float32)
+            self.layer1 = MulFunc()
+
+        def forward(self):
+            x = self.layer1(self.a, self.b)
+            return x
+
+    net = Simple(av, bv)
+    optim = optimizer.SGD(net.parameters(), lr=1.0)
+    optim.zero_grad()
+
+    with optim.record():
+        loss = net()
+        optim.backward(loss.sum())
+    optim.step()
+
+    np.testing.assert_almost_equal(loss.numpy(), (av * bv))
+    np.testing.assert_almost_equal(net.a.numpy(), (av - 2 * bv))
+    np.testing.assert_almost_equal(net.b.numpy(), (bv - 3 * av))
+
+
+def test_multi_output():
+    data_shape = (9, 2, 6)
+    av = np.random.random(data_shape).astype(np.float32)
+    bv = np.random.random(data_shape).astype(np.float32)
+
+    class MulFunc(Function):
+        def forward(self, a, b):
+            self.a = a
+            self.b = b
+            return a * b, a + b
+
+        def backward(self, grad_1, grad_2):
+            return grad_1 * (self.b + 1), grad_2 * (self.a + 1)
+
+    class Simple(Module):
+        def __init__(self, a, b):
+            super().__init__()
+            self.a = Parameter(a, dtype=np.float32)
+            self.b = Parameter(b, dtype=np.float32)
+            self.layer1 = MulFunc()
+
+        def forward(self):
+            x, y = self.layer1(self.a, self.b)
+            return x + y
+
+    net = Simple(av, bv)
+    optim = optimizer.SGD(net.parameters(), lr=1.0)
+    optim.zero_grad()
+
+    with optim.record():
+        loss = net()
+        optim.backward(loss.sum())
+    optim.step()
+
+    np.testing.assert_almost_equal(loss.numpy(), (av * bv + av + bv), decimal=6)
+    np.testing.assert_almost_equal(net.a.numpy(), (av - bv - 1), decimal=6)
+    np.testing.assert_almost_equal(net.b.numpy(), (bv - av - 1), decimal=6)
diff --git a/imperative/python/test/unit/test_imperative_rt.py b/imperative/python/test/unit/test_imperative_rt.py
new file mode 100644
index 0000000000000000000000000000000000000000..959a08c4adb9c9e438886fe512b7886a0cfb25ca
--- /dev/null
+++ b/imperative/python/test/unit/test_imperative_rt.py
@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+import pytest
+
+import megengine.core.tensor.raw_tensor
+from megengine.core.tensor.core import apply
+
+
+def elemwise(*args, mode):
+    from megengine.core.ops.builtin import Elemwise
+    from megengine.core._imperative_rt.imperative import apply_op
+
+    return apply_op(Elemwise(mode=mode).to_c(), args)
+
+
+def test_basic_interface():
+    cf = megengine.core._imperative_rt.OperatorNodeConfig()
+    cf.name = "megengine.core"
+    cf.dtype = "float32"
+    cf.comp_node_arr = ["xpux"]
+    print(cf.name)
+    print(cf.dtype)
+    print(cf.comp_node_arr)
+    print(cf.comp_node)
+    cf.comp_node_arr = ["xpux", "xpux:1"]
+    with pytest.raises(ValueError):
+        cf.comp_node
+
+
+def test_opr_attr():
+    from megengine.core.ops.builtin import Elemwise
+
+    assert Elemwise(mode="add") == Elemwise(mode="add")
+
+
+def test_simple_arith():
+    x = np.random.rand(10).astype("float32")
+    xx = megengine.core._imperative_rt.put(x)
+    (yy,) = elemwise(xx, xx, mode="mul")
+    np.testing.assert_allclose(x * x, megengine.core._imperative_rt.get_value(yy))
+    megengine.core._imperative_rt.delete(xx)
+    megengine.core._imperative_rt.delete(yy)
+
+
+def test_tensor_on_device():
+    device = megengine.core._imperative_rt.CompNode("cpu0:1")
+    x = np.random.rand(10).astype("float32")
+    xx = megengine.core._imperative_rt.put(x, device=device)
+    assert str(megengine.core._imperative_rt.get_device(xx)) == "cpu0:1"
+    np.testing.assert_equal(x, megengine.core._imperative_rt.get_value(xx))
+    megengine.core._imperative_rt.delete(xx)
+
+
+def test_raw_tensor():
+    from megengine.core.tensor.raw_tensor import as_raw_tensor
+    from megengine.core.ops.builtin import Elemwise
+
+    x = np.random.rand(10).astype("float32")
+    xx = as_raw_tensor(x)
+    (yy,) = apply(Elemwise(mode="mul"), xx, xx)
+    np.testing.assert_allclose(x * x, yy.numpy())
+    (yy,) = apply(Elemwise(mode="mul"), xx, xx)
+    np.testing.assert_allclose(x * x, yy.numpy())
diff --git a/imperative/python/test/unit/test_indexing_op.py b/imperative/python/test/unit/test_indexing_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..70b2911f046883eca5d2fbd96b44b1191034ea1f
--- /dev/null
+++ b/imperative/python/test/unit/test_indexing_op.py
@@ -0,0 +1,546 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import collections
+
+import numpy as np
+import pytest
+
+import megengine.core.ops.builtin
+import megengine.core.tensor.raw_tensor
+from megengine.core.ops._internal import all_ops
+from megengine.core.tensor import Tensor
+from megengine.core.tensor.core import apply
+from megengine.core.tensor.raw_tensor import RawTensor, as_raw_tensor
+
+
+def cvt_to_shape_desc(val, inpvar, config=None):
+    def as_tensor(val, device):
+        assert device is not None, "can not infer device"
+        # TODO: should copy to appropriate device
+        val = as_raw_tensor(val, device=device)
+        return val
+
+    device = None
+    if inpvar is not None:
+        assert isinstance(inpvar, RawTensor)
+        device = device or inpvar.device
+
+    if config is not None:
+        device = device or config.device
+
+    if isinstance(val, RawTensor):
+        return as_tensor(val, device)
+
+    if not isinstance(val, collections.Iterable):
+        val = [val]
+
+    components = []
+    on_host = True
+    for i in val:
+        if isinstance(i, RawTensor):
+            on_host = False
+            device = device or i.device
+        else:
+            assert isinstance(i, int), (
+                "shape desc could contain either int or Tensor, got {}"
+                " actually".format(repr(i))
+            )
+        components.append(i)
+    assert components, "shape desc could not be empty"
+
+    if on_host:
+        shape = np.ascontiguousarray(components, dtype=np.int32)
+        assert np.all(shape == components), "failed to convert to shape: {}".format(
+            components
+        )
+        return as_tensor(shape, device)
+
+    for idx, v in enumerate(components):
+        if not isinstance(v, RawTensor):
+            vi = int(v)
+            assert vi == v, "could not convert {} to int".format(v)
+            v = vi
+        components[idx] = as_tensor(v, device)
+
+    return invoke_op(all_oprs.Concat(axis=0), components)
+
+
+def canonize_reshape(inputs, *, config):
+    src, tshape = inputs
+    tshape = cvt_to_shape_desc(tshape, src, config)
+    return src, tshape
+
+
+def canonize_inputs(inputs, *, config):
+    """convert immediate numbers and SharedND to SymbolVar in inputs; at least
+    one of the inputs must be SymbolVar, so comp node and comp graph can
+    beinferred
+
+    :return: list of converted vars
+    """
+
+    if (
+        isinstance(inputs, (list, tuple))
+        and len(inputs) == 1
+        and isinstance(inputs[0], (list, tuple))
+    ):
+        # handle the case when a list is passed to a function with
+        # variable-length argument (e.g. concat has signature concat(*inputs)
+        # and is called with concat([a, b]))
+        inputs = inputs[0]
+
+    if isinstance(inputs, RawTensor):
+        return [inputs]
+
+    old_inputs = inputs
+    inputs = []
+    get_comp_node = None
+    need_cvt = False
+    for i in old_inputs:
+        if isinstance(i, RawTensor):
+            get_comp_node = lambda cn=i.device.to_c(): cn
+        else:
+            need_cvt = True
+        inputs.append(i)
+    if not need_cvt:
+        return inputs
+
+    if get_comp_node is None:
+
+        def get_comp_node():
+            return config.comp_node
+
+    for idx, var in enumerate(inputs):
+        if not isinstance(var, RawTensor):
+            var = as_raw_tensor(var)
+        inputs[idx] = var
+    return inputs
+
+
+def invoke_op(op, inputs_, cvt_inputs=canonize_inputs):
+    inputs = cvt_inputs(
+        inputs_, config=megengine.core._imperative_rt.OperatorNodeConfig()
+    )
+    return apply(op, *inputs)
+
+
+def unpack_getitem(inp, tuple_val, *, allow_newaxis=True):
+    assert isinstance(inp, RawTensor)
+    if not isinstance(tuple_val, tuple):
+        tuple_val = (tuple_val,)
+
+    def as_tensor(v):
+        if not isinstance(v, RawTensor):
+            vi = np.ascontiguousarray(v, dtype=np.int32)
+            assert np.abs(vi - v).max() == 0, "bad index: {!r}".format(v)
+            v = as_raw_tensor(vi)
+        return v
+
+    new_axes = []
+    tensors = []
+    items = []
+    cur_axis = -1
+    for i_idx, i in enumerate(tuple_val):
+        cur_axis += 1
+        if i is np.newaxis:
+            if cur_axis >= 0:
+                new_axes.append(cur_axis)
+            continue
+
+        if i is Ellipsis:
+            cur_axis = -1
+            for j in tuple_val[:i_idx:-1]:
+                if j is Ellipsis:
+                    raise IndexError("only one ellipsis is allowed")
+                if j is np.newaxis:
+                    new_axes.append(cur_axis)
+                cur_axis -= 1
+            continue
+
+        item = [
+            cur_axis,
+        ]
+
+        def push(v, item, tensors):
+            if v is None:
+                item.append(False)
+            else:
+                item.append(True)
+                tensors.append(as_tensor(v))
+
+        if isinstance(i, slice):
+            if i.start is None and i.stop is None and i.step is None:
+                continue
+            push(i.start, item, tensors)
+            push(i.stop, item, tensors)
+            push(i.step, item, tensors)
+            item.append(False)  # idx
+        else:
+            item += [False,] * 3  # begin, end, stop
+            push(i, item, tensors)
+        assert len(item) == 5
+        items.append(item)
+    if new_axes:
+        raise IndexError("newaxis is not allowed here")
+    return inp, tensors, items
+
+
+def dimshuffle(*args, **kwargs):
+    op = all_ops.Dimshuffle(**kwargs).to_c()
+    return invoke_op(op, args)
+
+
+def broadcast(input, tshape):
+    op = all_ops.Broadcast().to_c()
+    return invoke_op(op, (input, tshape), canonize_reshape)
+
+
+def subtensor(input, tuple_val):
+    input, tensors, items = unpack_getitem(input, tuple_val)
+    op = all_ops.Subtensor(items).to_c()
+    return invoke_op(op, (input, *tensors))
+
+
+def set_subtensor(input, value, tuple_val):
+    input, tensors, items = unpack_getitem(input, tuple_val)
+    op = all_ops.SetSubtensor(items).to_c()
+    return invoke_op(op, (input, value, *tensors))
+
+
+def incr_subtensor(input, value, tuple_val):
+    input, tensors, items = unpack_getitem(input, tuple_val)
+    op = all_ops.IncrSubtensor(items).to_c()
+    return invoke_op(op, (input, value, *tensors))
+
+
+def advance_indexing(input, tuple_val):
+    input, tensors, items = unpack_getitem(input, tuple_val)
+    op = all_ops.IndexingMultiAxisVec(items).to_c()
+    return invoke_op(op, (input, *tensors))
+
+
+def set_advance_indexing(input, value, tuple_val):
+    input, tensors, items = unpack_getitem(input, tuple_val)
+    op = all_ops.IndexingSetMultiAxisVec(items).to_c()
+    return invoke_op(op, (input, value, *tensors))
+
+
+def incr_advance_indexing(input, value, tuple_val):
+    input, tensors, items = unpack_getitem(input, tuple_val)
+    op = all_ops.IndexingIncrMultiAxisVec(items).to_c()
+    return invoke_op(op, (input, value, *tensors))
+
+
+def mesh_indexing(input, tuple_val):
+    input, tensors, items = unpack_getitem(input, tuple_val)
+    op = all_ops.MeshIndexing(items).to_c()
+    return invoke_op(op, (input, *tensors))
+
+
+def set_mesh_indexing(input, value, tuple_val):
+    input, tensors, items = unpack_getitem(input, tuple_val)
+    op = all_ops.SetMeshIndexing(items).to_c()
+    return invoke_op(op, (input, value, *tensors))
+
+
+def incr_mesh_indexing(input, value, tuple_val):
+    input, tensors, items = unpack_getitem(input, tuple_val)
+    op = all_ops.IncrMeshIndexing(items).to_c()
+    return invoke_op(op, (input, value, *tensors))
+
+
+def batched_mesh_indexing(input, tuple_val):
+    input, tensors, items = unpack_getitem(input, tuple_val)
+    op = all_ops.BatchedMeshIndexing(items).to_c()
+    return invoke_op(op, (input, *tensors))
+
+
+def batched_set_mesh_indexing(input, value, tuple_val):
+    input, tensors, items = unpack_getitem(input, tuple_val)
+    op = all_ops.BatchedSetMeshIndexing(items).to_c()
+    return invoke_op(op, (input, value, *tensors))
+
+
+def batched_incr_mesh_indexing(input, value, tuple_val):
+    input, tensors, items = unpack_getitem(input, tuple_val)
+    op = all_ops.BatchedIncrMeshIndexing(items).to_c()
+    return invoke_op(op, (input, value, *tensors))
+
+
+def test_dimshuffle():
+    x = np.arange(10).reshape(2, 5).astype("int32")
+    xx = as_raw_tensor(x)
+    (yy,) = dimshuffle(xx, pattern="1x0")
+    np.testing.assert_equal(np.expand_dims(x.transpose(), axis=1), yy.numpy())
+
+
+def test_broadcast():
+    x = np.arange(10).reshape(1, 10).astype("int32")
+    xx = as_raw_tensor(x)
+    (yy,) = broadcast(xx, (10, 10))
+    np.testing.assert_equal(np.repeat(x, 10, 0), yy.numpy())
+
+
+def test_subtensor():
+    x = np.arange(25).reshape(5, 5).astype("int32")
+    d = np.arange(2).astype("int32")
+    xx = as_raw_tensor(x)
+    (yy0,) = subtensor(xx, (slice(0, 4, 2), 3))
+    (yy1,) = set_subtensor(xx, d, (slice(0, 4, 2), 3))
+    (yy2,) = incr_subtensor(xx, d, (slice(0, 4, 2), 3))
+
+    np.testing.assert_equal(x[0:4:2, 3], yy0.numpy())
+
+    x_ = x.copy()
+    x_[0:4:2, 3] = d
+    np.testing.assert_equal(x_, yy1.numpy())
+
+    x_ = x.copy()
+    x_[0:4:2, 3] += d
+    np.testing.assert_equal(x_, yy2.numpy())
+
+
+def test_advance_indexing():
+    x = np.arange(25).reshape(5, 5).astype("int32")
+    d = np.arange(15).reshape(3, 5).astype("int32")
+    xx = as_raw_tensor(x)
+    (yy0,) = advance_indexing(xx, ((0, 4, 2), slice(None, None, None)))
+    (yy1,) = set_advance_indexing(xx, d, ((0, 4, 2), slice(None, None, None)))
+    (yy2,) = incr_advance_indexing(xx, d, ((0, 4, 2), slice(None, None, None)))
+
+    np.testing.assert_equal(x[(0, 4, 2), :], yy0.numpy())
+
+    x_ = x.copy()
+    x_[(0, 4, 2), :] = d
+    np.testing.assert_equal(x_, yy1.numpy())
+
+    x_ = x.copy()
+    x_[(0, 4, 2), :] += d
+    np.testing.assert_equal(x_, yy2.numpy())
+
+
+def test_mesh_indexing():
+    x = np.arange(25).reshape(5, 5).astype("int32")
+    d = np.arange(6).reshape(3, 2).astype("int32")
+    xx = as_raw_tensor(x)
+    (yy0,) = mesh_indexing(xx, (slice(0, 5, 2), (1, 3)))
+    (yy1,) = set_mesh_indexing(xx, d, (slice(0, 5, 2), (1, 3)))
+    (yy2,) = incr_mesh_indexing(xx, d, (slice(0, 5, 2), (1, 3)))
+
+    r = np.ndarray(shape=(3, 2), dtype="int32")
+    for i0, i1 in enumerate(range(0, 5, 2)):
+        for j0, j1 in enumerate((1, 3)):
+            r[i0, j0] = x[i1, j1]
+    np.testing.assert_equal(r, yy0.numpy())
+
+    r = x.copy()
+    for i0, i1 in enumerate(range(0, 5, 2)):
+        for j0, j1 in enumerate((1, 3)):
+            r[i1, j1] = d[i0, j0]
+    np.testing.assert_equal(r, yy1.numpy())
+
+    r = x.copy()
+    for i0, i1 in enumerate(range(0, 5, 2)):
+        for j0, j1 in enumerate((1, 3)):
+            r[i1, j1] += d[i0, j0]
+    np.testing.assert_equal(r, yy2.numpy())
+
+
+def test_batched_mesh_indexing():
+    x = np.arange(24).reshape(2, 3, 4).astype("int32")
+    d = np.arange(12).reshape(2, 2, 3).astype("int32")
+    xx = as_raw_tensor(x)
+    s = [(0, 1, 2), (1, 2, 3)]
+    (yy0,) = batched_mesh_indexing(xx, (slice(None, None, None), [(0, 2)] * 2, s))
+    (yy1,) = batched_set_mesh_indexing(
+        xx, d, (slice(None, None, None), [(0, 2)] * 2, s)
+    )
+    (yy2,) = batched_incr_mesh_indexing(
+        xx, d, (slice(None, None, None), [(0, 2)] * 2, s)
+    )
+
+    r = np.ndarray(shape=(2, 2, 3), dtype="int32")
+    for i in range(2):
+        for j0, j1 in enumerate((0, 2)):
+            for k0, k1 in enumerate(s[i]):
+                r[i, j0, k0] = x[i, j1, k1]
+    np.testing.assert_equal(r, yy0.numpy())
+
+    r = x.copy()
+    for i in range(2):
+        for j0, j1 in enumerate((0, 2)):
+            for k0, k1 in enumerate(s[i]):
+                r[i, j1, k1] = d[i, j0, k0]
+    np.testing.assert_equal(r, yy1.numpy())
+
+    r = x.copy()
+    for i in range(2):
+        for j0, j1 in enumerate((0, 2)):
+            for k0, k1 in enumerate(s[i]):
+                r[i, j1, k1] += d[i, j0, k0]
+    np.testing.assert_equal(r, yy2.numpy())
+
+
+# high level
+
+
+def test_advance_indexing_high_level():
+    x = np.arange(25).reshape(5, 5).astype("int32")
+    d = np.arange(15).reshape(3, 5).astype("int32")
+    xx = Tensor(x)
+
+    np.testing.assert_equal(x[1, :], xx[1, :].numpy())
+    np.testing.assert_equal(x[:, 1], xx[:, 1].numpy())
+    np.testing.assert_equal(x[1:3, :], xx[1:3, :].numpy())
+
+    np.testing.assert_equal(x[:, :], xx[:, :].numpy())
+    np.testing.assert_equal(x[1, 1], xx[1, 1].numpy())
+    yy = xx[(0, 4, 2), :]
+    np.testing.assert_equal(x[(0, 4, 2), :], yy.numpy())
+
+    x_ = x.copy()
+    x_[(0, 4, 2), :] = d
+    xx_ = Tensor(xx)
+    xx_[(0, 4, 2), :] = d
+    np.testing.assert_equal(x_, xx_.numpy())
+
+    x = np.arange(27).reshape(3, 3, 3).astype("int32")
+    xx = Tensor(x)
+
+    np.testing.assert_equal(x[1, :, :], xx[1, :, :].numpy())
+    np.testing.assert_equal(x[1, :, 1], xx[1, :, 1].numpy())
+    np.testing.assert_equal(x[1, 0:1, :], xx[1, 0:1, :].numpy())
+    np.testing.assert_equal(x[0:1, 1, 1], xx[0:1, 1, 1].numpy())
+    np.testing.assert_equal(x[:, 1, 1], xx[:, 1, 1].numpy())
+    np.testing.assert_equal(x[:, 1], xx[:, 1].numpy())
+    np.testing.assert_equal(x[1, 1:2], xx[1, 1:2].numpy())
+
+    x_ = x.copy()
+    x_[1, 1, 1] = -1
+    xx[1, 1, 1] = -1
+    np.testing.assert_equal(x_, xx.numpy())
+
+    x_[:, 1, 1] = -2
+    xx[:, 1, 1] = x_[:, 1, 1]
+    np.testing.assert_equal(x_, xx.numpy())
+
+    x_[0:1, :, 1] = -3
+    xx[0:1, :, 1] = x_[0:1, :, 1]
+    np.testing.assert_equal(x_, xx.numpy())
+
+    x_[0:1, :, 1] = -4
+    y = Tensor(x_)
+    xx[0:1, :, 1] = y[0:1, :, 1]
+    np.testing.assert_equal(y.numpy(), xx.numpy())
+
+    x[:] = 1
+    xx[:] = 1
+    np.testing.assert_equal(x, xx.numpy())
+
+    x = np.arange(9).reshape(3, 3).astype("int32")
+    xx = Tensor(x)
+    y = np.array([1, 2])
+    yy = Tensor(y)
+    np.testing.assert_equal(x[:, y[0]], xx[:, y[0]].numpy())
+    # np.testing.assert_equal(x[:, y[0]], xx[:, yy[0]].numpy()) # FIXME
+    np.testing.assert_equal(x[:, y], xx[:, y].numpy())
+    np.testing.assert_equal(x[:, y], xx[:, yy].numpy())
+
+    x_ = x.copy()
+    x_[:, y[0]] = -1
+    xx_ = Tensor(x_)
+    xx[:, yy[0]] = xx_[:, yy[0]]
+    np.testing.assert_equal(x_, xx.numpy())
+
+    x_[:, y] = -1
+    xx_ = Tensor(x_)
+    xx[:, yy] = xx_[:, yy]
+    np.testing.assert_equal(x_, xx.numpy())
+
+    x = np.arange(9).reshape(3, 3).astype("int32")
+    xx = Tensor(x)
+    y = np.array([1])
+    yy = Tensor(y)
+    np.testing.assert_equal(x[:, y[0]], xx[:, y[0]].numpy())
+    # np.testing.assert_equal(x[:, y[0]], xx[:, yy[0]].numpy()) # FIXME
+    np.testing.assert_equal(x[:, y], xx[:, y].numpy())
+
+    # XXX: no way to tell whether yy is scalar or ndim=1 array
+    np.testing.assert_equal(x[:, y], xx[:, yy].numpy())
+
+    x = np.arange(9).reshape(3, 3).astype("int32")
+    xx = Tensor(x)
+    np.testing.assert_equal(x[[0, 1], 0], xx[[0, 1], 0].numpy())
+    np.testing.assert_equal(x[0:2, 0], xx[0:2, 0].numpy())
+
+
+def test_advance_indexing_with_bool():
+    a = np.arange(9).reshape(3, 3).astype(np.float32)
+    b = np.array([1, 2, 3])
+    c = np.array([1, 2, 3])
+    aa = Tensor(a)
+    bb = Tensor(b)
+    cc = Tensor(c)
+    np.testing.assert_equal(a[b == 1, c == 2], aa[bb == 1, cc == 2].numpy())
+    a[b == 1, c == 2] = -1.0
+    aa[bb == 1, cc == 2] = -1.0
+    np.testing.assert_equal(a, aa.numpy())
+
+    a = np.arange(9).reshape(3, 3).astype(np.float32)
+    b = np.array([False, True, True])
+    c = np.array([2, 0]).astype(np.int32)
+    aa = Tensor(a)
+    bb = Tensor(b)
+    cc = Tensor(c)
+    np.testing.assert_equal(a[b, c], aa[bb, cc].numpy())
+    a[b, c] = -1.0
+    aa[bb, cc] = -1.0
+    np.testing.assert_equal(a, aa.numpy())
+    d = np.array([-1, -2], dtype=np.float32)
+    dd = Tensor(d)
+    a[b, c] = d
+    aa[bb, cc] = dd
+    np.testing.assert_equal(a, aa.numpy())
+
+    a = np.ones((2, 2))
+    b = np.array([[True, False], [False, True]])
+    aa = Tensor(a)
+    bb = Tensor(b)
+    np.testing.assert_equal(a[b], aa[bb].numpy())
+    b[:] = True
+    bb[:] = True
+    np.testing.assert_equal(a[b], aa[bb].numpy())
+    np.testing.assert_equal(a[:, [True, False]], aa[:, [True, False]].numpy())
+
+    a = np.ones((2, 2), dtype=np.int32)
+    b = np.array([[False, False], [False, False]])
+    aa = Tensor(a)
+    bb = Tensor(b)
+    np.testing.assert_equal(a[b], aa[bb].numpy())
+
+    b = np.array([False, False])
+    bb = Tensor(b)
+    np.testing.assert_equal(a[b], aa[bb].numpy().reshape(a[b].shape))  # FIXME
+
+    a = np.arange(576).reshape(2, 3, 4, 3, 4, 2).astype("int32")
+    aa = Tensor(a)
+
+    b = (np.random.sample((2, 3, 4)) > 0.5).astype("bool")
+    bb = Tensor(b)
+    np.testing.assert_equal(a[b, :, 0:4:2], aa[bb, :, 0:4:2].numpy())
+
+    b = (np.random.sample((4, 3, 4)) > 0.5).astype("bool")
+    bb = Tensor(b)
+    np.testing.assert_equal(a[..., b, 0:2], aa[..., bb, 0:2].numpy())
+
+    b = (np.random.sample((3, 4, 3)) > 0.5).astype("bool")
+    bb = Tensor(b)
+    np.testing.assert_equal(
+        a[:, b, 0:2, [True, False]], aa[:, bb, 0:2, [True, False]].numpy()
+    )
diff --git a/imperative/python/test/unit/test_jit.py b/imperative/python/test/unit/test_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bc9c2f17dc612a02480c802d289693cc71906e9
--- /dev/null
+++ b/imperative/python/test/unit/test_jit.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import pytest
+
+from megengine.core import Tensor
+
+# from megengine.core.interpreter.hints import function
+
+
+@pytest.mark.skip(reason="under rewrite")
+def test_1():
+    @function
+    def f(x, p):
+        x = x + 1
+        if p:
+            return x * x
+        return x * 2
+
+    x = Tensor(0)
+
+    for _ in range(5):
+        assert f(x, 0).numpy() == 2
+        assert f(x, 1).numpy() == 1
diff --git a/imperative/python/test/unit/test_loss.py b/imperative/python/test/unit/test_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4abbd682fcc47e31f6e8a8cdb118d8b1d4ccbeb
--- /dev/null
+++ b/imperative/python/test/unit/test_loss.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+import megengine.functional as F
+from megengine import tensor
+
+
+# XXX need to test label_smooth
+def test_cross_entropy_with_softmax():
+    data = tensor([1, 100]).astype(np.float32).reshape((1, 2))
+    label = tensor([1]).astype(np.int32)
+    loss = F.cross_entropy_with_softmax(data, label)
+    np.testing.assert_allclose(loss.numpy(), 0.0)
diff --git a/imperative/python/test/unit/test_megbrain_graph.py b/imperative/python/test/unit/test_megbrain_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fb6a9de921947bfeceaec9dbc7c8ff993c04d4d
--- /dev/null
+++ b/imperative/python/test/unit/test_megbrain_graph.py
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from concurrent.futures import Future
+
+import numpy as np
+
+import megengine.functional as F
+from megengine.core._imperative_rt import DeviceTensorND
+from megengine.core.tensor import megbrain_graph as mgb_graph
+from megengine.core.tensor.raw_tensor import as_raw_tensor
+
+
+def make_dev_tensor(value, dtype=None, device=None):
+    return as_raw_tensor(value, dtype=dtype, device=device)._dev_tensor()
+
+
+def test_io():
+    g = mgb_graph.Graph()
+    x = make_dev_tensor(np.random.randn(3).astype("float32"), device="xpux")
+    vx, _ = mgb_graph.input_callback(
+        lambda: x, device=x.comp_node, dtype=x.dtype, graph=g
+    )
+    y = Future()
+    v = mgb_graph.output_callback(y.set_result, vx)
+    f = g.compile(v)
+    f()
+
+    np.testing.assert_equal(x.numpy(), y.result().numpy())
+
+
+def test_io2():
+    g = mgb_graph.Graph()
+    g.options.async_exec_level = 0b100
+    dtype, device = "float32", "xpux"
+    px = mgb_graph.InputNode(device=device, dtype=dtype, graph=g)
+    py = mgb_graph.OutputNode(px.outputs[0])
+    f = g.compile(py.outputs[0])
+
+    for _ in range(3):
+        f.execute()
+        x = make_dev_tensor(np.random.randn(10).astype(dtype), device=device)
+        px.set_value(x)
+        y = py.get_value()
+        np.testing.assert_equal(x.numpy(), y.numpy())
+        f.wait()
+
+
+def test_attr_output():
+    g = mgb_graph.Graph()
+    g.options.async_exec_level = 0b100
+    dtype, device = "float32", "xpux"
+    px = mgb_graph.InputNode(device=device, dtype=dtype, graph=g)
+    py = mgb_graph.AttrOutputNode(px.outputs[0])
+    f = g.compile(py.outputs[0])
+
+    for shape in [(2,), (3,), (5,)]:
+        f.execute()
+        x = make_dev_tensor(np.random.randn(*shape).astype(dtype), device=device)
+        px.set_value(x)
+        ay = py.get_value()
+        assert ay.shape == shape
+        assert ay.dtype == np.dtype(dtype)
+        assert ay.device == device
+        f.wait()
+
+
+def test_op():
+    g = mgb_graph.Graph()
+    x = make_dev_tensor(np.random.randn(10).astype("float32"), device="xpux")
+    v, _ = mgb_graph.input_callback(
+        lambda: x, device=x.comp_node, dtype=x.dtype, graph=g
+    )
+    v = F.neg(v)
+    y = Future()
+    v = mgb_graph.output_callback(y.set_result, v)
+    f = g.compile(v)
+    f()
+
+    np.testing.assert_equal(x.numpy(), -y.result().numpy())
diff --git a/imperative/python/test/unit/test_module.py b/imperative/python/test/unit/test_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..5de497ed873eee5830f5dfec87efafae4c4148e0
--- /dev/null
+++ b/imperative/python/test/unit/test_module.py
@@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import platform
+
+import pytest
+
+
+@pytest.mark.skipif(
+    platform.system() == "Darwin", reason="do not imp GPU mode at macos now"
+)
+@pytest.mark.skipif(
+    platform.system() == "Windows", reason="do not imp GPU mode at Windows now"
+)
+@pytest.mark.isolated_distributed
+def test_syncbn():
+    import numpy as np
+    import multiprocessing as mp
+    from megengine.distributed.group import Server
+
+    nr_chan = 8
+    nr_ranks = 4
+    data_shape = (3, nr_chan, 4, nr_ranks * 8)
+    momentum = 0.9
+    eps = 1e-5
+    running_mean = np.zeros((1, nr_chan, 1, 1), dtype=np.float32)
+    running_var = np.ones((1, nr_chan, 1, 1), dtype=np.float32)
+    steps = 4
+    server = Server(0)
+    port = server.py_server_port
+
+    def worker(rank, data, yv_expect, running_mean, running_var):
+        import megengine as mge
+        import megengine.distributed as dist
+        from megengine import tensor
+        from megengine.module import SyncBatchNorm
+        from megengine.distributed.group import Group
+        from megengine.test import assertTensorClose
+
+        if mge.get_device_count("gpu") < nr_ranks:
+            return
+        dist.init_process_group("localhost", port, nr_ranks, rank, rank)
+        group = Group([i for i in range(nr_ranks)])
+        bn = SyncBatchNorm(nr_chan, eps=eps, momentum=momentum, group=group)
+        data_tensor = None
+        for i in range(steps):
+            if data_tensor is None:
+                data_tensor = tensor(data[i], device=f"gpu{rank}:0")
+            else:
+                data_tensor.set_value(data[i])
+            yv = bn(data_tensor)
+
+        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
+        assertTensorClose(running_mean, bn.running_mean.numpy(), max_err=5e-6)
+        assertTensorClose(running_var, bn.running_var.numpy(), max_err=5e-6)
+
+    xv = []
+    for i in range(steps):
+        xv.append(np.random.normal(loc=2.3, size=data_shape).astype(np.float32))
+        xv_transposed = np.transpose(xv[i], [0, 2, 3, 1]).reshape(
+            (data_shape[0] * data_shape[2] * data_shape[3], nr_chan)
+        )
+
+        mean = np.mean(xv_transposed, axis=0).reshape(1, nr_chan, 1, 1)
+
+        var_biased = np.var(xv_transposed, axis=0).reshape((1, nr_chan, 1, 1))
+        sd = np.sqrt(var_biased + eps)
+
+        var_unbiased = np.var(xv_transposed, axis=0, ddof=1).reshape((1, nr_chan, 1, 1))
+        running_mean = running_mean * momentum + mean * (1 - momentum)
+        running_var = running_var * momentum + var_unbiased * (1 - momentum)
+
+        yv_expect = (xv[i] - mean) / sd
+
+    data = []
+    for i in range(nr_ranks):
+        data.append([])
+        for j in range(steps):
+            data[i].append(xv[j][:, :, :, i * 8 : i * 8 + 8])
+
+    procs = []
+    for rank in range(nr_ranks):
+        p = mp.Process(
+            target=worker,
+            args=(
+                rank,
+                data[rank],
+                yv_expect[:, :, :, rank * 8 : rank * 8 + 8],
+                running_mean,
+                running_var,
+            ),
+        )
+        p.start()
+        procs.append(p)
+    for p in procs:
+        p.join(10)
+        assert p.exitcode == 0
+
+
+def test_module_conv2d():
+    from megengine.module.conv import Conv2d
+
+    conv = Conv2d(2, 3, 1)
diff --git a/imperative/python/test/unit/test_raw_tensor.py b/imperative/python/test/unit/test_raw_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f4ae7ec53a472fa86798142683c07b6fdc17a40
--- /dev/null
+++ b/imperative/python/test/unit/test_raw_tensor.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+import megengine.functional as F
+from megengine.core.tensor.raw_tensor import as_raw_tensor
+
+
+def test_as_raw_tensor():
+    x = np.arange(6, dtype="float32").reshape(2, 3)
+    xx = as_raw_tensor(x, device="xpux")
+    yy = F.add(xx, 1).numpy()
+    assert xx.dtype == np.float32
+    assert xx.device == "xpux"
+    np.testing.assert_almost_equal(yy, x + 1)
+
+
+def test_as_raw_tensor_from_int64():
+    x = np.arange(6, dtype="int64").reshape(2, 3)
+    xx = as_raw_tensor(x, dtype="float32", device="xpux")
+    yy = F.add(xx, 1).numpy()
+    assert xx.dtype == np.float32
+    assert xx.device == "xpux"
+    np.testing.assert_almost_equal(yy, x.astype("float32") + 1)
diff --git a/imperative/python/test/unit/test_serialization.py b/imperative/python/test/unit/test_serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fa19bd4b5a3def2c89736e2c0fa5b717d32c1b9
--- /dev/null
+++ b/imperative/python/test/unit/test_serialization.py
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import pickle
+from tempfile import TemporaryFile
+
+import numpy as np
+
+import megengine as mge
+from megengine import Buffer, Parameter, tensor
+
+
+def test_tensor_serialization():
+    def tensor_eq(a, b):
+        assert a.dtype == b.dtype
+        assert a.device == b.device
+        assert a.requires_grad == b.requires_grad
+        np.testing.assert_equal(a.numpy(), b.numpy())
+
+    with TemporaryFile() as f:
+        data = np.random.randint(low=0, high=7, size=[233])
+        a = tensor(data, device="xpux", dtype=np.int32)
+        pickle.dump(a, f)
+        f.seek(0)
+        b = pickle.load(f)
+        np.testing.assert_equal(a.numpy(), b.numpy())
+
+    with TemporaryFile() as f:
+        a = Parameter(np.random.random(size=(233, 2)).astype(np.float32))
+        pickle.dump(a, f)
+        f.seek(0)
+        b = pickle.load(f)
+        assert isinstance(b, Parameter)
+        np.testing.assert_equal(a.numpy(), b.numpy())
+
+    with TemporaryFile() as f:
+        a = Buffer(np.random.random(size=(2, 233)).astype(np.float32))
+        pickle.dump(a, f)
+        f.seek(0)
+        b = pickle.load(f)
+        assert isinstance(b, Buffer)
+        np.testing.assert_equal(a.numpy(), b.numpy())
+
+    with TemporaryFile() as f:
+        a = Buffer(np.random.random(size=(2, 233)).astype(np.float32))
+        mge.save(a, f)
+        f.seek(0)
+        b = mge.load(f, map_location="cpux")
+        assert isinstance(b, Buffer)
+        assert "cpu" in str(b.device)
+        np.testing.assert_equal(a.numpy(), b.numpy())
+
+    with TemporaryFile() as f:
+        if mge.is_cuda_available():
+            device_org = mge.get_default_device()
+            a = Buffer(np.random.random(size=(2, 233)).astype(np.float32))
+            mge.save(a, f)
+            f.seek(0)
+            mge.set_default_device("cpux")
+            b = mge.load(f, map_location={"gpu0": "cpu0"})
+            assert isinstance(b, Buffer)
+            assert "cpu0" in str(b.device)
+            np.testing.assert_equal(a.numpy(), b.numpy())
+            mge.set_default_device(device_org)
diff --git a/imperative/python/test/unit/test_tensor_wrapper.py b/imperative/python/test/unit/test_tensor_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..92dc1c255fd967b9391b86dc5ab0689b244370ad
--- /dev/null
+++ b/imperative/python/test/unit/test_tensor_wrapper.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import numpy as np
+
+from megengine.core.tensor.tensor_wrapper import TensorWrapper
+
+
+def test_basic():
+    x_np = np.random.rand(10).astype("float32")
+    x = TensorWrapper(x_np)
+    y = x * x
+    y_np = y.numpy()
+    np.testing.assert_almost_equal(y_np, x_np * x_np)
+
+
+def test_literal_arith():
+    x_np = np.random.rand(10).astype("float32")
+    x = TensorWrapper(x_np)
+    y = x * 2
+    y_np = y.numpy()
+    np.testing.assert_almost_equal(y_np, x_np * 2)
+
+
+def test_matmul():
+    A = TensorWrapper(np.random.rand(5, 7).astype("float32"))
+    B = TensorWrapper(np.random.rand(7, 10).astype("float32"))
+    C = A @ B
+    np.testing.assert_almost_equal(C.numpy(), A.numpy() @ B.numpy(), decimal=6)
+
+
+def test_reduce():
+    for m in ["sum", "prod", "min", "max", "mean"]:
+        x_np = np.random.rand(10).astype("float32")
+        x = TensorWrapper(x_np)
+        y = getattr(x, m)(-1)
+        np.testing.assert_almost_equal(y.numpy(), getattr(x_np, m)(-1), decimal=6)
+
+
+def test_set_subtensor():
+    x = TensorWrapper([1, 2, 3])
+    x[:] = [1, 1, 1]
+    np.testing.assert_almost_equal(x.numpy(), [1, 1, 1], decimal=6)
+    x[[0, 2]] = [3, 2]
+    np.testing.assert_almost_equal(x.numpy(), [3, 1, 2], decimal=6)
+    x[1:3] = [4, 5]
+    np.testing.assert_almost_equal(x.numpy(), [3, 4, 5], decimal=6)
diff --git a/imperative/python/test/unit/test_util.py b/imperative/python/test/unit/test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..414dbd21137d9d6c66096fa61bd83617d0b876ad
--- /dev/null
+++ b/imperative/python/test/unit/test_util.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from megengine.core._imperative_rt import Logger
+
+
+def test_logger():
+    orig_level = Logger().set_log_level(Logger.LogLevel.Info)
+    assert Logger().set_log_level(Logger.LogLevel.Info) == Logger.LogLevel.Info
+    Logger().set_log_level(orig_level)
diff --git a/imperative/python/tools/gen_op_defs.py b/imperative/python/tools/gen_op_defs.py
new file mode 100755
index 0000000000000000000000000000000000000000..e892a0f5d34c66a2e7853d3a0190d479d2389307
--- /dev/null
+++ b/imperative/python/tools/gen_op_defs.py
@@ -0,0 +1,504 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+import argparse
+import collections
+import textwrap
+import os
+import hashlib
+import struct
+
+class member_defs:
+    """contain classes to define members of an opr param"""
+
+    Dtype = collections.namedtuple('Dtype', ['cname', 'pycvt', 'pyfmt',
+                                             'cppjson', 'cname_attr'])
+    Dtype.__new__.__defaults__ = ('', )
+    uint32 = Dtype('uint32_t', 'int', 'I', 'NumberInt')
+    uint64 = Dtype('uint64_t', 'int', 'Q', 'NumberInt',
+                   'alignas(sizeof(uint64_t)) ')
+    int32 = Dtype('int32_t', 'int', 'i', 'NumberInt')
+    float32 = Dtype('float', 'float', 'f', 'Number')
+    float64 = Dtype('double', 'float', 'd', 'Number')
+    dtype = Dtype('DTypeEnum', '_as_dtype_num', 'I', 'Number')
+    bool = Dtype('bool', 'bool', '?', 'Bool')
+
+    class Base:
+        pass
+
+
+    class Doc:
+        """wrap an identifier to associate document
+
+        note: if the doc starts with a linebreak, it would not be reforamtted.
+        """
+        __slots__ = ['id', 'doc']
+
+        def __init__(self, id_, doc):
+            assert isinstance(id_, str) and isinstance(doc, str), (id_, doc)
+            self.id = id_
+            self.doc = doc
+
+        @property
+        def no_reformat(self):
+            """whether reformat is disallowed for this doc string"""
+            return self.doc.startswith('\n')
+
+        @property
+        def raw_lines(self):
+            """the doc lines when ``no_format`` is true"""
+            ret = self.doc.split('\n')
+            assert not ret[0]
+            return ret[1:]
+
+        @classmethod
+        def make(cls, v):
+            """make doc object from str or doc"""
+            if isinstance(v, cls):
+                return v
+            assert isinstance(v, str)
+            return cls(v, '')
+
+        def __str__(self):
+            return self.id
+
+        def __eq__(self, rhs):
+            if isinstance(rhs, str):
+                return self.id == rhs
+            return (isinstance(rhs, Doc) and
+                    (self.id, self.doc) == (rhs.id, rhs.doc))
+
+
+    class Enum(Base):
+        """define an enum; the result would contain both an enum class def and its
+        corresponding data field
+
+        :param default: index of default member value
+
+        :attr name_field: name of the data field of this enum in the param
+            struct
+        :attr member_alias: list of (member, alias) pairs
+        """
+        __slots__ = ['name', 'name_field', 'members', 'default',
+                     'member_alias']
+
+        all_enums = {}
+        """(param_name, name) => enum"""
+
+        def __init__(self, param_name, name, name_field, members, default,
+                     member_alias):
+            name = member_defs.Doc.make(name)
+            assert name.id[0].isupper()
+            members = tuple(map(member_defs.Doc.make, members))
+            if isinstance(default, str):
+                if default not in name_field:
+                    raise ValueError(
+                        "Default value '{}' does not exist.".format(default))
+                default = name_field.index(default)
+            assert isinstance(default, int)
+            self.name = name
+            self.name_field = self.get_name_field(name.id, name_field)
+            self.members = members
+            self.default = default
+
+            self.all_enums[(param_name, name.id)] = self
+
+            assert isinstance(member_alias, list)
+            self.member_alias = member_alias
+
+        @classmethod
+        def get_name_field(cls, name, name_field):
+            if name_field is None:
+                name_field = name[0].lower() + name[1:]
+            assert isinstance(name_field, str)
+            return name_field
+
+    class Field(Base):
+        """define a normal data field"""
+        __slots__ = ['name', 'dtype', 'default']
+
+        def __init__(self, name, dtype, default):
+            assert isinstance(dtype, member_defs.Dtype)
+            self.name = member_defs.Doc.make(name)
+            self.dtype = dtype
+            self.default = default
+
+    class Const(Base):
+        """define a const data field"""
+        __slots__ = ['name', 'dtype', 'default']
+
+        def __init__(self, name, dtype, default):
+            assert isinstance(dtype, member_defs.Dtype)
+            self.name = member_defs.Doc.make(name)
+            self.dtype = dtype
+            self.default = default
+
+    class EnumAlias(Base):
+        """alias of enum type from another param"""
+        __slots__ = ['name', 'name_field', 'src_class', 'src_name', 'default']
+
+        def __init__(self, name, name_field, src_class, src_name, default):
+            self.name = name
+            self.name_field = member_defs.Enum.get_name_field(name, name_field)
+            self.src_class = src_class
+            if src_name is None:
+                src_name = name
+            self.src_name = src_name
+            self.default = default
+
+        @property
+        def src_enum(self):
+            """source Enum class"""
+            return member_defs.Enum.all_enums[(self.src_class, self.src_name)]
+
+        def get_default(self):
+            """get default index; fallback to src index if default is not
+            set"""
+            if self.default is None:
+                return self.src_enum.default
+            return self.default
+
+
+class ParamDef:
+    """"""
+    __all_tags = set()
+    all_param_defs = []
+
+    __slots__ = ['name', 'members', 'tag', 'is_legacy']
+
+    def __init__(self, name, doc='', *, version=0, is_legacy=False):
+        self.members = []
+        self.all_param_defs.append(self)
+        h = hashlib.sha256(name.encode('utf-8'))
+        if version:
+            h.update(struct.pack('<I', version))
+        if is_legacy:
+            name += 'V{}'.format(version)
+        self.name = member_defs.Doc(name, doc)
+        self.tag = int(h.hexdigest()[:8], 16)
+        self.is_legacy = is_legacy
+        if self.tag < 1024:
+            self.tag += 1024
+        assert self.tag not in self.__all_tags, (
+            'tag hash confliction: name={} tag={}'.format(name, self.tag))
+        self.__all_tags.add(self.tag)
+
+    def add_fields(self, dtype, *names_defaults):
+        assert isinstance(dtype, str)
+        dtype = getattr(member_defs, dtype)
+        assert len(names_defaults) % 2 == 0
+        for i, j in zip(names_defaults[::2], names_defaults[1::2]):
+            self.members.append(member_defs.Field(i, dtype, j))
+        return self
+
+    def add_enum(self, name, *members, default=0, name_field=None,
+                 member_alias=[]):
+        self.members.append(member_defs.Enum(
+            self.name.id, name, name_field, members, default, member_alias))
+        return self
+
+    def add_enum_alias(self, name, src_class, src_name=None, name_field=None,
+                       default=None):
+        self.members.append(member_defs.EnumAlias(
+            name, name_field, src_class, src_name, default))
+        return self
+
+    def add_const(self, dtype, *names_defaults):
+        assert isinstance(dtype, str)
+        dtype = getattr(member_defs, dtype)
+        assert len(names_defaults) % 2 == 0
+        for i, j in zip(names_defaults[::2], names_defaults[1::2]):
+            self.members.append(member_defs.Const(i, dtype, j))
+        return self
+
+
+class WriterBase:
+    """base class for output file writer"""
+
+    _fout = None
+    _input_hash = None
+    _cur_class = None
+
+    def __call__(self, fout):
+        self._fout = fout
+
+    def set_input_hash(self, h):
+        self._input_hash = h
+        return self
+
+    def _get_header(self):
+        return 'generated by {} for {}'.format(
+            os.path.basename(__file__),
+            self._input_hash
+        )
+
+    def _process(self, defs):
+        dispatch = {
+            member_defs.Enum: self._on_member_enum,
+            member_defs.EnumAlias: self._on_member_enum_alias,
+            member_defs.Field: self._on_member_field,
+            member_defs.Const: self._on_const_field
+        }
+        for i in defs:
+            assert isinstance(i, ParamDef)
+            if i.is_legacy:
+                continue
+            self._cur_class = i.name
+            self._on_param_begin(i)
+            for j in i.members:
+                dispatch[type(j)](j)
+            self._on_param_end(i)
+
+    def _on_param_begin(self, p):
+        """:type p: :class:`.ParamDef`"""
+
+    def _on_param_end(self, p):
+        """:type p: :class:`.ParamDef`"""
+
+    def _on_member_enum(self, e):
+        """:type p: :class:`.Enum`"""
+
+    def _on_member_enum_alias(self, e):
+        """:type p: :class:`.EnumAlias`"""
+
+    def _on_member_field(self, f):
+        """:type p: :class:`.Field`"""
+
+    def _on_const_field(self, f):
+        """:type p: :class:`.Const`"""
+
+
+class IndentWriterBase(WriterBase):
+    _cur_indent = ''
+
+    def _indent(self):
+        self._cur_indent += ' ' * 4
+
+    def _unindent(self):
+        self._cur_indent = self._cur_indent[:-4]
+
+    def _write(self, content, *fmt, indent=0):
+        if indent < 0:
+            self._unindent()
+
+        self._fout.write(self._cur_indent)
+        if fmt:
+            content = content % fmt
+        self._fout.write(content)
+        self._fout.write('\n')
+
+        if indent > 0:
+            self._indent()
+
+
+class PyWriter(IndentWriterBase):
+
+    _static_members = None
+    _non_static_members = None
+    _enums = None
+    _enum_map = None
+
+    def __call__(self, fout, defs):
+        super().__call__(fout)
+        self._enum_map = {}
+        self._write('// %s', self._get_header())
+        self._write('#include "megbrain/imperative/opdef/all.h"')
+        self._write('')
+        self._write('using namespace mgb::imperative;')
+        self._write('')
+        self._process(defs)
+
+    def _on_param_begin(self, p):
+        self._enums = []
+        self._non_static_members = []
+        self._static_members = []
+
+    def _reg_enum_single(self, cur_def, e):
+        alias = None
+        if isinstance(e, member_defs.Enum):
+            src = e
+        else:
+            assert isinstance(e, member_defs.EnumAlias)
+            src = e.src_enum
+            alias = e
+
+        src_py_name = self._enum_map.get(src, None)
+        if src_py_name is not None:
+            py_name = '{}{}Enum'.format(cur_def, src.name if alias is None else alias.name)
+            self._write('m.attr("{}") = m.attr("{}");\n'.format(py_name, src_py_name))
+            return
+
+        if alias is None:
+            enum_name = str(src.name)
+        else:
+            enum_name = str(alias.name)
+        c_name = 'opdef::{}::{}'.format(cur_def, enum_name)
+        py_name = '{}{}Enum'.format(cur_def, enum_name)
+        self._write('py::enum_<{}>(m, "{}")'.format(c_name, py_name), indent=1)
+        for i in src.members:
+            self._write('.value("{0}", {1}::{0})'.format(i, c_name))
+        self._write(';\n', indent=-1)
+        self._enum_map[src] = py_name
+
+    def _on_param_end(self, p):
+        cur_def = '{}Def'.format(p.name)
+        for e in self._enums:
+            self._reg_enum_single(cur_def, e)
+        self._write('py::class_<opdef::{0}>(m, "{0}")'.format(cur_def), indent=1)
+        # TODO: use ctor with given default value
+        self._write('.def(py::init<>())')
+        for i in self._static_members:
+            assert isinstance(i, member_defs.Const)
+            self._write('.def_property_readonly_static("{0}", []() {{ return opdef::{1}::{0}; }})'.format(i.name, cur_def))
+        for i in self._non_static_members:
+            fname = None
+            if isinstance(i, member_defs.Field):
+                fname = i.name
+            else:
+                assert isinstance(i, (member_defs.Enum, member_defs.EnumAlias))
+                fname = i.name_field
+            self._write('.def_readwrite("{0}", &opdef::{1}::{0})'.format(fname, cur_def))
+        self._write(';\n', indent=-1)
+
+
+    def _on_member_enum(self, e,):
+        self._enums.append(e)
+        self._non_static_members.append(e)
+
+    def _on_member_enum_alias(self, e):
+        self._enums.append(e)
+        self._non_static_members.append(e)
+
+    def _on_member_field(self, f):
+        self._non_static_members.append(f)
+
+    def _on_const_field(self, f):
+        self._static_members.append(f)
+
+
+class CPPWriter(IndentWriterBase):
+    _param_namespace = 'opdef'
+
+    _ctor_args = None
+    """list of (text in func param, var name); func param name must be var name
+    appended by an underscore"""
+    _non_static_members = None
+
+    def __call__(self, fout, defs):
+        super().__call__(fout)
+        self._write('// %s', self._get_header())
+        self._write('#pragma once')
+        self._write('#include "megdnn.h"')
+        # which defined in megbrain/tools/param_defs/mgb_opr_param_defs.py
+        self._write('#include "megbrain/opr/param_defs.h"')
+        self._write('#include <stdint.h>')
+        self._write('namespace mgb {')
+        self._write('namespace imperative {')
+        self._write('namespace %s {', self._param_namespace)
+        self._write('namespace {')
+        self._write('#include "megdnn/dtype.h"')
+        self._write('using DTypeEnum = megdnn::DTypeEnum;')
+        self._write('} // anonymous namespace')
+        self._process(defs)
+        self._write('} // namespace %s', self._param_namespace)
+        self._write('} // namespace imperative')
+        self._write('} // namespace mgb')
+        self._write('// vim: syntax=cpp.doxygen')
+
+    def _on_param_begin(self, p):
+        self._write('struct %sDef {', p.name, indent=1)
+        self._ctor_args = []
+        self._non_static_members = []
+
+    def _add_ctor_args(self, typename, default, varname):
+        self._ctor_args.append((
+            '{} {}_={}'.format(typename, varname, default),
+            varname))
+
+    def _on_param_end(self, p):
+        '''
+        MegDNN param structures are not packed and we need to initialize the structure
+        paddings to zero or it would break MegBrain hash system. We do memset(0) in default
+        ctor and use a trick, wrapping non-static members in a anonymous union which would
+        copy the object representation in its default copy/move ctor, for copy/move ctor.
+        > The implicitly-defined copy/move constructor for a non-union class X performs
+        > a memberwise copy/move of its bases and members. [class.copy.ctor 14]
+        > The implicitly-defined copy/move constructor for a union X copies the object
+        > representation (6.9) of X. [class.copy.ctor 15]
+        '''
+        if self._non_static_members:
+            self._write('union { struct {')
+            for i in self._non_static_members:
+                if isinstance(i, member_defs.Field):
+                    self._write('%s%s %s;', i.dtype.cname_attr, i.dtype.cname, i.name)
+                else:
+                    assert isinstance(i, (member_defs.Enum, member_defs.EnumAlias))
+                    self._write('%s %s;', i.name, i.name_field)
+            self._write('}; };')
+        param_list = []
+        if self._ctor_args:
+            pdefs, varnames = zip(*self._ctor_args)
+            self._write('%sDef(%s) {', p.name, ', '.join(pdefs), indent=1)
+            self._write('memset(this, 0, sizeof(*this));')
+            for var in varnames:
+                self._write('this->%s = %s_;', var, var)
+                param_list.append(str(var))
+            self._write('}', indent=-1)
+        self._write('megdnn::param::%s param() {', self._cur_class, indent=1)
+        self._write('return {%s};', ','.join(param_list))
+        self._write('}', indent=-1)
+        self._write('};\n', indent=-1)
+
+
+    def __on_member_enum(self, e, default_value):
+        self._write('using %s = megdnn::param::%s::%s;', e.name, self._cur_class, e.name)
+        self._non_static_members.append(e)
+        self._add_ctor_args(e.name, default_value, e.name_field)
+
+    def _on_member_enum(self, e,):
+        self.__on_member_enum(e, '{}::{}'.format(e.name, e.members[e.default]))
+
+    def _on_member_enum_alias(self, e):
+        self.__on_member_enum(e, '{}::{}'.format(e.name, e.src_enum.members[e.get_default()]))
+
+    def _on_member_field(self, f):
+        self._non_static_members.append(f)
+        self._add_ctor_args(f.dtype.cname, f.default, f.name)
+
+    def _on_const_field(self, f):
+        if 'int' in f.dtype.cname:
+            self._write('static constexpr %s%s %s = %s;', f.dtype.cname_attr, f.dtype.cname, f.name, f.default)
+        else:
+            self._write('static const %s%s %s = %s;', f.dtype.cname_attr, f.dtype.cname, f.name, f.default)
+
+def main():
+    parser = argparse.ArgumentParser(
+        'generate opr param defs from description file')
+    parser.add_argument('-t', '--type', choices=['c++', 'py'], default='c++',
+                        help='output type')
+    parser.add_argument('input')
+    parser.add_argument('output')
+    args = parser.parse_args()
+
+    with open(args.input) as fin:
+        inputs = fin.read()
+        exec(inputs, {'pdef': ParamDef, 'Doc': member_defs.Doc})
+        input_hash = hashlib.sha256()
+        input_hash.update(inputs.encode(encoding='UTF-8'))
+        input_hash = input_hash.hexdigest()
+
+    if args.type == 'py':
+        writer = PyWriter()
+    else:
+        writer = CPPWriter()
+
+    with open(args.output, 'w') as fout:
+        writer.set_input_hash(input_hash)(fout, ParamDef.all_param_defs)
+
+if __name__ == '__main__':
+    main()
diff --git a/imperative/python/tools/gen_ops.py b/imperative/python/tools/gen_ops.py
new file mode 100755
index 0000000000000000000000000000000000000000..059efc6c925823e97fac68240ba5cf72ea771c2a
--- /dev/null
+++ b/imperative/python/tools/gen_ops.py
@@ -0,0 +1,276 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+from io import StringIO
+import re
+import argparse
+import subprocess
+import os
+import textwrap
+import inspect
+
+
+def camel2underscore(
+        name, *,
+        first_cap_re=re.compile('([A-Z])([A-Z][a-z]+)'),
+        all_cap_re = re.compile('([a-z])([A-Z]+)')):
+    if name.isupper():
+        return name.lower()
+    s1 = first_cap_re.sub(r'\1_\2', name)
+    return all_cap_re.sub(r'\1_\2', s1).lower()
+
+
+def caller_lineno(level=1):
+    f = inspect.stack()[level+1]
+    return '%s:%d' % (f.filename, f.lineno)
+
+
+class Doc:
+    """wrap an identifier and doc"""
+    _id = None
+
+    def __init__(self, id_, doc, typestr=None, default=None):
+        self._id = id_
+        self.doc = doc
+        self.typestr = typestr
+        self.default = default
+
+    def __str__(self):
+        return self._id
+
+
+class Context:
+    fout = None
+
+    def __init__(self):
+        self.fout = StringIO()
+        self.indent = 0
+        self.generated = []
+        self.skipped = []
+
+    def write(self, text, *fmt, indent=0):
+        text = textwrap.dedent(text)
+        text = textwrap.indent(text, ' '*4*(self.indent + indent))
+        text = text % fmt
+        if not text.endswith('\n'):
+            text += '\n'
+        self.fout.write(text)
+
+    def _gen_signature(self, params, *, have_config=True,
+                       has_out_dtype=False):
+        sig = ['self', '*']
+
+        for i, _ in params:
+            sig.append('{}=None'.format(i))
+
+        if have_config:
+            sig.extend(['name=None', 'comp_node=None', 'config=None'])
+            if has_out_dtype:
+                sig.append('dtype=None')
+
+        if params:
+            sig.append('**kwargs')
+
+        if sig[-1] == '*':
+            sig.pop()
+        return ', '.join(sig)
+
+    def _write_canonize_inputs(self, inputs, convert_inputs,
+                               convert_inputs_args=None,
+                               has_out_dtype=False):
+        self._write_gen_config(has_out_dtype)
+        inputs = list(map(str, inputs))
+        if convert_inputs_args is None:
+            if inputs[0][0] == '*':
+                arg = inputs[0][1:]
+            else:
+                arg = '[{}]'.format(', '.join(inputs))
+        else:
+            arg = convert_inputs_args
+        self.write('inputs = helper.%s(%s, config=config)',
+                   convert_inputs, arg)
+
+    def _write_gen_config(self, has_out_dtype=False):
+        self.write('''\
+            config = config or Config()
+            if name:
+                config.name = name
+            if comp_node:
+                config.comp_node = comp_node
+            ''')
+        if has_out_dtype:
+            self.write('''\
+                if dtype:
+                    config.dtype = dtype
+                ''')
+        self.write('self.config = config')
+
+    def _write_make_params(self, params):
+        for pname, ptype in params:
+            self.write('self.%s = helper.make_param(%s, param_defs.%s, kwargs)',
+                pname, pname, ptype)
+        self.write('assert not kwargs, "extra kwargs: {}".format(kwargs)')
+
+    def _write_doc(self, inputs, params, desc):
+        self.write('"""')
+        if isinstance(desc, Doc):
+            assert desc._id is None
+            self.write(desc.doc)
+        elif desc:
+            for i in textwrap.wrap(desc, 75):
+                self.write(i)
+
+        self.write('')
+        for i in inputs:
+            name = str(i)
+            typestr = ':class:`.Tensor`'
+            if name[0] == '*':
+                name = name[1:]
+                typestr = 'list of ' + typestr
+            if isinstance(i, Doc):
+                self.write(':param %s: %s', name, i.doc)
+                if i.typestr is not None:
+                    typestr = i.typestr
+            if typestr:
+                if not isinstance(i, Doc):
+                    self.write(':param %s: ', name)
+                self.write(':type %s: %s', name, typestr)
+
+        for pname, ptype in params:
+            self.write(':param %s: ', pname)
+            self.write(':type %s: :class:`~megbrain.opr_param_defs.%s`',
+                        pname, ptype)
+
+        self.write(':param comp_node: see doc for *config*')
+        self.write(':param name: see doc for *config*')
+        self.write(
+            ':param config: give a :class:`.OperatorNodeConfig` object to set '
+            'operator name and comp node. This can also be achieved by passing '
+            '*comp_node* and *name* separately.')
+
+        self.write('"""')
+
+    def _write_return(self, name, outputs):
+        self.write('opdef = helper.PodOpVisitor("%s", config, params)', name)
+        self.write('outputs = helper.create_op(opdef, inputs)')
+        if outputs:
+            self.write('outputs = [outputs[i] for i in %s]',
+                        list(map(int, outputs)))
+        self.write('return helper.convert_outputs(outputs)')
+
+    def decl_opr(self, name, *, inputs, params, desc=None, pyname=None,
+                 canonize_input_vars=None,
+                 canonize_input_vars_args=None, body=None,
+                 outputs=None, version=0, has_out_dtype=False):
+        """
+        :param inputs: name of variable inputs; a name starting with `*' means
+            a list of vars
+        :type inputs: list of str
+        :param params: (param name, param type) pairs; it can be a single
+            string representing the param type, and param name defaults to
+            'param'
+        :type params: list of pair of str, or str
+        :param pyname: python function name
+        :param body: extra statements to be placed before calling _create_opr
+        :param outputs: the indices of output vars to be selected from raw opr
+            result
+        """
+        if body:
+            self.skipped.append(name)
+            return
+
+        body = body or []
+        if isinstance(params, str):
+            params = [('param', params)]
+        assert params
+
+        self.write('# %s', caller_lineno())
+        self.write('class %s(PodOpVisitor):', name)
+        self.indent += 1
+
+        param_names, _ = zip(*params)
+        self.write('param_names = (%s,)', ', '.join(map('"{}"'.format, param_names)))
+        self.write('name = "%s"', '{}V{}'.format(name, version) if version else name)
+        self.write('\n')
+
+        self.write('def __init__(%s):',
+                    self._gen_signature(params,
+                                        has_out_dtype=has_out_dtype))
+        self.indent += 1
+
+        self._write_gen_config(has_out_dtype=has_out_dtype)
+        self.write('\n')
+
+        self._write_make_params(params)
+
+        self.write('\n')
+        self.indent -= 2
+
+        self.generated.append(name)
+
+    def decl_raw_opr(self, name, *, inputs, inputs_cvt=[], body=None,
+                     desc=None, local_defs=[], have_config=True):
+        self.skipped.append(name)
+
+    def get_str(self):
+        return self.fout.getvalue()
+
+    def all_list(self):
+        buf = StringIO()
+        print(
+            '[',
+            *('    "%s",' % i for i in self.generated),
+            ']',
+            sep='\n',
+            file=buf
+        )
+        return buf.getvalue()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='generate operator function def code from decl file')
+    parser.add_argument('inputs', nargs='+')
+    parser.add_argument('--output', '-o')
+    args = parser.parse_args()
+
+    gen = Context()
+    exec_globals = {
+        'decl_opr': gen.decl_opr,
+        'decl_raw_opr': gen.decl_raw_opr,
+        'Doc': Doc,
+        'camel2underscore': camel2underscore,
+    }
+    for i in args.inputs:
+        print('generate ops from {}'.format(i))
+        with open(i) as fin:
+            exec(compile(fin.read(), i, 'exec'), exec_globals)
+
+    try:
+        git_commit = subprocess.check_output(
+            ['git', 'rev-parse', 'HEAD'], universal_newlines=True,
+            cwd=os.path.dirname(os.path.realpath(__file__))).strip()
+    except:
+        git_commit = 'NOT_A_GIT_REPO'
+
+    def relpath(*args):
+        d = os.path.dirname(__file__)
+        return os.path.join(d, *args)
+
+    with open(relpath('ops.tpl.py')) as fin:
+        with open(args.output, 'w') as fout:
+            fout.write(fin.read()
+                       .replace('{%all%}', gen.all_list())
+                       .replace('{%body%}', gen.get_str())
+                       .replace('{%git_commit%}', git_commit))
+
+    print('Skipped:')
+    print(*gen.skipped, sep='\n')
+
+if __name__ == '__main__':
+    main()
diff --git a/imperative/python/tools/ops.tpl.py b/imperative/python/tools/ops.tpl.py
new file mode 100644
index 0000000000000000000000000000000000000000..f91004b1f732886623952ea4629be2942af1ff8d
--- /dev/null
+++ b/imperative/python/tools/ops.tpl.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+"""This python module contains functions to apply the operators defined by
+megbrain.
+
+.. note::
+    Most of the functions are automatically generated, and their signature have
+    the form contain a ``param`` argument (or more than one arguments such as
+    :func:`convolution` that has ``param`` and ``execution_polity``) and also
+    accept keyword arguments. In such case, it can be called by either
+    providing a param object of appropriate type, or by passing the arguments
+    needed by the constructor of param object to the keyword arguments.
+    Furthermore, for a param that needs an enumeration member, the enum name
+    can be used to refer to the enum object.
+
+    For example, the following statements are equivalent::
+
+        elemwise([a, b], mode='max')
+        elemwise([a, b], mode=opr_param_defs.Elemwise.Mode.MAX)
+        elemwise([a, b], param=opr_param_defs.Elemwise('max'))
+"""
+
+__git_commit__ = "{%git_commit%}"
+
+import collections
+
+from . import helper
+from .helper import PodOpVisitor
+from . import param_defs
+from ..._imperative_rt import OperatorNodeConfig as Config
+
+__all__ = {%all%}
+
+{%body%}
diff --git a/imperative/src/impl/blob_manager_impl.cpp b/imperative/src/impl/blob_manager_impl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3fa3c5316c54f1bfd734cc07460ce445ba265866
--- /dev/null
+++ b/imperative/src/impl/blob_manager_impl.cpp
@@ -0,0 +1,162 @@
+/**
+ * \file src/core/impl/imperative/physical_tensor.cpp
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "./blob_manager_impl.h"
+#include "megbrain/utils/arith_helper.h"
+#include <set>
+
+namespace mgb {
+namespace imperative {
+
+BlobManagerImpl::BlobData::BlobData(Blob* in_blob){
+    blob = in_blob;
+    DeviceTensorStorage d_storage;
+    d_storage.reset(blob->m_comp_node, blob->m_size, blob->m_storage);
+
+    h_storage = HostTensorStorage(blob->m_comp_node);
+
+    h_storage.ensure_size(blob->m_size);
+
+    h_storage.copy_from(const_cast<DeviceTensorStorage&>(d_storage), blob->m_size);
+}
+
+void BlobManagerImpl::register_blob(Blob* blob) {
+    // add blob into the comp2blobs map
+    MGB_LOCK_GUARD(m_mtx);
+    mgb_assert(m_comp2blobs_map[blob->m_comp_node].insert(blob));
+}
+
+void BlobManagerImpl::unregister_blob(Blob* blob) {
+    // erase blob into the comp2blobs map
+    MGB_LOCK_GUARD(m_mtx);
+    mgb_assert(1 == m_comp2blobs_map[blob->m_comp_node].erase(blob));
+}
+
+void BlobManagerImpl::alloc_with_defrag(Blob* blob, size_t size) {
+    if (!m_enable) {
+        alloc_direct(blob, size);
+    } else {
+        // // debug
+        // defrag(blob->m_comp_node);
+        // alloc_direct(blob, storage, size);
+
+        // try alloc
+        MGB_TRY { alloc_direct(blob, size); }
+        // if fail, try defrag, alloc again
+        MGB_CATCH(MemAllocError&, {
+            mgb_log_warn("memory allocation failed for blob; try defragmenting");
+            defrag(blob->m_comp_node);
+            alloc_direct(blob, size);
+        });
+    }
+}
+
+
+void BlobManagerImpl::alloc_direct(Blob* blob, size_t size) {
+    DeviceTensorStorage storage(blob->m_comp_node);
+    mgb_assert(blob->m_comp_node.valid());
+    storage.ensure_size(size);
+    blob->m_storage = storage.raw_storage();
+}
+
+void BlobManagerImpl::defrag(const CompNode& cn) {
+    BlobSetWithMux* blobs_set_ptr;
+    {
+        MGB_LOCK_GUARD(m_mtx);
+        blobs_set_ptr = &m_comp2blobs_map[cn];
+    }
+    MGB_LOCK_GUARD(blobs_set_ptr->mtx);
+    std::vector<BlobData> blob_data_arrary;
+    std::set<Blob::RawStorage> storage_set;
+
+    auto alignment = cn.get_mem_addr_alignment();
+    size_t tot_sz = 0;
+
+    // copy to HostTensorStorage, and release
+    for (auto i : blobs_set_ptr->blobs_set) {
+        // skip if blob do not have m_storage
+        if (!i->m_storage) continue;
+
+        // skip if ues_count() > 1
+        if (i->m_storage.use_count() > 1) continue;
+
+        // two blobs can't share same storage
+        mgb_assert(storage_set.insert(i->m_storage).second);
+
+        tot_sz += get_aligned_power2(i -> m_size, alignment);
+        BlobData blob_data(i);
+        blob_data_arrary.push_back(blob_data);
+        i -> m_storage.reset();
+    }
+    // clear all, make sure m_storage will be release
+    storage_set.clear();
+
+    // skip if no blob to defrag
+    if (!blob_data_arrary.size()) return;
+
+    // wait all other comp nodes to avoid moved var being read; note that
+    // ExecEnv has been paused, so no new task would not be dispatched
+    CompNode::sync_all();
+    CompNode::try_coalesce_all_free_memory();
+
+    // try free all
+    MGB_TRY{cn.free_device(cn.alloc_device(tot_sz));}
+    MGB_CATCH(MemAllocError&, {})
+
+    // allocate for each storage
+    for (auto i : blob_data_arrary) {
+        DeviceTensorStorage d_storage = DeviceTensorStorage(cn);
+        d_storage.ensure_size(i.blob -> m_size);
+        d_storage.copy_from(i.h_storage, i.blob -> m_size);
+        i.blob -> m_storage = d_storage.raw_storage();
+    }
+
+    // wait copy finish before destructing host values
+    cn.sync();
+}
+
+void BlobManagerImpl::set_enable(bool flag) {
+    m_enable = flag;
+}
+
+struct BlobManagerStub : BlobManager {
+    void alloc_with_defrag(Blob* blob, size_t size) {
+        mgb_assert(0, "prohibited after global variable destruction");
+    };
+    void register_blob(Blob* blob) {
+        mgb_assert(0, "prohibited after global variable destruction");
+    };
+    void unregister_blob(Blob* blob) {};
+    void set_enable(bool flag) {
+        mgb_assert(0, "prohibited after global variable destruction");
+    };
+    void defrag(const CompNode& cn) {
+        mgb_assert(0, "prohibited after global variable destruction");
+    };
+};
+
+BlobManager* BlobManager::inst() {
+    static std::aligned_union_t<0, BlobManagerImpl, BlobManagerStub> storage;
+
+    struct Keeper {
+        Keeper() {
+            new(&storage) BlobManagerImpl();
+        }
+        ~Keeper() {
+            reinterpret_cast<BlobManager*>(&storage)->~BlobManager();
+            new(&storage) BlobManagerStub();
+        }
+    };
+    static Keeper _;
+
+    return reinterpret_cast<BlobManager*>(&storage);
+}
+
+} // namespace imperative
+} // namespace mgb
diff --git a/imperative/src/impl/blob_manager_impl.h b/imperative/src/impl/blob_manager_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..32ee2879be734a3634166e51ac53d8411d45b38d
--- /dev/null
+++ b/imperative/src/impl/blob_manager_impl.h
@@ -0,0 +1,59 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include "megbrain/imperative/blob_manager.h"
+
+namespace mgb {
+namespace imperative {
+
+class BlobManagerImpl final: public BlobManager {
+
+    struct BlobSetWithMux {
+        std::mutex mtx;
+        ThinHashSet<Blob*> blobs_set;
+        bool insert(Blob* blob) {
+            MGB_LOCK_GUARD(mtx);
+            return blobs_set.insert(blob).second;
+        }
+        size_t erase(Blob* blob) {
+            MGB_LOCK_GUARD(mtx);
+            return blobs_set.erase(blob);
+        }
+    };
+
+    struct BlobData {
+        Blob* blob;
+        HostTensorStorage h_storage;
+        BlobData(Blob* in_blob);
+    };
+
+    std::mutex m_mtx;
+    CompNode::UnorderedMap<BlobSetWithMux> m_comp2blobs_map;
+    bool m_enable;
+
+    void defrag(const CompNode& cn) override;
+
+    void alloc_direct(Blob* blob, size_t size);
+
+public:
+    static BlobManager* inst();
+
+    void alloc_with_defrag(Blob* blob, size_t size) override;
+
+    void register_blob(Blob* blob) override;
+
+    void unregister_blob(Blob* blob) override;
+
+    void set_enable(bool flag) override;
+};
+
+} // namespace imperative
+} // namespace mgb
diff --git a/imperative/src/impl/dnn_op_helper.h b/imperative/src/impl/dnn_op_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..17017854d98358c59e263febb0b3c9a68a5b5638
--- /dev/null
+++ b/imperative/src/impl/dnn_op_helper.h
@@ -0,0 +1,54 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "megbrain/comp_node_env.h"
+#include "megbrain/comp_node.h"
+
+using namespace megdnn;
+
+namespace mgb {
+namespace imperative {
+
+/*!
+ * \brief A struct for safely calling DNN oprs
+ * In some cases, op may be released before the complete of the execution
+ * This destructor will prevent this
+ */
+template<typename Opr>
+struct DnnOprCaller {
+    CompNode cn;
+    DeviceTensorND dev_tensor;
+    Workspace workspace;
+    std::unique_ptr<Opr> op;
+
+    DnnOprCaller(CompNode cn): cn(cn) {
+        auto&& handle = MegDNNHandle::get(
+                                CompNodeEnv::from_comp_node(cn)).handle();
+        op = handle->create_operator<Opr>();
+    }
+
+    megdnn::Workspace create_workspace(TensorLayout layout) {
+        dev_tensor = Tensor::make(layout, cn)->dev_tensor();
+        workspace = megdnn::Workspace(dev_tensor.raw_ptr(), 
+                                      dev_tensor.storage().size());
+        return workspace;
+    }
+    
+    ~DnnOprCaller() {
+        using DT = CompNode::DeviceType;
+        if (cn.device_type() == DT::CPU && cn != CompNode::default_cpu()) {
+            CompNodeEnv::from_comp_node(cn).cpu_env().dispatch(
+                [p = op.release()] { delete p; }
+            );
+        }
+    }
+};
+
+} // namespace imperative
+} // namespace mgb
\ No newline at end of file
diff --git a/imperative/src/impl/interpreter_impl.cpp b/imperative/src/impl/interpreter_impl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..297976a8179f1fac808f141d22ea7b800bc6bd13
--- /dev/null
+++ b/imperative/src/impl/interpreter_impl.cpp
@@ -0,0 +1,213 @@
+#include "./interpreter_impl.h"
+
+
+using namespace mgb;
+using namespace imperative;
+using namespace interpreter;
+using namespace interpreter::intl;
+
+
+std::unique_ptr<Interpreter::Channel> InterpreterImpl::create_channel() {
+    return std::make_unique<ChannelImpl>();
+}
+
+Interpreter& Interpreter::inst() {
+    static InterpreterImpl inst_;
+    return inst_;
+}
+
+void* ChannelImpl::put(const HostTensorND& value) {
+    auto info = alloc();
+    info->desc.layout = value.layout();
+    info->desc.comp_node = value.comp_node();
+    info->desc.value = value.proxy_to_default_cpu();
+    m_valid_handle.insert(info);
+    m_worker.add_task(Put{info, value});
+    return info;
+}
+
+void ChannelImpl::del(void* handle) {
+    mgb_assert(m_valid_handle.erase(handle), "invalid handle: %p", handle);
+    m_worker.add_task(Del{reinterpret_cast<TensorInfo*>(handle)});
+}
+
+SmallVector<void*> ChannelImpl::apply_op(
+        std::shared_ptr<OpDef> op,
+        const SmallVector<void*>& inputs) {
+    SmallVector<LogicalTensorDesc> input_descs;
+    input_descs.reserve(inputs.size());
+    for (auto h : inputs) {
+        auto info = reinterpret_cast<TensorInfo*>(h);
+        input_descs.push_back(info->desc);
+    }
+    auto output_descs = OpDef::infer_output_attrs_fallible(*op, input_descs);
+    ApplyOp cmd{std::move(op)};
+    cmd.inputs.reserve(inputs.size());
+    for (auto i : inputs) {
+        cmd.inputs.push_back(reinterpret_cast<TensorInfo*>(i));
+    }
+    cmd.outputs.reserve(output_descs.size());
+    SmallVector<void*> outputs;
+    for (auto&& desc : output_descs) {
+        auto info = alloc();
+        info->desc = desc;
+        m_valid_handle.insert(info);
+        cmd.outputs.push_back(info);
+        outputs.push_back(info);
+    }
+    m_worker.add_task(std::move(cmd));
+    return outputs;
+}
+
+HostTensorND ChannelImpl::get_value(void* handle) {
+    mgb_assert(m_valid_handle.find(handle) != m_valid_handle.end(),
+               "invalid handle: %p", handle);
+    auto info = reinterpret_cast<TensorInfo*>(handle);
+    std::unique_lock<decltype(m_mutex)> lock(m_mutex);
+    mgb_assert(!m_waitee);
+    if (!info->value_fetched) {
+        m_waitee = info;
+        m_worker.add_task(GetValue{info});
+        m_cv.wait(lock, [&]() {
+            check_worker_exc_unsafe();
+            return info->value_fetched;
+        });
+        m_waitee = nullptr;
+    }
+    mgb_assert(info->ptr->value_fetched());
+    return info->ptr->get_value();
+}
+
+TensorShape ChannelImpl::get_shape(void* handle) {
+    mgb_assert(m_valid_handle.find(handle) != m_valid_handle.end(),
+               "invalid handle: %p", handle);
+    auto info = reinterpret_cast<TensorInfo*>(handle);
+    if (info->desc.layout.ndim != 0) {
+        return info->desc.layout;
+    }
+    std::unique_lock<decltype(m_mutex)> lock(m_mutex);
+    mgb_assert(!m_waitee);
+    m_waitee = info;
+    m_cv.wait(lock, [&]() {
+        check_worker_exc_unsafe();
+        return bool(info->ptr);
+    });
+    m_waitee = nullptr;
+    TensorShape ret = info->ptr->layout();
+    mgb_assert(ret.ndim != 0);
+    return ret;
+}
+
+DType ChannelImpl::get_dtype(void* handle) {
+    mgb_assert(m_valid_handle.find(handle) != m_valid_handle.end(),
+               "invalid handle: %p", handle);
+    auto info = reinterpret_cast<TensorInfo*>(handle);
+    auto ret = info->desc.layout.dtype;
+    mgb_assert(ret.valid());
+    return ret;
+}
+
+CompNode ChannelImpl::get_device(void* handle) {
+    mgb_assert(m_valid_handle.find(handle) != m_valid_handle.end(),
+               "invalid handle: %p", handle);
+    auto info = reinterpret_cast<TensorInfo*>(handle);
+    auto ret = info->desc.comp_node;
+    mgb_assert(ret.valid());
+    return ret;
+}
+
+DeviceTensorND ChannelImpl::get_dev_tensor(void* handle) {
+    mgb_assert(m_valid_handle.find(handle) != m_valid_handle.end(),
+               "invalid handle: %p", handle);
+    auto info = reinterpret_cast<TensorInfo*>(handle);
+    std::unique_lock<decltype(m_mutex)> lock(m_mutex);
+    mgb_assert(!m_waitee);
+    m_waitee = info;
+    m_cv.wait(lock, [&]() {
+        check_worker_exc_unsafe();
+        return bool(info->ptr);
+    });
+    m_waitee = nullptr;
+    return info->ptr->dev_tensor();
+}
+
+void ChannelImpl::sync() {
+    m_worker.wait_all_task_finish();
+    MGB_LOCK_GUARD(m_mutex);
+    check_worker_exc_unsafe();
+}
+
+void ChannelImpl::close() {
+    sync();
+}
+
+void ChannelImpl::config_async_level(int level) {
+    mgb_assert(0);
+}
+
+TensorInfo* ChannelImpl::alloc() {
+    MGB_LOCK_GUARD(m_mutex);
+    return m_pool.alloc();
+}
+
+void ChannelImpl::free(TensorInfo* ptr) {
+    MGB_LOCK_GUARD(m_mutex);
+    m_pool.free(ptr);
+}
+
+ChannelImpl::~ChannelImpl() {}
+
+void ChannelImpl::produce_tensor(TensorInfo* dest, TensorPtr ptr) {
+    MGB_LOCK_GUARD(m_mutex);
+    dest->value_fetched = ptr->value_fetched();
+    dest->ptr = std::move(ptr);
+    if (m_waitee == dest) {
+        m_cv.notify_all();
+    }
+}
+
+void ChannelImpl::process_one_task(Command& cmd) {
+    std::visit([this](auto& cmd) {
+        using T = std::remove_reference_t<decltype(cmd)>;
+        try {
+            if constexpr (std::is_same_v<T, Put>) {
+                produce_tensor(cmd.dest, Tensor::make(cmd.value));
+            } else if constexpr (std::is_same_v<T, ApplyOp>) {
+                SmallVector<TensorPtr> tensor_inputs;
+                tensor_inputs.reserve(cmd.inputs.size());
+                for (auto i : cmd.inputs) {
+                    tensor_inputs.push_back(i->ptr);
+                }
+                auto tensor_outputs = OpDef::apply_on_physical_tensor(*cmd.op, tensor_inputs);
+                mgb_assert(tensor_outputs.size() == cmd.outputs.size());
+                for (size_t i = 0; i < tensor_outputs.size(); ++i) {
+                    produce_tensor(cmd.outputs[i], std::move(tensor_outputs[i]));
+                }
+            } else if constexpr (std::is_same_v<T, Del>) {
+                free(cmd.dest);
+            } else if constexpr (std::is_same_v<T, GetValue>) {
+                cmd.dest->ptr->fetch_value();
+                MGB_LOCK_GUARD(m_mutex);
+                cmd.dest->value_fetched = true;
+                if (m_waitee == cmd.dest) {
+                    m_cv.notify_all();
+                }
+            } else {
+                static_assert(!std::is_same_v<T, T>);
+            }
+        } catch (...) {
+            MGB_LOCK_GUARD(m_mutex);
+            m_worker_exc = std::current_exception();
+            m_cv.notify_all();
+        }
+    }, cmd);
+}
+
+
+void ChannelImpl::check_worker_exc_unsafe() {
+    if (m_worker_exc) {
+        std::exception_ptr exc;
+        std::swap(exc, m_worker_exc);
+        std::rethrow_exception(exc);
+    }
+}
diff --git a/imperative/src/impl/interpreter_impl.h b/imperative/src/impl/interpreter_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..fae219958d5fddc40f62eff026dce688bcad562b
--- /dev/null
+++ b/imperative/src/impl/interpreter_impl.h
@@ -0,0 +1,95 @@
+#include <variant>
+#include <future>
+
+#include "megbrain/utils/mempool.h"
+#include "megbrain/imperative/interpreter.h"
+
+
+namespace mgb::imperative::interpreter::intl {
+
+using Handle = Interpreter::Handle;
+
+struct InterpreterImpl : Interpreter {
+    std::unique_ptr<Channel> create_channel() override;
+};
+
+struct TensorInfo {
+    TensorPtr ptr;
+    LogicalTensorDesc desc;
+    bool value_fetched = false;
+};
+
+struct Put {
+    TensorInfo* dest;
+    HostTensorND value;
+};
+struct ApplyOp {
+    std::shared_ptr<OpDef> op;
+    SmallVector<TensorInfo*> inputs;
+    SmallVector<TensorInfo*> outputs;
+};
+struct Del {
+    TensorInfo* dest;
+};
+struct GetValue {
+    TensorInfo* dest;
+};
+using Command = std::variant<Put,
+                             ApplyOp,
+                             Del,
+                             GetValue>;
+
+struct ChannelImpl : Interpreter::Channel {
+    ChannelImpl() : m_worker(this) {}
+    ~ChannelImpl() override;
+
+    Handle put(const HostTensorND& value) override;
+
+    void del(Handle) override;
+
+    SmallVector<Handle> apply_op(
+            std::shared_ptr<OpDef> op,
+            const SmallVector<Handle>& inputs) override;
+
+    HostTensorND get_value(Handle) override;
+    TensorShape get_shape(Handle) override;
+    DType get_dtype(Handle) override;
+    CompNode get_device(Handle) override;
+
+    DeviceTensorND get_dev_tensor(Handle) override;
+
+    void sync() override;
+    void close() override;
+
+    void config_async_level(int level) override;
+
+private:
+    TensorInfo* alloc();
+    void free(TensorInfo*);
+
+    void process_one_task(Command&);
+
+    void check_worker_exc_unsafe();
+
+    void produce_tensor(TensorInfo* dest, TensorPtr ptr);
+
+    std::mutex m_mutex;
+    std::condition_variable m_cv;
+    MemPool<TensorInfo> m_pool;
+    std::unordered_set<Handle> m_valid_handle;
+    TensorInfo* m_waitee = nullptr;
+    std::exception_ptr m_worker_exc;
+
+    struct WorkQueue : AsyncQueueSC<Command, WorkQueue> {
+        WorkQueue(ChannelImpl* owner) : m_owner(owner) {}
+        void process_one_task(Command& cmd) {
+            m_owner->process_one_task(cmd);
+        }
+    private:
+        ChannelImpl* m_owner;
+    } m_worker;
+
+    int m_async_level = 2;
+};
+
+} // namespace mgb::imperative::interpreter::intl
diff --git a/imperative/src/impl/op_def.cpp b/imperative/src/impl/op_def.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd1d1c39d6bd67ea8d486fc41af143f6a977e168
--- /dev/null
+++ b/imperative/src/impl/op_def.cpp
@@ -0,0 +1,82 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "megbrain/imperative/op_def.h"
+#include "megbrain/imperative/ops/opr_attr.h"
+
+#include "./op_trait.h"
+
+namespace mgb {
+namespace imperative {
+
+std::shared_ptr<OpDef> OpDef::make_from_op_node(
+    cg::OperatorNodeBase* node) {
+    OpTrait* trait;
+    trait = OpTrait::find_by_typeinfo(node->dyn_typeinfo());
+    if (!trait) {
+        // TODO: register `make_from_op_node` for each OperatorNode
+        // instead of forwarding to OprAttr
+        trait = OpTrait::find_by_typeinfo(OprAttr::typeinfo());
+    }
+    mgb_assert(trait);
+    return trait->make_from_op_node(node);
+}
+
+SmallVector<TensorPtr> OpDef::apply_on_physical_tensor(
+    const OpDef& def,
+    const SmallVector<TensorPtr>& inputs) {
+    return def.trait()->apply_on_physical_tensor(def, inputs);
+}
+
+void OpDef::exec(
+    const OpDef& def,
+    const SmallVector<TensorPtr>& inputs,
+    const SmallVector<TensorPtr>& outputs) {
+    def.trait()->exec(def, inputs, outputs);
+}
+
+cg::OperatorNodeBase* OpDef::apply_on_var_node(
+    const OpDef& def,
+    const VarNodeArray& inputs) {
+    return def.trait()->apply_on_var_node(def, inputs);
+}
+
+SmallVector<LogicalTensorDesc> OpDef::infer_output_attrs_fallible(
+    const OpDef& def,
+    const SmallVector<LogicalTensorDesc>& inputs) {
+    return def.trait()->infer_output_attrs_fallible(def, inputs);
+}
+
+SmallVector<LogicalTensorDesc> OpDef::infer_output_attrs(
+    const OpDef& def,
+    const SmallVector<TensorPtr>& inputs) {
+    return def.trait()->infer_output_attrs(def, inputs);
+}
+
+BackwardGraphResult OpDef::make_backward_graph(
+    const OpDef& def,
+    const SmallVector<LogicalTensorDesc>& inputs,
+    const SmallVector<bool>& input_requires_grad,
+    const SmallVector<bool>& output_has_grad) {
+    return def.trait()->make_backward_graph(def, inputs, input_requires_grad, output_has_grad);
+}
+
+const OpTrait* OpDef::trait() const {
+    if (!m_trait) {
+        m_trait = OpTrait::find_by_typeinfo(dyn_typeinfo());
+        mgb_throw_if(!m_trait, MegBrainError,
+            "can not find op_trait by %s", dyn_typeinfo()->name);
+    }
+    return m_trait;
+}
+
+} // namespace imperative
+} // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/impl/op_trait.cpp b/imperative/src/impl/op_trait.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..06163296a0bdc24cec7a9a1dcad9421b5dd42864
--- /dev/null
+++ b/imperative/src/impl/op_trait.cpp
@@ -0,0 +1,160 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include <sstream>
+
+#include "megbrain/imperative/ops/opr_attr.h"
+
+#include "./op_trait.h"
+#include "./proxy_graph_detail.h"
+
+namespace mgb {
+namespace imperative {
+
+namespace detail {
+
+struct StaticData {
+    std::list<OpTrait> registries;
+    std::unordered_map<const char*, OpTrait*> name2reg;
+    std::unordered_map<Typeinfo*, OpTrait*> type2reg;
+};
+
+// use "Construct On First Use" to prevent "static initialization order fiasco"
+// (i.e., ensure global registry was initialized before calling opr registration)
+StaticData& static_data() {
+    static StaticData data;
+    return data;
+}
+
+template<typename T>
+struct __not_implementation__;
+
+template<typename RType, typename ...Args>
+struct __not_implementation__<RType(Args...)> {
+    static RType raise(Args ...) {
+        mgb_throw(MegBrainError, "Not Implemented");
+    }
+};
+
+} // detail
+
+OpTrait::OpTrait(const char* name_): name(name_) {}
+
+OpTrait* OpTrait::find_by_typeinfo(Typeinfo* type) {
+    auto&& type2reg = detail::static_data().type2reg;
+    auto iter = type2reg.find(type);
+    if (iter == type2reg.end()) {
+        return nullptr;
+    }
+    return iter->second;
+}
+
+OpTrait* OpTrait::find_by_name(const char* name) {
+    auto&& name2reg = detail::static_data().name2reg;
+    auto iter = name2reg.find(name);
+    if (iter == name2reg.find(name)) {
+        return nullptr;
+    }
+    return iter->second;
+}
+
+void OpTrait::for_each_trait(thin_function<void(OpTrait&)> visitor){
+    for(auto& trait: detail::static_data().registries){
+        visitor(trait);
+    }
+}
+
+OpTraitRegistry& OpTraitRegistry::finalize() {
+    std::ostringstream msg;
+    #define CHECK(field) if (!trait->field) { \
+        msg << ", " #field; \
+        trait->field = \
+            detail::__not_implementation__<decltype(OpDef::field)>::raise; \
+    }
+    CHECK(make_from_op_node);
+    CHECK(apply_on_physical_tensor);
+    CHECK(exec);
+    CHECK(apply_on_var_node);
+    CHECK(infer_output_attrs_fallible);
+    CHECK(infer_output_attrs);
+    CHECK(make_backward_graph);
+    #undef CHECK
+    if (msg.tellp() > 0) {
+        mgb_log_warn(
+            "%s op trait missing: %s",
+            trait->name ? trait->name : "(anonymous)",
+            msg.str().c_str() + 2 /* skip first ", " */);
+    }
+    return *this;
+}
+
+SmallVector<TensorPtr> fallback_apply_on_physical_tensor(
+        const OpDef& def,
+        const SmallVector<TensorPtr>& inputs) {
+    auto desc = OpDef::infer_output_attrs(def, inputs);
+    SmallVector<TensorPtr> outputs;
+    for (auto&& i : desc) {
+        outputs.push_back(Tensor::make(i.layout, i.comp_node));
+    }
+    OpDef::exec(def, inputs, outputs);
+    return outputs;
+}
+
+SmallVector<LogicalTensorDesc> fallback_infer_output_attrs(const OpDef& def,
+        const SmallVector<TensorPtr>& inputs){
+    SmallVector<LogicalTensorDesc> input_descs;
+    for(auto&& input: inputs){
+        input_descs.push_back({input->layout(), input->comp_node()});
+    }
+    return input_descs;
+}
+
+OpTraitRegistry& OpTraitRegistry::fallback() {
+    if (!trait->exec && trait->apply_on_var_node) {
+        trait->exec = proxy_graph_detail::exec;
+    }
+    if (!trait->infer_output_attrs && trait->apply_on_var_node) {
+        trait->infer_output_attrs = proxy_graph_detail::infer_output_attrs;
+    }
+    if (!trait->infer_output_attrs_fallible && trait->apply_on_var_node) {
+        trait->infer_output_attrs_fallible = proxy_graph_detail::infer_output_attrs_fallible;
+    }
+    if (!trait->make_backward_graph && trait->apply_on_var_node) {
+        trait->make_backward_graph = proxy_graph_detail::make_backward_graph;
+    }
+    if (!trait->apply_on_physical_tensor && trait->infer_output_attrs && trait->exec) {
+        trait->apply_on_physical_tensor = fallback_apply_on_physical_tensor;
+    }
+    if(!trait->infer_output_attrs && trait->infer_output_attrs_fallible){
+        trait->infer_output_attrs = fallback_infer_output_attrs;
+    }
+    return *this;
+}
+
+void OpTraitRegistry::do_insert(Typeinfo* type) {
+    auto&& sd = detail::static_data();
+    mgb_assert(sd.type2reg.emplace(type, trait).second);
+}
+
+OpTraitRegistry OpTraitRegistry::do_insert(const char* name) {
+    auto&& sd = detail::static_data();
+    if (name) {
+        mgb_assert(!sd.name2reg.count(name),
+            "duplicated opr trait %s", name);
+    }
+    sd.registries.emplace_back(name);
+    auto ret = &sd.registries.back();
+    sd.name2reg.emplace(name, ret);
+    return {ret};
+}
+
+} // namespace imperative
+} // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/impl/op_trait.h b/imperative/src/impl/op_trait.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf92bab2d2dc4e2258d7850ce4ee9eb954739973
--- /dev/null
+++ b/imperative/src/impl/op_trait.h
@@ -0,0 +1,119 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include "megbrain/imperative/op_def.h"
+
+namespace mgb {
+namespace imperative {
+
+using OpDefMaker = thin_function<
+        decltype(OpDef::make_from_op_node)>;
+using ApplyOnPhysicalTensor = thin_function<
+        decltype(OpDef::apply_on_physical_tensor)>;
+using PhysicalTensorExecutor = thin_function<
+        decltype(OpDef::exec)>;
+using ApplyOnVarNode = thin_function<
+        decltype(OpDef::apply_on_var_node)>;
+using InferOutputAttrsFallible = thin_function<
+        decltype(OpDef::infer_output_attrs_fallible)>;
+using InferOutputAttrs = thin_function<
+        decltype(OpDef::infer_output_attrs)>;
+using GradMaker = thin_function<
+        decltype(OpDef::make_backward_graph)>;
+
+struct OpTrait {
+    const char* name;
+    OpDefMaker make_from_op_node;
+    ApplyOnPhysicalTensor apply_on_physical_tensor;
+    PhysicalTensorExecutor exec;
+    ApplyOnVarNode apply_on_var_node;
+    InferOutputAttrsFallible infer_output_attrs_fallible;
+    InferOutputAttrs infer_output_attrs;
+    GradMaker make_backward_graph;
+    OpTrait(const char* name);
+    static OpTrait* find_by_name(const char* name);
+    static OpTrait* find_by_typeinfo(Typeinfo* type);
+    static void for_each_trait(thin_function<void(OpTrait&)> visitor);
+};
+
+struct OpTraitRegistry {
+    OpTrait* trait;
+    OpTraitRegistry& make_from_op_node(OpDefMaker f) {
+        trait->make_from_op_node = f;
+        return *this;
+    }
+    OpTraitRegistry& apply_on_physical_tensor(ApplyOnPhysicalTensor f) {
+        trait->apply_on_physical_tensor = f;
+        return *this;
+    }
+    OpTraitRegistry& physical_tensor_executor(PhysicalTensorExecutor f) {
+        trait->exec = f;
+        return *this;
+    }
+    OpTraitRegistry& apply_on_var_node(ApplyOnVarNode f) {
+        trait->apply_on_var_node = f;
+        return *this;
+    }
+    OpTraitRegistry& infer_output_attrs_fallible(InferOutputAttrsFallible f) {
+        trait->infer_output_attrs_fallible = f;
+        return *this;
+    }
+    OpTraitRegistry& infer_output_attrs(InferOutputAttrs f) {
+        trait->infer_output_attrs = f;
+        return *this;
+    }
+    OpTraitRegistry& grad_maker(GradMaker f) {
+        trait->make_backward_graph = f;
+        return *this;
+    }
+    OpTraitRegistry& fallback();
+    OpTraitRegistry& finalize();
+
+    template<typename T>
+    void insert() {
+        do_insert(T::typeinfo());
+    }
+
+    template<typename T0, typename T1, typename ...Ts>
+    void insert() {
+        insert<T0>();
+        insert<T1, Ts...>();
+    }
+
+    template<typename ...Args>
+    static OpTraitRegistry insert(const char* name) {
+        auto&& ret = do_insert(name);
+        ret.insert<Args...>();
+        return ret;
+    }
+
+    void do_insert(Typeinfo* type);
+
+    static OpTraitRegistry do_insert(const char* name);
+};
+
+namespace detail {
+struct _RegisterHelper {
+    OpTraitRegistry registry;
+    ~_RegisterHelper() {
+        registry.finalize();
+    }
+};
+} // namespace detail
+
+} // namespace imperative
+} // namespace mgb
+
+#define OP_TRAIT_REG(name, ...) \
+    static OpTraitRegistry __##name##_global_registry__ = \
+        detail::_RegisterHelper{OpTraitRegistry::insert<__VA_ARGS__>(#name)}.registry
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/impl/opr_utility.cpp b/imperative/src/impl/opr_utility.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a39f65669f6a80d0033c7e9419f3b8b77ba33e9c
--- /dev/null
+++ b/imperative/src/impl/opr_utility.cpp
@@ -0,0 +1,185 @@
+/**
+ * \file src/core/impl/imperative/opr_utility.cpp
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "megbrain/imperative/opr_utility.h"
+
+// FIXME; setup_config_cn is copied from src/opr/impl/utility.cpp
+namespace {
+mgb::OperatorNodeConfig setup_config_cn(const mgb::OperatorNodeConfig& config_,
+                                        const mgb::CompNode& cn) {
+    auto prev_cn = config_.get_single_comp_node();
+    mgb_assert(!prev_cn.valid() || cn == prev_cn);
+    auto config = config_;
+    config.comp_node(cn);
+    return config;
+}
+}  // namespace
+namespace mgb {
+namespace opr {
+
+/* ================ InputCallback ================== */
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(InputCallback);
+
+InputCallback::InputCallback(cg::ComputingGraph& graph, callback_t callback,
+                             const VarNodeArray& inputs,
+                             const OperatorNodeConfig& config)
+        : Super(&graph, config, "input_callback", inputs),
+          m_callback(callback) {
+    for (VarNode* i : inputs) {
+        add_input({i});
+    }
+    DType dt = config.output_dtype();
+    mgb_assert(dt.valid());
+    add_output(None)->add_flag(VarNode::Flag::NO_SYS_MEM_ALLOC).dtype(dt);
+    add_output(None)
+            ->add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE)
+            .add_flag(VarNode::Flag::NO_SYS_MEM_ALLOC)
+            .dtype(DType::from_enum(DTypeEnum::Byte));
+}
+
+SymbolVarArray InputCallback::make(cg::ComputingGraph& graph,
+                                   callback_t callback, CompNode comp_node,
+                                   DType dtype, const SymbolVarArray& inputs) {
+    mgb_assert(comp_node.valid());
+    mgb_assert(dtype.valid());
+    OperatorNodeConfig config;
+    config.comp_node(comp_node);
+    config.output_dtype(dtype);
+    auto vinputs = to_var_node_array(inputs);
+    auto opr = graph.insert_opr(
+            std::make_unique<InputCallback>(graph, callback, vinputs, config));
+    return to_symbol_var_array(opr->output());
+}
+
+void InputCallback::init_output_static_infer_desc() {}
+
+cg::OperatorNodeBase::NodeProp* InputCallback::do_make_node_prop() const {
+    NodeProp* prop = Super::do_make_node_prop();
+    prop->add_flag(NodeProp::Flag::NO_AUTOMATIC_DUP);
+    SmallVector<NodeProp::DepType> dep_types(input().size(),
+                                             NodeProp::DepType::DEV_COMP_ORDER);
+    prop->reset_dep_type(input(), dep_types);
+    return prop;
+}
+
+void InputCallback::scn_do_execute() {
+    auto dev_tensor = m_callback();
+    output(0)->reset_dev_tensor_from_tensor(dev_tensor);
+}
+
+/* ================ OutputCallback ================== */
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(OutputCallback);
+
+OutputCallback::OutputCallback(Param param, const VarNodeArray& inputs,
+                               const OperatorNodeConfig& config)
+        : Super(inputs[0]->owner_graph(),
+                setup_config_cn(config, inputs[0]->comp_node()),
+                "output_callback", inputs),
+          m_param(std::move(param)) {
+    for (VarNode* i : inputs) {
+        add_input({i});
+    }
+    if (!m_param.borrow) {
+        input(0)->add_flag(VarNode::Flag::NO_SYS_STATIC_MEM_ALLOC);
+    }
+    add_output(None)
+            ->add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE)
+            .add_flag(VarNode::Flag::NO_SYS_MEM_ALLOC)
+            .dtype(DType::from_enum(DTypeEnum::Byte));
+}
+
+SymbolVar OutputCallback::make(Param param, const SymbolVarArray& inputs) {
+    mgb_assert(inputs.size() >= 1);
+    auto vinputs = to_var_node_array(inputs);
+    OperatorNodeConfig config;
+    return inputs[0].insert_single_output_opr<OutputCallback>(std::move(param),
+                                                              vinputs, config);
+}
+
+void OutputCallback::init_output_static_infer_desc() {}
+
+cg::OperatorNodeBase::NodeProp* OutputCallback::do_make_node_prop() const {
+    NodeProp* prop = Super::do_make_node_prop();
+    prop->add_flag(NodeProp::Flag::NO_AUTOMATIC_DUP);
+    SmallVector<NodeProp::DepType> dep_types(input().size(),
+                                             NodeProp::DepType::DEV_COMP_ORDER);
+    dep_types[0] = NodeProp::DepType::DEV_VALUE;
+    prop->reset_dep_type(input(), dep_types);
+    return prop;
+}
+
+void OutputCallback::scn_do_execute() {
+    m_param.callback(input(0)->dev_tensor());
+}
+
+/* ================ NopCallback ================== */
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(NopCallback);
+
+NopCallback::NopCallback(cg::ComputingGraph& graph, callback_t callback,
+                         const VarNodeArray& inputs,
+                         const OperatorNodeConfig& config)
+        : Super(&graph, config, "nop_callback", inputs), m_callback(callback) {
+    for (VarNode* i : inputs) {
+        add_input({i});
+    }
+    add_output(None)
+            ->add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE)
+            .add_flag(VarNode::Flag::NO_SYS_MEM_ALLOC)
+            .dtype(DType::from_enum(DTypeEnum::Byte));
+}
+
+SymbolVar NopCallback::make(cg::ComputingGraph& graph, callback_t callback,
+                            CompNode comp_node, const SymbolVarArray& inputs) {
+    mgb_assert(comp_node.valid());
+    OperatorNodeConfig config;
+    config.comp_node(comp_node);
+    auto vinputs = to_var_node_array(inputs);
+    auto opr = graph.insert_opr(
+            std::make_unique<NopCallback>(graph, callback, vinputs, config));
+    return opr->output(0);
+}
+
+void NopCallback::init_output_static_infer_desc() {}
+void NopCallback::on_output_comp_node_stream_changed() {}
+
+void NopCallback::init_output_comp_node() {
+    auto cn = config().get_single_comp_node();
+    mgb_assert(cn.valid());
+    output(0)->comp_node(cn);
+}
+
+cg::OperatorNodeBase::NodeProp* NopCallback::do_make_node_prop() const {
+    NodeProp* prop = Super::do_make_node_prop();
+    SmallVector<NodeProp::DepType> dep_types(input().size(),
+                                             NodeProp::DepType::DEV_COMP_ORDER);
+    prop->reset_dep_type(input(), dep_types);
+    prop->add_flag(
+            cg::OperatorNodeBase::NodeProp::Flag::CROSS_COMP_NODE_MEMORY);
+    return prop;
+}
+
+void NopCallback::do_execute(ExecEnv& env) {
+    auto cn = output(0)->comp_node();
+    auto runner = [this, cn] {
+        owner_graph()->event().signal_inplace<cg::event::BeforeKernel>(this,
+                                                                       cn);
+        cn.activate();
+        m_callback();
+        owner_graph()->event().signal_inplace<cg::event::AfterKernel>(this, cn);
+    };
+    env.dispatch_on_comp_node(cn, runner);
+}
+
+}  // namespace opr
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/impl/ops/backward_graph.cpp b/imperative/src/impl/ops/backward_graph.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d34068d8582fcf872cfea2de791c4cfa357b6ce6
--- /dev/null
+++ b/imperative/src/impl/ops/backward_graph.cpp
@@ -0,0 +1,113 @@
+/**
+ * \file src/core/impl/imperative/physical_tensor.cpp
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "megbrain/imperative/ops/backward_graph.h"
+#include "../op_trait.h"
+
+namespace mgb {
+namespace imperative {
+
+SmallVector<TensorPtr>
+BackwardGraph::InternalGraph::apply(
+        const SmallVector<TensorPtr>& inputs) const {
+    ThinHashMap<size_t, TensorPtr> node2tensor;
+    auto&& input_nodes = this->inputs;
+    mgb_assert(inputs.size() == input_nodes.size());
+    for (size_t i = 0; i < inputs.size(); ++ i) {
+        node2tensor[input_nodes[i]] = inputs[i];
+    }
+    for (auto &&i : constants) {
+        node2tensor[i.first] = i.second;
+    }
+    for (size_t i = 0; i < exprs.size(); ++ i) {
+        auto&& expr = exprs[i];
+        SmallVector<TensorPtr> inputs;
+        for (auto &&in : std::get<1>(expr)) {
+            inputs.push_back(node2tensor.at(in));
+        }
+        auto outputs = OpDef::apply_on_physical_tensor(
+                *std::get<0>(expr), inputs);
+        auto output_nodes = std::get<2>(expr);
+        mgb_assert(outputs.size() == output_nodes.size());
+        for (size_t i = 0; i < outputs.size(); ++ i) {
+            node2tensor[output_nodes[i]] = outputs[i];
+        }
+    }
+    SmallVector<TensorPtr> ret;
+    for (auto &&i : outputs) {
+        ret.push_back(node2tensor.at(i));
+    }
+    return ret;
+}
+
+SmallVector<LogicalTensorDesc>
+BackwardGraph::InternalGraph::infer_attrs(
+        const SmallVector<LogicalTensorDesc>& inputs) const {
+    using TensorAttr = LogicalTensorDesc;
+    ThinHashMap<size_t, TensorAttr> node2attr;
+    auto&& input_nodes = this->inputs;
+    mgb_assert(inputs.size() == input_nodes.size());
+    for (size_t i = 0; i < inputs.size(); ++ i) {
+        node2attr[input_nodes[i]] = inputs[i];
+    }
+    for (auto &&i : constants) {
+        auto* value = i.second->try_get_value();
+        mgb_assert(value);
+        node2attr[i.first] = TensorAttr{
+            i.second->layout(), i.second->comp_node(),
+            value->proxy_to_default_cpu()};
+    }
+    for (size_t i = 0; i < exprs.size(); ++ i) {
+        auto&& expr = exprs[i];
+        SmallVector<TensorAttr> inputs;
+        for (auto &&in : std::get<1>(expr)) {
+            inputs.push_back(node2attr.at(in));
+        }
+        auto outputs = OpDef::infer_output_attrs_fallible(
+                *std::get<0>(expr), inputs);
+        auto output_nodes = std::get<2>(expr);
+        mgb_assert(outputs.size() == output_nodes.size());
+        for (size_t i = 0; i < outputs.size(); ++ i) {
+            node2attr[output_nodes[i]] = outputs[i];
+        }
+    }
+    SmallVector<TensorAttr> ret;
+    for (auto &&i : outputs) {
+        ret.push_back(node2attr.at(i));
+    }
+    return ret;
+}
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(BackwardGraph);
+
+namespace {
+SmallVector<TensorPtr> backward_impl(
+    const OpDef& backward_graph,
+    const SmallVector<TensorPtr>& tensors) {
+    return backward_graph.cast_final_safe<BackwardGraph>()
+            .graph().apply(tensors);
+}
+
+SmallVector<LogicalTensorDesc> infer_tensor_attrs(
+    const OpDef& backward_graph,
+    const SmallVector<LogicalTensorDesc> inputs) {
+    return backward_graph.cast_final_safe<BackwardGraph>()
+            .graph().infer_attrs(inputs);
+}
+
+OP_TRAIT_REG(BackwardGraph, BackwardGraph)
+    .apply_on_physical_tensor(backward_impl)
+    .infer_output_attrs_fallible(infer_tensor_attrs)
+    .fallback();
+} // anonymous namespace
+
+} // namespace imperative
+} // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/impl/ops/collective_comm.cpp b/imperative/src/impl/ops/collective_comm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..35eca804c4fb3bed37f55c1a5c24f8b5358cab50
--- /dev/null
+++ b/imperative/src/impl/ops/collective_comm.cpp
@@ -0,0 +1,59 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+#include "megbrain_build_config.h"
+
+#if MGB_ENABLE_OPR_MM
+#include "../op_trait.h"
+#include "../proxy_graph_detail.h"
+#include "megbrain/opr/mm_handler.h"
+#endif // MGB_ENABLE_OPR_MM
+
+#include "megbrain/imperative/ops/collective_comm.h"
+
+namespace mgb {
+namespace imperative {
+
+#if MGB_ENABLE_OPR_MM
+namespace {
+cg::OperatorNodeBase* apply_on_var_node(
+        const OpDef& def,
+        const VarNodeArray& inputs) {
+    auto&& comm = def.cast_final_safe<CollectiveComm>();
+    auto group_client = std::make_shared<GroupClientProxy>(
+            ssprintf("%s:%d", comm.addr.data(), comm.port));
+    SmallVector<std::shared_ptr<mgb::DeviceTensorND>> dev_buffer_arr(1, nullptr);
+    auto disable = std::make_shared<DTypeScalar>();
+    disable->set(0);
+
+    cg::OperatorNodeConfig config;
+    if (comm.comp_node.size() > 0) {
+        config.comp_node(CompNode::load(comm.comp_node));
+    }
+
+    mgb_assert(inputs.size() == 1, "exactly one input expected");
+    auto&& graph = inputs[0]->owner_graph();
+
+    return graph->insert_opr(std::make_unique<opr::CollectiveComm>(
+            inputs, graph, comm.key, comm.nr_devices, comm.is_root, comm.rank,
+            comm.local_grad, group_client, comm.mode, comm.dtype, comm.backend,
+            dev_buffer_arr, config, disable));
+}
+
+OP_TRAIT_REG(CollectiveComm, CollectiveComm, opr::CollectiveComm)
+    .apply_on_var_node(apply_on_var_node)
+    .fallback();
+} // anonymous namespace
+#endif // MGB_ENABLE_OPR_MM
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(CollectiveComm);
+
+}  // namespace imperative
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/impl/ops/cond_take.cpp b/imperative/src/impl/ops/cond_take.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..de4cd0fbf5146e66aa44481e49714d4cf4535d60
--- /dev/null
+++ b/imperative/src/impl/ops/cond_take.cpp
@@ -0,0 +1,118 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "megbrain/imperative/ops/cond_take.h"
+#include "megbrain/imperative/ops/opr_attr.h"
+#include "megbrain/opr/misc.h"
+#include "../dnn_op_helper.h"
+#include "../op_trait.h"
+
+using namespace megdnn;
+
+namespace mgb::imperative {
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(CondTake);
+
+namespace {
+
+class MegDNNDynOutMallocImpl final: public megdnn::DynOutMallocPolicy {
+    using Output = std::array<TensorPtr, 2>;
+    
+    CompNode m_cn;
+    Output m_out;
+
+    public:
+        MegDNNDynOutMallocImpl(CompNode cn): m_cn{cn} {}
+
+        megdnn::TensorND alloc_output(
+                size_t id, DType dtype, const TensorShape &shape,
+                void *user_data) override;
+
+        void* alloc_workspace(size_t sz, void *user_data) override;
+        void free_workspace(void *ptr, void *user_data) override;
+        TensorPtr at(size_t id);
+};
+
+megdnn::TensorND MegDNNDynOutMallocImpl::alloc_output(
+        size_t id, DType dtype, const TensorShape &shape,
+        void * /*user_data*/) {
+    TensorLayout m_layout(shape, dtype);
+    m_out[id] = Tensor::make(m_layout, m_cn);
+    return m_out[id]->dev_tensor().as_megdnn();
+}
+
+void* MegDNNDynOutMallocImpl::alloc_workspace(size_t sz, void * /*user_data*/) {
+    return m_cn.alloc_device(sz);
+}
+
+void MegDNNDynOutMallocImpl::free_workspace(void *ptr, void * /*user_data*/) {
+    m_cn.free_device(ptr);
+}
+
+TensorPtr MegDNNDynOutMallocImpl::at(size_t id) {
+    return m_out[id];
+}
+
+cg::OperatorNodeBase* apply_on_var_node(
+        const OpDef& def,
+        const VarNodeArray& inputs) {
+    def.cast_final_safe<CondTake>();
+    auto&& graph = inputs[0]->owner_graph();
+
+    opr::CondTake::Param param;
+    param.val = 1;
+    cg::OperatorNodeConfig config;
+    cg::OperatorNodeBase* opr = graph->insert_opr(
+            std::make_unique<opr::CondTake>(
+                    inputs[0], inputs[1], param, config));
+    return opr;
+}
+
+SmallVector<TensorPtr> apply_on_physical_tensor(
+        const OpDef& def,
+        const SmallVector<TensorPtr>& inputs) {
+    auto opr = def.cast_final_safe<CondTake>();
+    mgb_assert(opr.same_type<CondTake>());
+    mgb_assert(inputs.size() == 2, "CondTake take 2 inputs, got %lu",
+               inputs.size());
+
+    auto&& inp = inputs[0];
+    auto&& msk = inputs[1];
+    mgb_assert(inp->layout().eq_shape(msk->layout()),
+               "input shape does not match mask shape");
+    mgb_assert(msk->get_value().dtype().enumv() == DTypeEnum::Bool,
+               "mask dtype must be bool");
+    DnnOprCaller<megdnn::CondTake> dnn_op(inp->comp_node());
+    dnn_op.op->param().val = 1;
+
+    TensorLayout m_layout({dnn_op.op->get_workspace_in_bytes(inp->layout())},
+                           dtype::Byte());
+
+    auto dnn_workspace = dnn_op.create_workspace(m_layout);
+    MegDNNDynOutMallocImpl policy{inp->comp_node()};
+
+    dnn_op.op->exec(inp->dev_tensor().as_megdnn(),
+                  msk->dev_tensor().as_megdnn(),
+                  dnn_workspace,
+                  &policy);
+
+    SmallVector<TensorPtr> out;
+    out.push_back(policy.at(0));
+    out.push_back(policy.at(1));
+    return out;
+}
+
+OP_TRAIT_REG(CondTake, CondTake, opr::CondTake)
+    .apply_on_var_node(apply_on_var_node)
+    .apply_on_physical_tensor(apply_on_physical_tensor)
+    .fallback();
+
+} // namespace
+
+} // namespace mgb::imperative
\ No newline at end of file
diff --git a/imperative/src/impl/ops/io_remote.cpp b/imperative/src/impl/ops/io_remote.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d7375737f84189b5c48b40dc69cbacf406a0023
--- /dev/null
+++ b/imperative/src/impl/ops/io_remote.cpp
@@ -0,0 +1,64 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+#include "megbrain_build_config.h"
+
+#if MGB_ENABLE_OPR_MM
+#include "../op_trait.h"
+#include "../proxy_graph_detail.h"
+#include "megbrain/opr/io_remote.h"
+#include "megbrain/opr/mm_handler.h"
+#endif // MGB_ENABLE_OPR_MM
+
+#include "megbrain/imperative/ops/io_remote.h"
+
+namespace mgb {
+namespace imperative {
+
+#if MGB_ENABLE_OPR_MM
+namespace {
+cg::OperatorNodeBase* apply_on_var_node_remote_send(
+        const OpDef& def, const VarNodeArray& inputs) {
+    auto&& send = def.cast_final_safe<RemoteSend>();
+    auto group_client = std::make_shared<GroupClientProxy>(
+            ssprintf("%s:%d", send.addr.data(), send.port));
+    auto&& graph = inputs[0]->owner_graph();
+
+    cg::OperatorNodeConfig config;
+    cg::OperatorNodeBase* opr =
+            graph->insert_opr(std::make_unique<mgb::opr::RemoteSend>(
+                    send.key, inputs[0], group_client, true, config));
+    return opr;
+}
+
+cg::OperatorNodeBase* apply_on_var_node_remote_recv(
+        const OpDef& def, const VarNodeArray& inputs) {
+    auto&& recv = def.cast_final_safe<RemoteRecv>();
+    auto group_client = std::make_shared<GroupClientProxy>(
+            ssprintf("%s:%d", recv.addr.data(), recv.port));
+    auto&& graph = inputs[0]->owner_graph();
+    return graph->insert_opr(std::make_unique<mgb::opr::RemoteRecv>(
+            recv.key, *graph, group_client, OperatorNodeConfig{recv.cn},
+            recv.shape, recv.dtype));
+}
+
+OP_TRAIT_REG(RemoteSend, RemoteSend, mgb::opr::RemoteSend)
+        .apply_on_var_node(apply_on_var_node_remote_send)
+        .fallback();
+
+OP_TRAIT_REG(RemoteRecv, RemoteRecv, mgb::opr::RemoteRecv)
+        .apply_on_var_node(apply_on_var_node_remote_recv)
+        .fallback();
+}  // anonymous namespace
+#endif // MGB_ENABLE_OPR_MM
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(RemoteSend);
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(RemoteRecv);
+
+}  // namespace imperative
+}  // namespace mgb
diff --git a/imperative/src/impl/ops/nms.cpp b/imperative/src/impl/ops/nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3c34c87afaffd58d824c7366826132941a8420bd
--- /dev/null
+++ b/imperative/src/impl/ops/nms.cpp
@@ -0,0 +1,42 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+#include "../op_trait.h"
+
+#include "megbrain/imperative/ops/nms.h"
+#include "megbrain/opr/standalone/nms_opr.h"
+
+namespace mgb {
+namespace imperative {
+
+using NMSKeepOpr = opr::standalone::NMSKeep;
+
+namespace {
+cg::OperatorNodeBase* apply_on_var_node(
+        const OpDef& def,
+        const VarNodeArray& inputs) {
+    auto&& nms_keep = def.cast_final_safe<NMSKeep>();
+
+    NMSKeepOpr::Param param;
+    param.iou_thresh = nms_keep.iou_thresh;
+    param.max_output = nms_keep.max_output;
+
+    return NMSKeepOpr::make(inputs[0], param).node()->owner_opr();
+}
+
+OP_TRAIT_REG(NMSKeep, NMSKeep, NMSKeepOpr)
+    .apply_on_var_node(apply_on_var_node)
+    .fallback();
+} // anonymous namespace
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(NMSKeep);
+
+}  // namespace imperative
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/impl/ops/opr_attr.cpp b/imperative/src/impl/ops/opr_attr.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..feb52c76a51befd27703ad7cf3c0b68f76366c60
--- /dev/null
+++ b/imperative/src/impl/ops/opr_attr.cpp
@@ -0,0 +1,123 @@
+/**
+ * \file src/core/impl/imperative/physical_tensor.cpp
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "megbrain/imperative/ops/opr_attr.h"
+#include "megbrain/serialization/opr_load_dump.h"
+
+#include "../op_trait.h"
+#include "../proxy_graph_detail.h"
+
+namespace mgb {
+namespace imperative {
+
+namespace {
+class OprParamsLoadContext final: public serialization::OprLoadContextRawPOD {
+    const OprAttr::Param& m_param;
+    size_t m_pos = 0;
+    ComputingGraph *m_graph;
+
+    void read_raw(void *dest, size_t size) override final {
+        mgb_assert(m_pos + size <= m_param.size(), "too many bytes requested");
+        memcpy(dest, m_param.data() + m_pos, size);
+        m_pos += size;
+    }
+
+    std::shared_ptr<HostTensorND> load_tensor() override {
+        mgb_assert(0);
+    }
+
+    std::shared_ptr<DeviceTensorND> load_tensor_shared() override {
+        mgb_assert(0);
+    }
+
+    const serialization::GraphLoadConfig& config() const override {
+        mgb_assert(0);
+    }
+
+    public:
+        OprParamsLoadContext(const OprAttr::Param& param,
+                ComputingGraph *graph):
+            serialization::OprLoadContextRawPOD(false), m_param(param), m_graph(graph)
+        {}
+
+        ~OprParamsLoadContext() {
+            mgb_assert(m_pos == m_param.size(), "param not fully consumed");
+        }
+
+        ComputingGraph& graph() override {
+            return *m_graph;
+        }
+};
+
+class OprParamsDumpContext final: public serialization::OprDumpContextRawPOD {
+public:
+    OprAttr::Param m_param;
+    OprParamsDumpContext() : serialization::OprDumpContextRawPOD(false) {}
+    void write_raw(const void *data, size_t size) {
+        const char* src = static_cast<const char*>(data);
+        m_param.insert(m_param.end(), src, src + size);
+    }
+    void dump_tensor(
+            const std::string &name,
+            const HostTensorND &tensor,
+            TensorWriteMethod method) {
+        mgb_assert(0);
+    }
+    const serialization::GraphDumpConfig& config() const {
+        mgb_assert(0);
+    }
+};
+
+cg::OperatorNodeBase* apply_on_var_node(
+        const OpDef& def, const VarNodeArray& inputs) {
+    auto&& attr = def.cast_final_safe<OprAttr>();
+    mgb_assert(!inputs.empty());
+    auto registry = serialization::OprRegistry::find_by_name(attr.type);
+    mgb_assert(registry, "operator %s not found", attr.type.c_str());
+    OprParamsLoadContext ctx{attr.param, inputs[0]->owner_graph()};
+    return registry->loader(ctx, inputs, attr.config);
+}
+
+std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* opr) {
+    OprParamsDumpContext ctx;
+    auto registry = serialization::OprRegistry::find_by_type(opr->dyn_typeinfo());
+    mgb_assert(registry, "operator %s not found", opr->dyn_typeinfo()->name);
+    mgb_assert(registry->dumper, "operator %s cannot be serialized", opr->dyn_typeinfo()->name);
+    registry->dumper(ctx, *opr);
+    return OprAttr::make(registry->name, std::move(ctx.m_param), opr->config());
+}
+
+OP_TRAIT_REG(OprAttr, OprAttr)
+    .make_from_op_node(make_from_op_node)
+    .apply_on_var_node(apply_on_var_node)
+    .fallback();
+
+} // anonymous namespace
+
+bool OprAttr::is_same_st(const Hashable& rhs_) const {
+    auto&& rhs = static_cast<const OprAttr&>(rhs_);
+    return type == rhs.type && param == rhs.param
+        && config.comp_node() == rhs.config.comp_node()
+        && config.output_dtype() == rhs.config.output_dtype();
+}
+
+size_t OprAttr::hash() const {
+    return hash_pair_combine(
+            hash_pair_combine(
+                mgb::hash(type),
+                mgb::hash(static_cast<std::vector<char>>(param))),
+            config.hash());
+}
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(OprAttr);
+
+} // namespace imperative
+} // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/impl/ops/tensor_manip.cpp b/imperative/src/impl/ops/tensor_manip.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8a84e6911ac4fe1a8221768bc6b4e60f14c9e2e
--- /dev/null
+++ b/imperative/src/impl/ops/tensor_manip.cpp
@@ -0,0 +1,145 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "megbrain/imperative/ops/tensor_manip.h"
+#include "megbrain/imperative/ops/opr_attr.h"
+#include "megbrain/opr/tensor_manip.h"
+#include "../op_trait.h"
+
+namespace mgb::imperative {
+namespace {
+
+cg::OperatorNodeBase* apply_on_var_node(
+        const OpDef& def,
+        const VarNodeArray& inputs) {
+    def.cast_final_safe<GetVarShape>();
+    return opr::GetVarShape::make(inputs).node()->owner_opr();
+}
+
+SmallVector<TensorPtr> apply_on_physical_tensor(
+        const OpDef& def,
+        const SmallVector<TensorPtr>& inputs) {
+    def.cast_final_safe<GetVarShape>();
+    mgb_assert(inputs.size() == 1, "GetVarShape take 1 input, got %lu", inputs.size());
+    auto&& inp = inputs[0];
+    auto&& shp = inp->layout();
+    mgb_assert(shp.ndim != 0, "input shape invalid");
+    HostTensorND hv(inp->comp_node(), {shp.ndim}, dtype::Int32());
+    auto* ptr = hv.ptr<dt_int32>();
+    for (size_t i = 0; i < shp.ndim; ++i) {
+        ptr[i] = shp.shape[i];
+    }
+    return {Tensor::make(std::move(hv))};
+}
+
+SmallVector<LogicalTensorDesc> infer_output_attrs_fallible(
+        const OpDef& def,
+        const SmallVector<LogicalTensorDesc>& inputs) {
+    def.cast_final_safe<GetVarShape>();
+    mgb_assert(inputs.size() == 1, "GetVarShape take 1 input, got %lu", inputs.size());
+    auto&& desc = inputs[0];
+    if (!desc.layout.ndim) {
+        return {{TensorLayout(dtype::Int32()), desc.comp_node}};
+    }
+    DeviceTensorND value(CompNode::default_cpu(), {desc.layout.ndim}, dtype::Int32());
+    auto* ptr = value.ptr<dt_int32>();
+    for (size_t i = 0; i < desc.layout.ndim; ++i) {
+        ptr[i] = desc.layout[i];
+    }
+    return {{value.layout(), desc.comp_node, std::move(value)}};
+}
+
+std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* node_) {
+    auto* node = &node_->cast_final_safe<opr::GetVarShape>();
+    if (node->config().comp_node().size() ||
+            node->config().output_dtype().valid() ||
+            node->param().axis != opr::GetVarShape::Param::INVALID_AXIS) {
+        mgb_log_warn("weird GetVarShape");
+        return OpTrait::find_by_typeinfo(OprAttr::typeinfo())->make_from_op_node(node);
+    }
+    return GetVarShape::make();
+}
+
+OP_TRAIT_REG(GetVarShape, GetVarShape, opr::GetVarShape)
+    .make_from_op_node(make_from_op_node)
+    .infer_output_attrs_fallible(infer_output_attrs_fallible)
+    .apply_on_var_node(apply_on_var_node)
+    .apply_on_physical_tensor(apply_on_physical_tensor)
+    .fallback();
+
+TensorShapeArray get_shapes(const std::vector<std::vector<size_t>>& shapes) {
+    TensorShapeArray ret;
+    for (auto&& i:shapes) {
+        SmallVector<size_t> shape(i.begin(), i.end());
+        TensorShape shp(shape);
+        ret.push_back(shp);
+    }
+    return ret;
+}
+
+cg::OperatorNodeBase* param_pack_split_apply_on_var_node(
+        const OpDef& def, const VarNodeArray& inputs) {
+    auto&& param = def.cast_final_safe<ParamPackSplit>();
+    auto&& graph = inputs[0]->owner_graph();
+
+    auto&& shapes = get_shapes(param.shapes);
+    cg::OperatorNodeConfig config;
+    cg::OperatorNodeBase* opr =
+            graph->insert_opr(std::make_unique<mgb::opr::ParamPackSplit>(
+                    inputs[0], param.offsets, shapes, config));
+    return opr;
+}
+
+SmallVector<TensorPtr> param_pack_split_apply_on_physical_tensor(
+        const OpDef& def,
+        const SmallVector<TensorPtr>& inputs) {
+    auto param = def.cast_final_safe<ParamPackSplit>();
+    mgb_assert(inputs.size() == 1, "ParamPackSplit take 1 input, got %lu", inputs.size());
+    auto&& inp = inputs[0];
+    auto&& shp = inp->layout();
+    mgb_assert(shp.ndim == 1, "ParamPackSplit input shape invalid, ndim should be 1");
+    mgb_assert(param.shapes.size() * 2 == param.offsets.size());
+    SmallVector<TensorPtr> ret;
+    auto&& shapes = get_shapes(param.shapes);
+    size_t dtype_size = inputs[0]->layout().dtype.size();
+    for (size_t i = 0; i < shapes.size(); ++i) {
+        ret.push_back(
+                inputs[0]->sub(param.offsets[i * 2] * dtype_size, shapes[i]));
+    }
+    return ret;
+}
+
+OP_TRAIT_REG(ParamPackSplit, ParamPackSplit, mgb::opr::ParamPackSplit)
+        .apply_on_var_node(param_pack_split_apply_on_var_node)
+        .apply_on_physical_tensor(param_pack_split_apply_on_physical_tensor)
+        .fallback();
+
+cg::OperatorNodeBase* param_pack_concat_apply_on_var_node(
+        const OpDef& def, const VarNodeArray& inputs) {
+    auto&& param = def.cast_final_safe<ParamPackConcat>();
+    auto&& graph = inputs[0]->owner_graph();
+
+    VarNodeArray inps(inputs.begin(), inputs.end() - 1);
+    cg::OperatorNodeConfig config;
+    cg::OperatorNodeBase* opr =
+            graph->insert_opr(std::make_unique<mgb::opr::ParamPackConcat>(
+                    inps, inputs.back(), param.offsets, config));
+    return opr;
+}
+
+OP_TRAIT_REG(ParamPackConcat, ParamPackConcat, mgb::opr::ParamPackConcat)
+        .apply_on_var_node(param_pack_concat_apply_on_var_node)
+        .fallback();
+} // namespace
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(GetVarShape);
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(ParamPackSplit);
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(ParamPackConcat);
+
+} // namespace mgb::imperative
diff --git a/imperative/src/impl/physical_tensor.cpp b/imperative/src/impl/physical_tensor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..268b6d24cf5d311e08dd489d17004eaa605bb00a
--- /dev/null
+++ b/imperative/src/impl/physical_tensor.cpp
@@ -0,0 +1,425 @@
+/**
+ * \file src/core/impl/imperative/physical_tensor.cpp
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "megbrain/imperative.h"
+#include "megbrain/imperative/blob_manager.h"
+#include <mutex>
+
+namespace mgb {
+namespace imperative {
+
+namespace {
+
+class EventPool : CompNodeDepedentObject {
+    CompNode::UnorderedMap<CompNode::EventPool> m_cn2pool;
+    Spinlock m_lock;
+
+    EventPool() = default;
+public:
+    static EventPool& inst() {
+        static Spinlock lock;
+        static std::unique_ptr<EventPool> ptr;
+        MGB_LOCK_GUARD(lock);
+        if (!ptr || ptr->is_finalized()) {
+            ptr.reset(new EventPool());
+        }
+        return *ptr;
+    }
+    CompNode::Event* alloc(CompNode cn) {
+        CompNode::EventPool *pool;
+        {
+            MGB_LOCK_GUARD(m_lock);
+            auto iter = m_cn2pool.find(cn);
+            if (iter == m_cn2pool.end()) {
+                iter = m_cn2pool.emplace(
+                        std::piecewise_construct,
+                        std::forward_as_tuple(cn),
+                        std::forward_as_tuple(cn)).first;
+            }
+            pool = &iter->second;
+        }
+        return pool->alloc();
+    }
+    void free(CompNode::Event* event) {
+        CompNode::EventPool* pool;
+        {
+            MGB_LOCK_GUARD(m_lock);
+            pool = &m_cn2pool.at(event->comp_node());
+        }
+        pool->free(event);
+    }
+    std::shared_ptr<void> on_comp_node_finalize() override {
+        MGB_LOCK_GUARD(m_lock);
+        for (auto&& i : m_cn2pool) {
+            i.second.assert_all_freed();
+        }
+        return {};
+    }
+    ~EventPool() {
+        for (auto&& i : m_cn2pool) {
+            i.second.assert_all_freed();
+        }
+    }
+};
+
+class AsyncReleaser : public CompNodeDepedentObject {
+    struct WaiterParam {
+        CompNode cn;
+        CompNode::Event *event;
+        BlobPtr blob;
+        HostTensorStorage::RawStorage storage;
+    };
+    class Waiter final: public AsyncQueueSC<WaiterParam, Waiter> {
+        AsyncReleaser *m_par_releaser;
+
+        public:
+            Waiter(AsyncReleaser *releaser):
+                m_par_releaser(releaser)
+            {
+            }
+
+            void process_one_task(WaiterParam &param) {
+                if (param.event->finished()) {
+                    param.blob.reset();
+                    param.storage.reset();
+                    EventPool::inst().free(param.event);
+                    return;
+                }
+
+                using namespace std::literals;
+                std::this_thread::sleep_for(1us);
+                add_task(std::move(param));
+            }
+    };
+    Waiter m_waiter{this};
+
+protected:
+    std::shared_ptr<void> on_comp_node_finalize() override {
+        m_waiter.wait_task_queue_empty();
+        return {};
+    }
+
+public:
+    static AsyncReleaser* inst() {
+        static AsyncReleaser releaser;
+        return &releaser;
+    }
+
+    ~AsyncReleaser() {
+        m_waiter.wait_task_queue_empty();
+    }
+
+    void add(BlobPtr blob, CompNode cn) {
+        add(cn, std::move(blob), {});
+    }
+
+    void add(const HostTensorND& hv) {
+        add(hv.comp_node(), {}, hv.storage().raw_storage());
+    }
+
+    void add(CompNode cn, BlobPtr blob, HostTensorStorage::RawStorage storage = {}) {
+        auto event = EventPool::inst().alloc(cn);
+        event->record();
+        m_waiter.add_task({cn, event, std::move(blob), std::move(storage)});
+    }
+};
+
+class CompNodeSyncManager : public CompNodeDepedentObject {
+    ThinHashMap<Blob*, std::unique_ptr<CompNode::Event>> m_blob2event;
+    std::mutex m_mtx;
+private:
+    static CompNodeSyncManager mgr;
+public:
+    std::shared_ptr<void> on_comp_node_finalize() override {
+        MGB_LOCK_GUARD(m_mtx);
+        m_blob2event.clear();
+        return {};
+    }
+
+    static CompNodeSyncManager* inst() {
+        return &mgr;
+    }
+
+    CompNode::Event* get_or_create_event(Blob* blob) {
+        mgb_assert(!is_finalized());
+        MGB_LOCK_GUARD(m_mtx);
+        auto&& e = m_blob2event[blob];
+        if (!e) {
+            e = blob->comp_node().create_event();
+        }
+        return e.get();
+    }
+
+    void remove(Blob* blob) {
+        MGB_LOCK_GUARD(m_mtx);
+        m_blob2event.erase(blob);
+    }
+};
+CompNodeSyncManager CompNodeSyncManager::mgr;
+
+// Cache for small blobs
+// 1. A blob has to be seen twice (within a window) to be eligible for cache
+// 2. Cache eviction occurs when cache size reaches a threshold, in least frequently used order
+class ConstTensorCache {
+public:
+    struct Entry {
+        size_t hitcnt = 0;
+        std::unique_ptr<dt_byte[]> data;
+        size_t size;
+        BlobPtr blob;
+
+        Entry(const dt_byte* ptr, size_t size_, BlobPtr blob_)
+                : data(new dt_byte[size_]), size(size_), blob(blob_) {
+            memcpy(data.get(), ptr, size);
+        }
+
+        // does not check input
+        bool match(const HostTensorND& hv) {
+            return 0 == memcmp(data.get(), hv.raw_ptr(), hv.layout().span().high_byte);
+        }
+    };
+
+    bool check(const HostTensorND& hv) {
+        auto&& layout = hv.layout();
+        auto&& span = layout.span();
+        return hv.format().is_default() && !hv.empty() &&
+            layout.is_contiguous() && span.low_byte == 0 &&
+            span.high_byte <= max_bytes;
+    }
+
+    // hash storage; does not check input
+    static uint64_t hash(const HostTensorND& hv) {
+        auto&& span = hv.layout().span();
+        return XXHash{}
+            .update(hv.raw_ptr(), span.high_byte)
+            .digest();
+    }
+
+    BlobPtr lookup(const HostTensorND& hv) {
+        if (!check(hv)) {
+            return {};
+        }
+        auto h = hash(hv);
+        MGB_LOCK_GUARD(mtx);
+        // lookup in g1
+        auto it = g1.find(h);
+        if (it != g1.end()) {
+            if (!it->second.match(hv)) {
+                mgb_log_warn("hash collision in const tensor cache");
+                return {};
+            }
+            it->second.hitcnt += 1;
+            return it->second.blob;
+        }
+        // lookup in g0
+        if (!g0.extract(h) && !g0b.extract(h)) {
+            maybe_collect_g0();
+            g0.emplace(h);
+            return {};
+        }
+        // add new entry to g1
+        maybe_collect_g1();
+        Entry entry(hv.raw_ptr(), hv.layout().span().high_byte, Tensor(hv).blob());
+        it = g1.emplace_hint(it, h, std::move(entry));
+        it->second.hitcnt += 1;
+        return it->second.blob;
+    }
+
+    void clear() {
+        MGB_LOCK_GUARD(mtx);
+        g0.clear();
+        g0b.clear();
+        g1.clear();
+    }
+
+    std::mutex mtx;
+    size_t hwm = 1024, lwm = 512, max_bytes = TensorShape::MAX_NDIM * 8, window = 65536;
+
+private:
+    void maybe_collect_g0() {
+        if (g0.size() > window) {
+            std::swap(g0, g0b);
+            g0.clear();
+        }
+    }
+    void maybe_collect_g1() {
+        if (g1.size() <= hwm) return;
+
+        using KV = std::pair<uint64_t, Entry>;
+        std::vector<KV> tmp;
+        tmp.reserve(g1.size());
+        for (auto&& kv : g1) {
+            tmp.emplace_back(kv.first, std::move(kv.second));
+        }
+        std::nth_element(tmp.begin(), tmp.begin() + lwm, tmp.end(), [](const KV& lhs, const KV& rhs) {
+                return lhs.second.hitcnt > rhs.second.hitcnt;
+            });
+        g1.clear();
+        for (auto&& kv : tmp) {
+            kv.second.hitcnt = 0;
+            g1.emplace(std::move(kv));
+        }
+    }
+    std::unordered_set<uint64_t> g0, g0b;
+    std::unordered_map<uint64_t, Entry> g1;
+};
+
+struct MultiCNConstTensorCache : CompNodeDepedentObject {
+    std::mutex mtx;
+    CompNode::UnorderedMap<ConstTensorCache> cn2cache;
+
+    std::shared_ptr<void> on_comp_node_finalize() {
+        MGB_LOCK_GUARD(mtx);
+        cn2cache.clear();
+        return {};
+    }
+
+    BlobPtr lookup(const HostTensorND& hv) {
+        MGB_LOCK_GUARD(mtx);
+        return cn2cache[hv.comp_node()].lookup(hv);
+    }
+};
+
+MultiCNConstTensorCache const_tensor_cache;
+
+} // namespace
+
+void EventDeleter::operator()(CompNode::Event* event) {
+    EventPool::inst().free(event);
+}
+
+Blob::Blob(const DeviceTensorStorage& s):
+    m_comp_node{s.comp_node()}, m_storage{s.raw_storage()},
+    m_size{s.size()} {
+    BlobManager::inst()->register_blob(this);
+}
+
+Blob::Blob(CompNode cn, size_t sz):
+    m_comp_node{cn}, m_storage{}, m_size{sz} {
+    BlobManager::inst()->register_blob(this);
+}
+
+Blob::~Blob() {
+    BlobManager::inst()->unregister_blob(this);
+    CompNodeSyncManager::inst()->remove(this);
+}
+
+const Blob::RawStorage& Blob::storage() {
+    if (!m_storage) {
+        BlobManager::inst()->alloc_with_defrag(this, m_size);
+    }
+    return m_storage;
+}
+
+Tensor::Tensor(BlobPtr blob, const TensorLayout& layout, size_t offset, const HostTensorND& hv)
+        : m_layout(layout), m_blob(std::move(blob)), m_offset(offset), m_value(hv) {
+}
+
+Tensor::Tensor(const HostTensorND &hv)
+    : Tensor(hv.layout(), hv.comp_node()) {
+    m_value = hv;
+    dev_tensor().copy_from_fixlayout(hv);
+    // even though hv is saved in m_value, Tensor itself could be
+    // released before copy completes
+    AsyncReleaser::inst()->add(hv);
+}
+
+Tensor::Tensor(const DeviceTensorND &dv, const HostTensorND& hv) {
+    if (!hv.empty()) {
+        mgb_assert(dv.comp_node() == hv.comp_node());
+        mgb_assert(dv.dtype() == hv.dtype());
+        mgb_assert(dv.shape().eq_shape(hv.shape()));
+        m_value = hv;
+    }
+    m_layout = dv.layout();
+    m_blob = Blob::make(dv.storage());
+    m_offset = 0;
+}
+
+Tensor::Tensor(const TensorLayout& layout, const CompNode& cn)
+    : m_layout{layout}, m_blob{Blob::make(cn, layout.dtype.size(layout.total_nr_elems()))},
+    m_offset{0} {}
+
+Tensor::Tensor(const BlobPtr blob, const size_t offset, const TensorLayout& layout)
+    : m_layout{layout}, m_blob{blob}, m_offset{offset} {}
+
+TensorPtr Tensor::make(const HostTensorND& hv) {
+    auto&& blob = const_tensor_cache.lookup(hv);
+    if (blob) {
+        return make(std::forward<decltype(blob)>(blob), hv.layout(), hv);
+    }
+    return std::make_shared<Tensor>(hv);
+}
+
+DeviceTensorND Tensor::dev_tensor() {
+    mgb_assert(m_blob, "uninitialized tensor.");
+    DeviceTensorStorage storage;
+    storage.reset(m_blob->comp_node(), m_blob->size(), m_blob->storage());
+    storage = storage.sub(m_offset);
+    DeviceTensorND ret;
+    ret.reset(storage, m_layout);
+    return ret;
+}
+
+void Tensor::fetch_value() {
+    MGB_LOCK_GUARD(m_mtx);
+    if (m_value.empty()) {
+        m_value.copy_from(dev_tensor());
+        m_value_ready.reset(EventPool::inst().alloc(comp_node()));
+        m_value_ready->record();
+    }
+}
+
+bool Tensor::value_fetched() {
+    MGB_LOCK_GUARD(m_mtx);
+    return m_value.layout().ndim != 0;
+}
+
+const HostTensorND& Tensor::get_value() {
+    fetch_value();
+    if (m_value_ready) {
+        m_value_ready->host_wait();
+    }
+    return m_value;
+}
+
+const HostTensorND* Tensor::try_get_value() {
+    MGB_LOCK_GUARD(m_mtx);
+    if (!m_value.empty() && (!m_value_ready || m_value_ready->finished())) {
+        return &m_value;
+    }
+    return nullptr;
+}
+
+TensorPtr Tensor::make_scalar(DTypeScalar value, CompNode cn) {
+    HostTensorND hv{cn, value.dtype()};
+    hv.resize({1});
+    memcpy(hv.raw_ptr(), value.storage(), value.dtype().size(1));
+    return make(hv);
+}
+
+TensorPtr Tensor::sub(size_t offset, TensorShape shape) {
+    TensorLayout layout(shape, m_layout.dtype);
+    return Tensor::make(m_blob, offset + m_offset, layout);
+}
+
+void Tensor::add_release_callback(CompNode cn) {
+    AsyncReleaser::inst()->add(m_blob, cn);
+}
+
+CompNode::Event* Tensor::get_or_create_event() {
+    auto e = CompNodeSyncManager::inst()->get_or_create_event(m_blob.get());
+    e->record();
+    return e;
+}
+
+} // namespace imperative
+} // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/impl/profiler.cpp b/imperative/src/impl/profiler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f35f5b3c9232c276ac3b0db93308e66ea5833ba0
--- /dev/null
+++ b/imperative/src/impl/profiler.cpp
@@ -0,0 +1,214 @@
+/**
+ * \file src/core/impl/imperative/profiler.cpp
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "megbrain/imperative/profiler.h"
+
+#include <sys/unistd.h>
+#include <variant>
+
+#include "megbrain/imperative/ops/opr_attr.h"
+#include "megbrain/imperative/physical_tensor.h"
+
+#include "./op_trait.h"
+
+namespace mgb {
+
+namespace imperative {
+
+class OpDefInfo{
+public:
+    size_t id;
+    std::string name;
+};
+
+class ProfilerEntry {
+public:
+    ProfilerEntry(size_t index, Profiler::EventKind type, std::unique_ptr<CompNode::Event> device)
+        : index{index}, type{type}, device{std::move(device)}{
+    }
+    ProfilerEntry(size_t index, Profiler::EventKind type, double host): index{index}, type{type}, host{host}{
+    }
+    size_t index;
+    Profiler::EventKind type;
+    std::unique_ptr<CompNode::Event> device = nullptr;
+    double host = 0;
+};
+
+class ProfilerPrivate {
+public:
+    std::vector<OpDefInfo> op_list;
+    std::vector<ProfilerEntry> entry_list;
+    std::vector<std::unique_ptr<CompNode::Event>> event_list;
+    std::vector<std::tuple<OpTrait*, std::unique_ptr<ApplyOnPhysicalTensor>>>
+            hook_list;
+    ThinHashMap<CompNode, std::tuple<CompNode::Event*, double>>
+            comp_node_begin_map;
+    ThinHashMap<CompNode, CompNode::Event*> comp_node_end_map;
+    RealTimer timer;
+    size_t dump_count = 0;
+    bool enabled = false;
+    std::string path;
+};
+
+namespace {
+CompNode::UnorderedSet collect_comp_nodes(
+        const OpDef& def, const SmallVector<TensorPtr>& inputs) {
+    CompNode::UnorderedSet comp_nodes;
+    for (auto&& input : inputs) {
+        comp_nodes.insert(input->comp_node());
+    }
+    for (auto&& output_attr : def.infer_output_attrs(def, inputs)) {
+        comp_nodes.insert(output_attr.comp_node);
+    }
+    return comp_nodes;
+}
+}  // namespace
+
+std::unique_ptr<CompNode::Event> Profiler::create_event(CompNode comp_node){
+    auto event = comp_node.create_event(CompNode::Event::NEED_TIMER);
+    event->record();
+    auto& [begin, time] = m_private->comp_node_begin_map[comp_node];
+    if (begin == nullptr) {
+        begin = event.get();
+        time = m_private->timer.get_msecs();
+    }
+    return event;
+}
+
+double Profiler::get_host_time_now(){
+    return m_private->timer.get_msecs();
+}
+
+double Profiler::get_device_time(CompNode::Event& event) {
+    auto [base_event, host_time] =
+            m_private->comp_node_begin_map[event.comp_node()];
+    if (base_event == &event) {
+        return host_time;
+    } else {
+        return host_time + base_event->elapsed_time_until(event) * 1000;
+    }
+}
+
+size_t Profiler::get_dump_count(){
+    return m_private->dump_count;
+}
+
+Profiler::Profiler() {
+    m_private = std::make_unique<ProfilerPrivate>();
+}
+
+Profiler::Profiler(const std::string& path): Profiler() {
+    m_private->path = path;
+}
+
+void Profiler::enable() {
+    m_private->enabled = true;
+    CompNode::sync_all();
+    OpTrait::for_each_trait([this](OpTrait& trait) {
+        auto backup = std::make_unique<ApplyOnPhysicalTensor>(
+                std::move(trait.apply_on_physical_tensor));
+        trait.apply_on_physical_tensor =
+                 [this, backup = backup.get()] (
+                        const OpDef& def,
+                        const SmallVector<TensorPtr>& inputs){
+                    size_t index = m_private->op_list.size();
+                    std::string name = "[" + std::to_string(index) + "]" + print_op(def);
+                    m_private->op_list.push_back({reinterpret_cast<size_t>(&def), name});
+                    m_private->entry_list.emplace_back(index, OprBegin, get_host_time_now());
+                    auto&& comp_nodes = collect_comp_nodes(def, inputs);
+                    for (auto&& comp_node : comp_nodes) {
+                        m_private->entry_list.emplace_back(index, OprBegin, create_event(comp_node));
+                    }
+                    auto output = (*backup)(def, inputs);
+                    for (auto&& comp_node : comp_nodes) {
+                        m_private->entry_list.emplace_back(index, OprEnd, create_event(comp_node));
+                    }
+                    m_private->entry_list.emplace_back(index, OprEnd, get_host_time_now());
+                    return output;
+                };
+        m_private->hook_list.push_back({&trait, std::move(backup)});
+    });
+}
+
+void Profiler::disable() {
+    for (auto&& hook : m_private->hook_list) {
+        std::get<0>(hook)->apply_on_physical_tensor =
+                std::move(*std::get<1>(hook));
+    }
+    m_private->hook_list.clear();
+    m_private->enabled = false;
+}
+
+Profiler::~Profiler() {
+}
+
+void Profiler::dump(){
+    dump(m_private->path);
+}
+
+void Profiler::dump(const std::string& path) {
+    using namespace json;
+    auto obj = json::Object::make();
+    if (!(*obj)["traceEvents"]) {
+        (*obj)["traceEvents"] = Array::make();
+    }
+    auto& trace_events = (*obj)["traceEvents"]->cast_final<Array>();
+    for (auto&& entry : m_private->entry_list) {
+        auto trace_event_ptr = Object::make();
+        auto& trace_event = *trace_event_ptr;
+        std::string name;
+        size_t id;
+        int pid;
+        std::string tid;
+        double ts;
+        const char* ph;
+        name = m_private->op_list[entry.index].name;
+        id = entry.index;
+        pid = getpid();
+        if (entry.device) {
+            entry.device->host_wait();
+            ts = get_device_time(*entry.device);
+            tid = entry.device->comp_node().to_string();
+        } else {
+            ts = entry.host;
+            tid = "host";
+        }
+        switch (entry.type) {
+            case OprBegin: {
+                ph = "B";
+                break;
+            }
+            case OprEnd: {
+                ph = "E";
+                break;
+            }
+        }
+        trace_event["name"] = String::make(name);
+        trace_event["id"] = Number::make(id);
+        trace_event["pid"] = Number::make(pid);
+        trace_event["tid"] = String::make(tid);
+        trace_event["ts"] = Number::make(ts * 1000);
+        trace_event["ph"] = String::make(ph);
+        trace_events.add(std::move(trace_event_ptr));
+    }
+    obj->writeto_fpath(path.empty() ? path : m_private->path);
+    m_private->dump_count++;
+}
+
+std::string Profiler::print_op(const OpDef& def){
+    auto* opr_attr = def.try_cast_final<const OprAttr>();
+    if(opr_attr){
+        return std::string("OprAttr:") + opr_attr->type;
+    }
+    return def.dyn_typeinfo()->name;
+}
+
+}  // namespace imperative
+
+}  // namespace mgb
diff --git a/imperative/src/impl/proxy_graph.cpp b/imperative/src/impl/proxy_graph.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b750749ff921e5293b838a0b31b0021f53892bd4
--- /dev/null
+++ b/imperative/src/impl/proxy_graph.cpp
@@ -0,0 +1,850 @@
+/**
+ * \file src/core/impl/imperative/proxy_graph.cpp
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "./blob_manager_impl.h"
+#include "./proxy_graph.h"
+#include "megbrain/graph/static_infer.h"
+#include "megbrain/graph/operator_node.h"
+#include "megbrain/opr/io.h"
+#include "megbrain/opr/utility.h"
+#include "megbrain/imperative/ops/opr_attr.h"
+#include "megbrain/imperative/ops/backward_graph.h"
+
+namespace mgb {
+namespace imperative {
+
+using cg::OperatorNodeBase;
+
+template<bool p, typename T, typename F>
+constexpr auto&& select(T&& t, F&& f) {
+    if constexpr (p) {
+        return std::forward<T>(t);
+    } else {
+        return std::forward<F>(f);
+    }
+}
+
+MGB_DEFINE_OPR_CLASS(
+        ProxyGraph::InputPlaceholder,
+        cg::OperatorNodeBase) // {
+
+    void on_output_comp_node_stream_changed() override {
+        mgb_assert(0);
+    }
+    // TODO: consider implement following initialization method,
+    // so InputPlaceholder can be initialized correctly during
+    // operator insertion
+    void init_output_comp_node() override {
+    }
+    void init_output_format() override {
+    }
+    void init_output_dtype() override {
+    }
+    void init_output_static_infer_desc() override {
+    }
+    void init_output_mem_plan(bool dynamic) override {
+        MGB_MARK_USED_VAR(dynamic);
+        mgb_assert(0);
+    }
+    void do_execute(ExecEnv &env) override {
+        mgb_assert(0);
+    }
+
+public:
+    Tensor* m_tensor;
+
+    InputPlaceholder(ComputingGraph& graph, Tensor* tensor = nullptr,
+                     const DeviceTensorND& static_infer_value = {})
+            : Super(&graph, {}, "device_value", {}), m_tensor(tensor),
+              m_static_infer_value(static_infer_value) {
+        mgb_assert(m_static_infer_value.empty() ||
+                   m_static_infer_value.comp_node() == CompNode::default_cpu());
+        add_output(None)->add_flag(VarNode::Flag::NO_SYS_MEM_ALLOC);
+        // never dedup
+        add_equivalence_component<ScalarHash<void*>>(this);
+    }
+
+    static SymbolVar make(ComputingGraph& graph, Tensor& tensor) {
+        auto opr = graph.insert_opr(
+            std::make_unique<InputPlaceholder>(graph, &tensor));
+        auto var = opr->output(0);
+        auto&& dev_tensor = tensor.dev_tensor();
+        var->m_comp_node = dev_tensor.comp_node();
+        var->m_shape = dev_tensor.shape();
+        var->m_dev_tensor = dev_tensor;
+        var->reset_dev_tensor_from_tensor(dev_tensor);
+        return var;
+    }
+
+    static SymbolVar make(ComputingGraph& graph, const LogicalTensorDesc& desc) {
+        auto opr = graph.insert_opr(
+            std::make_unique<InputPlaceholder>(graph, nullptr, desc.value));
+        auto var = opr->output(0);
+        var->m_comp_node = desc.comp_node;
+        var->m_shape = desc.layout;
+        var->m_dev_tensor.reset({}, TensorLayout(desc.layout.dtype));
+        return var;
+    }
+
+    const DeviceTensorND* get_static_infer_value(bool may_sync) {
+        if (!m_static_infer_value.empty()) {
+            return &m_static_infer_value;
+        }
+        if (m_tensor && (may_sync || m_tensor->try_get_value())) {
+            auto&& hv = m_tensor->get_value();
+            mgb_assert(!hv.empty());
+            m_static_infer_value = hv.proxy_to_default_cpu();
+            // steal ownership from shared_ptr
+            using SP = std::shared_ptr<dt_byte>;
+            auto& sp = const_cast<SP&>(m_static_infer_value.storage().raw_storage());
+            static auto dummy = std::make_shared<dt_byte>();
+            sp = SP(dummy, sp.get());
+            return &m_static_infer_value;
+        }
+        return nullptr;
+    }
+
+private:
+    DeviceTensorND m_static_infer_value;
+};
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(
+        ProxyGraph::InputPlaceholder);
+
+class ProxyGraph::ExecEnv final : public cg::GraphExecutable::ExecEnv {
+
+public:
+    void dispatch_on_comp_node(CompNode, Task&& task) override {
+        task();
+    }
+
+    void dispatch_on_comp_node_with_mask(CompNode, Task&& task,
+                                         cg::ExecutionMask* mask) override {
+        mgb_throw_if(mask, GraphError,
+                     "ExecutionMask not supported in imperative mode");
+        task();
+    }
+
+    void pause_exec() override {}
+
+    void resume_exec() override {}
+};
+
+class ProxyGraph::StaticInferManager : public cg::static_infer::StaticInferManager {
+public:
+    using Tag = cg::static_infer::Tag;
+    using ShapeInferDesc = cg::static_infer::ShapeInferDesc;
+    using ValueInferDesc = cg::static_infer::ValueInferDesc;
+    using InferType = cg::static_infer::InferType;
+    using DepVal = cg::static_infer::DepVal;
+    using DepElement = cg::static_infer::DepElement;
+    using DepType = cg::static_infer::DepType;
+    using InpElement = cg::static_infer::InpElement;
+
+    struct Result {
+        TensorShape shape;
+        DeviceTensorND value;
+    };
+
+    ProxyGraph* owner;
+    cg::OperatorNodeBase* cur_opr = nullptr;
+    std::vector<std::optional<ShapeInferDesc>> shape_descs;
+    std::vector<std::optional<ValueInferDesc>> value_descs;
+    std::vector<Result> inferred_outputs;
+
+    StaticInferManager(ProxyGraph* owner_) : owner(owner_) {}
+
+    size_t locate_output(VarNode* var) {
+        mgb_assert(cur_opr);
+        auto&& output_vars = cur_opr->output();
+        mgb_assert(shape_descs.size() == output_vars.size());
+        auto&& it = std::find(output_vars.begin(), output_vars.end(), var);
+        mgb_assert(it != output_vars.end());
+        return it - output_vars.begin();
+    }
+
+    void register_shape_infer(Tag dest, const ShapeInferDesc &desc) override {
+        auto i = locate_output(dest);
+        mgb_assert(!shape_descs[i]);
+        shape_descs[i].emplace(desc);
+    }
+
+    void register_value_infer(Tag dest, const ValueInferDesc &desc) override {
+        auto i = locate_output(dest);
+        mgb_assert(!value_descs[i]);
+        value_descs[i].emplace(desc);
+    }
+
+    InferType get_infer_type(Tag var) override {
+        // may be called during get_proxy_opr or make_backward_graph
+
+        // don't let opr apply any immediate optimization
+        return {InferType::MISSING_INP, InferType::MISSING_INP};
+
+        if (auto opr = var->owner_opr()->try_cast_final<InputPlaceholder>()) {
+            return {var->shape().ndim ? InferType::CONST : InferType::MISSING_INP,
+                    opr->m_tensor ? InferType::CONST : InferType::MISSING_INP};
+        }
+        if (cur_opr) {
+            auto&& outputs = cur_opr->output();
+            auto&& it = std::find(outputs.begin(), outputs.end(), var);
+            if (it != outputs.end()) {
+                return {infer_shape_fallible(var) ? InferType::CONST : InferType::MISSING_INP,
+                        // value inference could be expensive
+                        InferType::MISSING_INP};
+            }
+        }
+        return {InferType::MISSING_INP, InferType::MISSING_INP};
+    }
+
+    void update() {
+        if (cur_opr != owner->m_cur_opr) {
+            clear();
+            cur_opr = owner->m_cur_opr;
+            if (cur_opr) {
+                auto nout = cur_opr->output().size();
+                shape_descs.resize(nout);
+                value_descs.resize(nout);
+                inferred_outputs.resize(nout);
+                cur_opr->init_output_static_infer_desc();
+            }
+        }
+    }
+
+    void clear() {
+        cur_opr = nullptr;
+        shape_descs.clear();
+        value_descs.clear();
+        inferred_outputs.clear();
+    }
+
+    template<bool is_shape>
+    auto do_infer(Tag dest, bool may_sync)
+            -> const std::conditional_t<is_shape, TensorShape, DeviceTensorND>* {
+        // Some infer_func does not use InpVal passed to them, but
+        // call infer_* on their inputs instead, so dest could be an input.
+        // It is also possible that an opr call infer_* on its inputs before it
+        // is inserted
+        if (auto opr = dest->owner_opr()->try_cast_final<InputPlaceholder>()) {
+            if constexpr (is_shape) {
+                auto* shp = &dest->shape();
+                return shp->ndim ? shp : nullptr;
+            } else {
+                return opr->get_static_infer_value(may_sync);
+            }
+        }
+
+        mgb_assert(cur_opr);
+        mgb_assert(cur_opr->output().size() == shape_descs.size());
+
+        // dest must be an output now
+        auto i = locate_output(dest);
+        auto& result = inferred_outputs[i];
+        auto& desc = select<is_shape>(shape_descs[i], value_descs[i]);
+
+        // return if no need to call infer_func
+        if constexpr (is_shape) {
+            if (result.shape.ndim != 0) {
+                return &result.shape;
+            }
+        } else {
+            if (!result.value.empty()) {
+                return &result.value;
+            }
+        }
+        if (!desc) {
+            return nullptr;
+        }
+
+        // fill args for infer_func
+        cg::static_infer::InpVal args{1};
+        args.val.reserve(desc->deps.size());
+        auto push_shape = [&args](const TensorShape* shape) {
+            args.val.emplace_back();
+            args.val.back().m_shape = shape;
+        };
+        auto push_value = [&args](const DeviceTensorND* value) {
+            args.val.emplace_back();
+            args.val.back().m_value = value;
+        };
+
+        for (auto&& dep : desc->deps) {
+            if (auto opr = dep.dest->owner_opr()->template try_cast_final<InputPlaceholder>()) {
+                if (dep.type == DepType::SHAPE) {
+                    if (dep.dest->shape().ndim) {
+                        push_shape(&dep.dest->shape());
+                    } else {
+                        return nullptr;
+                    }
+                } else {
+                    if (auto* p = opr->get_static_infer_value(may_sync)) {
+                        push_value(p);
+                    } else {
+                        return nullptr;
+                    }
+                }
+                continue;
+            }
+
+            // dep must be an output
+            if (dep.type == DepType::SHAPE) {
+                if (auto* p = do_infer<true>(dep.dest, may_sync)) {
+                    push_shape(p);
+                } else {
+                    return nullptr;
+                }
+            } else {
+                if (auto* p = do_infer<false>(dep.dest, may_sync)) {
+                    push_value(p);
+                } else {
+                    return nullptr;
+                }
+            }
+        }
+
+        // call infer_func
+        if constexpr (is_shape) {
+            if (!desc->infer_func(result.shape, args)) {
+                mgb_log_warn("something is missing for shape inference of %s",
+                             cur_opr->dyn_typeinfo()->name);
+                return nullptr;
+            }
+            return &result.shape;
+        } else {
+            if (!desc->infer_func(result.value, args)) {
+                mgb_log_warn("something is missing for value inference of %s",
+                             cur_opr->dyn_typeinfo()->name);
+                return nullptr;
+            }
+            return &result.value;
+        }
+    }
+
+    const TensorShape& infer_shape(Tag var) override {
+        auto* p = do_infer<true>(var, true);
+        mgb_assert(p, "failed to infer shape for %s", var->name().c_str());
+        return *p;
+    }
+    const TensorShape* infer_shape_fallible(Tag var) override {
+        return do_infer<true>(var, false);
+    }
+    const DeviceTensorND& infer_value(Tag var) override {
+        auto* p = do_infer<false>(var, true);
+        mgb_assert(p, "failed to infer value for %s", var->name().c_str());
+        return *p;
+    }
+    const DeviceTensorND* infer_value_fallible(Tag var) override {
+        return do_infer<false>(var, false);
+    }
+
+    DepVal get_rt_static_source_deps(const DepElement&) override {mgb_assert(0);}
+};
+
+class ProxyGraph::SeqCompNodeOptimizer : public cg::SeqCompNodeOptimizer {
+    void register_stream_var(VarNode*, StreamPropType) override {}
+    void register_propagate_function(VarNode*, PropFunction) override {}
+    StreamPropType stream_prop_type(VarNode*) override {mgb_assert(0);}
+};
+
+class ProxyGraph::ProxyGraphImpl : public cg::ComputingGraph {
+    static std::atomic<size_t> m_node_id;
+    ProxyGraph* m_owner;
+    MemPool<VarNode> m_var_node_pool;
+    std::vector<std::unique_ptr<OperatorNodeBase>> m_opr_refkeeper;
+    CompNode::UnorderedSet m_used_comp_node;
+    VarReceiverInfo m_var_receiver_info;
+public:
+    ~ProxyGraphImpl() {
+        mgb_assert(!m_owner->m_cur_opr);
+        if (is_finalized()) return;
+        for (auto&& i : m_used_comp_node) {
+            if (i.device_type() == CompNode::DeviceType::CUDA) continue;
+            i.sync();
+        }
+    }
+
+    ProxyGraphImpl(ProxyGraph* owner) : m_owner(owner) {
+        options().imperative_proxy_graph = true;
+        options().log_level = 0;
+        m_var_receiver_info.dev_value = 1;
+        m_var_receiver_info.allow_empty_value = 1;
+    }
+
+    static std::unique_ptr<ProxyGraphImpl> make(ProxyGraph* owner) {
+        return std::make_unique<ProxyGraphImpl>(owner);
+    }
+
+    void add_used_comp_node(CompNode cn) {
+        m_used_comp_node.insert(cn);
+    }
+
+    bool invalid() const {
+        return is_finalized() || nr_oprs_in_graph() > m_owner->m_max_op_cnt;
+    }
+
+    size_t next_node_id() override {
+        return m_node_id.fetch_add(1);
+    }
+
+    void* alloc_varnode_storage() override {
+        return m_var_node_pool.alloc_raw();
+    }
+
+    void free_varnode_storage(void* ptr) override {
+        m_var_node_pool.free_raw(ptr);
+    }
+
+    OperatorNodeBase* insert_opr(std::unique_ptr<OperatorNodeBase> opr_uniqp) override {
+        mgb_assert(!is_finalized());
+        auto opr = opr_uniqp.get();
+
+        if (!opr->inserted_in_graph()) {
+            m_opr_refkeeper.emplace_back(std::move(opr_uniqp));
+            opr->set_inserted_in_graph();
+            opr->init_output_comp_node();
+            opr->init_output_dtype();
+            opr->init_output_format();
+        }
+        return opr;
+    }
+
+    cg::static_infer::StaticInferManager& static_infer_manager() override {
+        return *m_owner->m_static_infer_manager;
+    }
+
+    cg::SeqCompNodeOptimizer& seq_comp_node_optimizer() override {
+        return *m_owner->m_seq_comp_node_optimizer;
+    }
+
+    std::shared_ptr<void> on_comp_node_finalize() override {
+        // FIXME: mutex
+        mgb_assert(!m_owner->m_cur_opr);
+        // finalize would do sync first
+        m_opr_refkeeper.clear();
+        return {};
+    }
+
+    const VarReceiverInfo& var_receiver_in_current_comp_seq(
+            const VarNode *var) const override {
+        return m_var_receiver_info;
+    }
+
+    size_t nr_oprs_in_graph() const override {return m_opr_refkeeper.size();}
+
+    std::unique_ptr<cg::AsyncExecutable> compile(const OutputSpec &out_spec) override {mgb_assert(0);}
+    SmallVector<std::unique_ptr<cg::AsyncExecutable>> compile_multi_part(
+            const SmallVector<OutputSpec>& out_specs) override {mgb_assert(0);}
+    cg::AsyncExecutable* current_comp_seq() override {mgb_assert(0);}
+    std::string get_mem_allocation_info() const override {mgb_assert(0);}
+    VarNode* find_var_by_id(size_t id) const override {mgb_assert(0);}
+    void share_device_memory_with(ComputingGraph &other) override {mgb_assert(0);}
+    void set_device_memory_allocator(
+            std::shared_ptr<cg::DeviceMemoryAllocator> allocator) override {mgb_assert(0);}
+    size_t get_device_memory_size(CompNode cn) override {mgb_assert(0);}
+    size_t clear_device_memory() override {mgb_assert(0);}
+    void set_as_subgraph(ComputingGraph &par_graph) override {mgb_assert(0);}
+    void record_async_error(std::unique_ptr<MegBrainError> async_exc) override {mgb_assert(0);}
+};
+
+std::atomic<size_t> ProxyGraph::ProxyGraphImpl::m_node_id = 0;
+
+ProxyGraph::ProxyGraph() :
+        m_graph(ProxyGraphImpl::make(this)),
+        m_env{new ExecEnv},
+        m_static_infer_manager(new StaticInferManager(this)),
+        m_seq_comp_node_optimizer(new SeqCompNodeOptimizer()) {
+}
+
+void ProxyGraph::reset() {
+    mgb_assert(!m_cur_opr);
+    m_graph = ProxyGraphImpl::make(this);
+}
+
+ProxyGraph* ProxyGraph::get_default_graph() {
+    static thread_local ProxyGraph inst;
+    if (inst.m_graph->invalid()) {
+        inst.reset();
+    }
+    return &inst;
+}
+
+class ProxyGraph::CurOprGuard {
+public:
+    CurOprGuard(ProxyGraph* owner, OperatorNodeBase* opr) : m_owner(owner) {
+        mgb_assert(!owner->m_cur_opr);
+        owner->m_cur_opr = opr;
+    }
+    CurOprGuard(const CurOprGuard&) = delete;
+    ~CurOprGuard() {
+        m_owner->cleanup();
+    }
+private:
+    ProxyGraph* m_owner;
+};
+
+#define CUR_OPR_GUARD(opr) CurOprGuard MGB_TOKENPASTE2(__cur_opr_guard_, __LINE__)(this, opr)
+
+/*********************** Physical Tensor Impl ***********************/
+
+SmallVector<LogicalTensorDesc> ProxyGraph::infer_output_attrs(
+        const OpDef& opdef,
+        const SmallVector<Tensor*>& inputs) {
+    SmallVector<LogicalTensorDesc> ret;
+    CUR_OPR_GUARD(get_proxy_opr(opdef, inputs));
+    do_shape_infer(true);
+    for (auto&& i: m_cur_opr->usable_output()) {
+        mgb_assert(i->dtype().valid() && i->comp_node().valid());
+        mgb_assert(i->shape().ndim || i->contain_flag(VarNode::Flag::NO_SYS_MEM_ALLOC));
+        ret.push_back({{i->shape(), i->dtype()}, i->comp_node()});
+    }
+    return ret;
+}
+
+void ProxyGraph::invoke_op(const OpDef& opdef,
+        const SmallVector<Tensor*>& inputs,
+        const SmallVector<Tensor*>& outputs) {
+    CUR_OPR_GUARD(get_proxy_opr(opdef, inputs));
+    init_output_tensor(outputs);
+    for (auto oup : m_cur_opr->output()) {
+        m_graph->add_used_comp_node(oup->comp_node());
+    }
+    m_cur_opr->execute(*m_env);
+}
+
+void ProxyGraph::cleanup() {
+    if (m_cur_opr) {
+        for (auto&& i : m_cur_opr->input()) {
+            i->m_dev_tensor.storage({});
+        }
+        for (auto&& i : m_cur_opr->output()) {
+            i->m_dev_tensor.storage({});
+        }
+        m_static_infer_manager->clear();
+    }
+    m_cur_opr = nullptr;
+}
+
+void ProxyGraph::init_output_tensor(const SmallVector<Tensor*>& outputs) {
+    // get proxy opr
+    auto proxy = m_cur_opr;
+
+    do_shape_infer(true);
+
+    size_t j = 0;
+    for (auto&& var : proxy->output()) {
+        auto &&chk = var->m_mem_plan.reset_from_owner_var().chunk();
+        if (var->contain_flag(VarNode::Flag::VOLATILE_CONTENT)) {
+            // alloc workspace
+            TensorLayout layout{var->shape(), var->dtype(), var->format()};
+            DeviceTensorStorage storage;
+            storage.comp_node(var->comp_node())
+                   .ensure_size(layout.dtype.size(layout.total_nr_elems()));
+            var->m_dev_tensor.reset(storage, layout);
+        } else {
+            mgb_assert(j < outputs.size());
+            auto &&tensor = outputs[j];
+            auto &&layout = tensor->layout();
+            mgb_assert(var->comp_node() == tensor->comp_node() &&
+                        var->shape().eq_shape(layout) &&
+                        var->dtype() == layout.dtype);
+            var->assign_dev_tensor_from_tensor(tensor->dev_tensor());
+            ++ j;
+        }
+        chk.mem_alloc_status.set_from_owner_var();
+    }
+    mgb_assert(j == outputs.size());
+
+    // Memory forwarding was bypassed in megbrain with graph option
+    // imerative_proxy_graph on, here we call mem_plan_fwd_in2out_readonly
+    // to initialize some opr(e.g. Subtensor)'s internal state
+    // TODO: implement memory forwarding
+    proxy->mem_plan_fwd_in2out_readonly();
+    {
+        // some opr (e.g. Reduce) rely on on_mem_status_changed to set
+        // input/output tensor corretly, since we bypass var_node_mem_mgr
+        // on_mem_status_changed should be called here
+        auto&& cb = proxy->get_opr_event_callback().on_mem_status_changed;
+        if (cb.valid()) {
+            cb.val()();
+        }
+    }
+}
+
+cg::OperatorNodeBase* ProxyGraph::get_proxy_opr(
+        const OpDef& opdef,
+        const SmallVector<Tensor*>& inputs) {
+    VarNodeArray vinputs(inputs.size());
+    for (size_t i = 0; i < inputs.size(); ++ i) {
+        vinputs[i] = InputPlaceholder::make(*m_graph, *inputs[i]).node();
+    }
+    auto opr = OpDef::apply_on_var_node(opdef, vinputs);
+    mgb_assert(opr->dyn_typeinfo() != InputPlaceholder::typeinfo());
+    for (auto &&i : opr->input()) {
+        mgb_assert(i->owner_opr()->dyn_typeinfo() ==
+                InputPlaceholder::typeinfo());
+    }
+    return opr;
+}
+
+/*********************** Logical Tensor Impl ***********************/
+
+size_t ProxyGraph::get_opr_output_size(const OpDef& opdef,
+        const SmallVector<LogicalTensorDesc>& inputs) {
+    return get_proxy_opr(opdef, inputs)->usable_output().size();
+}
+
+SmallVector<LogicalTensorDesc> ProxyGraph::infer_output_attrs_fallible(
+        const OpDef& opdef,
+        const SmallVector<LogicalTensorDesc>& inputs) {
+    auto opr = get_proxy_opr(opdef, inputs);
+    CUR_OPR_GUARD(opr);
+    do_shape_infer(false);
+    SmallVector<LogicalTensorDesc> ret;
+    for (auto&& i : opr->usable_output()) {
+        ret.push_back({{i->shape(), i->dtype()}, i->comp_node()});
+    }
+    return ret;
+}
+
+struct ProxyGraph::GradGraph {
+    cg::VarNodeArray inputs;
+    cg::VarNodeArray outputs;
+    cg::VarNodeArray output_grads;
+    cg::VarNode* grad;
+};
+
+BackwardGraphResult
+ProxyGraph::make_backward_graph(
+        const OpDef& opdef,
+        const SmallVector<LogicalTensorDesc>& input_descs,
+        const SmallVector<bool>& input_requires_grad,
+        const SmallVector<bool>& output_has_grad) {
+    ThinHashMap<VarNode*, size_t> var2idx;
+    auto push = [&var2idx, cnt=0](VarNode* var) mutable {
+        auto&& ret = var2idx.emplace(var, cnt ++);
+        mgb_assert(ret.second, "var %s has been already inserted", var->cname());
+        return ret.first->second;
+    };
+    auto inputs = make_input_place_holders(input_descs);
+    auto fwd = OpDef::apply_on_var_node(opdef, inputs);
+    auto&& outputs = fwd->usable_output();
+    SmallVector<LogicalTensorDesc> output_descs;
+    for (auto&& i : outputs) {
+        output_descs.push_back({TensorLayout{i->dtype()}, i->comp_node()});
+    }
+    auto output_grads = make_input_place_holders(output_descs);
+    mgb_assert(output_grads.size() == output_has_grad.size());
+    bool any_input_has_grad = false;
+    for (size_t i = 0; i < output_grads.size(); ++ i) {
+        if (!output_has_grad[i]) {
+            output_grads[i] = nullptr;
+        } else {
+            any_input_has_grad = true;
+        }
+    }
+    if (!any_input_has_grad) {
+        return {};
+    }
+    auto* gfunc = cg::lookup_grad_func(fwd->dyn_typeinfo());
+
+    BackwardGraphResult result;
+    auto&& backward = BackwardGraph::make();
+    auto&& igraph = backward->cast_final_safe<BackwardGraph>().graph();
+
+    size_t nr_backward_graph_inputs = 0;
+    auto gen_expr = [this, &var2idx, &igraph, &push, &fwd,
+            &nr_backward_graph_inputs](cg::OperatorNodeBase* op) {
+        if (auto t = as_tensor(op)) {
+            mgb_assert(op->output().size() == 1);
+            igraph.constants.emplace_back(push(op->output(0)), std::move(t));
+        } else if (op->same_type<InputPlaceholder>()) {
+            ++ nr_backward_graph_inputs;
+            push(op->output(0));
+        } else {
+            std::vector<size_t> inputs, outputs;
+            for (auto &&i : op->input()) {
+                if (i->owner_opr() == fwd) {
+                    if (var2idx.find(i) == var2idx.end()) {
+                        ++ nr_backward_graph_inputs;
+                        push(i);
+                    }
+                }
+                inputs.push_back(var2idx.at(i));
+            }
+            for (auto &&i : op->usable_output()) {
+                outputs.push_back(push(i));
+            }
+            igraph.exprs.emplace_back(OpDef::make_from_op_node(op), inputs, outputs);
+        }
+    };
+
+    // set backward graph outputs
+    cg::DepOprIter iter{gen_expr};
+    iter.set_visited(fwd);
+    result.input_has_grad.resize(inputs.size());
+
+    VarNodeArray output_grads_with_unused_var;
+    {
+        auto iter = output_grads.begin();
+        for (auto&& i : fwd->output()) {
+            if (i->contain_flag(VarNode::Flag::VOLATILE_CONTENT)) {
+                // the var node with VOLATILE_CONTENT(e.g. workspace
+                // or an empty var) would not be considered as a normal
+                // output, so its grad is always NULL
+                output_grads_with_unused_var.push_back(nullptr);
+            } else {
+                output_grads_with_unused_var.push_back(*iter);
+                ++ iter;
+            }
+        }
+        mgb_assert(iter == output_grads.end());
+    }
+
+    Maybe<VarNodeArray> grad_results;
+    for (size_t i = 0; i < inputs.size(); ++ i) {
+        VarNode* grad;
+        if (grad_results.valid()) {
+            grad = grad_results.val()[i];
+        } else {
+            auto res = (*gfunc)(fwd, i, output_grads_with_unused_var);
+            if (res.from_single()) {
+                grad = res.single();
+            } else {
+                grad_results.emplace(res.all(fwd));
+                grad = grad_results.val()[i];
+            }
+        }
+        if (grad && !grad->owner_opr()->same_type<opr::InvalidGrad>()
+            && input_requires_grad[i]) {
+            mgb_assert(!grad->owner_opr()->same_type<opr::InvalidGrad>(),
+                       "gradient of operator %s w.r.t. input #%lu is "
+                       "either not well defined or not implemented",
+                       fwd->dyn_typeinfo()->name, i);
+            iter.add(grad);
+            igraph.outputs.push_back(var2idx.at(grad));
+            result.input_has_grad[i] = true;
+        } else {
+            result.input_has_grad[i] = false;
+        }
+    }
+    if (igraph.outputs.empty()) {
+        return {};
+    }
+
+    // set backward graph inputs
+    igraph.inputs.reserve(nr_backward_graph_inputs);
+    result.save_for_backward.reserve(nr_backward_graph_inputs);
+    auto write_inputs = [&igraph, &var2idx, &result](const VarNodeArray& vars) {
+        for (auto&& i: vars) {
+            auto&& iter = var2idx.find(i);
+            if (iter != var2idx.end()) {
+                igraph.inputs.push_back(iter->second);
+                result.save_for_backward.push_back(true);
+            } else {
+                result.save_for_backward.push_back(false);
+            }
+        }
+    };
+    write_inputs(inputs);
+    write_inputs(outputs);
+    write_inputs(output_grads);
+    mgb_assert(igraph.inputs.size() == nr_backward_graph_inputs);
+
+    auto treat_as_single = [](auto&& igraph) {
+        if (igraph.exprs.size() != 1)
+            return false;
+        auto&& expr = igraph.exprs[0];
+        auto&& expr_inputs = std::get<1>(expr);
+        if (expr_inputs.size() != igraph.inputs.size()) {
+            return false;
+        }
+        for (size_t i = 0; i < expr_inputs.size(); ++ i) {
+            if (igraph.inputs[i] != expr_inputs[i]) {
+                return false;
+            }
+        }
+        auto&& expr_outputs = std::get<2>(expr);
+        if (expr_outputs.size() != igraph.outputs.size()) {
+            return false;
+        }
+        for (size_t i = 0; i < expr_outputs.size(); ++ i) {
+            if (igraph.outputs[i] != expr_outputs[i]) {
+                return false;
+            }
+        }
+        return true;
+    };
+    if (treat_as_single(igraph)) {
+        result.backward = std::get<0>(igraph.exprs[0]);
+    } else {
+        result.backward = backward;
+    }
+    return result;
+}
+
+cg::OperatorNodeBase* ProxyGraph::get_proxy_opr(const OpDef& opdef,
+        const SmallVector<LogicalTensorDesc>& inputs) {
+    mgb_assert(!m_cur_opr);
+    auto vinputs = make_input_place_holders(inputs);
+    return OpDef::apply_on_var_node(opdef, vinputs);
+}
+
+VarNodeArray ProxyGraph::make_input_place_holders(const SmallVector<LogicalTensorDesc>& inputs) {
+    VarNodeArray vinputs(inputs.size());
+    for (size_t i = 0; i < inputs.size(); ++ i) {
+        vinputs[i] = InputPlaceholder::make(*m_graph, inputs[i]).node();
+    }
+    return vinputs;
+}
+
+/*********************** Common Impl ***********************/
+
+void ProxyGraph::do_shape_infer(bool sync_value) {
+    m_static_infer_manager->update();
+
+    for (auto* var : m_cur_opr->output()) {
+        if (sync_value) {
+            var->shape(m_static_infer_manager->infer_shape(var));
+        } else if (auto* shape = m_static_infer_manager->infer_shape_fallible(var)) {
+            var->shape(*shape);
+        }
+    }
+}
+
+TensorPtr ProxyGraph::as_tensor(cg::OperatorNodeBase* opr, bool share) {
+    // TODO : maybe some tensor should copy value from origin opr rather than
+    // share the RawStorage
+    mgb_assert(share, "can't share memory with opr %s", opr->cname());
+    if (opr->same_type<opr::ImmutableTensor>()) {
+        auto&& dv = opr->cast_final_safe<opr::ImmutableTensor>().value();
+        HostTensorND hv(dv.comp_node(), dv.shape(), dv.dtype());
+        const DeviceTensorND* cpu_value;
+        // get host value
+        if (opr->owner_graph() == m_graph.get()) {
+            CUR_OPR_GUARD(opr);
+            m_static_infer_manager->update();
+            cpu_value = m_static_infer_manager->infer_value_fallible(opr->output(0));
+        } else {
+            cpu_value = opr->owner_graph()->static_infer_manager().infer_value_fallible(opr->output(0));
+        }
+        mgb_assert(cpu_value);
+        mgb_assert(cpu_value->comp_node() == CompNode::default_cpu());
+        // default_cpu is synchronous with respect to caller
+        hv.proxy_to_default_cpu().copy_from_fixlayout(*cpu_value);
+        return Tensor::make(dv, hv);
+    } else if (opr->same_type<opr::SharedDeviceTensor>()) {
+        return Tensor::make(opr->cast_final_safe<opr::SharedDeviceTensor>().get_dev_tensor());
+    } else {
+        return {};
+    }
+}
+
+} // namespace imperative
+} // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/impl/proxy_graph.h b/imperative/src/impl/proxy_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..c26cc32fb1c31ce199e1d32fde47c9ac8ba53b04
--- /dev/null
+++ b/imperative/src/impl/proxy_graph.h
@@ -0,0 +1,104 @@
+/**
+ * \file src/core/impl/imperative/proxy_graph.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include "megbrain/imperative.h"
+#include "megbrain/graph/cg.h"
+#include "megbrain/graph/grad_impl.h"
+#include "megbrain/comp_node.h"
+
+#include "megbrain/imperative/ops/backward_graph.h"
+
+namespace mgb {
+namespace imperative {
+
+class ProxyGraph : public NonCopyableObj {
+public:
+    static ProxyGraph* get_default_graph();
+
+    /********************** Physical Tensor API **********************/
+
+    SmallVector<LogicalTensorDesc> infer_output_attrs(
+            const OpDef& opdef,
+            const SmallVector<Tensor*>& inputs);
+
+    void invoke_op(
+            const OpDef& opdef,
+            const SmallVector<Tensor*>& inputs,
+            const SmallVector<Tensor*>& outputs);
+
+    BackwardGraphResult make_backward_graph(
+            const OpDef& opdef,
+            const SmallVector<LogicalTensorDesc>& input_descs,
+            const SmallVector<bool>& input_requires_grad,
+            const SmallVector<bool>& output_has_grad);
+
+    /********************** Logical Tensor API **********************/
+
+    size_t get_opr_output_size(
+            const OpDef& opdef,
+            const SmallVector<LogicalTensorDesc>& inputs);
+
+    SmallVector<LogicalTensorDesc> infer_output_attrs_fallible(
+            const OpDef& opdef,
+            const SmallVector<LogicalTensorDesc>& inputs);
+
+private:
+    ProxyGraph();
+
+    class ProxyGraphImpl;
+    class ExecEnv;
+    class StaticInferManager;
+    class SeqCompNodeOptimizer;
+    class InputPlaceholder;
+    struct ProxyGraphInst;
+    struct GradGraph;
+    struct CurOprGuard;
+
+    void reset();
+
+    /********************** Physical Tensor Helper **********************/
+
+    void cleanup();
+
+    void init_output_tensor(
+            const SmallVector<Tensor*>& outputs);
+
+    cg::OperatorNodeBase* get_proxy_opr(
+            const OpDef& opdef,
+            const SmallVector<Tensor*>& inputs);
+
+    /********************** Logical Tensor Helper **********************/
+
+    cg::OperatorNodeBase* get_proxy_opr(
+            const OpDef& opdef,
+            const SmallVector<LogicalTensorDesc>& inputs);
+
+    cg::VarNodeArray make_input_place_holders(
+            const SmallVector<LogicalTensorDesc>& inputs);
+
+    /********************** Common Helper **********************/
+
+    void do_shape_infer(bool sync_value);
+
+    TensorPtr as_tensor(cg::OperatorNodeBase* opr, bool share=true);
+
+    cg::OperatorNodeBase* m_cur_opr = nullptr;
+    std::unique_ptr<ProxyGraphImpl> m_graph;
+    size_t m_max_op_cnt = 1000;
+    std::unique_ptr<ExecEnv> m_env;
+    std::unique_ptr<StaticInferManager> m_static_infer_manager;
+    std::unique_ptr<SeqCompNodeOptimizer> m_seq_comp_node_optimizer;
+};
+
+} // namespace imperative
+} // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/impl/proxy_graph_detail.cpp b/imperative/src/impl/proxy_graph_detail.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3fa962cf6c33d480e496b25f35acbe0fe60419e4
--- /dev/null
+++ b/imperative/src/impl/proxy_graph_detail.cpp
@@ -0,0 +1,124 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "./proxy_graph.h"
+#include "./proxy_graph_detail.h"
+
+namespace mgb {
+namespace imperative {
+namespace proxy_graph_detail {
+
+namespace {
+SmallVector<Tensor*> to_raw_ptr_array(
+        const SmallVector<TensorPtr>& inputs,
+        bool ensure_storage=true) {
+    SmallVector<Tensor*> ret;
+    for (auto&& i : inputs) {
+        mgb_assert(i);
+        ret.push_back(i.get());
+        if (ensure_storage) {
+            // apply lazy allocation
+            i->blob()->storage();
+        }
+    }
+    return ret;
+}
+} // anonymous namespace
+
+void exec(const OpDef& def,
+        const SmallVector<TensorPtr>& inputs_,
+        const SmallVector<TensorPtr>& outputs_) {
+    auto&& graph = ProxyGraph::get_default_graph();
+    auto inputs = to_raw_ptr_array(inputs_),
+         outputs = to_raw_ptr_array(outputs_);
+    CompNode::UnorderedSet used_cns;
+    for (auto&& out: outputs) {
+        auto cn = out->comp_node();
+        if (used_cns.insert(cn).second) {
+            for (auto&& in: inputs) {
+                if (in->comp_node() != cn) {
+                    auto&& e = in->get_or_create_event();
+                    e->device_wait_by(cn);
+                }
+            }
+        }
+    }
+    graph->invoke_op(def, inputs, outputs);
+    for (auto&& cn: used_cns) {
+        for (auto&& in: inputs) {
+            if (in->comp_node() != cn) {
+                in->add_release_callback(cn);
+            }
+        }
+    }
+}
+
+SmallVector<LogicalTensorDesc> infer_output_attrs(const OpDef& def,
+        const SmallVector<TensorPtr>& inputs) {
+    auto&& graph = ProxyGraph::get_default_graph();
+    return graph->infer_output_attrs(def, to_raw_ptr_array(inputs));
+}
+
+SmallVector<LogicalTensorDesc>
+infer_output_attrs_fallible(const OpDef& def,
+        const SmallVector<LogicalTensorDesc>& inputs) {
+    auto&& graph = ProxyGraph::get_default_graph();
+    return graph->infer_output_attrs_fallible(def, inputs);
+}
+
+namespace {
+
+size_t get_backward_graph_hash_key(const OpDef& def,
+        const SmallVector<LogicalTensorDesc>& inputs,
+        const SmallVector<bool>& input_requires_grad,
+        const SmallVector<bool>& output_has_grad) {
+    XXHash state;
+    size_t length = 0, data[3 + 2 * inputs.size()];
+    data[length ++] = def.hash();
+    for (auto &&i : inputs) {
+        data[length ++] = mgb::hash(i.layout.dtype.handle());
+        data[length ++] = mgb::hash(i.comp_node);
+    }
+    data[length ++] = mgb::hash(input_requires_grad);
+    data[length ++] = mgb::hash(output_has_grad);
+    mgb_assert(length == 3 + 2 * inputs.size());
+    state.update(data, length * sizeof(size_t));
+    return state.digest();
+}
+
+struct BackwardGraphCache : std::unordered_map<size_t, BackwardGraphResult>, CompNodeDepedentObject {
+    std::shared_ptr<void> on_comp_node_finalize() override {
+        clear();
+        return {};
+    }
+} backward_graph_cache;
+
+} // anonymous namespace
+
+BackwardGraphResult
+make_backward_graph(const OpDef& def,
+        const SmallVector<LogicalTensorDesc>& inputs,
+        const SmallVector<bool>& input_requires_grad,
+        const SmallVector<bool>& output_has_grad) {
+    auto&& graph = ProxyGraph::get_default_graph();
+    auto hash_key = get_backward_graph_hash_key(def, inputs, input_requires_grad, output_has_grad);
+    auto&& iter = backward_graph_cache.find(hash_key);
+    if (iter != backward_graph_cache.end()) {
+        return iter->second;
+    }
+    auto res = graph->make_backward_graph(def, inputs, input_requires_grad, output_has_grad);
+    backward_graph_cache.emplace(hash_key, res);
+    return res;
+}
+
+} // namespace proxy_graph_detail
+} // namespace imperative
+} // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
\ No newline at end of file
diff --git a/imperative/src/impl/proxy_graph_detail.h b/imperative/src/impl/proxy_graph_detail.h
new file mode 100644
index 0000000000000000000000000000000000000000..16c05a6e339ac121d289be00cdc08e3742087392
--- /dev/null
+++ b/imperative/src/impl/proxy_graph_detail.h
@@ -0,0 +1,39 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include "megbrain/imperative/op_def.h"
+
+namespace mgb {
+namespace imperative {
+namespace proxy_graph_detail {
+
+void exec(const OpDef& def,
+        const SmallVector<TensorPtr>& inputs_,
+        const SmallVector<TensorPtr>& outputs_);
+
+SmallVector<LogicalTensorDesc> infer_output_attrs(const OpDef& def,
+        const SmallVector<TensorPtr>& inputs);
+
+SmallVector<LogicalTensorDesc>
+infer_output_attrs_fallible(const OpDef& def,
+        const SmallVector<LogicalTensorDesc>& inputs);
+
+BackwardGraphResult
+make_backward_graph(const OpDef& def,
+        const SmallVector<LogicalTensorDesc>& inputs,
+        const SmallVector<bool>& input_requires_grad,
+        const SmallVector<bool>& output_has_grad);
+
+} // namespace proxy_graph_detail
+} // namespace imperative
+} // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
\ No newline at end of file
diff --git a/imperative/src/include/megbrain/imperative.h b/imperative/src/include/megbrain/imperative.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac3dceda7d5be597497f794b9fe4342a71f55b3a
--- /dev/null
+++ b/imperative/src/include/megbrain/imperative.h
@@ -0,0 +1,16 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include "megbrain/imperative/physical_tensor.h"
+#include "megbrain/imperative/op_def.h"
+#include "megbrain/imperative/opdef/all.h"
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/include/megbrain/imperative/blob_manager.h b/imperative/src/include/megbrain/imperative/blob_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..61dbd540d9c57bcf0d857cb39692104a4839aeb2
--- /dev/null
+++ b/imperative/src/include/megbrain/imperative/blob_manager.h
@@ -0,0 +1,35 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include "megbrain/imperative/physical_tensor.h"
+
+namespace mgb {
+namespace imperative {
+
+class BlobManager : public NonCopyableObj {
+public:
+    virtual ~BlobManager() = default;
+
+    static BlobManager* inst();
+
+    virtual void alloc_with_defrag(Blob* blob, size_t size) = 0;
+
+    virtual void register_blob(Blob* blob) = 0;
+
+    virtual void unregister_blob(Blob* blob) = 0;
+
+    virtual void set_enable(bool flag) = 0;
+
+    virtual void defrag(const CompNode& cn) = 0;
+};
+
+} // namespace imperative
+} // namespace mgb
diff --git a/imperative/src/include/megbrain/imperative/interpreter.h b/imperative/src/include/megbrain/imperative/interpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9124a0c5a0d571844b3473e76849e8944e7b51a
--- /dev/null
+++ b/imperative/src/include/megbrain/imperative/interpreter.h
@@ -0,0 +1,39 @@
+#include <atomic>
+
+#include "megbrain/imperative/op_def.h"
+
+namespace mgb::imperative::interpreter {
+
+struct Interpreter {
+    using Handle = void*;
+
+    struct Channel {
+        virtual ~Channel() = default;
+
+        virtual Handle put(const HostTensorND& value) = 0;
+
+        virtual void del(Handle) = 0;
+
+        virtual SmallVector<Handle> apply_op(
+                std::shared_ptr<OpDef> op,
+                const SmallVector<Handle>& inputs) = 0;
+
+        virtual HostTensorND get_value(Handle) = 0;
+        virtual TensorShape get_shape(Handle) = 0;
+        virtual DType get_dtype(Handle) = 0;
+        virtual CompNode get_device(Handle) = 0;
+
+        virtual DeviceTensorND get_dev_tensor(Handle) = 0;
+
+        virtual void sync() = 0;
+        virtual void close() = 0;
+
+        virtual void config_async_level(int level) = 0;
+    };
+
+    virtual std::unique_ptr<Channel> create_channel() = 0;
+
+    static Interpreter& inst();
+};
+
+} // namespace mgb::imperative::interpreter
diff --git a/imperative/src/include/megbrain/imperative/op_def.h b/imperative/src/include/megbrain/imperative/op_def.h
new file mode 100644
index 0000000000000000000000000000000000000000..f742fc6bfa25a54dc443e0e2df20a835745eb067
--- /dev/null
+++ b/imperative/src/include/megbrain/imperative/op_def.h
@@ -0,0 +1,91 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include "megbrain/graph.h"
+#include "megbrain/imperative/physical_tensor.h"
+
+namespace mgb {
+namespace imperative {
+
+class OpDef;
+struct OpTrait;
+
+struct BackwardGraphResult {
+    std::shared_ptr<OpDef> backward;
+    std::vector<bool> save_for_backward;
+    std::vector<bool> input_has_grad;
+};
+
+class OpDef : public Hashable {
+    mutable const OpTrait* m_trait = nullptr;
+public:
+    virtual ~OpDef() = default;
+
+    virtual std::shared_ptr<OpDef> copy() const = 0;
+
+    static std::shared_ptr<OpDef> make_from_op_node(
+        cg::OperatorNodeBase* node);
+
+    static SmallVector<TensorPtr> apply_on_physical_tensor(
+        const OpDef& def,
+        const SmallVector<TensorPtr>& inputs);
+
+    static void exec(
+        const OpDef& def,
+        const SmallVector<TensorPtr>& inputs,
+        const SmallVector<TensorPtr>& outputs);
+
+    static cg::OperatorNodeBase* apply_on_var_node(
+        const OpDef& def,
+        const VarNodeArray& inputs);
+
+    static SmallVector<LogicalTensorDesc> infer_output_attrs_fallible(
+        const OpDef& def,
+        const SmallVector<LogicalTensorDesc>& inputs);
+
+    static SmallVector<LogicalTensorDesc> infer_output_attrs(
+        const OpDef& def,
+        const SmallVector<TensorPtr>& inputs);
+
+    static BackwardGraphResult make_backward_graph(
+        const OpDef& def,
+        const SmallVector<LogicalTensorDesc>& inputs,
+        const SmallVector<bool>& input_requires_grad,
+        const SmallVector<bool>& output_has_grad);
+
+    const OpTrait* trait() const;
+
+    virtual size_t hash() const {
+        mgb_throw(MegBrainError, "not implemented");
+    }
+
+    virtual bool is_same_st(const Hashable&) const {
+        mgb_throw(MegBrainError, "not implemented");
+    }
+};
+
+template<typename T>
+class OpDefImplBase : public OpDef {
+public:
+    virtual std::shared_ptr<OpDef> copy() const override {
+        return std::shared_ptr<OpDef>(new T(this->cast_final_safe<T>()));
+    }
+
+    template<typename ...Args>
+    static std::shared_ptr<OpDef> make(const Args& ...args) {
+        return std::shared_ptr<OpDef>(new T(args...));
+    }
+};
+
+} // namespace imperative
+} // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/include/megbrain/imperative/opr_utility.h b/imperative/src/include/megbrain/imperative/opr_utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2c94dfe73df03968c47c914c8f236b31ce5abc4
--- /dev/null
+++ b/imperative/src/include/megbrain/imperative/opr_utility.h
@@ -0,0 +1,95 @@
+/**
+ * \file src/core/include/megbrain/opr_utility.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include "megbrain/graph.h"
+#include "megbrain/graph/event.h"
+#include "megbrain/opr/internal/identical_fwd.h"
+#include "megbrain/opr/internal/param_tag_defs.h"
+#include "megbrain/opr/internal/megdnn_opr_wrapper.h"
+#include "megbrain/opr/param_defs.h"
+
+#include "megdnn/oprs/utils.h"
+
+namespace mgb {
+namespace opr {
+/*
+ * InputCallback, OutputCallback, NopCallback
+ * Intended for runtime data exchange with Python.
+ */
+
+MGB_DEFINE_OPR_CLASS(InputCallback, cg::SingleCNOperatorNodeBase) // {
+public:
+    using callback_t = thin_function<DeviceTensorND(void)>;
+    InputCallback(cg::ComputingGraph& graph,
+                  callback_t callback,
+                  const VarNodeArray& inputs,
+                  const OperatorNodeConfig &config);
+    static SymbolVarArray make(cg::ComputingGraph& graph,
+                               callback_t callback,
+                               CompNode comp_node,
+                               DType dtype,
+                               const SymbolVarArray& inputs = {});
+protected:
+    void scn_do_execute() override;
+    void init_output_static_infer_desc() override;
+    NodeProp* do_make_node_prop() const override;
+private:
+    callback_t m_callback;
+};
+
+MGB_DEFINE_OPR_CLASS(OutputCallback, cg::SingleCNOperatorNodeBase) // {
+public:
+    using callback_t = thin_function<void(DeviceTensorND)>;
+    struct Param {
+        callback_t callback;
+        bool borrow = false;
+    };
+    OutputCallback(Param param,
+                   const VarNodeArray& inputs,
+                   const OperatorNodeConfig &config);
+    static SymbolVar make(Param param,
+                          const SymbolVarArray& inputs);
+    static SymbolVar make(Param param,
+                          SymbolVar input) {
+        return make(std::move(param), SymbolVarArray{input});
+    }
+protected:
+    void scn_do_execute() override;
+    void init_output_static_infer_desc() override;
+    NodeProp* do_make_node_prop() const override;
+private:
+    Param m_param;
+};
+
+MGB_DEFINE_OPR_CLASS(NopCallback, cg::OperatorNodeBase) // {
+public:
+    using callback_t = thin_function<void(void)>;
+    NopCallback(cg::ComputingGraph& graph,
+                callback_t callback,
+                const VarNodeArray& inputs,
+                const OperatorNodeConfig &config);
+    static SymbolVar make(cg::ComputingGraph& graph,
+                          callback_t callback,
+                          CompNode comp_node,
+                          const SymbolVarArray& inputs = {});
+protected:
+    void do_execute(ExecEnv &env) override;
+    void init_output_static_infer_desc() override;
+    void init_output_comp_node() override;
+    void on_output_comp_node_stream_changed() override;
+    NodeProp* do_make_node_prop() const override;
+private:
+    callback_t m_callback;
+};
+} // namespace opr
+} // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/include/megbrain/imperative/ops/backward_graph.h b/imperative/src/include/megbrain/imperative/ops/backward_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f5124f180e683a0582b85a8e3de864738cca493
--- /dev/null
+++ b/imperative/src/include/megbrain/imperative/ops/backward_graph.h
@@ -0,0 +1,58 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include "megbrain/imperative/op_def.h"
+
+namespace mgb {
+namespace imperative {
+
+// a special OpDef used for taking gradient on physical tensor
+struct BackwardGraph final : public OpDefImplBase<BackwardGraph> {
+    MGB_DYN_TYPE_OBJ_FINAL_DECL;
+public:
+    struct InternalGraph {
+        // op, inputs, outputs
+        using Expr = std::tuple<std::shared_ptr<OpDef>,
+                std::vector<size_t>, std::vector<size_t>>;
+        std::vector<Expr> exprs;
+
+        // index array of input nodes
+        std::vector<size_t> inputs;
+
+        // index array of output nodes
+        std::vector<size_t> outputs;
+
+        // pair of (node index, correspending constant)
+        std::vector<std::pair<size_t, TensorPtr>> constants;
+
+        SmallVector<TensorPtr>
+        apply(const SmallVector<TensorPtr>& inputs) const;
+
+        SmallVector<LogicalTensorDesc>
+        infer_attrs(const SmallVector<LogicalTensorDesc>& inputs) const;
+    };
+
+    const InternalGraph& graph() const {
+        return m_graph;
+    }
+
+    InternalGraph& graph() {
+        return m_graph;
+    }
+
+private:
+    InternalGraph m_graph;
+};
+
+} // namespace imperative
+} // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/include/megbrain/imperative/ops/collective_comm.h b/imperative/src/include/megbrain/imperative/ops/collective_comm.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d6c515b7cfff5474e4c21a26540df0826bd64f8
--- /dev/null
+++ b/imperative/src/include/megbrain/imperative/ops/collective_comm.h
@@ -0,0 +1,56 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include "megbrain/imperative/op_def.h"
+#include "megbrain/opr/param_defs.h"
+
+namespace mgb {
+namespace imperative {
+
+class CollectiveComm : public OpDefImplBase<CollectiveComm> {
+    MGB_DYN_TYPE_OBJ_FINAL_DECL;
+
+public:
+    CollectiveComm() = default;
+    CollectiveComm(const std::string& key_, size_t nr_devices_,
+                      uint32_t rank_, bool is_root_, bool local_grad_,
+                      const std::string& addr_, uint32_t port_,
+                      const megdnn::param::CollectiveComm::Mode& mode_,
+                      const DType& dtype_, const std::string& backend_,
+                      const std::string& comp_node_)
+            : key(key_),
+              nr_devices(nr_devices_),
+              rank(rank_),
+              is_root(is_root_),
+              local_grad(local_grad_),
+              addr(addr_),
+              port(port_),
+              mode(mode_),
+              dtype(dtype_),
+              backend(backend_),
+              comp_node(comp_node_) {}
+    std::string key;
+    size_t nr_devices;
+    uint32_t rank;
+    bool is_root;
+    bool local_grad;
+    std::string addr;
+    uint32_t port;
+    megdnn::param::CollectiveComm::Mode mode;
+    DType dtype;
+    std::string backend;
+    std::string comp_node;
+};
+
+}  // namespace imperative
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/include/megbrain/imperative/ops/cond_take.h b/imperative/src/include/megbrain/imperative/ops/cond_take.h
new file mode 100644
index 0000000000000000000000000000000000000000..64cdce0e8b044d2956dfd0ad88135f5448ac06a7
--- /dev/null
+++ b/imperative/src/include/megbrain/imperative/ops/cond_take.h
@@ -0,0 +1,22 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include "megbrain/imperative/op_def.h"
+
+namespace mgb::imperative {
+
+class CondTake : public OpDefImplBase<CondTake> {
+    MGB_DYN_TYPE_OBJ_FINAL_DECL;
+public:
+    CondTake() = default;
+};
+
+} // namespace mgb::imperative
diff --git a/imperative/src/include/megbrain/imperative/ops/io_remote.h b/imperative/src/include/megbrain/imperative/ops/io_remote.h
new file mode 100644
index 0000000000000000000000000000000000000000..83e5867338a4486f29a4a77cc7e847da73bca303
--- /dev/null
+++ b/imperative/src/include/megbrain/imperative/ops/io_remote.h
@@ -0,0 +1,61 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include "megbrain/imperative/op_def.h"
+
+namespace mgb {
+namespace imperative {
+
+class RemoteSend : public OpDefImplBase<RemoteSend> {
+    MGB_DYN_TYPE_OBJ_FINAL_DECL;
+
+public:
+    RemoteSend() = default;
+    RemoteSend(const std::string& key_, const std::string& addr_,
+               uint32_t port_, uint32_t rank_to_)
+            : key(key_),
+              addr(addr_),
+              port(port_),
+              rank_to(rank_to_) {}
+    std::string key;
+    std::string addr;
+    uint32_t port;
+    uint32_t rank_to;
+};
+
+class RemoteRecv : public OpDefImplBase<RemoteRecv> {
+    MGB_DYN_TYPE_OBJ_FINAL_DECL;
+
+public:
+    RemoteRecv() = default;
+    RemoteRecv(const std::string& key_, const std::string& addr_,
+               uint32_t port_, uint32_t rank_from_, TensorShape shape_,
+               CompNode cn_, const DType& dtype_)
+            : key(key_),
+              addr(addr_),
+              port(port_),
+              rank_from(rank_from_),
+              cn(cn_),
+              shape(shape_),
+              dtype(dtype_) {}
+    std::string key;
+    std::string addr;
+    uint32_t port;
+    uint32_t rank_from;
+    CompNode cn;
+    TensorShape shape;
+    DType dtype;
+};
+
+}  // namespace imperative
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/include/megbrain/imperative/ops/nms.h b/imperative/src/include/megbrain/imperative/ops/nms.h
new file mode 100644
index 0000000000000000000000000000000000000000..80fcc642ef1478b4d270f9bcfcb3bb5b18b74c14
--- /dev/null
+++ b/imperative/src/include/megbrain/imperative/ops/nms.h
@@ -0,0 +1,26 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include "megbrain/imperative/op_def.h"
+
+namespace mgb::imperative {
+
+class NMSKeep : public OpDefImplBase<NMSKeep> {
+    MGB_DYN_TYPE_OBJ_FINAL_DECL;
+public:
+    float iou_thresh;     //!< IoU threshold for overlapping
+    uint32_t max_output;  //!< max number of output boxes per batch
+    NMSKeep() = default;
+    NMSKeep(float iou_thresh_, uint32_t max_output_):
+        iou_thresh(iou_thresh_), max_output(max_output_) {}
+};
+
+} // namespace mgb::imperative
diff --git a/imperative/src/include/megbrain/imperative/ops/opr_attr.h b/imperative/src/include/megbrain/imperative/ops/opr_attr.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c8aa03a3803d893d498ed85d49368f6a2c5876a
--- /dev/null
+++ b/imperative/src/include/megbrain/imperative/ops/opr_attr.h
@@ -0,0 +1,53 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include "megbrain/imperative/op_def.h"
+
+namespace mgb {
+namespace imperative {
+
+struct OprAttr : public OpDefImplBase<OprAttr> {
+    MGB_DYN_TYPE_OBJ_FINAL_DECL;
+public:
+    using Type = std::string;
+    struct Param : public std::vector<char> {
+        template<typename T>
+        void write_pod(const T& data) {
+            static_assert(!std::is_pointer<T>::value && is_location_invariant<T>::value);
+            const char* ptr = static_cast<const char*>(static_cast<const void*>(&data));
+            insert(end(), ptr, ptr + sizeof(T));
+        }
+        template<typename T, typename ...Args>
+        void write_pod(const T& data, const Args& ...args) {
+            write_pod(data);
+            write_pod(args...);
+        }
+    };
+
+    Type type;
+    Param param;
+    cg::OperatorNodeConfig config;
+
+    OprAttr() = default;
+    OprAttr(const Type& t): type(t){}
+    OprAttr(const Type& t, const Param& p, const cg::OperatorNodeConfig& c):
+            type(t), param(p), config(c) {}
+
+    std::string repr() const;
+
+    bool is_same_st(const Hashable& rhs) const;
+    size_t hash() const;
+};
+
+} // namespace imperative
+} // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/include/megbrain/imperative/ops/tensor_manip.h b/imperative/src/include/megbrain/imperative/ops/tensor_manip.h
new file mode 100644
index 0000000000000000000000000000000000000000..c559df1cfa41076959e650484e54926362c74644
--- /dev/null
+++ b/imperative/src/include/megbrain/imperative/ops/tensor_manip.h
@@ -0,0 +1,56 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include "megbrain/imperative/op_def.h"
+
+namespace mgb::imperative {
+
+class GetVarShape : public OpDefImplBase<GetVarShape> {
+    MGB_DYN_TYPE_OBJ_FINAL_DECL;
+public:
+    GetVarShape() = default;
+
+    size_t hash() const override {
+        return reinterpret_cast<std::uintptr_t>(dyn_typeinfo());
+    }
+
+    bool is_same_st(const Hashable& rhs) const override {
+        return rhs.dyn_typeinfo() == dyn_typeinfo();
+    }
+};
+
+class ParamPackSplit : public OpDefImplBase<ParamPackSplit> {
+    MGB_DYN_TYPE_OBJ_FINAL_DECL;
+
+public:
+    ParamPackSplit() = default;
+
+    ParamPackSplit(std::vector<dt_int32>& offsets_,
+                   std::vector<std::vector<size_t>>& shapes_)
+            : offsets(offsets_), shapes(shapes_) {}
+
+    std::vector<dt_int32> offsets;
+    std::vector<std::vector<size_t>> shapes;
+};
+
+class ParamPackConcat : public OpDefImplBase<ParamPackConcat> {
+    MGB_DYN_TYPE_OBJ_FINAL_DECL;
+
+public:
+    ParamPackConcat() = default;
+
+    ParamPackConcat(std::vector<dt_int32>& offsets_)
+            : offsets(offsets_) {}
+
+    std::vector<dt_int32> offsets;
+};
+
+} // namespace mgb::imperative
diff --git a/imperative/src/include/megbrain/imperative/physical_tensor.h b/imperative/src/include/megbrain/imperative/physical_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..757b84555aeea7d82621f477acd62e90fa594310
--- /dev/null
+++ b/imperative/src/include/megbrain/imperative/physical_tensor.h
@@ -0,0 +1,138 @@
+/**
+ * \file src/core/include/megbrain/imperative.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include <mutex>
+#include <memory>
+
+#include "megbrain/tensor.h"
+
+namespace mgb {
+namespace imperative {
+
+/************************** Tensor *****************************/
+class Blob;
+using BlobPtr = std::shared_ptr<Blob>;
+
+class BlobManagerImpl;
+
+class Blob : public NonCopyableObj {
+public:
+    Blob(const DeviceTensorStorage& s);
+    Blob(CompNode cn, size_t sz);
+    ~Blob();
+
+    template<typename ...Args>
+    static BlobPtr make(Args&& ...args) {
+        return std::make_shared<Blob>(std::forward<Args>(args)...);
+    }
+
+    using RawStorage = DeviceTensorStorage::RawStorage;
+    const RawStorage& storage();
+
+    const CompNode& comp_node() const {
+        return m_comp_node;
+    }
+
+    size_t size() const {
+        return m_size;
+    }
+private:
+    friend class BlobManagerImpl;
+    CompNode m_comp_node;
+    mutable RawStorage m_storage;
+    size_t m_size = 0;
+};
+
+struct EventDeleter {
+    void operator()(CompNode::Event*);
+};
+using EventPtr = std::unique_ptr<CompNode::Event, EventDeleter>;
+
+class Tensor;
+using TensorPtr = std::shared_ptr<Tensor>;
+class Tensor : public NonCopyableObj {
+public:
+    Tensor() = default;
+    Tensor(BlobPtr blob, const TensorLayout& layout, size_t offset = 0, const HostTensorND& hv = {});
+    Tensor(BlobPtr blob, const TensorLayout& layout, const HostTensorND& hv = {})
+        : Tensor(std::move(blob), layout, 0, hv) {};
+    Tensor(const HostTensorND &hv);
+    Tensor(const DeviceTensorND &dv, const HostTensorND& hv = {});
+    Tensor(const TensorLayout& layout, const CompNode& cn);
+    Tensor(const BlobPtr blob, const size_t offset, const TensorLayout& layout);
+
+    static TensorPtr make(const HostTensorND& hv);
+
+    template<typename T, typename = std::enable_if_t<std::is_same_v<std::decay_t<T>, HostTensorND>>>
+    static TensorPtr make(T&& hv) {
+        TensorPtr (*f)(const HostTensorND&) = &make;
+        return f(std::forward<T>(hv));
+    };
+
+    template<typename ...Args>
+    static TensorPtr make(Args&& ...args) {
+        return std::make_shared<Tensor>(std::forward<Args>(args)...);
+    }
+
+    CompNode comp_node() const {
+        mgb_assert(m_blob, "uninitialized tensor.");
+        return m_blob->comp_node();
+    }
+
+    TensorLayout layout() const {
+        return m_layout;
+    }
+
+    DeviceTensorND dev_tensor();
+
+    static TensorPtr make_scalar(DTypeScalar value, CompNode cn);
+
+    TensorPtr make_scalar(DTypeScalar value) const {
+        mgb_assert(m_blob, "uninitialized tensor.");
+        return make_scalar(value, m_blob->comp_node());
+    }
+
+    BlobPtr& blob() {
+        return m_blob;
+    }
+
+    void fetch_value();
+    bool value_fetched();
+    TensorPtr sub(size_t offset, TensorShape shape);
+
+    // m_value is set once readonly afterwards
+    // so the return value is thread safe
+    const HostTensorND& get_value();
+    // return a pointer instead of a reference to ensure thread safety
+    const HostTensorND* try_get_value();
+
+    void add_release_callback(CompNode cn);
+    CompNode::Event* get_or_create_event();
+private:
+
+    TensorLayout m_layout;
+    BlobPtr m_blob;
+    size_t m_offset;
+    std::mutex m_mtx;
+    HostTensorND m_value;
+    EventPtr m_value_ready = nullptr;
+};
+
+struct LogicalTensorDesc {
+    TensorLayout layout;
+    CompNode comp_node;
+    DeviceTensorND value; // cpu:default
+};
+
+} // namespace imperative
+} // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/include/megbrain/imperative/profiler.h b/imperative/src/include/megbrain/imperative/profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..a223ab37495060b2241f232974e37759fe9c9242
--- /dev/null
+++ b/imperative/src/include/megbrain/imperative/profiler.h
@@ -0,0 +1,52 @@
+/**
+ * \file src/core/include/megbrain/profiler.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include "megbrain/comp_node.h"
+#include "megbrain/graph/event.h"
+#include "megbrain/utils/json.h"
+#include "megbrain/utils/timer.h"
+
+#include "megbrain/imperative/op_def.h"
+
+namespace mgb {
+namespace imperative {
+
+class ProfilerPrivate;
+
+using OpDefPrinter = thin_function<std::string(const OpDef&)>;
+
+class Profiler {
+private:
+    std::unique_ptr<ProfilerPrivate> m_private;
+
+public:
+    enum EventKind { OprBegin, OprEnd };
+
+public:
+    Profiler();
+    Profiler(const std::string& path);
+    ~Profiler();
+    void enable();
+    void disable();
+    void dump();
+    void dump(const std::string& path);
+    void record_host(size_t id, std::string name, EventKind type,
+                     double host_time);
+    void record_device(size_t id, std::string name, EventKind type,
+                       double host_time, CompNode comp_node);
+    double get_device_time(CompNode::Event& event);
+    size_t get_dump_count();
+    std::unique_ptr<CompNode::Event> create_event(CompNode comp_node);
+    double get_host_time_now();
+    std::string print_op(const OpDef& def);
+};
+}  // namespace imperative
+}  // namespace mgb
diff --git a/imperative/src/test/backward_graph.cpp b/imperative/src/test/backward_graph.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f83058fc2c6a8c370648e340dc63886f7c93fc58
--- /dev/null
+++ b/imperative/src/test/backward_graph.cpp
@@ -0,0 +1,145 @@
+/**
+ * \file imperative/src/test/backward_graph.cpp
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "./helper.h"
+#include "megbrain/opr/basic_arith.h"
+#include "megbrain/opr/dnn/batch_norm.h"
+#include "megbrain/imperative/ops/opr_attr.h"
+
+using namespace mgb;
+using namespace cg;
+using namespace imperative;
+
+TEST(TestImperative, BackwardGraphBasic) {
+    HostTensorGenerator<> gen;
+    SmallVector<HostTensorND> hvs;
+    SmallVector<TensorPtr> inputs;
+    for(size_t i = 0; i < 2; ++ i) {
+        hvs.push_back(*gen({42}));
+        inputs.push_back(Tensor::make(hvs.back()));
+    }
+
+    using Param = opr::Elemwise::Param;
+    Param param{Param::Mode::MUL};
+    OprAttr attr{"Elemwise", {}, {}};
+    attr.param.write_pod(param);
+
+    SmallVector<LogicalTensorDesc> input_descs;
+    for (auto&& i : inputs) {
+        input_descs.push_back({i->layout(), i->comp_node()});
+    }
+    auto result = OpDef::make_backward_graph(attr, input_descs, {true, true}, {true});
+    auto&& save_for_backward = result.save_for_backward;
+    auto&& input_has_grad = result.input_has_grad;
+
+    auto outputs = OpDef::apply_on_physical_tensor(attr, inputs);
+    inputs.push_back(outputs[0]);
+    hvs.push_back(*gen({42}));
+    inputs.push_back(Tensor::make(hvs.back()));
+    mgb_assert(save_for_backward.size() == inputs.size());
+    for (size_t i = 0; i < inputs.size(); ++ i) {
+        if (!save_for_backward[i]) {
+            inputs[i].reset(); // drop unused tensor
+        }
+    }
+    SmallVector<TensorPtr> backward_graph_inputs;
+    for (auto&& i : inputs) {
+        if (i) {
+            backward_graph_inputs.push_back(i);
+        }
+    }
+    inputs.clear();
+    auto input_grads = OpDef::apply_on_physical_tensor(*(result.backward), backward_graph_inputs);
+    mgb_assert(input_grads.size() == input_has_grad.size());
+    for (size_t i = 0; i < input_has_grad.size(); ++ i) {
+        mgb_assert(input_has_grad[i] == static_cast<bool>(input_grads[i]));
+    }
+
+    SmallVector<HostTensorND> res;
+    for (auto&& i : input_grads) {
+        res.emplace_back();
+        res.back().copy_from(i->dev_tensor()).sync();
+    }
+    for (size_t i = 0; i < 42; ++ i) {
+        for (size_t j = 0; j < 1; ++ j) {
+            ASSERT_EQ(hvs[2].ptr<float>()[i] * hvs[j].ptr<float>()[i], res[j ^ 1].ptr<float>()[i]);
+        }
+    }
+}
+
+TEST(TestImperative, BackwardGraphIdentity) {
+    HostTensorGenerator<> gen;
+    auto host_a = gen({42}), host_dc = gen({42});
+    auto a = Tensor::make(*host_a), dc = Tensor::make(*host_dc);
+    SmallVector<TensorPtr> inputs;
+    inputs.push_back(a);
+
+    OprAttr attr{"Identity", {}, {}};
+    attr.param.write_pod<megdnn::param::Empty>({});
+
+    SmallVector<LogicalTensorDesc> input_descs;
+    input_descs.push_back({a->layout(), a->comp_node()});
+    auto result = OpDef::make_backward_graph(attr, input_descs, {true}, {true});
+    auto&& save_for_backward = result.save_for_backward;
+    auto&& input_has_grad = result.input_has_grad;
+
+    auto outputs = OpDef::apply_on_physical_tensor(attr, inputs);
+    inputs.push_back(outputs[0]);
+    inputs.push_back(dc);
+    mgb_assert(save_for_backward.size() == inputs.size());
+    for (size_t i = 0; i < inputs.size(); ++ i) {
+        if (!save_for_backward[i]) {
+            inputs[i].reset(); // drop unused tensor
+        }
+    }
+    SmallVector<TensorPtr> backward_graph_inputs;
+    for (auto&& i : inputs) {
+        if (i) {
+            backward_graph_inputs.push_back(i);
+        }
+    }
+    inputs.clear();
+    auto input_grads = OpDef::apply_on_physical_tensor(*(result.backward), backward_graph_inputs);
+    mgb_assert(input_grads.size() == input_has_grad.size());
+    for (size_t i = 0; i < input_has_grad.size(); ++ i) {
+        mgb_assert(input_has_grad[i] == static_cast<bool>(input_grads[i]));
+    }
+
+    HostTensorND hv;
+    hv.copy_from(input_grads[0]->dev_tensor()).sync();
+    for (size_t i = 0; i < 42; ++ i) {
+        ASSERT_EQ(host_dc->ptr<float>()[i], hv.ptr<float>()[i]);
+    }
+}
+
+TEST(TestImperative, BatchNormGrad) {
+     auto cn = CompNode::load("xpux");
+     using Param = opr::BatchNorm::Param;
+     size_t N=2, C=3, H=5, W=5;
+     LogicalTensorDesc inp{TensorLayout{{N, C, H, W}, dtype::Float32()}, cn};
+     LogicalTensorDesc stat{TensorLayout{{C}, dtype::Float32()}, cn};
+     {
+          auto op = OprAttr::make("BatchNorm");
+          auto&& attr = op->cast_final_safe<OprAttr>();
+          Param param;
+          param.fwd_mode = Param::FwdMode::TRAINING;
+          attr.param.write_pod(param);
+          OpDef::make_backward_graph(attr, {inp, stat, stat, stat, stat},
+               {true, true ,true, false, false}, {false, false, false, false, true});
+     }
+     {
+          auto op = OprAttr::make("BatchNorm");
+          auto&& attr = op->cast_final_safe<OprAttr>();
+          Param param;
+          param.fwd_mode = Param::FwdMode::TRAINING;
+          attr.param.write_pod(param);
+          OpDef::make_backward_graph(attr, {inp, stat, stat},
+               {true, true ,true}, {false, false, true});
+     }
+}
diff --git a/imperative/src/test/collective_comm.cpp b/imperative/src/test/collective_comm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b1a1c9ad76fbfebfe91b3935f042715287446ba6
--- /dev/null
+++ b/imperative/src/test/collective_comm.cpp
@@ -0,0 +1,51 @@
+/**
+ * \file imperative/src/test/imperative.cpp
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "./helper.h"
+#include "megbrain/imperative/ops/collective_comm.h"
+#include "megbrain/opr/mm_handler.h"
+
+using namespace mgb;
+using namespace imperative;
+
+TEST(TestImperative, AllReduceBasic) {
+    REQUIRE_GPU(2);
+    const char* server_addr = "127.0.0.1";
+    uint32_t port = 3456;
+    mgb_assert(create_zmqrpc_server(server_addr, port) > 0);
+    HostTensorGenerator<> gen;
+    CompNode cn0 = CompNode::load("gpu0"),
+             cn1 = CompNode::load("gpu1");
+
+    auto host_x = gen({233}, cn0), host_y = gen({233}, cn1);
+    auto expect = gen({233});
+    for (size_t i = 0; i < 233; ++ i) {
+        expect->ptr<float>()[i] = host_x->ptr<float>()[i] + host_y->ptr<float>()[i];
+    }
+
+    auto run = [&](std::shared_ptr<HostTensorND> hnd, uint32_t idx) {
+        imperative::CollectiveComm
+            def{"all_reduce", 2, idx, idx==0, false, server_addr, port,
+                megdnn::param::CollectiveComm::Mode::ALL_REDUCE_SUM,
+                dtype::Float32(), "nccl", ""};
+        auto inp = Tensor::make(*hnd);
+        auto oup = OpDef::apply_on_physical_tensor(def, {inp});
+        HostTensorND host_v;
+        host_v.copy_from(oup[0]->dev_tensor()).sync();
+        MGB_ASSERT_TENSOR_NEAR(*expect, host_v, 1e-6);
+    };
+
+    std::thread t0(std::bind(run, host_x, 0));
+    std::thread t1(std::bind(run, host_y, 1));
+
+    t0.join();
+    t1.join();
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/test/cond_take.cpp b/imperative/src/test/cond_take.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dad18671da4f044a1e783532fcad0d569a8785c4
--- /dev/null
+++ b/imperative/src/test/cond_take.cpp
@@ -0,0 +1,22 @@
+/**
+ * \file imperative/src/test/imperative.cpp
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "./helper.h"
+#include "megbrain/imperative/ops/cond_take.h"
+
+using namespace mgb;
+using namespace imperative;
+
+TEST(TestImperative, CondTake) {
+    auto op = imperative::CondTake::make();
+    auto msk = HostTensorGenerator<dtype::Bool>()({42});
+    OprChecker(op).run({TensorShape{42}, *msk});
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/test/helper.cpp b/imperative/src/test/helper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5fb3119b88dcf7bc1d29c5f80931d5ccce57e42b
--- /dev/null
+++ b/imperative/src/test/helper.cpp
@@ -0,0 +1,164 @@
+/**
+ * \file imperative/src/test/helper.cpp
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "helper.h"
+#include "megbrain/graph.h"
+#include "megbrain/opr/io.h"
+
+#include <memory>
+#include <pybind11/embed.h>
+#include <pybind11/numpy.h>
+
+namespace py = pybind11;
+
+namespace mgb {
+namespace imperative {
+
+namespace {
+
+#define XSTR(s) STR(s)
+#define STR(s) #s
+#define CONCAT(a, b) a##b
+#define PYINIT(name) CONCAT(PyInit_, name)
+#define pyinit PYINIT(MODULE_NAME)
+
+#define UNUSED __attribute__((unused))
+
+extern "C" PyObject* pyinit();
+
+class PyEnv {
+    static std::unique_ptr<PyEnv> m_instance;
+    std::unique_ptr<py::scoped_interpreter> m_interpreter;
+    PyEnv();
+public:
+    static PyEnv& instance();
+    static py::module get();
+};
+
+std::unique_ptr<PyEnv> PyEnv::m_instance = nullptr;
+
+PyEnv::PyEnv() {
+    mgb_assert(!m_instance);
+    auto err = PyImport_AppendInittab(XSTR(MODULE_NAME), &pyinit);
+    mgb_assert(!err);
+    m_interpreter.reset(new py::scoped_interpreter());
+}
+
+PyEnv& PyEnv::instance() {
+    if (!m_instance) {
+        m_instance.reset(new PyEnv());
+    }
+    return *m_instance;
+}
+
+py::module PyEnv::get() {
+    instance();
+    return py::module::import(XSTR(MODULE_NAME));
+}
+
+py::array array(const Tensor& x) {
+     PyEnv::get();
+     return py::cast(x).attr("numpy")();
+}
+
+py::array array(const HostTensorND& x) {
+    return array(*Tensor::make(x));
+}
+
+py::array array(const DeviceTensorND& x) {
+    return array(*Tensor::make(x));
+}
+
+UNUSED void print(const Tensor& x) {
+    return print(array(x));
+}
+
+UNUSED void print(const HostTensorND& x) {
+    return print(array(x));
+}
+
+UNUSED void print(const DeviceTensorND& x) {
+    return print(array(x));
+}
+
+UNUSED void print(const char* s) {
+    PyEnv::instance();
+    py::print(s);
+}
+
+} // anonymous namespace
+
+OprChecker::OprChecker(std::shared_ptr<OpDef> opdef)
+    : m_op(opdef) {}
+
+void OprChecker::run(std::vector<InputSpec> inp_keys) {
+    HostTensorGenerator<> gen;
+    size_t nr_inps = inp_keys.size();
+    SmallVector<HostTensorND> host_inp(nr_inps);
+    VarNodeArray sym_inp(nr_inps);
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    for (size_t i = 0; i < nr_inps; ++ i) {
+        host_inp[i] = std::visit([&gen](auto&& arg) -> HostTensorND {
+                using T = std::decay_t<decltype(arg)>;
+                if constexpr (std::is_same_v<TensorShape, T>) {
+                    return *gen(arg);
+                } else {
+                    static_assert(std::is_same_v<HostTensorND, T>);
+                    return arg;
+                }
+            }, inp_keys[i]);
+        sym_inp[i] = opr::SharedDeviceTensor::make(*graph, host_inp[i]).node();
+    }
+    auto sym_oup = OpDef::apply_on_var_node(*m_op, sym_inp)->usable_output();
+    size_t nr_oups = sym_oup.size();
+    ComputingGraph::OutputSpec oup_spec(nr_oups);
+    SmallVector<HostTensorND> host_sym_oup(nr_oups);
+    for (size_t i = 0; i < nr_oups; ++ i) {
+        oup_spec[i] = make_callback_copy(sym_oup[i], host_sym_oup[i]);
+    }
+    auto func = graph->compile(oup_spec);
+
+    SmallVector<TensorPtr> imp_physical_inp(nr_inps);
+    for (size_t i = 0; i < nr_inps; ++ i) {
+        imp_physical_inp[i] = Tensor::make(host_inp[i]);
+    }
+
+    auto imp_oup = OpDef::apply_on_physical_tensor(*m_op, imp_physical_inp);
+    mgb_assert(imp_oup.size() == nr_oups);
+
+    // check input not modified
+    for (size_t i = 0; i < imp_physical_inp.size(); ++i) {
+        HostTensorND hv;
+        hv.copy_from(imp_physical_inp[i]->dev_tensor()).sync();
+        MGB_ASSERT_TENSOR_EQ(hv, host_inp[i]);
+    }
+
+    SmallVector<HostTensorND> host_imp_oup(nr_oups);
+    for (size_t i = 0; i < nr_oups; ++ i) {
+        host_imp_oup[i].copy_from(imp_oup[i]->dev_tensor()).sync();
+    }
+
+    func->execute().wait(); // run last because it may contain inplace operations
+
+    for(size_t i = 0; i < nr_oups; ++ i) {
+        MGB_ASSERT_TENSOR_EQ(host_sym_oup[i], host_imp_oup[i]);
+    }
+}
+
+TEST(TestHelper, PyModule) {
+    py::module m = PyEnv::get();
+    py::print(m);
+    py::print(py::cast(DeviceTensorND()));
+}
+
+} // namespace imperative
+} // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/test/helper.h b/imperative/src/test/helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad172f21086d27aec88366652dedf56f25e95d30
--- /dev/null
+++ b/imperative/src/test/helper.h
@@ -0,0 +1,32 @@
+/**
+ * \file imperative/src/test/helper.h
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#pragma once
+
+#include <variant>
+
+#include "megbrain/imperative.h"
+#include "megbrain/test/helper.h"
+
+namespace mgb {
+namespace imperative {
+
+class OprChecker {
+public:
+     using InputSpec = std::variant<HostTensorND, TensorShape>;
+     OprChecker(std::shared_ptr<OpDef> opdef);
+     void run(std::vector<InputSpec> inp_shapes);
+private:
+     std::shared_ptr<OpDef> m_op;
+};
+
+} // namespace imperative
+} // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/test/imperative.cpp b/imperative/src/test/imperative.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..84072ae9bd5dc2515820947e2eb736bd563ddf8e
--- /dev/null
+++ b/imperative/src/test/imperative.cpp
@@ -0,0 +1,181 @@
+/**
+ * \file imperative/src/test/imperative.cpp
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "./helper.h"
+#include "megbrain/opr/basic_arith.h"
+#include "megbrain/opr/basic_arith_wrapper.h"
+#include "megbrain/opr/dnn/convolution.h"
+#include "megbrain/opr/tensor_manip.h"
+#include "megbrain/opr/dnn/batch_norm.h"
+#include "megbrain/opr/utility.h"
+#include "megbrain/imperative/blob_manager.h"
+#include "megbrain/imperative/ops/opr_attr.h"
+#include "megbrain/comp_node_env.h"
+
+
+using namespace mgb;
+using namespace cg;
+using namespace imperative;
+
+TEST(TestImperative, APlusB) {
+     auto op = OprAttr::make("Elemwise");
+     auto&& attr = op->cast_final_safe<OprAttr>();
+     using Param = opr::Elemwise::Param;
+     Param param{Param::Mode::ADD};
+     attr.param.write_pod(param);
+     OprChecker(op).run({TensorShape{42}, TensorShape{42}});
+}
+
+TEST(TestImperative, Convolution) {
+     auto op = OprAttr::make("ConvolutionV1");
+     auto&& attr = op->cast_final_safe<OprAttr>();
+     using Param = opr::Convolution::Param;
+     using Policy = opr::Convolution::ExecutionPolicy;
+     Param param{Param::Mode::CONVOLUTION};
+     Policy policy{Policy::Strategy::HEURISTIC};
+     attr.param.write_pod(param);
+     attr.param.write_pod(policy);
+     size_t N = 4, IC = 3, OC = 8, FH = 3, FW = 3, IH = 16, IW = 16;
+     OprChecker(op).run({TensorShape{N, IC, IH, IW}, TensorShape{OC, IC, FH, FW}});
+}
+
+TEST(TestImperative, Reduce) {
+     auto op = OprAttr::make("ReduceV2");
+     auto&& attr = op->cast_final_safe<OprAttr>();
+     using Param = opr::Reduce::Param;
+     Param param{Param::Mode::SUM_SQR};
+     attr.param.write_pod(param);
+     HostTensorND one{CompNode::load("xpu0"), {{1}, dtype::Int32()}};
+     one.ptr<int>()[0] = 1;
+     OprChecker(op).run({TensorShape{2, 3, 4}, one});
+}
+
+TEST(TestImperative, BatchNorm) {
+     auto op = OprAttr::make("BatchNorm");
+     auto&& attr = op->cast_final_safe<OprAttr>();
+     using Param = opr::BatchNorm::Param;
+     Param param;
+     param.param_dim = Param::ParamDim::DIM_1C11;
+     param.avg_factor = 0.999;
+     attr.param.write_pod(param);
+     size_t N=2, C=3, H=5, W=5;
+     OprChecker(op).run({
+          TensorShape{N, C, H, W},
+          TensorShape{1, C, 1, 1},
+          TensorShape{1, C, 1, 1},
+          TensorShape{1, C, 1, 1},
+          TensorShape{1, C, 1, 1}
+     });
+}
+
+TEST(TestImperative, Concat) {
+     OprAttr::Param param;
+     param.write_pod(megdnn::param::Axis(0));
+     OperatorNodeConfig config{CompNode::load("xpu1")};
+     OprChecker(OprAttr::make("Concat", param, config))
+          .run({TensorShape{200, 300}, TensorShape{300, 300}});
+}
+
+TEST(TestImperative, Split) {
+     OprAttr::Param param;
+     param.write_pod(megdnn::param::Axis(0));
+     auto op = OprAttr::make("Split", param, OperatorNodeConfig{});
+     auto cn = CompNode::load("xpu0");
+     HostTensorND s1{cn, {{1}, dtype::Int32()}};
+     s1.ptr<int>()[0] = 20;
+     HostTensorND s2{cn, {{1}, dtype::Int32()}};
+     s2.ptr<int>()[0] = 80;
+     OprChecker(op).run({TensorShape{100}, s1, s2});
+}
+
+#if MGB_CUDA && MGB_ENABLE_EXCEPTION
+void run_graph(size_t mem_reserved, bool enable_defrag) {
+     CompNode::try_coalesce_all_free_memory();
+     CompNode::finalize();
+
+     auto cn = CompNode::load("gpux");
+     cn.sync(); // wait for async init to finish
+
+     BlobManager::inst() -> set_enable(enable_defrag);
+
+     HostTensorGenerator<> gen;
+     using TensorPtr = std::shared_ptr<Tensor>;
+     TensorPtr ptr_a[100];
+
+     size_t unit_size = mem_reserved / (100.5 * 4);
+     auto host_a = gen({unit_size});
+     for(int i = 0; i < 100; ++i) {
+          ptr_a[i] = Tensor::make(*host_a);
+     }
+
+     // free half
+     for(int i = 0; i < 100; i += 2) {
+          ptr_a[i].reset();
+     }
+
+     auto op = OprAttr::make("Elemwise");
+     auto&& attr = op->cast_final_safe<OprAttr>();
+     using Param = opr::Elemwise::Param;
+     Param param{Param::Mode::MUL};
+     attr.param.write_pod(param);
+
+     auto out = OpDef::apply_on_physical_tensor(*op, {ptr_a[1], ptr_a[99]}).at(0);
+
+     // value before defrag
+     HostTensorND host_out_before;
+     host_out_before.copy_from(out->dev_tensor()).sync();
+
+     // make defrag work
+     auto e = Tensor::make(*gen({unit_size * 10}));
+
+     // value after defrag
+     HostTensorND host_out_after;
+     host_out_after.copy_from(out->dev_tensor()).sync();
+
+     // make sure defragment do not change the value
+     for (size_t i = 0; i < unit_size; ++ i) {
+          ASSERT_EQ(host_out_before.ptr<float>()[i], host_out_after.ptr<float>()[i]);
+     }
+}
+
+TEST(TestImperative, Defragment) {
+     REQUIRE_GPU(1);
+     CompNode::load("gpux").activate();
+     size_t reserve;
+     {
+          size_t free, tot;
+          MGB_CUDA_CHECK(cudaMemGetInfo(&free, &tot));
+          reserve = free * 0.92;
+     }
+     auto reserve_setting = ssprintf("b:%zu", reserve);
+
+     auto do_run = [reserve]() {
+          ASSERT_THROW(run_graph(reserve, false), MemAllocError);
+          run_graph(reserve, true);
+     };
+
+     // reserve memory explicitly to avoid uncontrollable factors
+     constexpr const char* KEY = "MGB_CUDA_RESERVE_MEMORY";
+     auto old_value = getenv(KEY);
+     setenv(KEY, reserve_setting.c_str(), 1);
+     MGB_TRY {
+          do_run();
+     } MGB_FINALLY(
+             if (old_value) {
+                 setenv(KEY, old_value, 1);
+             } else {
+                 unsetenv(KEY);
+             }
+             CompNode::try_coalesce_all_free_memory();
+             CompNode::finalize();
+     );
+}
+#endif // MGB_CUDA && MGB_ENABLE_EXCEPTION
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/test/io_remote.cpp b/imperative/src/test/io_remote.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a4d914139de29365e31f1ec3fc7501d9c1833eea
--- /dev/null
+++ b/imperative/src/test/io_remote.cpp
@@ -0,0 +1,66 @@
+/**
+ * \file imperative/src/test/imperative.cpp
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "./helper.h"
+#include "megbrain/imperative/ops/io_remote.h"
+#include "megbrain/opr/mm_handler.h"
+
+using namespace mgb;
+using namespace imperative;
+
+TEST(TestImperative, IORemote) {
+    REQUIRE_GPU(2);
+    const char* server_addr = "127.0.0.1";
+    uint32_t port = 4567;
+    mgb_assert(create_zmqrpc_server(server_addr, port) > 0);
+    HostTensorGenerator<> gen;
+    CompNode cn0 = CompNode::load("gpu0"), cn1 = CompNode::load("gpu1");
+
+    size_t vector_size = 233;
+    auto host_x = gen({vector_size}, cn0), host_y = gen({vector_size}, cn1);
+
+    auto expect = gen({vector_size});
+    for (size_t i = 0; i < vector_size; ++i) {
+        expect->ptr<float>()[i] = host_x->ptr<float>()[i];
+    }
+
+    auto run_send = [&](std::shared_ptr<HostTensorND> hnd) {
+        imperative::RemoteSend def{"io_remote_test", server_addr, port, 1};
+        auto inp = Tensor::make(*hnd);
+        auto oup = OpDef::apply_on_physical_tensor(def, {inp});
+    };
+
+    auto run_recv = [&](std::shared_ptr<HostTensorND> hnd) {
+        // auto&& shape = std::initializer_list{vector_size};
+        imperative::RemoteRecv def{"io_remote_test",
+                                   server_addr,
+                                   port,
+                                   0,
+                                   {
+                                           vector_size,
+                                   },
+                                   CompNode::load("gpu1"),
+                                   dtype::Float32()};
+        auto inp = Tensor::make(*hnd);
+        auto oup = OpDef::apply_on_physical_tensor(def, {inp});
+        HostTensorND host_v;
+        host_v.copy_from(oup[0]->dev_tensor()).sync();
+        MGB_ASSERT_TENSOR_NEAR(*expect, host_v, 1e-6);
+    };
+
+    std::thread t0(std::bind(run_send, host_x));
+    std::thread t1(std::bind(run_recv, host_y));
+
+    t0.join();
+    t1.join();
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
+
+// ./imperative_test --gtest_filter TestIORemote
diff --git a/imperative/src/test/opr_utility.cpp b/imperative/src/test/opr_utility.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3fcf22c1dcb1b5cb75d0e4a4f7ca5d4be2f7f0ad
--- /dev/null
+++ b/imperative/src/test/opr_utility.cpp
@@ -0,0 +1,138 @@
+/**
+ * \file imperative/src/test/opr_utility.cpp
+ *
+ * This file is part of MegBrain, a deep learning framework developed by Megvii.
+ *
+ * \copyright Copyright (c) 2014-2019 Megvii Inc. All rights reserved.
+ *
+ */
+
+#include "megbrain/imperative/opr_utility.h"
+#include "megbrain/opr/io.h"
+#include "megbrain/opr/basic_arith.h"
+#include "megbrain/opr/utility.h"
+#include "megbrain/test/helper.h"
+
+using namespace mgb;
+using namespace opr;
+
+TEST(TestOprUtility, InputCallback) {
+    HostTensorGenerator<> gen;
+    DeviceTensorND dv;
+    auto hv = gen({2, 3});
+    dv.copy_from(*hv).sync();
+    auto graph = ComputingGraph::make();
+    auto callback = [dv]() {return dv;};
+    auto outputs = opr::InputCallback::make(*graph, callback, dv.comp_node(), dv.dtype());
+
+    HostTensorND hout;
+    ComputingGraph::OutputSpec outspec{make_callback_copy(outputs[0], hout)};
+    auto func = graph->compile(outspec);
+    func->execute();
+    MGB_ASSERT_TENSOR_EQ(hout, *hv);
+}
+
+TEST(TestOprUtility, OutputCallback) {
+    HostTensorGenerator<> gen;
+    auto hx = gen({2, 3});
+    auto graph = ComputingGraph::make();
+    auto x = opr::Host2DeviceCopy::make(*graph, hx);
+    HostTensorND hy;
+    auto callback = [&hy](DeviceTensorND dv) {hy.copy_from(dv);};
+    auto dummy = opr::OutputCallback::make({callback}, x);
+    auto y = opr::VirtualDep::make({x, dummy});
+
+    ComputingGraph::OutputSpec outspec{{y, [](DeviceTensorND&){}}};
+    auto func = graph->compile(outspec);
+    func->execute();
+    MGB_ASSERT_TENSOR_EQ(hy, *hx);
+}
+
+TEST(TestOprUtility, NopCallback) {
+    HostTensorGenerator<> gen;
+    auto hx = gen({2, 3});
+    auto graph = ComputingGraph::make();
+    auto x = opr::Host2DeviceCopy::make(*graph, hx);
+    bool fired = false;
+    auto callback = [&fired]() {fired = true;};
+    auto dummy = opr::NopCallback::make(*graph, callback, x.node()->comp_node(), {x});
+    auto y = opr::VirtualDep::make({x, dummy});
+
+    ComputingGraph::OutputSpec outspec{{y, [](DeviceTensorND&){}}};
+    auto func = graph->compile(outspec);
+    func->execute();
+    ASSERT_TRUE(fired);
+}
+
+TEST(TestOprUtility, NopCallbackMixedInput) {
+    auto graph = ComputingGraph::make();
+    auto x0 = opr::Host2DeviceCopy::make(*graph, HostTensorGenerator<dtype::Int32>()({2, 3}), OperatorNodeConfig(CompNode::load("xpu0")));
+    auto x1 = opr::Host2DeviceCopy::make(*graph, HostTensorGenerator<dtype::Float32>()({2, 3}), OperatorNodeConfig(CompNode::load("xpu1")));
+
+    bool fired = false;
+    auto callback = [&fired]() {fired = true;};
+    auto dummy = opr::NopCallback::make(*graph, callback, CompNode::load("xpux"), {x0, x1});
+    auto y = opr::VirtualDep::make({x0, dummy});
+
+    ComputingGraph::OutputSpec outspec{{y, [](DeviceTensorND&){}}};
+    auto func = graph->compile(outspec);
+    func->execute();
+    ASSERT_TRUE(fired);
+}
+
+TEST(TestOprUtility, CallbackChain) {
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    HostTensorGenerator<dtype::Int16> gen;
+    SymbolVar x, dummy;
+    DeviceTensorND dev_x, dev_y;
+    auto host_x = gen({2, 3});
+    dev_x.copy_from(*host_x).sync();
+    auto cn = dev_x.comp_node();
+    auto dev_x_weakptr = std::weak_ptr<dt_byte>(dev_x.storage().raw_storage());
+
+    {
+        auto callback = [&dev_x]() {
+            DeviceTensorND ret = dev_x;
+            dev_x.storage({});
+            return ret;
+        };
+        auto out = opr::InputCallback::make(*graph, callback, cn, dev_x.dtype());
+        x = out[0];
+        dummy = out[1];
+    }
+
+    {
+        x = opr::TypeCvt::make(x, dtype::Int32());
+        x = opr::TypeCvt::make(x, dtype::Int16());
+        auto callback = [&](DeviceTensorND y) {
+            // dev_x.storage has been reset in InputCallback
+            mgb_assert(!dev_x.storage().comp_node_valid());
+            dev_y = y;
+        };
+        dummy = opr::OutputCallback::make({callback}, {x, dummy});
+    }
+
+    bool fired = false;
+    {
+        auto callback = [&]() {
+            fired = true;
+            ASSERT_FALSE(dev_x_weakptr.lock());
+        };
+        dummy = opr::NopCallback::make(*graph, callback, cn, {dummy});
+    }
+
+    {
+        auto out = opr::VirtualDep::make({x.make_scalar(0), dummy});
+        ComputingGraph::OutputSpec outspec{{out, [](DeviceTensorND&){}}};
+        auto func = graph->compile(outspec);
+        func->execute();
+    }
+
+    ASSERT_TRUE(fired);
+    HostTensorND host_y;
+    host_y.copy_from(dev_y).sync();
+    MGB_ASSERT_TENSOR_EQ(host_y, *host_x);
+}
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/imperative/src/version.ld b/imperative/src/version.ld
new file mode 100644
index 0000000000000000000000000000000000000000..19c7cde52ae01019004a1e076c5ee4b387da7892
--- /dev/null
+++ b/imperative/src/version.ld
@@ -0,0 +1,17 @@
+{
+global:
+    MGB_VSYM_*;
+    MEGDNN_VSYM_*;
+    mgb_get_extern_c_opr_api_versioned;
+    PyInit__imperative_rt;
+    extern "C++" {
+        *mgb::*;
+        *megdnn::*;
+        *megcore::*;
+        megcore*;
+    };
+    megcore*;
+
+local:
+    *;
+};
diff --git a/imperative/test/CMakeLists.txt b/imperative/test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6b766cddce55b0893d49907bfa010d159d662fb7
--- /dev/null
+++ b/imperative/test/CMakeLists.txt
@@ -0,0 +1,45 @@
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
+set(MGB_TEST_DIR ${PROJECT_SOURCE_DIR}/test/src)
+
+file(GLOB_RECURSE SOURCES ../src/test/*.cpp ../src/impl/*.cpp ${MGB_TEST_DIR}/*.cpp)
+
+# disable distributed tests
+if(NOT MGE_WITH_DISTRIBUTED)
+    list(FILTER SOURCES EXCLUDE REGEX ".*test/collective_comm.cpp")
+    list(FILTER SOURCES EXCLUDE REGEX ".*test/io_remote.cpp")
+endif()
+
+# TODO: turn python binding into a static/object library
+add_executable(imperative_test ${SOURCES} ${SRCS})
+target_include_directories(imperative_test PRIVATE ${MGB_TEST_DIR}/include ../src/include)
+
+# Python binding
+target_include_directories(imperative_test PRIVATE ${MODULE_SRC_INCLUDE} ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR})
+target_compile_definitions(imperative_test PRIVATE MODULE_NAME=C)
+target_compile_options(imperative_test PRIVATE -Wno-unused-parameter)
+
+set(LINK_LIBS megbrain megdnn gtest pybind11::embed gen_op_def)
+if(MGE_WITH_CUDA)
+    list(APPEND LINK_LIBS cudart)
+endif()
+
+if(MGE_WITH_DISTRIBUTED)
+    list(APPEND LINK_LIBS megray)
+endif()
+
+target_link_libraries(imperative_test ${LINK_LIBS})
+if(CXX_SUPPORT_WCLASS_MEMACCESS)
+    if(MGE_WITH_CUDA)
+        target_compile_options(imperative_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-class-memaccess>"
+            "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-class-memaccess>")
+    else()
+        target_compile_options(imperative_test PRIVATE "-Wno-class-memaccess")
+    endif()
+endif()
+
+if(UNIX)
+    target_link_libraries(imperative_test dl rt)
+endif()
+
+
+install(TARGETS imperative_test RUNTIME DESTINATION test)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9b3eeba42b0fa9907c34984fe96aad7540eaa58b..895919a49aad7233311c58720ffbefe8e0b0e205 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -168,25 +168,28 @@ if(ANDROID)
     target_link_libraries(megbrain PUBLIC log)
 endif()
 
-if(NOT MGE_BUILD_IMPERATIVE_RT)
-    # Build as SHARED or STATIC depending on BUILD_SHARED_LIBS=ON/OFF
-    add_library(megengine)
-    target_link_libraries(megengine PUBLIC megbrain megdnn)
-    if (UNIX AND NOT APPLE)
-        # TODO: Use target_link_options after upgrading to CMake 3.13
-        # FIXME; Please use right directory for mgb or imperative
-        target_link_options(megengine PRIVATE -Wl,--no-undefined -Wl,--version-script=${PROJECT_SOURCE_DIR}/python_module/src/version.ld)
-    endif()
-    set_target_properties(megengine PROPERTIES CXX_VISIBILITY_PRESET default)
-    set_target_properties(megengine PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
-    # Do not export targets if MGE_WITH_DISTRIBUTED is on. MegRay is not ready
-    # for this.
-    install(TARGETS megengine
-            EXPORT ${MGE_EXPORT_TARGETS}
-            LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-            ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+if(MGE_BUILD_IMPERATIVE_RT)
+    set (_VER_FILE ${PROJECT_SOURCE_DIR}/python_module/src/version.ld)
+else()
+    set (_VER_FILE ${PROJECT_SOURCE_DIR}/imperative/src/version.ld)
 endif()
 
+# Build as SHARED or STATIC depending on BUILD_SHARED_LIBS=ON/OFF
+add_library(megengine)
+target_link_libraries(megengine PUBLIC megbrain megdnn)
+if (UNIX AND NOT APPLE)
+    # TODO: Use target_link_options after upgrading to CMake 3.13
+    target_link_options(megengine PRIVATE -Wl,--no-undefined -Wl,--version-script=${_VER_FILE})
+endif()
+set_target_properties(megengine PROPERTIES CXX_VISIBILITY_PRESET default)
+set_target_properties(megengine PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
+# Do not export targets if MGE_WITH_DISTRIBUTED is on. MegRay is not ready
+# for this.
+install(TARGETS megengine
+        EXPORT ${MGE_EXPORT_TARGETS}
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
 if (NOT MGE_WITH_DISTRIBUTED)
     install(TARGETS megbrain
             EXPORT ${MGE_EXPORT_TARGETS}
diff --git a/src/opr/impl/standalone/nms_cpu.cpp b/src/opr/impl/standalone/nms_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..21dc9c9eb4528b1b9845461fba0a7d0b59935aeb
--- /dev/null
+++ b/src/opr/impl/standalone/nms_cpu.cpp
@@ -0,0 +1,60 @@
+#include "./nms_cpu.h"
+
+#include <algorithm>
+#include <cstring>
+
+namespace {
+struct Box {
+    float x0, y0, x1, y1;
+};
+
+bool box_iou(Box a, Box b, float thresh) {
+    using std::max;
+    using std::min;
+    float left = max(a.x0, b.x0), right = min(a.x1, b.x1);
+    float top = max(a.y0, b.y0), bottom = min(a.y1, b.y1);
+    float width = max(right - left, 0.f),
+          height = max(bottom - top, 0.f);
+    float interS = width * height;
+    float Sa = (a.x1 - a.x0) * (a.y1 - a.y0);
+    float Sb = (b.x1 - b.x0) * (b.y1 - b.y0);
+    return interS > (Sa + Sb - interS) * thresh;
+}
+}  // anonymous namespace
+
+size_t mgb::opr::standalone::nms::cpu_kern_workspace(size_t nr_boxes) {
+    return (((nr_boxes - 1) / sizeof(size_t)) + 1) * sizeof(size_t);
+}
+
+void mgb::opr::standalone::nms::cpu_kern(size_t nr_boxes, size_t max_output,
+                                         float overlap_thresh,
+                                         const float* boxes, uint32_t* out_idx,
+                                         uint32_t* out_size, void* workspace) {
+    size_t out_pos = 0, last_out = 0;
+    auto boxes_bptr = reinterpret_cast<const Box*>(boxes);
+    auto kept_mask = static_cast<size_t*>(workspace);
+    memset(kept_mask, 0, cpu_kern_workspace(nr_boxes));
+    for (size_t i = 0; i < nr_boxes; ++i) {
+        bool supressed = false;
+        auto ibox = boxes_bptr[i];
+        for (size_t j = 0; j < i; ++j) {
+            bool j_kept =
+                    (kept_mask[j / sizeof(size_t)] >> (j % sizeof(size_t))) & 1;
+            if (j_kept && box_iou(ibox, boxes_bptr[j], overlap_thresh)) {
+                supressed = true;
+                break;
+            }
+        }
+        if (!supressed) {
+            kept_mask[i / sizeof(size_t)] |= size_t(1) << (i % sizeof(size_t));
+            last_out = i;
+            out_idx[out_pos++] = i;
+            if (out_pos == max_output)
+                break;
+        }
+    }
+    *out_size = out_pos;
+    while (out_pos < max_output) {
+        out_idx[out_pos++] = last_out;
+    }
+}
diff --git a/src/opr/impl/standalone/nms_cpu.h b/src/opr/impl/standalone/nms_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..918a2330b1f7186713a2671ec9c2c54ce30c0f59
--- /dev/null
+++ b/src/opr/impl/standalone/nms_cpu.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace mgb {
+namespace opr {
+namespace standalone {
+namespace nms {
+
+/*!
+ * \brief CPU single-batch nms kernel
+ *
+ * See nms_kern.cuh for explanation on the parameters.
+ */
+void cpu_kern(size_t nr_boxes, size_t max_output, float overlap_thresh,
+              const float* boxes, uint32_t* out_idx, uint32_t* out_size,
+              void* workspace);
+
+size_t cpu_kern_workspace(size_t nr_boxes);
+
+}  // namespace nms
+}  // namespace standalone
+}  // namespace opr
+}  // namespace mgb
diff --git a/src/opr/impl/standalone/nms_kern.cu b/src/opr/impl/standalone/nms_kern.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1c5e70fea2cee8bc288baace5a0fa4a143a26b12
--- /dev/null
+++ b/src/opr/impl/standalone/nms_kern.cu
@@ -0,0 +1,216 @@
+#include "nms_kern.cuh"
+
+#include <cassert>
+#include <algorithm>
+
+namespace {
+
+// each thread computs one bit
+const int THREADS_PER_BLOCK = 64;
+
+const int WARP_SIZE = 32;
+
+// use aligned structure for large memory transaction
+struct __align__(16) Box {
+    float x0, y0, x1, y1;
+};
+
+//! return whether IoU(a, b) > thresh
+__device__ __forceinline__ bool box_iou(Box a, Box b, float thresh) {
+    float left = max(a.x0, b.x0), right = min(a.x1, b.x1);
+    float top = max(a.y0, b.y0), bottom = min(a.y1, b.y1);
+    float width = max(right - left, 0.f),
+          height = max(bottom - top, 0.f);
+    float interS = width * height;
+    float Sa = (a.x1 - a.x0) * (a.y1 - a.y0);
+    float Sb = (b.x1 - b.x0) * (b.y1 - b.y0);
+    return interS > (Sa + Sb - interS) * thresh;
+}
+
+//! store uint64_t with cache streaming
+__device__ __forceinline__ void store_u64_cs(uint64_t *ptr, uint64_t val) {
+    asm volatile("st.cs.u64 [%0], %1;" :  : "l"(ptr), "l"(val));
+}
+
+//! load uint64_t with cache streaming
+__device__ __forceinline__ uint64_t load_u64_cs(const uint64_t *ptr) {
+    uint64_t val;
+    asm volatile("ld.cs.u64 %0, [%1];" : "=l"(val) : "l"(ptr));
+    return val;
+}
+
+__global__ void kern_gen_mask(
+        const int nr_boxes, const float nms_overlap_thresh,
+        const Box *dev_boxes, const int dev_mask_width, uint64_t *dev_mask) {
+    const int
+        box_group_row = blockIdx.y,
+        box_group_col = blockIdx.x;
+
+    if (box_group_row > box_group_col)
+        return;
+
+    const int
+        row_nr_boxes = min(
+                nr_boxes - box_group_row * THREADS_PER_BLOCK,
+                THREADS_PER_BLOCK),
+        col_nr_boxes = min(
+                nr_boxes - box_group_col * THREADS_PER_BLOCK,
+                THREADS_PER_BLOCK);
+
+    __shared__ Box block_boxes[THREADS_PER_BLOCK];
+
+    if (threadIdx.x < col_nr_boxes) {
+        block_boxes[threadIdx.x] = dev_boxes[
+            THREADS_PER_BLOCK * box_group_col + threadIdx.x];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_nr_boxes) {
+        const int cur_box_idx = THREADS_PER_BLOCK * box_group_row + threadIdx.x;
+        Box cur_box = dev_boxes[cur_box_idx];
+
+        uint64_t result = 0;
+        const int start = (box_group_row == box_group_col) ?
+            threadIdx.x + 1 : // blocks on diagnal
+            0;
+        for (int i = start; i < col_nr_boxes; ++ i) {
+            result |= static_cast<uint64_t>(
+                    box_iou(cur_box, block_boxes[i],
+                        nms_overlap_thresh)) << i;
+        }
+        store_u64_cs(
+                &dev_mask[cur_box_idx * dev_mask_width + box_group_col],
+                result);
+    }
+}
+
+//! true -> ~0, false -> 0
+__device__ __forceinline__ uint32_t bool_as_u32_mask(bool v) {
+    return (!v) - 1;
+}
+
+//! return min value of val in current warp
+__device__ __forceinline__ uint32_t warp_reduce_min_brdcst(uint32_t val) {
+    __shared__ uint32_t ans;
+    static_assert(WARP_SIZE == 32, "warp size != 32");
+#pragma unroll
+    for (uint32_t offset = WARP_SIZE / 2; offset; offset /= 2)
+        val = min(val, __shfl_down_sync(0xFFFFFFFF, val, offset));
+
+    if (!threadIdx.x)
+        ans = val;
+    __syncthreads();
+    return ans;
+}
+
+struct BitwiseOrArgs {
+    uint64_t *dst;
+    const uint64_t *src;
+    uint32_t size;
+};
+
+__device__ __forceinline__ void bitwise_or_single_warp(BitwiseOrArgs args) {
+    uint64_t * __restrict__ dst = args.dst;
+    const uint64_t * __restrict__ src = args.src;
+    uint32_t size = args.size;
+    for (uint32_t i = threadIdx.x; i < size; i += WARP_SIZE) {
+        dst[i] |= load_u64_cs(&src[i]);
+    }
+}
+
+__global__ void kern_gen_indices(
+        uint32_t nr_boxes, uint32_t max_output, uint32_t overlap_mask_width,
+        const uint64_t * __restrict__ overlap_mask, uint64_t *__restrict__ rm_mask,
+        uint32_t * __restrict__ out_idx, uint32_t * __restrict__ out_size) {
+    __shared__ uint32_t out_pos;
+    __shared__ BitwiseOrArgs bitwise_or_args;
+
+    const uint32_t nr_box_blocks = DIVUP(nr_boxes, 64);
+
+    if (!threadIdx.x) {
+        uint32_t cnt = nr_box_blocks * 64 - nr_boxes;
+        // mark the padded boxes as having been removed
+        rm_mask[nr_box_blocks - 1] = ((1ull << cnt) - 1) << (64 - cnt);
+        out_pos = 0;
+    }
+    __syncthreads();
+
+    uint32_t
+        box_block_id = threadIdx.x,
+        th0_box_block_id = 0;
+
+    while (th0_box_block_id < nr_box_blocks) {
+        bool in_range = box_block_id < nr_box_blocks;
+        uint64_t cur_mask = ~rm_mask[box_block_id & bool_as_u32_mask(in_range)];
+        uint32_t min_box_block_id = warp_reduce_min_brdcst(
+                box_block_id | bool_as_u32_mask(!(in_range && cur_mask)));
+
+        if (min_box_block_id + 1) {
+            // min_box_block_id != UINT32_MAX, so at least one thread finds a
+            // un-removed box
+            if (min_box_block_id == box_block_id) {
+                // exactly one thread can take this path
+                uint32_t box_id_in_block = __ffsll(cur_mask) - 1,
+                         box_id = box_block_id * 64 + box_id_in_block;
+
+                // so this box would not be processed again
+                rm_mask[box_block_id] |= 1ull << box_id_in_block;
+
+                bitwise_or_args.dst = &rm_mask[box_block_id];
+                bitwise_or_args.src =
+                    &overlap_mask[box_id * overlap_mask_width + box_block_id];
+                bitwise_or_args.size = nr_box_blocks - box_block_id;
+                out_idx[out_pos ++] = box_id;
+            }
+            __syncthreads();
+            if (out_pos == max_output)
+                break;
+            bitwise_or_single_warp(bitwise_or_args);
+
+            // skip the blocks before min_box_block_id
+            th0_box_block_id = min_box_block_id;
+            box_block_id = min_box_block_id + threadIdx.x;
+        } else {
+            th0_box_block_id += WARP_SIZE;
+            box_block_id += WARP_SIZE;
+        }
+    }
+
+    if (out_pos < max_output) {
+        // fill the values after out_pos
+        uint32_t val = out_idx[out_pos - 1];
+        for (uint32_t i = out_pos + threadIdx.x; i < max_output; i += WARP_SIZE) {
+            out_idx[i] = val;
+        }
+    }
+    if (!threadIdx.x) {
+        *out_size = out_pos;
+    }
+}
+
+} // anonymous namespace
+
+void mgb::opr::standalone::nms::launch_gen_mask(
+        const int nr_boxes, const float nms_overlap_thresh,
+        const float *dev_boxes, const int dev_mask_width,
+        uint64_t *dev_mask, cudaStream_t stream) {
+    dim3 blocks(DIVUP(nr_boxes, THREADS_PER_BLOCK),
+                DIVUP(nr_boxes, THREADS_PER_BLOCK));
+    dim3 threads(THREADS_PER_BLOCK);
+    kern_gen_mask<<<blocks, threads, 0, stream>>>(
+            nr_boxes, nms_overlap_thresh,
+            reinterpret_cast<const Box*>(dev_boxes), dev_mask_width, dev_mask);
+}
+
+void mgb::opr::standalone::nms::launch_gen_indices(
+        int nr_boxes, int max_output, int overlap_mask_width,
+        const uint64_t *overlap_mask, uint64_t *rm_mask,
+        uint32_t *out_idx, uint32_t *out_size,
+        cudaStream_t stream) {
+    kern_gen_indices<<<1, WARP_SIZE, 0, stream>>>(
+            nr_boxes, max_output, overlap_mask_width,
+            overlap_mask, rm_mask,
+            out_idx, out_size);
+}
+
+// vim: ft=cuda syntax=cuda.doxygen
diff --git a/src/opr/impl/standalone/nms_kern.cuh b/src/opr/impl/standalone/nms_kern.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..aad49b2e1665995eb0e9bbf486bd594f7752a1d4
--- /dev/null
+++ b/src/opr/impl/standalone/nms_kern.cuh
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#define DIVUP(m, n) (((m)-1) / (n) + 1)
+
+namespace mgb {
+namespace opr {
+namespace standalone {
+namespace nms {
+
+/*!
+ * \brief launch the kernel to generate nms mask
+ * \param nr_boxes number of input boxes
+ * \param nms_overlap_thresh overlapping threshold for IoU
+ * \param[in] dev_boxes boxes in [n, 4] layout,
+ *      each row containing (x0, y0, x1, y1)
+ * \param dev_mask_width width in number of uint64_t elements of div_mask
+ *      matrix; must be at least ceil(n, 64)
+ * \param[out] dev_mask [n, dev_mask_width] dev_mask[i] is a
+ *      bitmask of length n indicating whether i overlaps with each box. Only
+ *      the upper triangle (row < col) are filled.
+ */
+void launch_gen_mask(const int nr_boxes, const float nms_overlap_thresh,
+                     const float* dev_boxes, const int dev_mask_width,
+                     uint64_t* dev_mask, cudaStream_t stream);
+
+/*!
+ * \brief launch the kernel to generate indices of kept boxes
+ * \param max_output max number of entries to be written to out_idx
+ * \param overlap_mask the mask generated by launch_gen_mask
+ * \param[in,out] rm_mask mask of removed boxes; must be initialized as 0
+ * \param[out] out_idx indices of boxes to be kept
+ * \param[out] out_size number of items written to out_idx; the remaining items
+ *      would be filled with the last valid item
+ */
+void launch_gen_indices(int nr_boxes, int max_output, int overlap_mask_width,
+                        const uint64_t* overlap_mask, uint64_t* rm_mask,
+                        uint32_t* out_idx, uint32_t* out_size,
+                        cudaStream_t stream);
+
+}  // namespace nms
+}  // namespace standalone
+}  // namespace opr
+}  // namespace mgb
+
+// vim: ft=cuda syntax=cuda.doxygen
diff --git a/src/opr/impl/standalone/nms_opr.cpp b/src/opr/impl/standalone/nms_opr.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a88a9a8669dba1c21c3cafff965c00c6b13ec89
--- /dev/null
+++ b/src/opr/impl/standalone/nms_opr.cpp
@@ -0,0 +1,272 @@
+#include "megbrain/opr/standalone/nms_opr.h"
+
+#if MGB_CUDA
+#include "./nms_kern.cuh"
+#endif
+#include "./nms_cpu.h"
+
+#include "megbrain/comp_node_env.h"
+#include "megbrain/serialization/sereg.h"
+#include "megbrain/utils/arith_helper.h"  // for get_aligned_power2
+
+#if MGB_ENABLE_FBS_SERIALIZATION
+#include "megbrain/serialization/internal/mgb_cpp_opr_generated.h"
+#include "megbrain/serialization/internal/schema_generated.h"
+#endif
+
+using namespace mgb::opr::standalone;
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(NMSKeep);
+
+class NMSKeep::Kern {
+public:
+    virtual ~Kern() = default;
+
+    //! get workspace size in bytes
+    virtual size_t get_workspace_size(const NMSKeep* opr,
+                                      const TensorShape& boxes) = 0;
+    virtual void exec(const NMSKeep* opr, const DeviceTensorND& inp,
+                      const DeviceTensorND& out_idx,
+                      const DeviceTensorND& out_size,
+                      const DeviceTensorND& workspace) = 0;
+};
+
+// f{{{ cuda kernel begins
+#if MGB_CUDA
+class NMSKeep::CUDAKern final : public Kern {
+    size_t m_workspace_overlap_mask_bytes, m_workspace_overlap_mask_bytes_align,
+            m_workspace_rm_mask_bytes;
+
+    void init(const NMSKeep* opr, const TensorShape& boxes) {
+        auto align = opr->comp_node().get_mem_addr_alignment();
+        size_t nr_boxes = boxes[1];
+        m_workspace_overlap_mask_bytes =
+                nr_boxes * DIVUP(nr_boxes, 64) * sizeof(uint64_t);
+        m_workspace_overlap_mask_bytes_align =
+                get_aligned_power2(m_workspace_overlap_mask_bytes, align);
+        m_workspace_rm_mask_bytes = DIVUP(nr_boxes, 64) * sizeof(uint64_t);
+    }
+
+public:
+    size_t get_workspace_size(const NMSKeep* opr,
+                              const TensorShape& boxes) override {
+        init(opr, boxes);
+        return m_workspace_overlap_mask_bytes_align + m_workspace_rm_mask_bytes;
+    }
+
+    void exec(const NMSKeep* opr, const DeviceTensorND& inp,
+              const DeviceTensorND& out_idx, const DeviceTensorND& out_size,
+              const DeviceTensorND& workspace) override;
+};
+
+void NMSKeep::CUDAKern::exec(const NMSKeep* opr, const DeviceTensorND& inp,
+                             const DeviceTensorND& out_idx,
+                             const DeviceTensorND& out_size,
+                             const DeviceTensorND& workspace) {
+    // NOTE: input comp node might be different from output comp node (for
+    // example, CUDA stream may be modified to overlap computations); a
+    // SingleCNOperatorNodeBase is expected to execute on a single comp node,
+    // and the comp node is defined as the output comp node
+    CompNode comp_node = out_idx.comp_node();
+
+    // comp ndoe is also accessible from SingleCNOperatorNode
+    mgb_assert(comp_node == opr->comp_node());
+
+    // CompNodeEnv contains platform-specific properties of a CompNode
+    auto&& cuda_env = CompNodeEnv::from_comp_node(comp_node).cuda_env();
+    mgb_assert(cuda_env.device_prop.warpSize == 32, "invalid warp size: %d",
+               cuda_env.device_prop.warpSize);
+    auto stream = cuda_env.stream;
+
+    init(opr, inp.shape());
+
+    auto inp_ptr = inp.ptr<float>();
+    auto dev_overlap_mask = reinterpret_cast<uint64_t*>(workspace.raw_ptr()),
+         dev_rm_mask = reinterpret_cast<uint64_t*>(
+                 workspace.raw_ptr() + m_workspace_overlap_mask_bytes_align);
+    auto out_idx_ptr = reinterpret_cast<uint32_t*>(out_idx.ptr<int32_t>()),
+         out_size_ptr = reinterpret_cast<uint32_t*>(out_size.ptr<int32_t>());
+    size_t batch = inp.shape(0), nr_boxes = inp.shape(1);
+
+    MGB_CUDA_CHECK(cudaMemsetAsync(dev_overlap_mask, 0,
+                                   m_workspace_overlap_mask_bytes, stream));
+
+    auto max_output = opr->param().max_output;
+
+    for (size_t i = 0; i < batch; ++i) {
+        nms::launch_gen_mask(nr_boxes, opr->param().iou_thresh,
+                             inp_ptr + i * nr_boxes * 4, DIVUP(nr_boxes, 64),
+                             dev_overlap_mask, stream);
+
+        MGB_CUDA_CHECK(cudaMemsetAsync(dev_rm_mask, 0,
+                                       m_workspace_rm_mask_bytes, stream));
+        nms::launch_gen_indices(nr_boxes, max_output, DIVUP(nr_boxes, 64),
+                                dev_overlap_mask, dev_rm_mask,
+                                out_idx_ptr + i * max_output, out_size_ptr + i,
+                                stream);
+    }
+}
+
+#endif  // MGB_CUDA for CUDAKern
+// f}}} cuda kernel ends
+
+// f{{{ cpu kernel begins
+class NMSKeep::CPUKern final : public Kern {
+public:
+    ~CPUKern() = default;
+
+    size_t get_workspace_size(const NMSKeep*,
+                              const TensorShape& boxes) override {
+        return nms::cpu_kern_workspace(boxes.shape[1]);
+    }
+
+    void exec(const NMSKeep* opr, const DeviceTensorND& inp,
+              const DeviceTensorND& out_idx, const DeviceTensorND& out_size,
+              const DeviceTensorND& workspace) override;
+};
+void NMSKeep::CPUKern::exec(const NMSKeep* opr, const DeviceTensorND& inp,
+                            const DeviceTensorND& out_idx,
+                            const DeviceTensorND& out_size,
+                            const DeviceTensorND& workspace) {
+    // See CUDAKern::exec for more explanation on output comp nodes.
+    CompNode comp_node = out_idx.comp_node();
+
+    auto inp_ptr = inp.ptr<float>();
+    auto out_idx_ptr = reinterpret_cast<uint32_t*>(out_idx.ptr<int32_t>()),
+         out_size_ptr = reinterpret_cast<uint32_t*>(out_size.ptr<int32_t>());
+    size_t batch = inp.shape(0), nr_boxes = inp.shape(1);
+    auto param = opr->param();
+
+    auto workspace_ptr = workspace.raw_ptr();
+
+    // NOTE: we must copy all the params into the kernel closure since it would
+    // be dispatched on a different thread
+    auto kern = [=]() {
+        for (size_t i = 0; i < batch; ++i) {
+            nms::cpu_kern(nr_boxes, param.max_output, param.iou_thresh,
+                          inp_ptr + i * nr_boxes * 4,
+                          out_idx_ptr + i * param.max_output, out_size_ptr + i,
+                          workspace_ptr);
+        }
+    };
+
+    // The kernel should not be invoked
+    CompNodeEnv::from_comp_node(comp_node).cpu_env().dispatch(kern);
+}
+
+// f}}} cpu kernel ends
+
+NMSKeep::NMSKeep(VarNode* boxes, const Param& param,
+                 const OperatorNodeConfig& config)
+        : Super(boxes->owner_graph(),  // owner graph
+                config,                // OperatorNodeConfig
+                "nms_keep",  // opr type name (used for generating opr name)
+                {boxes}      // input vars for generating opr name
+                ),
+          m_param{param} {
+    mgb_assert(boxes->dtype() == dtype::Float32(),
+               "input should be float32; got %s", boxes->dtype().name());
+    // setup m_kern according to device type
+    switch (boxes->comp_node().device_type()) {
+#if MGB_CUDA
+        case CompNode::DeviceType::CUDA:
+            m_kern = std::make_unique<CUDAKern>();
+            break;
+#endif
+        case CompNode::DeviceType::CPU:
+            m_kern = std::make_unique<CPUKern>();
+            break;
+        default:
+            mgb_throw(MegBrainError, "NMSKeep: unsupported device type: %s",
+                      boxes->comp_node().to_string().c_str());
+    }
+
+    add_input({boxes});
+    add_output("indices")->dtype(dtype::Int32());
+    add_output("sizes")->dtype(dtype::Int32());
+    cg::add_workspace_output(this);  // workspace is also an output var
+
+    // make the graph deduplication system consider m_param (so two oprs with
+    // same input vars but different param values would not be deduplicated)
+    add_equivalence_component<PODHash<Param>>(&m_param);
+}
+
+// impl dtor after Kern is defined
+NMSKeep::~NMSKeep() noexcept = default;
+
+mgb::SymbolVar NMSKeep::make(SymbolVar boxes, const Param& param,
+                             const OperatorNodeConfig& config) {
+    // SymbolVar is just a wrapper of VarNode*, with overloaded methods such as
+    // operator+()
+    auto bvar = boxes.node();
+    // insert opr into the owner graph of boxes
+    return boxes.insert_single_output_opr<NMSKeep>(bvar, param, config);
+}
+
+void NMSKeep::get_output_var_shape(const TensorShapeArray& inp_shape,
+                                   TensorShapeArray& out_shape) const {
+    auto boxes = inp_shape.at(0);
+    mgb_assert(boxes.ndim == 3 && boxes.shape[2] == 4, "invalid box shape: %s",
+               boxes.to_string().c_str());
+
+    // out_shape should match the outputs added in the constructor
+    mgb_assert(out_shape.size() == 3);
+
+    auto batch = boxes[0];
+    out_shape[0] = {batch, m_param.max_output};                // indices
+    out_shape[1] = {batch};                                    // sizes
+    out_shape[2] = {m_kern->get_workspace_size(this, boxes)};  // workspace
+}
+
+void NMSKeep::add_input_layout_constraint() {
+    input(0)->add_layout_constraint_contiguous();
+}
+
+void NMSKeep::scn_do_execute() {
+    DeviceTensorND empty_workspace;
+    m_kern->exec(this, input(0)->dev_tensor(), output(0)->dev_tensor(),
+                 output(1)->dev_tensor(),
+                 // if workspace size is 0, output(2) would be invalid and its
+                 // dev_tensor() can not be accessed
+                 output(2)->dev_tensor_valid() ? output(2)->dev_tensor()
+                                               : empty_workspace);
+}
+
+#if MGB_ENABLE_FBS_SERIALIZATION
+
+namespace mgb {
+namespace serialization {
+namespace fbs {
+
+template <>
+struct ParamConverter<opr::standalone::NMSKeep::Param> {
+    using FlatBufferType = param::NMSKeep;
+    static opr::standalone::NMSKeep::Param to_param(const FlatBufferType* fb) {
+        return {fb->iou_thresh(), fb->max_output()};
+    }
+    static flatbuffers::Offset<FlatBufferType> to_flatbuffer(
+            flatbuffers::FlatBufferBuilder& builder,
+            const opr::standalone::NMSKeep::Param& p) {
+        return param::CreateNMSKeep(builder, p.iou_thresh, p.max_output);
+    }
+};
+
+}  // namespace fbs
+}  // namespace serialization
+}  // namespace mgb
+
+#endif
+
+namespace mgb {
+
+void _hack_pull_in_nms_opr_object() {}
+
+}  // namespace mgb
+
+// register serialization: the default implementation uses Opr::Param; it
+// requires Param::TAG, Opr::param() and Opr::make(..., param) to exist
+// Note: the second param 1 here means that this operator has one input
+using NMSKeepMGB = NMSKeep;
+MGB_SEREG_OPR(NMSKeepMGB, 1);
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/include/megbrain/opr/standalone/nms_opr.h b/src/opr/include/megbrain/opr/standalone/nms_opr.h
new file mode 100644
index 0000000000000000000000000000000000000000..a15e9f0ce812b0431eef6d1cecc9101132fd20e1
--- /dev/null
+++ b/src/opr/include/megbrain/opr/standalone/nms_opr.h
@@ -0,0 +1,62 @@
+#include "megbrain/graph.h"
+#include "megbrain_build_config.h"
+
+namespace mgb {
+namespace opr {
+namespace standalone {
+
+/*!
+ * \brief generate indices of boxes to be kept after NMS
+ *
+ * See the docs in the python operator
+ */
+MGB_DEFINE_OPR_CLASS(NMSKeep,
+                     cg::SingleCNOutshapePureByInshapeOprBase) // {
+public:
+    struct Param {
+        //! TAG is used by the serializer to check Param type; here we
+        //! just use a random number. To generate such a random number,
+        //! run `xxd -l4 -p /dev/urandom`
+        static constexpr uint32_t TAG = 0x988a7630u;
+
+        float iou_thresh;     //!< IoU threshold for overlapping
+        uint32_t max_output;  //!< max number of output boxes per batch
+    };
+    
+
+    NMSKeep(VarNode * boxes, const Param& param,
+            const OperatorNodeConfig& config);
+    ~NMSKeep() noexcept;
+
+    //! factory method to insert the operator into a graph
+    static SymbolVar make(SymbolVar boxes, const Param& param,
+                          const OperatorNodeConfig& config = {});
+
+    const Param& param() const { return m_param; }
+
+private:
+    const Param m_param;
+
+    class Kern;
+    class CUDAKern;
+    class CPUKern;
+
+    std::unique_ptr<Kern> m_kern;
+
+    //! override output shape infer func provided by
+    //! SingleCNOutshapePureByInshapeOprBase
+    void get_output_var_shape(const TensorShapeArray& inp_shape,
+                              TensorShapeArray& out_shape) const override;
+
+    //! this opr requires inputs to be contiguous
+    void add_input_layout_constraint() override;
+
+    //! execute the operator
+    void scn_do_execute() override;
+};
+
+}  // namespace standalone
+}  // namespace opr
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/src/opr/test/standalone/nms.cpp b/src/opr/test/standalone/nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f53b05936cd8e718440a57d931effc4e442d4c2a
--- /dev/null
+++ b/src/opr/test/standalone/nms.cpp
@@ -0,0 +1,77 @@
+/**
+ * \file src/opr/test/standalone/nms.cpp
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ */
+
+#include "megbrain/opr/standalone/nms_opr.h"
+#include "megbrain/test/helper.h"
+#include "megbrain/opr/io.h"
+#include "megbrain/opr/tensor_manip.h"
+#include "megbrain/opr/tensor_gen.h"
+#include <random>
+
+using namespace mgb;
+
+namespace {
+
+void run_on_comp_node(const char* cn_name) {
+    auto cn = CompNode::load(cn_name);
+    auto graph = ComputingGraph::make();
+    auto host_x = std::make_shared<HostTensorND>(cn, TensorShape{1, 2, 4},
+                                                 dtype::Float32{});
+    auto ptr = host_x->ptr<float>();
+    ptr[0] = 0.; ptr[1] = 0.;
+    ptr[2] = 2.; ptr[3] = 2.;
+    ptr[4] = 0.5; ptr[5] = 0.5;
+    ptr[6] = 1.5; ptr[7] = 1.5;
+
+    auto x = opr::Host2DeviceCopy::make(*graph, host_x);
+
+    {
+        auto idx = opr::standalone::NMSKeep::make(x, {0.2, 16});
+        auto size = idx.node()->owner_opr()->output(1);
+        HostTensorND host_idx, host_size;
+        auto func = graph->compile({make_callback_copy(idx, host_idx),
+                                    make_callback_copy(size, host_size)});
+        func->execute().wait();
+        auto idx_ptr = host_idx.ptr<int32_t>();
+        auto size_ptr = host_size.ptr<int32_t>();
+        ASSERT_EQ(size_ptr[0], 1);
+        ASSERT_EQ(idx_ptr[0], 0);
+    }
+    {
+        auto idx = opr::standalone::NMSKeep::make(x, {0.5, 16});
+        auto size = idx.node()->owner_opr()->output(1);
+        HostTensorND host_idx, host_size;
+        auto func = graph->compile({make_callback_copy(idx, host_idx),
+                                    make_callback_copy(size, host_size)});
+        func->execute().wait();
+        auto idx_ptr = host_idx.ptr<int32_t>();
+        auto size_ptr = host_size.ptr<int32_t>();
+        ASSERT_EQ(size_ptr[0], 2);
+        ASSERT_EQ(idx_ptr[0], 0);
+        ASSERT_EQ(idx_ptr[1], 1);
+    }
+}
+
+}
+
+TEST(TestOprNMS, CPU) {
+    run_on_comp_node("cpu0");
+}
+
+TEST(TestOprNMS, GPU) {
+    REQUIRE_GPU(1);
+    run_on_comp_node("gpu0");
+}
+
+#if MGB_ENABLE_EXCEPTION
+TEST(TestOprNMS, InvalidInput) {
+    HostTensorGenerator<> gen;
+    auto graph = ComputingGraph::make();
+    auto host_x = gen({1, 9, 5});
+    auto x = opr::Host2DeviceCopy::make(*graph, host_x);
+    ASSERT_ANY_THROW(opr::standalone::NMSKeep::make(x, {1., 1}));
+}
+#endif  // MGB_ENABLE_EXCEPTION