Merge branch 'master' into release-1.0

0a0e4b60 · Megvii Engine Team · 7838ba94 · 8d1f3025 · 0a0e4b60 · 0a0e4b60
232 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,9 +53,11 @@ option(MGE_WITH_DISTRIBUTED "Build with distributed support" ON)
 option(MGE_BUILD_IMPERATIVE_RT "Build _imperative_rt Python Module " ON)
 option(MGE_BUILD_SDK "Build load_and_run" ON)
 option(MGE_INFERENCE_ONLY "Build inference only library." OFF)
+option(MGE_WITH_PYTHON_MODULE "Build MegEngine legacy Python Module." OFF)
 option(MGE_WITH_MKLDNN "Enable Intel MKL_DNN support," ON)
 option(MGE_WITH_ROCM "Enable ROCM support" OFF)

+
 if(NOT ${MGE_BIN_REDUCE} STREQUAL "")
    message("build with BIN REDUCE")
    if(MGE_WITH_MINIMUM_SIZE)
@@ -152,6 +154,14 @@ if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386" OR ${MGE_ARCH} S
 endif()

 if(MSVC OR WIN32)
+    # for cmake after 3.15.2
+    cmake_policy(SET CMP0091 NEW)
+    if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+        set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreadedDebug")
+    else()
+        set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded")
+    endif()
+
    add_compile_definitions(NOMINMAX=1 _USE_MATH_DEFINES=1 WIN32=1)
    message("-- into windows build...")
    message("-- CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}")
@@ -285,7 +295,6 @@ if(MGE_WITH_TEST)
 endif()

 if(MGE_BUILD_IMPERATIVE_RT)
-    add_compile_definitions(MGB_ENABLE_IMPERATIVE_RUNTIME)
    set(CMAKE_CXX_STANDARD 17)
 endif()

@@ -701,7 +710,8 @@ endif()

 set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MARCH}")

-set(MGB_ENABLE_IMPERATIVE ${MGE_BUILD_IMPERATIVE_RT})
+set(MGE_VERSION_SCRIPT ${PROJECT_SOURCE_DIR}/src/version.ld CACHE INTERNAL "Path to linker version script")
+
 # Write out megbrain_build_config.h
 # It defines macros needed by both megbrain and dnn
 configure_file(src/megbrain_build_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/genfiles/megbrain_build_config.h)
@@ -831,3 +841,8 @@ if(MSVC OR WIN32)
        endif()
    endforeach()
 endif()
+
+if(MGE_WITH_JIT_MLIR)
+    add_subdirectory(tools/mlir/mgb-opt)
+    add_subdirectory(tools/mlir/mgb-file-check)
+endif()
--- a/dnn/include/megdnn/oprs/nn.h
+++ b/dnn/include/megdnn/oprs/nn.h
@@ -682,6 +682,53 @@ protected:
                    size_t workspace_in_bytes);
 };

+/**
+ * \brief base class for AdaptivePooling
+ */
+class AdaptivePoolingBase : public OperatorBase {
+    DEF_OPR_IMPL_CTOR(AdaptivePoolingBase, OperatorBase);
+    DEF_OPR_PARAM(AdaptivePooling);
+
+protected:
+    param::Pooling deduce_pooling_param(const TensorLayout& src,
+                                        const TensorLayout& dst);
+};
+
+class AdaptivePoolingForward : public AdaptivePoolingBase {
+    DEF_OPR_IMPL(AdaptivePoolingForward, AdaptivePoolingBase, 1, 1);
+
+public:
+    /**
+     * \param[in] src input tensor
+     * \param[out] dst output tensor
+     */
+    virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
+                      _megdnn_workspace workspace) = 0;
+    virtual size_t get_workspace_in_bytes(const TensorLayout& src,
+                                          const TensorLayout& dst) = 0;
+};
+
+using AdaptivePooling = AdaptivePoolingForward;
+
+class AdaptivePoolingBackward : public AdaptivePoolingBase {
+    DEF_OPR_IMPL(AdaptivePoolingBackward, AdaptivePoolingBase, 3, 1);
+
+public:
+    /**
+     * \param[in] src the `src' parameter in AdaptivePoolingForward::exec
+     * \param[in] dst the `dst' parameter in AdaptivePoolingForward::exec
+     * \param[in] diff the backpropagated gradient wrt. dst
+     * \param[out] grad the backpropagated gradient wrt. src
+     */
+    virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_in dst,
+                      _megdnn_tensor_in diff, _megdnn_tensor_out grad,
+                      _megdnn_workspace workspace) = 0;
+    virtual size_t get_workspace_in_bytes(const TensorLayout& src,
+                                          const TensorLayout& dst,
+                                          const TensorLayout& diff,
+                                          const TensorLayout& grad) = 0;
+};
+
 /**
 * \brief base class for Local
 */

--- a/dnn/scripts/opr_param_defs.py
+++ b/dnn/scripts/opr_param_defs.py
@@ -179,6 +179,11 @@ pdef('Axis').add_fields('int32', 'axis', 0)
 add_enum_alias('Format', 'ConvolutionV0')
 )

+(pdef('AdaptivePooling').
+ add_enum_alias('Mode', 'Pooling').
+ add_enum_alias('Format', 'ConvolutionV0')
+ )
+
 (pdef('LRN',
      'see ImageNet Classification with Deep Convolutional Neural Networks for'
      ' meaning of the fields').

--- a/dnn/src/atlas/megcore/computing_context.cpp
+++ b/dnn/src/atlas/megcore/computing_context.cpp
@@ -55,8 +55,12 @@ void AtlasComputingContext::memcpy(void* dst, const void* src,
        default:
            megdnn_throw("bad atlas memcpy kind");
    }
+#if MGB_USE_ATLAS_ASYNC_API
    acl_check(aclrtMemcpyAsync(dst, size_in_bytes, src, size_in_bytes,
                               atlas_kind, m_ctx.stream));
+#else
+    acl_check(aclrtMemcpy(dst, size_in_bytes, src, size_in_bytes, atlas_kind));
+#endif
 }

 void AtlasComputingContext::memset(void* dst, int value, size_t size_in_bytes) {
@@ -65,7 +69,11 @@ void AtlasComputingContext::memset(void* dst, int value, size_t size_in_bytes) {
 }

 void AtlasComputingContext::synchronize() {
+#if MGB_USE_ATLAS_ASYNC_API
    acl_check(aclrtSynchronizeStream(m_ctx.stream));
+#else
+    return;
+#endif
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/common/adaptive_pooling.cpp
+++ b/dnn/src/common/adaptive_pooling.cpp
+/**
+ * \file dnn/src/common/adaptive_pooling.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "megdnn/opr_param_defs.h"
+#include "megdnn/oprs.h"
+
+#include "src/common/utils.h"
+namespace megdnn {
+
+param::Pooling AdaptivePoolingBase::deduce_pooling_param(
+        const TensorLayout& src, const TensorLayout& dst) {
+    megdnn_assert(param().format == param::AdaptivePooling::Format::NCHW);
+    size_t IH = src.shape[2], IW = src.shape[3], OH = dst.shape[2],
+           OW = dst.shape[3];
+
+    param::Pooling ret;
+    ret.mode = param().mode;
+    ret.format = param().format;
+    ret.pad_h = ret.pad_w = 0;
+    ret.stride_h = floor(IH / OH);
+    ret.stride_w = floor(IW / OW);
+    ret.window_h = IH - (OH - 1) * ret.stride_h;
+    ret.window_w = IW - (OW - 1) * ret.stride_w;
+
+    return ret;
+}
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/common/basic_types.cpp
+++ b/dnn/src/common/basic_types.cpp
@@ -392,8 +392,6 @@ TensorLayout TensorLayout::broadcast(const TensorShape& tshape) const {
        TensorLayout result{dtype, format};
        result.ndim = tshape.ndim;
        for (size_t i = 0; i < tshape.ndim; i++) {
-            megdnn_throw_if(!tshape.shape[i], tensor_reshape_error,
-                            megdnn_mangle("target shape is 0"));
            result.shape[i] = tshape.shape[i];
            result.stride[i] = (tshape.shape[i] == 1);
        }
@@ -409,8 +407,6 @@ TensorLayout TensorLayout::broadcast(const TensorShape& tshape) const {
    for (size_t i = 0; i < tshape.ndim; ++i) {
        int target_idx = tshape.ndim - i - 1;
        int cur_idx = ndim - i - 1;
-        megdnn_throw_if(!tshape.shape[target_idx], tensor_reshape_error,
-                        megdnn_mangle("target shape is 0"));
        size_t cur_shape = (cur_idx >= 0 ? shape[cur_idx] : 1),
               cur_stride = (cur_idx >= 0 ? stride[cur_idx] : 0);
        if (tshape.shape[target_idx] != cur_shape) {
@@ -434,10 +430,16 @@ TensorLayout TensorLayout::broadcast(const TensorShape& tshape) const {
 bool TensorLayout::try_reshape(TensorLayout& result,
                               const TensorShape& tshp) const {
    megdnn_assert(tshp.ndim);
+
+    bool is_empty_shape = false;
    for (size_t i = 0; i < tshp.ndim; ++i) {
-        megdnn_throw_if(!tshp.shape[i], tensor_reshape_error,
-                        megdnn_mangle(ssprintf("bad target tshp: %s",
-                                               tshp.to_string().c_str())));
+        if (!tshp.shape[i]) {
+            megdnn_throw_if(!format.is_default(), tensor_reshape_error,
+                megdnn_mangle(ssprintf("bad target tshp: %s",
+                                tshp.to_string().c_str())));
+            is_empty_shape = true;
+            break;
+        }
    }

    megdnn_throw_if(
@@ -454,6 +456,11 @@ bool TensorLayout::try_reshape(TensorLayout& result,
    result.format = this->format;
    result.TensorShape::operator=(tshp);

+    if (is_empty_shape) {
+        result.init_contiguous_stride();
+        return true;
+    }
+
    size_t sdim = 0, prod = 1, cont_sdim = 0;
    for (size_t i = 0; i < tshp.ndim; ++i) {
        megdnn_assert(cont_sdim < cont.ndim);

--- a/dnn/src/common/handle_impl.h
+++ b/dnn/src/common/handle_impl.h
@@ -199,6 +199,8 @@ private:
    cb(Remap) \
    cb(RemapBackwardData) \
    cb(RemapBackwardMat) \
+    cb(AdaptivePoolingForward) \
+    cb(AdaptivePoolingBackward) \

 /*!
 * \brief specialize HandleImpl::create_operator for a single opr type;

--- a/dnn/src/cuda/adaptive_pooling/opr_impl.cpp
+++ b/dnn/src/cuda/adaptive_pooling/opr_impl.cpp
+/**
+ * \file dnn/src/cuda/adaptive_pooling/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/cuda/adaptive_pooling/opr_impl.h"
+#include "src/cuda/utils.h"
+
+namespace megdnn {
+namespace cuda {
+
+void AdaptivePoolingForwardImpl::exec(_megdnn_tensor_in src,
+                                      _megdnn_tensor_out dst,
+                                      _megdnn_workspace workspace) {
+    auto opr = handle()->create_operator<PoolingForward>();
+    opr->param() = deduce_pooling_param(src.layout, dst.layout);
+    opr->exec(src, dst, workspace);
+}
+
+size_t AdaptivePoolingForwardImpl::get_workspace_in_bytes(
+        const TensorLayout& src, const TensorLayout& dst) {
+    auto opr = handle()->create_operator<PoolingForward>();
+    opr->param() = deduce_pooling_param(src, dst);
+    return opr->get_workspace_in_bytes(src, dst);
+}
+
+void AdaptivePoolingBackwardImpl::exec(_megdnn_tensor_in src,
+                                       _megdnn_tensor_in dst,
+                                       _megdnn_tensor_in diff,
+                                       _megdnn_tensor_out grad,
+                                       _megdnn_workspace workspace) {
+    auto opr = handle()->create_operator<PoolingBackward>();
+    opr->param() = deduce_pooling_param(src.layout, dst.layout);
+    opr->exec(src, dst, diff, grad, workspace);
+}
+
+size_t AdaptivePoolingBackwardImpl::get_workspace_in_bytes(
+        const TensorLayout& src, const TensorLayout& dst,
+        const TensorLayout& diff, const TensorLayout& grad) {
+    auto opr = handle()->create_operator<PoolingBackward>();
+    opr->param() = deduce_pooling_param(src, dst);
+    return opr->get_workspace_in_bytes(src, dst, diff, grad);
+}
+}  // namespace cuda
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/adaptive_pooling/opr_impl.h
+++ b/dnn/src/cuda/adaptive_pooling/opr_impl.h
+/**
+ * \file dnn/src/cuda/adaptive_pooling/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+
+#include "src/cuda/cudnn_wrapper.h"
+#include "src/cuda/utils.h"
+
+namespace megdnn {
+namespace cuda {
+
+class AdaptivePoolingForwardImpl final : public AdaptivePoolingForward {
+public:
+    using AdaptivePoolingForward::AdaptivePoolingForward;
+    void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
+              _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(const TensorLayout& src,
+                                  const TensorLayout& dst) override;
+};
+
+class AdaptivePoolingBackwardImpl final : public AdaptivePoolingBackward {
+public:
+    using AdaptivePoolingBackward::AdaptivePoolingBackward;
+    void exec(_megdnn_tensor_in src, _megdnn_tensor_in dst,
+              _megdnn_tensor_in diff, _megdnn_tensor_out grad,
+              _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(const TensorLayout& src,
+                                  const TensorLayout& dst,
+                                  const TensorLayout& diff,
+                                  const TensorLayout& grad) override;
+};
+}  // namespace cuda
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/handle_create.cpp
+++ b/dnn/src/cuda/handle_create.cpp
@@ -11,6 +11,7 @@

 #include "src/common/handle_impl.h"

+#include "src/cuda/adaptive_pooling/opr_impl.h"
 #include "src/cuda/add_update/opr_impl.h"
 #include "src/cuda/argmxx/opr_impl.h"
 #include "src/cuda/argsort/opr_impl.h"

--- a/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_impl.cuinl
+++ b/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_impl.cuinl
@@ -72,6 +72,7 @@ namespace indexing_multi_axis_vec {
 #define cb0(_dtype) \
    MEGDNN_FOREACH_TENSOR_NDIM(INST, DTypeTrait<_dtype>::ctype)
    MEGDNN_FOREACH_COMPUTING_DTYPE(cb0)
+    cb0(::megdnn::dtype::Bool)
 #undef cb0
 #undef INST


--- a/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_incr.cu
+++ b/dnn/src/cuda/indexing_multi_axis_vec/kern_apply_opr_incr.cu
@@ -39,6 +39,11 @@ __device__ void atomicAdd(megdnn::dt_int16 *, megdnn::dt_int16) {
    ((int*)0)[0] = 1;
 }

+__device__ void atomicAdd(megdnn::dt_bool *, megdnn::dt_bool) {
+    __trap();
+    ((int*)0)[0] = 1;
+}
+
 #define KERN_APPLY_OPR_OPR \
    ::megdnn::cuda::indexing_multi_axis_vec::OprAtomicIncr
 #include "./kern_apply_opr_impl.cuinl"

--- a/dnn/src/cuda/indexing_multi_axis_vec/opr_impl.cpp
+++ b/dnn/src/cuda/indexing_multi_axis_vec/opr_impl.cpp
@@ -120,6 +120,7 @@ void ExecImpl<Opr>::dispatch_exec() {
        case DTypeTrait<_dtype>::enumv: \
            return dispatch_exec_ctype<DTypeTrait<_dtype>::ctype>();
        MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+        cb(::megdnn::dtype::Bool)
 #undef cb
        default:
            megdnn_throw("bad dtype");

--- a/dnn/src/naive/adaptive_pooling/opr_impl.cpp
+++ b/dnn/src/naive/adaptive_pooling/opr_impl.cpp
+/**
+ * \file dnn/src/naive/adaptive_pooling/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/naive/adaptive_pooling/opr_impl.h"
+
+#include "src/common/opr_delegate.h"
+#include "src/common/utils.h"
+#include "src/naive/handle.h"
+
+namespace megdnn {
+namespace naive {
+
+void AdaptivePoolingForwardImpl::exec(_megdnn_tensor_in src,
+                                      _megdnn_tensor_out dst,
+                                      _megdnn_workspace workspace) {
+    MEGDNN_DISPATCH_CPU_KERN(static_cast<naive::HandleImpl*>(handle()), {
+        auto opr = inplace_cpu_handle()->create_operator<PoolingForward>();
+        opr->param() = deduce_pooling_param(src.layout, dst.layout);
+        opr->exec(src, dst, workspace);
+    });
+}
+
+void AdaptivePoolingBackwardImpl::exec(_megdnn_tensor_in src,
+                                       _megdnn_tensor_in dst,
+                                       _megdnn_tensor_in diff,
+                                       _megdnn_tensor_out grad,
+                                       _megdnn_workspace workspace) {
+    MEGDNN_DISPATCH_CPU_KERN(static_cast<naive::HandleImpl*>(handle()), {
+        auto opr = inplace_cpu_handle()->create_operator<PoolingBackward>();
+        opr->param() = deduce_pooling_param(src.layout, dst.layout);
+        opr->exec(src, dst, diff, grad, workspace);
+    });
+}
+
+size_t AdaptivePoolingBackwardImpl::get_workspace_in_bytes(
+        const TensorLayout& src, const TensorLayout& dst,
+        const TensorLayout& diff, const TensorLayout& grad) {
+    auto opr = inplace_cpu_handle()->create_operator<PoolingBackward>();
+    opr->param() = deduce_pooling_param(src, dst);
+    return opr->get_workspace_in_bytes(src, dst, diff, grad);
+}
+}  // namespace naive
+}  // namespace megdnn
+// vim: syntax=cpp.doxygen
--- a/dnn/src/naive/adaptive_pooling/opr_impl.h
+++ b/dnn/src/naive/adaptive_pooling/opr_impl.h
+/**
+ * \file dnn/src/naive/adaptive_pooling/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+#include "src/common/utils.h"
+
+namespace megdnn {
+namespace naive {
+
+class AdaptivePoolingForwardImpl : public AdaptivePoolingForward {
+public:
+    using AdaptivePoolingForward::AdaptivePoolingForward;
+    void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
+              _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(const TensorLayout&,
+                                  const TensorLayout&) override {
+        return 0;
+    }
+};
+
+class AdaptivePoolingBackwardImpl : public AdaptivePoolingBackward {
+public:
+    using AdaptivePoolingBackward::AdaptivePoolingBackward;
+    void exec(_megdnn_tensor_in src, _megdnn_tensor_in dst,
+              _megdnn_tensor_in diff, _megdnn_tensor_out grad,
+              _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(const TensorLayout& src,
+                                  const TensorLayout& dst,
+                                  const TensorLayout& diff,
+                                  const TensorLayout& grad) override;
+};
+}  // namespace naive
+}  // namespace megdnn
+// vim: syntax=cpp.doxygen
--- a/dnn/src/naive/handle.cpp
+++ b/dnn/src/naive/handle.cpp
@@ -13,6 +13,7 @@

 #include "src/common/handle_impl.h"

+#include "src/naive/adaptive_pooling/opr_impl.h"
 #include "src/naive/add_update/opr_impl.h"
 #include "src/naive/argmxx/opr_impl.h"
 #include "src/naive/argsort/opr_impl.h"

--- a/dnn/src/naive/indexing_multi_axis_vec/opr_impl.cpp
+++ b/dnn/src/naive/indexing_multi_axis_vec/opr_impl.cpp
@@ -88,6 +88,7 @@ void dispatch_exec(HandleImpl *handle,
    }
    switch (data.layout.dtype.enumv()) {
        MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
+        cb(::megdnn::dtype::Bool)
        default:
            megdnn_throw(megdnn_mangle("bad dtype"));
    }

--- a/dnn/test/common/adaptive_pooling.h
+++ b/dnn/test/common/adaptive_pooling.h
+/**
+ * \file dnn/test/common/adaptive_pooling.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include <cstddef>
+#include "megdnn/basic_types.h"
+#include "megdnn/opr_param_defs.h"
+
+namespace megdnn {
+namespace test {
+namespace adaptive_pooling {
+
+struct TestArg {
+    param::AdaptivePooling param;
+    TensorShape ishape;
+    TensorShape oshape;
+    TestArg(param::AdaptivePooling param, TensorShape ishape,
+            TensorShape oshape)
+            : param(param), ishape(ishape), oshape(oshape) {}
+};
+
+inline std::vector<TestArg> get_args() {
+    std::vector<TestArg> args;
+    using Param = param::AdaptivePooling;
+    using Mode = param::AdaptivePooling::Mode;
+
+    for (size_t i = 36; i < 40; ++i) {
+        args.emplace_back(Param{Mode::AVERAGE}, TensorShape{2, 3, i, i + 1},
+                          TensorShape{2, 3, i - 4, i - 2});
+        args.emplace_back(Param{Mode::MAX}, TensorShape{2, 3, i, i + 1},
+                          TensorShape{2, 3, i - 4, i - 2});
+    }
+
+    for (size_t i = 5; i < 10; ++i) {
+        args.emplace_back(Param{Mode::AVERAGE}, TensorShape{2, 3, i, i + 1},
+                          TensorShape{2, 3, i - 3, i - 2});
+        args.emplace_back(Param{Mode::MAX}, TensorShape{2, 3, i, i + 1},
+                          TensorShape{2, 3, i - 3, i - 2});
+    }
+    return args;
+}
+
+}  // namespace adaptive_pooling
+}  // namespace test
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
--- a/dnn/test/common/opr_trait.h
+++ b/dnn/test/common/opr_trait.h
@@ -41,6 +41,8 @@ DEF(Images2NeibsForward, 2, true, true);
 DEF(Images2NeibsBackward, 2, true, false);
 DEF(PoolingForward, 2, true, true);
 DEF(PoolingBackward, 4, true, false);
+DEF(AdaptivePoolingForward, 2, true, false);
+DEF(AdaptivePoolingBackward, 4, true, false);
 DEF(LocalForward, 3, true, true);
 DEF(LocalBackwardData, 3, true, false);
 DEF(LocalBackwardFilter, 3, true, false);

--- a/dnn/test/cuda/adaptive_pooling.cpp
+++ b/dnn/test/cuda/adaptive_pooling.cpp
+/**
+ * \file dnn/test/cuda/adaptive_pooling.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "test/cuda/fixture.h"
+
+#include "megdnn/tensor_iter.h"
+#include "test/common/adaptive_pooling.h"
+#include "test/common/checker.h"
+
+#include "src/common/utils.h"
+#include "test/cuda/utils.h"
+
+#include <cudnn.h>
+#include "test/cuda/benchmark.h"
+
+namespace megdnn {
+namespace test {
+
+TEST_F(CUDA, ADAPTIVE_POOLING_FORWARD) {
+    auto args = adaptive_pooling::get_args();
+    using Format = param::AdaptivePooling::Format;
+    DType dtype = dtype::Float32();
+    for (auto&& arg : args) {
+        auto param = arg.param;
+        auto src = arg.ishape;
+        auto dst = arg.oshape;
+        param.format = Format::NCHW;
+        Checker<AdaptivePooling> checker(handle_cuda());
+        checker.set_epsilon(1e-2);
+        checker.set_param(param).set_dtype(0, dtype).set_dtype(1, dtype).exec(
+                TensorShapeArray{src, dst, {}});
+    }
+}
+
+TEST_F(CUDA, ADAPTIVE_POOLING_BACKWARD) {
+    auto args = adaptive_pooling::get_args();
+    for (auto&& arg : args) {
+        Checker<AdaptivePoolingBackward> checker(handle_cuda());
+        TensorLayout ilayout = TensorLayout(arg.ishape, dtype::Float32());
+        TensorLayout olayout = TensorLayout(arg.oshape, dtype::Float32());
+
+        auto constraint = [this,
+                           arg](CheckerHelper::TensorValueArray& tensors_orig) {
+            megdnn_assert(tensors_orig.size() == 4);
+            auto opr = handle_cuda()->create_operator<AdaptivePoolingForward>();
+            opr->param() = arg.param;
+
+            auto tensors_cuda_storage = CheckerHelper::alloc_tensors(
+                    handle_cuda(),
+                    {tensors_orig[0].layout, tensors_orig[1].layout}, 0);
+            auto&& tensors_cuda = *tensors_cuda_storage;
+
+            auto span = tensors_cuda[0].layout.span();
+            auto dst = static_cast<dt_byte*>(tensors_cuda[0].raw_ptr) +
+                       span.low_byte;
+            auto src = static_cast<const dt_byte*>(tensors_orig[0].raw_ptr) +
+                       span.low_byte;
+            megdnn_memcpy_H2D(handle_cuda(), dst, src, span.dist_byte());
+
+            auto workspace_size = opr->get_workspace_in_bytes(
+                    tensors_cuda[0].layout, tensors_cuda[1].layout);
+            auto workspace_cuda = megdnn_malloc(handle_cuda(), workspace_size);
+            Workspace workspace{static_cast<dt_byte*>(workspace_cuda),
+                                workspace_size};
+            opr->exec(tensors_cuda[0], tensors_cuda[1], workspace);
+            megdnn_free(handle_cuda(), workspace_cuda);
+
+            span = tensors_cuda[1].layout.span();
+            dst = static_cast<dt_byte*>(tensors_orig[1].raw_ptr) +
+                  span.low_byte;
+            src = static_cast<const dt_byte*>(tensors_cuda[1].raw_ptr) +
+                  span.low_byte;
+            megdnn_memcpy_D2H(handle_cuda(), dst, src, span.dist_byte());
+        };
+
+        DType dtype = dtype::Float32();
+        checker.set_tensors_constraint(constraint)
+                .set_dtype(0, dtype)
+                .set_dtype(1, dtype)
+                .set_dtype(2, dtype)
+                .set_dtype(3, dtype)
+                .set_param(arg.param)
+                .exec(TensorShapeArray{ilayout, olayout, olayout, ilayout});
+    }
+}
+}  // namespace test
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
--- a/dnn/test/cuda/conv_bias_int8.cpp
+++ b/dnn/test/cuda/conv_bias_int8.cpp
@@ -6,7 +6,8 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
 */
 #include "megdnn/oprs/nn.h"

@@ -37,7 +38,7 @@ std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) {
    args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1});
    args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2});
    args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2});
- 
+
    args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 1, 1});
    args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 1, 1});
    args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1});
@@ -614,11 +615,8 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_HSWISH) {
    param.stride_h = param.stride_w = 1;
    param.format = param::ConvBias::Format::CHWN4;
    param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
-    checker.set_param(param).execs({{4, 12, 12, 32, 4},
-                                    {4, 3, 3, 16, 4},
-                                    {4, 1, 1, 1, 4},
-                                    {},
-                                    {}});
+    checker.set_param(param).execs(
+            {{4, 12, 12, 32, 4}, {4, 3, 3, 16, 4}, {4, 1, 1, 1, 4}, {}, {}});
 }

 TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_CHECK_BOUNDS) {
@@ -1076,7 +1074,6 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {
 }


-
 #if CUDA_VERSION >= 10020
 /// \note: we only check several cases and block sizes in megdnn_test, the full
 /// testcases are written in cutlass repository
@@ -1234,8 +1231,7 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) {
            handle_cuda(), get_resnet50_bench_args(64),
            dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
            dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
-            "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
-            param::ConvBias::Format::NCHW4);
+            "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4);
 }
 #endif
 }  // namespace test

--- a/imperative/CMakeLists.txt
+++ b/imperative/CMakeLists.txt
@@ -47,8 +47,7 @@ add_custom_target(gen_opr_py DEPENDS ${GEN_OPS_FILE})

 ##################### end of opdef generation #########################

-set(VERSION_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/src/version.ld)
-add_custom_target(_version_ld SOURCES ${VERSION_SCRIPT})
+add_custom_target(_version_ld SOURCES ${MGE_VERSION_SCRIPT})

 add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/pybind11 ${PROJECT_BINARY_DIR}/third_party/pybind11)
 pybind11_add_module(${MODULE_NAME} NO_EXTRAS ${SRCS})
@@ -57,8 +56,21 @@ if (APPLE)
 elseif (MSVC OR WIN32)
    # Windows does not support implicitly importing data members from DLL.
    target_link_libraries(${MODULE_NAME} PRIVATE megbrain megdnn)
+    message("-- CMAKE_MSVC_RUNTIME_LIBRARY: ${CMAKE_MSVC_RUNTIME_LIBRARY}")
+    set_target_properties(${MODULE_NAME} PROPERTIES MSVC_RUNTIME_LIBRARY "${CMAKE_MSVC_RUNTIME_LIBRARY}")
 else()
-    target_link_libraries(${MODULE_NAME} PRIVATE megengine_export -Wl,--version-script=${VERSION_SCRIPT})
+    if (MGE_WITH_PYTHON_MODULE)
+        # use to fix runtime crash when build both mgb(MGE_WITH_PYTHON_MODULE) and imperative(MGE_BUILD_IMPERATIVE_RT)
+        target_link_libraries(${MODULE_NAME} PRIVATE megengine_export -Wl,--version-script=${MGE_VERSION_SCRIPT})
+    else()
+        # use to reduce whl size by depend on megbrain/dnn directly, caused by cmake create two cuda fatbin
+        # elf section on both megengine_export and target which depend on megengine_export
+        target_link_libraries(${MODULE_NAME} PRIVATE megbrain megdnn -Wl,--version-script=${MGE_VERSION_SCRIPT})
+        if (MGE_WITH_DISTRIBUTED)
+            message("-- Imperative configured to link megray")
+            target_link_libraries(${MODULE_NAME} PRIVATE megray)
+        endif()
+    endif()
 endif()

 target_include_directories(${MODULE_NAME} PUBLIC src/include PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR})

--- a/imperative/python/megengine/__init__.py
+++ b/imperative/python/megengine/__init__.py
@@ -76,7 +76,7 @@ from .logger import enable_debug_log, get_logger, set_log_file, set_log_level
 from .serialization import load, save
 from .tensor import Parameter, Tensor, tensor
 from .version import __version__
-from .core import cgtools
+from .utils import comp_graph_tools as cgtools

 _set_fork_exec_path_for_timed_func(
    sys.executable,

--- a/imperative/python/megengine/autodiff/grad_manager.py
+++ b/imperative/python/megengine/autodiff/grad_manager.py
@@ -20,7 +20,7 @@ class GradManager:
    the forward operations start and when all resources should be released. A typical usage of
    GradManager is as follows:

-        .. codeblock::
+        .. code-block::

            gm = GradManager()
            gm.attach(model.parameters())
@@ -32,7 +32,7 @@ class GradManager:

    You can also use `record()` and `release()` method instead of `with` context:

-        .. codeblock::
+        .. code-block::

            gm = GradManager()
            gm.attach(model.parameters())
@@ -50,7 +50,7 @@ class GradManager:
    processes. Users will finally get the averaged gradients if an "AllReduce"
    callback is registered as follows:

-        .. codeblock::
+        .. code-block::

            import megengine.distributed as dist

@@ -71,7 +71,7 @@ class GradManager:
        r"""Registers parameters that gradients should be calculated with respect to.
        Callback Functions should have a signature like this:

-            .. codeblock::
+            .. code-block::

                def cb(param: Tensor, grad: Tensor) -> Tensor:
                    # do something
@@ -100,6 +100,8 @@ class GradManager:
        :param ys: outputs of forward operators, e.g., the loss tensor
        :param dys: derivatives of ys
        """
+        from ..functional import ones_like
+
        global backwarding_grad_manager
        cache = backwarding_grad_manager
        backwarding_grad_manager = self
@@ -113,7 +115,7 @@ class GradManager:
        if not isinstance(ys, (tuple, list)):
            ys = [ys]
        if dys is None:
-            dys = [tensor(1.0).broadcast(y.shape) for y in ys]
+            dys = [ones_like(y) for y in ys]
        if not isinstance(dys, (tuple, list)):
            dys = [dys]
        try:

--- a/imperative/python/megengine/core/__init__.py
+++ b/imperative/python/megengine/core/__init__.py
@@ -11,4 +11,3 @@ import sys

 from .tensor import Tensor
 from .tensor.megbrain_graph import Graph
-from .utils import comp_graph_tools as cgtools
--- a/imperative/python/megengine/core/_wrap.py
+++ b/imperative/python/megengine/core/_wrap.py
@@ -22,11 +22,13 @@ class Device:
        else:
            self._cn = CompNode(device)

+        self.logical_name = self._cn.logical_name
+
    def to_c(self):
        return self._cn

    def __repr__(self):
-        return "{}({})".format(type(self).__qualname__, self)
+        return "{}({})".format(type(self).__qualname__, repr(self._cn))

    def __str__(self):
        return str(self._cn)

--- a/imperative/python/megengine/core/autodiff/builtin_op_utils.py
+++ b/imperative/python/megengine/core/autodiff/builtin_op_utils.py
@@ -160,7 +160,7 @@ def subtensor_grad_fn(op, inputs, outputs, input_requires_grad):
    def make_grad(grad_op, dy):
        grad = (
            TensorWrapper(0, dtype=dy.dtype, device=dy.device)
-            .broadcast(TensorWrapper(input_shape))
+            ._broadcast(TensorWrapper(input_shape))
            .__wrapped__
        )
        (dx,) = apply(grad_op, grad, dy, *params)
@@ -186,7 +186,7 @@ def indexingMultiAxisVec_grad_fn(op, inputs, outputs, input_requires_grad):
    def make_grad(grad_op, dy):
        grad = (
            TensorWrapper(0, dtype=dy.dtype, device=dy.device)
-            .broadcast(TensorWrapper(input_shape))
+            ._broadcast(TensorWrapper(input_shape))
            .__wrapped__
        )
        (dx,) = apply(grad_op, grad, dy, *params)

--- a/imperative/python/megengine/core/tensor/function.py
+++ b/imperative/python/megengine/core/tensor/function.py
@@ -50,8 +50,8 @@ class Function:
        """
        Applies operations to ``inputs`` and returns results. It must be overriden by all subclasses.

-        :param input: Input tensors.
-        :return: A tuple of Tensor or a single Tensor.
+        :param input: input tensors.
+        :return: a tuple of Tensor or a single Tensor.

        .. note::

@@ -64,12 +64,12 @@ class Function:
        """
        Compute the gradient of the forward function. It must be overriden by all subclasses.

-        :param output_grads: gradients of outputs that are returned by :meth:`~.function.Function.forward`
+        :param output_grads: gradients of outputs that are returned by :meth:`~.function.Function.forward`.

-            .. note::
+        .. note::

-                In case when some tensors of outputs are not related to loss function, the corresponding
-                values in ``output_grads`` would be ``None``.
+            In case when some tensors of outputs are not related to loss function, the corresponding
+            values in ``output_grads`` would be ``None``.

        .. note::


--- a/imperative/python/megengine/core/tensor/indexing.py
+++ b/imperative/python/megengine/core/tensor/indexing.py
@@ -173,7 +173,7 @@ def unpack_getitem(inp, tuple_val, *, allow_newaxis=True):
                item.append(True)
                v = get_index(v)
                assert np.issubdtype(v.dtype, np.integer) or np.issubdtype(
-                    v.dtype, np.bool
+                    v.dtype, np.bool_
                ), "var type in the subscript must be int or bool"
                tensors.append(v)

@@ -267,7 +267,7 @@ def setitem(tensor, index, value):
                        value.shape, tmp_result.shape
                    )
                )
-        value = value.broadcast(tmp_result.shape)
+        value = value._broadcast(tmp_result.shape)
    if use_subtensor:
        op = builtin.SetSubtensor(items=items)
    else:

--- a/imperative/python/megengine/core/tensor/megbrain_graph.py
+++ b/imperative/python/megengine/core/tensor/megbrain_graph.py
@@ -8,6 +8,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import collections
 import json
+import os
 import threading
 import weakref
 from concurrent.futures import Future, ThreadPoolExecutor
@@ -49,7 +50,16 @@ class Graph(_imperative_rt.ComputingGraph):

    def execute(self, *args):
        assert self._future is None
-        self._future = self._executor.submit(self._function.execute, *args)
+
+        def wrapped(*args):
+            try:
+                self._function.execute(*args)
+            except Exception as exc:
+                for i in self._function._all_rendezvous:
+                    i.set_exception(str(exc))
+                raise exc
+
+        self._future = self._executor.submit(wrapped, *args)

    def wait(self):
        assert self._future is not None
@@ -275,6 +285,7 @@ def dump_graph(
    keep_param_name: bool = False,
    keep_opr_priority: bool = False,
    strip_info_file=None,
+    append_json=False
 ):
    """serialize the computing graph of `output_vars` and get byte result.

@@ -295,6 +306,9 @@ def dump_graph(
    :param keep_opr_priority: whether to keep priority setting for operators
    :param strip_info_file: a string for path or a file handler. if is not None,
        then the dump information for code strip would be written to ``strip_info_file``
+    :param append_json: will be check when `strip_info_file` is not None. if set
+        true, the information for code strip will be append to strip_info_file.
+        if set false, will rewrite strip_info_file
    :return: dump result as byte string, and an instance of namedtuple
        :class:`CompGraphDumpResult`, whose fields are:

@@ -342,10 +356,25 @@ def dump_graph(

    if strip_info_file is not None:
        if isinstance(strip_info_file, str):
-            strip_info_file = open(strip_info_file, "w")
-        strip_info = json.loads(_imperative_rt.get_info_for_strip(ov))
-        strip_info["hash"] = dump_info.content_hash
-        json.dump(strip_info, strip_info_file)
+            if not os.path.exists(strip_info_file):
+                os.mknod(strip_info_file)
+            strip_info_file = open(strip_info_file, "r+")
+        new_strip_dict = json.loads(_imperative_rt.get_info_for_strip(ov))
+        ori_strip_dict = new_strip_dict
+        json_content = strip_info_file.read()
+        if append_json and len(json_content) != 0:
+            # if there are contents in json file. Read them first and then append new information
+            ori_strip_dict = json.loads(json_content)
+            for k in ori_strip_dict:
+                new_strip_dict_v = new_strip_dict.get(k)
+                if new_strip_dict_v is not None:
+                    for value in new_strip_dict_v:
+                        if not value in ori_strip_dict[k]:
+                            ori_strip_dict[k].append(value)
+        ori_strip_dict["hash"] = dump_info.content_hash
+        strip_info_file.seek(0)
+        strip_info_file.truncate()
+        json.dump(ori_strip_dict, strip_info_file)

    return dump_content, dump_info

@@ -358,7 +387,7 @@ CompGraphLoadResult = collections.namedtuple(
 def load_graph(fpath):
    """Load a serialized computing graph from file.

-    :parma fpath: Path or Handle for the output file
+    :param fpath: Path or Handle of the input file
    :return: An instance of namedtuple :class:`CompGraphLoadResult`,
        whose fields are:


--- a/imperative/python/megengine/core/tensor/multipledispatch/conflict.py
+++ b/imperative/python/megengine/core/tensor/multipledispatch/conflict.py
@@ -40,6 +40,8 @@
 #  All Megvii Modifications are Copyright (C) 2014-2020 Megvii Inc. All rights reserved.
 # --------------------------------------------------------------------------------------

+from collections import OrderedDict
+
 from .utils import _toposort, groupby
 from .variadic import isvariadic

@@ -159,5 +161,5 @@ def ordering(signatures):
    for s in signatures:
        if s not in edges:
            edges[s] = []
-    edges = dict((k, [b for a, b in v]) for k, v in edges.items())
+    edges = OrderedDict((k, [b for a, b in v]) for k, v in edges.items())
    return _toposort(edges)
--- a/imperative/python/megengine/core/tensor/raw_tensor/__init__.py
+++ b/imperative/python/megengine/core/tensor/raw_tensor/__init__.py
@@ -100,6 +100,8 @@ def _(data: DeviceTensorND):
 @as_raw_tensor.register(np.ndarray)
 def _(array: np.ndarray, dtype=None, device=None):
    device = None if device is None else as_device(device).to_c()
+    if 0 in array.strides:
+        array = array.squeeze().reshape(array.shape)
    return RawTensor(put(array, dtype=dtype, device=device))



--- a/imperative/python/megengine/core/tensor/tensor_wrapper.py
+++ b/imperative/python/megengine/core/tensor/tensor_wrapper.py
@@ -57,7 +57,29 @@ def _transpose(data, axes):


 def _broadcast(inp, shape):
+    def valid_broadcast(src, tar):
+        def failed():
+            raise ValueError(
+                "the input shape {} can not be broadcasted to target shape {}".format(
+                    src, tar
+                )
+            )
+
+        if isinstance(src, (TensorBase, TensorWrapperBase)):
+            src = src.numpy()
+
+        if isinstance(tar, (TensorBase, TensorWrapperBase)):
+            tar = tar.numpy()
+
+        if len(src) > len(tar):
+            failed()
+
+        for i in range(min(len(src), len(tar))):
+            if src[-i - 1] != 1 and src[-i - 1] != tar[-i - 1]:
+                failed()
+
    shape = utils.astensor1d(shape, inp, dtype="int32", device=inp.device)
+    valid_broadcast(inp.shape, shape)
    (result,) = apply(builtin.Broadcast(), inp, shape)
    return result

@@ -158,6 +180,10 @@ def _reduce(mode):
    def f(self, axis=None, keepdims: bool = False):
        data = self
        (data,) = utils.convert_inputs(data)
+        if mode == "MEAN":
+            data = data.astype("float32")
+        elif self.dtype == np.bool_:
+            data = data.astype("int32")
        if axis is None:
            data = data.reshape(-1)
            assert not keepdims, "can not set axis=None and keepdims=True"
@@ -180,6 +206,9 @@ def _reduce(mode):

            if not keepdims:
                result = _remove_axis(result, axis)
+        if self.dtype == np.bool_:
+            if mode in ["MIN", "MAX"]:
+                result = result.astype("bool")
        return result

    return f
@@ -203,7 +232,8 @@ def _todo(*_):
 def _expand_args(args):
    if len(args) == 1:
        if isinstance(
-            args[0], (collections.abc.Sequence, TensorBase, TensorWrapperBase)
+            args[0],
+            (collections.abc.Sequence, TensorBase, TensorWrapperBase, np.ndarray),
        ):
            args = args[0]
    return args
@@ -366,7 +396,8 @@ class ArrayMethodMixin(abc.ABC):
    def reshape(self, *args):
        return _reshape(self, _expand_args(args))

-    def broadcast(self, *args):
+    # FIXME: remove this method
+    def _broadcast(self, *args):
        return _broadcast(self, _expand_args(args))

    def transpose(self, *args):
@@ -377,7 +408,38 @@ class ArrayMethodMixin(abc.ABC):
    def flatten(self):
        return self.reshape(-1)

-    sum = _reduce("SUM")
+    def sum(self, axis=None, keepdims: bool = False):
+        r"""Returns the sum of each row of the input tensor in the given dimension ``axis``.
+        If ``axis`` is a list of axises, reduce over all of them.
+
+        If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor, except in the dimension(s) ``axis`` where it is of size 1. Otherwise, ``axis`` is squeezed(see :meth:`~.functional.tensor.squeeze`).
+
+        Same for prod/mean/max/min.
+
+        :param axis: the dimension or dimensions to reduce.
+        :param keepdim: whether the output tensor has ndim retained or not.
+        :return: output tensor.
+
+        Examples:
+
+        .. testcode::
+
+            from megengine import tensor
+            a = tensor([False, True, True, False])
+            b = tensor([1.0, 2.0, 3.0, 4.0])
+            print(a.sum().numpy())
+            print(b.sum().numpy())
+
+        Outputs:
+
+        .. testoutput::
+
+            [2]
+            [10.]
+
+        """
+        return _reduce("SUM")(self, axis, keepdims)
+
    prod = _reduce("PRODUCT")
    min = _reduce("MIN")
    max = _reduce("MAX")

--- a/imperative/python/megengine/core/tensor/utils.py
+++ b/imperative/python/megengine/core/tensor/utils.py
@@ -16,39 +16,74 @@ from ..ops.special import Const
 from ..tensor.core import OpBase, TensorBase, TensorWrapperBase, apply


-def dtype_promotion(raw_inputs):
-    def add_dtype(i):
-        if type(i) == int:
-            return np.array(i, dtype=np.int32)
-        if type(i) == float:
-            return np.array(i, dtype=np.float32)
-        if type(i) == bool:
-            return np.array(i, dtype=np.bool_)
-        return None
-
-    scalar_inputs = [
-        add_dtype(i) for i in raw_inputs if not hasattr(i, "dtype") and add_dtype(i)
-    ]
-    inputs = [i for i in raw_inputs if hasattr(i, "dtype")]
-    assert len(scalar_inputs + inputs) > 0
-    dtype = None
-    if len(inputs) > 0:
-        dtype = np.result_type(*inputs)
-    dtype_all = np.result_type(*(inputs + scalar_inputs))
-    assert (
-        dtype != np.float64 and dtype != np.int64
-    ), "unsupport dtype {} by dtype_promotion, please use explict type convert".format(
-        dtype
-    )
-    if dtype_all == np.bool_:
-        for i in raw_inputs:
-            if not hasattr(i, "dtype") or i.dtype != np.bool_:
-                raise TypeError(
-                    "bool dtype can not be operated with an element without bool dtype"
-                )
-    if dtype_all == np.float64:
-        dtype_all = np.float32
-    return dtype_all
+def dtype_promotion(inputs):
+    """
+    Returns the dtype that would result from performing an arithmetic
+    operation on the provided input tensors and scalars.
+    """
+    # map numpy.dtype.kind to priority
+    category_priority = {
+        "f": 3,  # floating-point
+        "i": 2,  # signed integer
+        "u": 2,  # unsigned integer
+        "b": 1,  # boolean
+    }
+
+    def scalar2dtype(x):
+        """
+        For scalar `x`, returns its corresponding type. A floating point scalar
+        has dtype 'float32'. An integral non-boolean scalar has dtype 'int32'.
+        A boolean scalar has dtype 'bool'.
+        """
+        if isinstance(x, bool):
+            return np.bool_
+        if isinstance(x, int):
+            return np.int32
+        if isinstance(x, float):
+            return np.float32
+
+    def promote_types(types, cat):
+        """
+        Returns the data type with sufficient size to hold all types of
+        category `cat` in the list `types`.
+        """
+        used_types = [
+            i for i in types if category_priority.get(np.dtype(i).kind, 0) == cat
+        ]
+        assert len(used_types) > 0
+        res = used_types[0]
+        for i in used_types:
+            res = np.promote_types(res, i)
+        return res
+
+    def max_priority(types):
+        """
+        Returns the maximum value of the priority of each type in the list
+        `types`.
+        """
+        if not types:
+            return 0
+        else:
+            return max([category_priority.get(np.dtype(i).kind, 0) for i in types])
+
+    scalars = []
+    tensors = []
+
+    for data in inputs:
+        if hasattr(data, "dtype"):
+            tensors.append(data.dtype)
+        elif isinstance(data, (float, int, bool)):
+            scalars.append(scalar2dtype(data))
+
+    max_pri_scalars = max_priority(scalars)
+    max_pri_tensors = max_priority(tensors)
+
+    assert max_pri_scalars > 0 or max_pri_tensors > 0
+
+    if max_pri_scalars > max_pri_tensors:
+        return promote_types(scalars, max_pri_scalars)
+    else:
+        return promote_types(tensors, max_pri_tensors)


 def get_device(inputs):

--- a/imperative/python/megengine/core/utils/__init__.py
+++ b/imperative/python/megengine/core/utils/__init__.py
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-from .comp_graph_tools import *
--- a/imperative/python/megengine/data/_queue.py
+++ b/imperative/python/megengine/data/_queue.py
@@ -26,7 +26,7 @@ def _clear_plasma_store():
    # `_PlasmaStoreManager.__del__` will not be called automaticly in subprocess,
    # so this function should be called explicitly
    global MGE_PLASMA_STORE_MANAGER
-    if MGE_PLASMA_STORE_MANAGER is not None:
+    if MGE_PLASMA_STORE_MANAGER is not None and MGE_PLASMA_STORE_MANAGER.refcount == 0:
        del MGE_PLASMA_STORE_MANAGER
        MGE_PLASMA_STORE_MANAGER = None

@@ -50,6 +50,7 @@ class _PlasmaStoreManager:
            stderr=None if debug_flag else subprocess.DEVNULL,
        )
        self.__initialized = True
+        self.refcount = 1

    def __del__(self):
        if self.__initialized and self.plasma_store.returncode is None:
@@ -83,6 +84,8 @@ class PlasmaShmQueue:
                    "Exception happened in starting plasma_store: {}\n"
                    "Tips: {}".format(str(e), err_info)
                )
+        else:
+            MGE_PLASMA_STORE_MANAGER.refcount += 1

        self.socket_name = MGE_PLASMA_STORE_MANAGER.socket_name

@@ -133,6 +136,8 @@ class PlasmaShmQueue:
    def close(self):
        self.queue.close()
        self.disconnect_client()
+        global MGE_PLASMA_STORE_MANAGER
+        MGE_PLASMA_STORE_MANAGER.refcount -= 1
        _clear_plasma_store()

    def cancel_join_thread(self):

--- a/imperative/python/megengine/data/collator.py
+++ b/imperative/python/megengine/data/collator.py
@@ -34,14 +34,14 @@ default_collate_err_msg_format = (

 class Collator:
    r"""
-    Used for merge a list of samples to form a mini-batch of Tenor(s). Used when using batched loading from a dataset.
-    modified from https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py
+    Used for merging a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a dataset.
+    Modified from https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py
    """

    def apply(self, inputs):
        """
-        input : sequence_N(tuple(CHW, C, CK))
-        output : tuple(NCHW, NC, NCK)
+        :param input: sequence_N(tuple(CHW, C, CK)).
+        :return: tuple(NCHW, NC, NCK).
        """
        elem = inputs[0]
        elem_type = type(elem)

--- a/imperative/python/megengine/data/dataloader.py
+++ b/imperative/python/megengine/data/dataloader.py
@@ -43,7 +43,7 @@ class DataLoader:
    ):
        r"""Provides a convenient way to iterate on a given dataset.

-        `DataLoader` combines a dataset with sampler, transform and collator,
+        `DataLoader` combines a dataset with `sampler`, `transform` and `collator`,
        make it flexible to get minibatch continually from a dataset.

        :type dataset: Dataset
@@ -53,21 +53,21 @@ class DataLoader:
            If specified, :attr:`shuffle` must be ``False``.
        :type transform: Transform
        :param transform: defined the transforming strategy for a sampled batch.
-            (default: ``None``)
+            Default: None
        :type collator: Collator
        :param collator: defined the merging strategy for a transformed batch.
-            (default: ``None``)
+            Default: None
        :type num_workers: int
        :param num_workers: the number of sub-process to load, transform and collate
-            the batch. ``0`` means using single-process. (default: ``0``)
+            the batch. ``0`` means using single-process. Default: 0
        :type timeout: int
        :param timeout: if positive, means the timeout value(second) for collecting a
-            batch from workers. (default: 0)
+            batch from workers. Default: 0
        :type divide: bool
        :param divide: define the paralleling strategy in multi-processing mode.
            ``True`` means one batch is divided into :attr:`num_workers` pieces, and
            the workers will process these pieces parallelly. ``False`` means
-            different sub-process will process different batch. (default: ``False``)
+            different sub-process will process different batch. Default: False

        """


--- a/imperative/python/megengine/data/dataset/meta_dataset.py
+++ b/imperative/python/megengine/data/dataset/meta_dataset.py
@@ -12,7 +12,7 @@ from typing import Tuple

 class Dataset(ABC):
    r"""
-    An abstract class for all Datasets
+    An abstract class for all Datasets.
    """

    @abstractmethod
@@ -22,8 +22,8 @@ class Dataset(ABC):

 class MapDataset(Dataset):
    r"""
-    An abstract class for map data
-    __getitem__ and __len__ method are aditionally needed
+    An abstract class for map data.
+    __getitem__ and __len__ method are aditionally needed.
    """

    @abstractmethod
@@ -41,8 +41,8 @@ class MapDataset(Dataset):

 class StreamDataset(Dataset):
    r"""
-    An abstract class for stream data
-    __iter__ method is aditionally needed
+    An abstract class for stream data.
+    __iter__ method is aditionally needed.
    """

    @abstractmethod

--- a/imperative/python/megengine/data/dataset/vision/cifar.py
+++ b/imperative/python/megengine/data/dataset/vision/cifar.py
@@ -21,7 +21,7 @@ logger = get_logger(__name__)


 class CIFAR10(VisionDataset):
-    r""" ``Dataset`` for CIFAR10 meta data
+    r""" ``Dataset`` for CIFAR10 meta data.
    """

    url_path = "http://www.cs.utoronto.ca/~kriz/"

--- a/imperative/python/megengine/data/dataset/vision/coco.py
+++ b/imperative/python/megengine/data/dataset/vision/coco.py
@@ -118,7 +118,7 @@ class COCO(VisionDataset):
            self.ids = ids

        self.json_category_id_to_contiguous_id = {
-            v: i + 1 for i, v in enumerate(self.cats.keys())
+            v: i + 1 for i, v in enumerate(sorted(self.cats.keys()))
        }

        self.contiguous_category_id_to_json_id = {

--- a/imperative/python/megengine/data/dataset/vision/folder.py
+++ b/imperative/python/megengine/data/dataset/vision/folder.py
@@ -30,19 +30,18 @@ class ImageFolder(VisionDataset):
        r"""
        ImageFolder is a class for loading image data and labels from a organized folder.

-        the folder is expected to be organized as followed
-        root/cls/xxx.img_ext
+        The folder is expected to be organized as followed: root/cls/xxx.img_ext

-        labels are indices of sorted classes in the root directory
+        Labels are indices of sorted classes in the root directory.

-        :param root: root directory of an image folder
+        :param root: root directory of an image folder.
        :param loader: a function used to load image from path,
                       if ``None``, default function that loads
-                       images with PILwill be called
+                       images with PIL will be called.
        :param check_valid_func: a function used to check if files in folder are
                                 expected image files, if ``None``, default function
-                                 that checks file extensions will be called
-        :param class_name: if ``True``, return class name instead of class index
+                                 that checks file extensions will be called.
+        :param class_name: if ``True``, return class name instead of class index.

        """
        super().__init__(root, order=("image", "image_category"))

--- a/imperative/python/megengine/data/dataset/vision/imagenet.py
+++ b/imperative/python/megengine/data/dataset/vision/imagenet.py
@@ -31,7 +31,7 @@ logger = get_logger(__name__)

 class ImageNet(ImageFolder):
    r"""
-    Load ImageNet from raw files or folder, expected folder looks like
+    Load ImageNet from raw files or folder. Expected folder looks like:

    .. code-block:: bash

@@ -60,25 +60,25 @@ class ImageNet(ImageFolder):

    def __init__(self, root: str = None, train: bool = True, **kwargs):
        r"""
-        initialization:
+        Initialization:

-        * if ``root`` contains ``self.target_folder`` depent on ``train``:
+        * if ``root`` contains ``self.target_folder`` depending on ``train``:

-          * initialize ImageFolder with target_folder
+          * initialize ImageFolder with target_folder.

        * else:

          * if all raw files are in ``root``:

-            * parse ``self.target_folder`` from raw files
-            * initialize ImageFolder with ``self.target_folder``
+            * parse ``self.target_folder`` from raw files.
+            * initialize ImageFolder with ``self.target_folder``.

          * else:

-            * raise error
+            * raise error.

-        :param root: root directory of imagenet data, if root is ``None``, used default_dataset_root
-        :param train: if ``True``, load the train split, otherwise load the validation split
+        :param root: root directory of imagenet data, if root is ``None``, use default_dataset_root.
+        :param train: if ``True``, load the train split, otherwise load the validation split.
        """

        # process the root path

--- a/imperative/python/megengine/data/dataset/vision/mnist.py
+++ b/imperative/python/megengine/data/dataset/vision/mnist.py
@@ -22,12 +22,12 @@ logger = get_logger(__name__)


 class MNIST(VisionDataset):
-    r""" ``Dataset`` for MNIST meta data
+    r""" ``Dataset`` for MNIST meta data.
    """

    url_path = "http://yann.lecun.com/exdb/mnist/"
    """
-    url prefix for downloading raw file
+    Url prefix for downloading raw file.
    """
    raw_file_name = [
        "train-images-idx3-ubyte.gz",
@@ -36,7 +36,7 @@ class MNIST(VisionDataset):
        "t10k-labels-idx1-ubyte.gz",
    ]
    """
-    raw file names of both training set and test set (10k)
+    Raw file names of both training set and test set (10k).
    """
    raw_file_md5 = [
        "f68b3c2dcbeaaa9fbdd348bbdeb94873",
@@ -45,7 +45,7 @@ class MNIST(VisionDataset):
        "ec29112dd5afa0611ce80d1b7f02629c",
    ]
    """
-    md5 for checking raw files
+    Md5 for checking raw files.
    """

    def __init__(
@@ -57,10 +57,10 @@ class MNIST(VisionDataset):
    ):
        r"""
        :param root: path for mnist dataset downloading or loading, if ``None``,
-            set ``root`` to the ``_default_root``
-        :param train: if ``True``, loading trainingset, else loading test set
+            set ``root`` to the ``_default_root``.
+        :param train: if ``True``, loading trainingset, else loading test set.
        :param download: if raw files do not exists and download sets to ``True``,
-            download raw files and process, otherwise raise ValueError, default is True
+            download raw files and process, otherwise raise ValueError, default is True.

        """
        super().__init__(root, order=("image", "image_category"))

--- a/imperative/python/megengine/data/dataset/vision/objects365.py
+++ b/imperative/python/megengine/data/dataset/vision/objects365.py
@@ -81,7 +81,7 @@ class Objects365(VisionDataset):
            self.ids = ids

        self.json_category_id_to_contiguous_id = {
-            v: i + 1 for i, v in enumerate(self.cats.keys())
+            v: i + 1 for i, v in enumerate(sorted(self.cats.keys()))
        }

        self.contiguous_category_id_to_json_id = {

--- a/imperative/python/megengine/data/dataset/vision/voc.py
+++ b/imperative/python/megengine/data/dataset/vision/voc.py
@@ -75,6 +75,8 @@ class PascalVOC(VisionDataset):
        else:
            raise NotImplementedError

+        self.img_infos = dict()
+
    def __getitem__(self, index):
        target = []
        for k in self.order:
@@ -107,9 +109,8 @@ class PascalVOC(VisionDataset):
                mask = mask[:, :, np.newaxis]
                target.append(mask)
            elif k == "info":
-                if image is None:
-                    image = cv2.imread(self.images[index], cv2.IMREAD_COLOR)
-                info = [image.shape[0], image.shape[1], self.file_names[index]]
+                info = self.get_img_info(index, image)
+                info = [info["height"], info["width"], info["file_name"]]
                target.append(info)
            else:
                raise NotImplementedError
@@ -119,6 +120,17 @@ class PascalVOC(VisionDataset):
    def __len__(self):
        return len(self.images)

+    def get_img_info(self, index, image=None):
+        if index not in self.img_infos:
+            if image is None:
+                image = cv2.imread(self.images[index], cv2.IMREAD_COLOR)
+            self.img_infos[index] = dict(
+                height=image.shape[0],
+                width=image.shape[1],
+                file_name=self.file_names[index],
+            )
+        return self.img_infos[index]
+
    def _trans_mask(self, mask):
        label = np.ones(mask.shape[:2]) * 255
        for i in range(len(self.class_colors)):
@@ -171,25 +183,3 @@ class PascalVOC(VisionDataset):
        "train",
        "tvmonitor",
    )
-    class_colors = [
-        [0, 0, 128],
-        [0, 128, 0],
-        [0, 128, 128],
-        [128, 0, 0],
-        [128, 0, 128],
-        [128, 128, 0],
-        [128, 128, 128],
-        [0, 0, 64],
-        [0, 0, 192],
-        [0, 128, 64],
-        [0, 128, 192],
-        [128, 0, 64],
-        [128, 0, 192],
-        [128, 128, 64],
-        [128, 128, 192],
-        [0, 64, 0],
-        [0, 64, 128],
-        [0, 192, 0],
-        [0, 192, 128],
-        [128, 64, 0],
-    ]
--- a/imperative/python/megengine/data/sampler.py
+++ b/imperative/python/megengine/data/sampler.py
@@ -28,25 +28,25 @@ class Sampler(ABC):
        seed=None,
    ):
        r"""
-        An abstract class for all sampler
+        An abstract class for all sampler.

        :type dataset: `dataset`
-        :param dataset: dataset to sample from
+        :param dataset: dataset to sample from.
        :type batch_size: positive integer
-        :param batch_size: batch size for batch method
+        :param batch_size: batch size for batch method.
        :type drop_last: bool
        :param drop_last: set ``True`` to drop the last incomplete batch,
            if the dataset size is not divisible by the batch size. If ``False`` and 
            the size of dataset is not divisible by the batch_size, then the last batch will
-            be smaller. (default: ``False``)
+            be smaller. Default: False
        :type num_samples: positive integer
-        :param num_samples: number of samples assigned to one rank
+        :param num_samples: number of samples assigned to one rank.
        :type world_size: positive integer
-        :param world_size: number of ranks
+        :param world_size: number of ranks.
        :type rank: non-negative integer within 0 and world_size
-        :param rank: rank id, non-negative interger within 0 and ``world_size``
+        :param rank: rank id, non-negative interger within 0 and ``world_size``.
        :type seed: non-negative integer
-        :param seed: seed for random operators
+        :param seed: seed for random operators.
        """
        if (
            not isinstance(batch_size, int)
@@ -103,15 +103,15 @@ class Sampler(ABC):

    def sample(self):
        """
-        return a list contains all sample indices
+        Return a list contains all sample indices.
        """
        raise NotImplementedError

    def scatter(self, indices) -> List:
        r"""
-        scatter method is used for splitting indices into subset, each subset
+        Scatter method is used for splitting indices into subset, each subset
        will be assigned to a rank. Indices are evenly splitted by default.
-        If customized indices assignment method is needed, please rewrite this method
+        If customized indices assignment method is needed, please rewrite this method.
        """
        total_size = self.num_samples * self.world_size

@@ -127,7 +127,7 @@ class Sampler(ABC):

    def batch(self) -> Iterator[List[Any]]:
        r"""
-        batch method provides a batch indices generator
+        Batch method provides a batch indices generator.
        """
        indices = list(self.sample())

@@ -156,7 +156,7 @@ class SequentialSampler(Sampler):
        rank=None,
    ):
        r"""
-        Sample elements sequentially
+        Sample elements sequentially.
        """
        super().__init__(dataset, batch_size, drop_last, None, world_size, rank)
        if indices is not None and not isinstance(indices, collections.abc.Sequence):
@@ -168,7 +168,7 @@ class SequentialSampler(Sampler):

    def sample(self) -> Iterator[Any]:
        r"""
-        return a generator 
+        Return a generator.
        """
        if self.indices is None:
            return iter(range(len(self.dataset)))
@@ -188,7 +188,7 @@ class RandomSampler(Sampler):
        seed=None,
    ):
        r"""
-        Sample elements randomly without replacement
+        Sample elements randomly without replacement.
        """
        super().__init__(dataset, batch_size, drop_last, None, world_size, rank, seed)
        if indices is not None and not isinstance(indices, collections.abc.Sequence):
@@ -218,10 +218,10 @@ class ReplacementSampler(Sampler):
        seed=None,
    ):
        r"""
-        Sample elements randomly with replacement
+        Sample elements randomly with replacement.

        :type weights: List
-        :param weights: weights for sampling indices, it could be unnormalized weights
+        :param weights: weights for sampling indices, it could be unnormalized weights.
        """
        super().__init__(
            dataset, batch_size, drop_last, num_samples, world_size, rank, seed
@@ -250,7 +250,7 @@ class ReplacementSampler(Sampler):


 class Infinite(Sampler):
-    r"""Infinite Sampler warper for basic sampler"""
+    r"""Infinite Sampler warper for basic sampler."""

    def sample(self):
        raise NotImplementedError("sample method not supported in Infinite")

--- a/imperative/python/megengine/data/transform/meta_transform.py
+++ b/imperative/python/megengine/data/transform/meta_transform.py
@@ -12,7 +12,7 @@ from typing import Sequence, Tuple

 class Transform(ABC):
    """
-    rewrite apply method in subclass
+    Rewrite apply method in subclass.
    """

    def apply_batch(self, inputs: Sequence[Tuple]):

--- a/imperative/python/megengine/data/transform/vision/functional.py
+++ b/imperative/python/megengine/data/transform/vision/functional.py
@@ -15,7 +15,7 @@ import numpy as np


 def wrap_keepdims(func):
-    """Wraper to keep the dimension of input images unchanged"""
+    """Wraper to keep the dimension of input images unchanged."""

    @functools.wraps(func)
    def wrapper(image, *args, **kwargs):
@@ -34,10 +34,10 @@ def wrap_keepdims(func):
 @wrap_keepdims
 def to_gray(image):
    r"""
-    Change BGR format image's color space to gray
+    Change BGR format image's color space to gray.

-    :param image: Input BGR format image, with (H, W, C) shape
-    :return: Gray format image, with (H, W, C) shape
+    :param image: input BGR format image, with `(H, W, C)` shape.
+    :return: gray format image, with `(H, W, C)` shape.
    """
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

@@ -45,10 +45,10 @@ def to_gray(image):
 @wrap_keepdims
 def to_bgr(image):
    r"""
-    Change gray format image's color space to BGR
+    Change gray format image's color space to BGR.

-    :param image: input Gray format image, with (H, W, C) shape
-    :return: BGR format image, with (H, W, C) shape
+    :param image: input Gray format image, with `(H, W, C)` shape.
+    :return: BGR format image, with `(H, W, C)` shape.
    """
    return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)

@@ -56,18 +56,18 @@ def to_bgr(image):
 @wrap_keepdims
 def pad(input, size, value):
    r"""
-    Pad input data with *value* and given *size*
+    Pad input data with *value* and given *size*.

-    :param input: Input data, with (H, W, C) shape
-    :param size: Padding size of input data, it could be integer or sequence.
-        If it's an integer, the input data will be padded in four directions.
-        If it's a sequence contains two integer, the bottom and right side
+    :param input: input data, with `(H, W, C)` shape.
+    :param size: padding size of input data, it could be integer or sequence.
+        If it is an integer, the input data will be padded in four directions.
+        If it is a sequence contains two integer, the bottom and right side
        of input data will be padded.
-        If it's a sequence contains four integer, the top, bottom, left, right
+        If it is a sequence contains four integer, the top, bottom, left, right
        side of input data will be padded with given size.
-    :param value: Padding value of data, could be a sequence of int or float.
-        if it's float value, the dtype of image will be casted to float32 also.
-    :return: Padded image
+    :param value: padding value of data, could be a sequence of int or float.
+        If it is float value, the dtype of image will be casted to float32 also.
+    :return: padded image.
    """
    if isinstance(size, int):
        size = (size, size, size, size)
@@ -81,14 +81,18 @@ def pad(input, size, value):
 @wrap_keepdims
 def flip(image, flipCode):
    r"""
-    Accordding to the flipCode (the type of flip), flip the input image
+    Accordding to the flipCode (the type of flip), flip the input image.

-    :param image: Input image, with (H, W, C) shape
+    :param image: input image, with `(H, W, C)` shape.
    :param flipCode: code that indicates the type of flip.
-        1 : Flip horizontally
-        0 : Flip vertically
-        -1 : Flip horizontally and vertically
-    :return: BGR format image, with (H, W, C) shape
+
+        * 1 : Flip horizontally
+
+        * 0 : Flip vertically
+
+        * -1: Flip horizontally and vertically
+
+    :return: BGR format image, with `(H, W, C)` shape.
    """
    return cv2.flip(image, flipCode=flipCode)

@@ -96,12 +100,12 @@ def flip(image, flipCode):
 @wrap_keepdims
 def resize(input, size, interpolation=cv2.INTER_LINEAR):
    r"""
-    resize the input data to given size
+    Resize the input data to given size.

-    :param input: Input data, could be image or masks, with (H, W, C) shape
-    :param size: Target size of input data, with (height, width) shape.
-    :param interpolation: Interpolation method.
-    :return: Resized data, with (H, W, C) shape
+    :param input: input data, could be image or masks, with `(H, W, C)` shape.
+    :param size: target size of input data, with (height, width) shape.
+    :param interpolation: interpolation method.
+    :return: resized data, with `(H, W, C)` shape.
    """
    if len(size) != 2:
        raise ValueError("resize needs (h, w), but got {}".format(size))

--- a/imperative/python/megengine/data/transform/vision/transform.py
+++ b/imperative/python/megengine/data/transform/vision/transform.py
@@ -44,26 +44,26 @@ __all__ = [
 class VisionTransform(Transform):
    r"""
    Base class of all transforms used in computer vision.
-    calling logic: apply_batch() -> apply() -> _apply_image() and other _apply_*()
+    Calling logic: apply_batch() -> apply() -> _apply_image() and other _apply_*()
    method. If you want to implement a self-defined transform method for image,
    rewrite _apply_image method in subclass.

-    :param order: Input type order. Input is a tuple contains different structures,
+    :param order: input type order. Input is a tuple containing different structures,
        order is used to specify the order of structures. For example, if your input
-        is (image, boxes) type, then the order should be ("image", "boxes").
-        Current available strings & data type are describe below:
+        is (image, boxes) type, then the ``order`` should be ("image", "boxes").
+        Current available strings and data type are describe below:

-        * "image": input image, with shape of (H, W, C)
-        * "coords": coordinates, with shape of (N, 2)
-        * "boxes": bounding boxes, with shape of (N, 4), "xyxy" format,
+        * "image": input image, with shape of `(H, W, C)`.
+        * "coords": coordinates, with shape of `(N, 2)`.
+        * "boxes": bounding boxes, with shape of `(N, 4)`, "xyxy" format,
          the 1st "xy" represents top left point of a box,
          the 2nd "xy" represents right bottom point.
-        * "mask": map used for segmentation, with shape of (H, W, 1)
-        * "keypoints": keypoints with shape of (N, K, 3), N for number of instances,
+        * "mask": map used for segmentation, with shape of `(H, W, 1)`.
+        * "keypoints": keypoints with shape of `(N, K, 3)`, N for number of instances,
          and K for number of keypoints in one instance. The first two dimensions
          of last axis is coordinate of keypoints and the the 3rd dimension is
          the label of keypoints.
-        * "polygons": A sequence contains numpy array, its length is number of instances.
+        * "polygons": a sequence containing numpy arrays, its length is the number of instances.
          Each numpy array represents polygon coordinate of one instance.
        * "category": categories for some data type. For example, "image_category"
          means category of the input image and "boxes_category" means categories of
@@ -94,11 +94,11 @@ class VisionTransform(Transform):
        self.order = order

    def apply_batch(self, inputs: Sequence[Tuple]):
-        r"""Apply transform on batch input data"""
+        r"""Apply transform on batch input data."""
        return tuple(self.apply(input) for input in inputs)

    def apply(self, input: Tuple):
-        r"""Apply transform on single input data"""
+        r"""Apply transform on single input data."""
        if not isinstance(input, tuple):
            input = (input,)

@@ -156,10 +156,10 @@ class VisionTransform(Transform):
 class ToMode(VisionTransform):
    r"""Change input data to a target mode.
    For example, most transforms use HWC mode image,
-    while the Neural Network might use CHW mode input tensor
+    while the neural network might use CHW mode input tensor.

-    :param mode: Output mode of input. Use "CHW" mode by default.
-    :param order: The same with :class:`VisionTransform`
+    :param mode: output mode of input. Default: "CHW"
+    :param order: the same with :class:`VisionTransform`
    """

    def __init__(self, mode="CHW", *, order=None):
@@ -185,14 +185,14 @@ class Compose(VisionTransform):
    r"""
    Composes several transforms together.

-    :param transforms: List of :class:`VisionTransform` to compose.
-    :param batch_compose: Whether use shuffle_indices for batch data or not.
+    :param transforms: list of :class:`VisionTransform` to compose.
+    :param batch_compose: whether use shuffle_indices for batch data or not.
        If True, use original input sequence.
        Otherwise, the shuffle_indices will be used for transforms.
-    :param shuffle_indices: Indices used for random shuffle, start at 1.
+    :param shuffle_indices: indices used for random shuffle, start at 1.
        For example, if shuffle_indices is [(1, 3), (2, 4)], then the 1st and 3rd transform
        will be random shuffled, the 2nd and 4th transform will also be shuffled.
-    :param order: The same with :class:`VisionTransform`
+    :param order: the same with :class:`VisionTransform`

    Examples:

@@ -264,8 +264,8 @@ class TorchTransformCompose(VisionTransform):
    some transforms with tensor in torchvision are not supported,
    such as Normalize and ToTensor in torchvision.

-    :param transforms: The same with ``Compose``
-    :param order: The same with :class:`VisionTransform`
+    :param transforms: the same with ``Compose``.
+    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, transforms, *, order=None):
@@ -303,16 +303,16 @@ class TorchTransformCompose(VisionTransform):
 class Pad(VisionTransform):
    r"""Pad the input data.

-    :param size: Padding size of input image, it could be integer or sequence.
-        If it's an integer, the input image will be padded in four directions.
-        If it's a sequence contains two integer, the bottom and right side
+    :param size: padding size of input image, it could be integer or sequence.
+        If it is an integer, the input image will be padded in four directions.
+        If it is a sequence containing two integers, the bottom and right side
        of image will be padded.
-        If it's a sequence contains four integer, the top, bottom, left, right
+        If it is a sequence containing four integers, the top, bottom, left, right
        side of image will be padded with given size.
-    :param value: Padding value of image, could be a sequence of int or float.
-        if it's float value, the dtype of image will be casted to float32 also.
-    :param mask_value: Padding value of segmentation map.
-    :param order: The same with :class:`VisionTransform`
+    :param value: padding value of image, could be a sequence of int or float.
+        if it is float value, the dtype of image will be casted to float32 also.
+    :param mask_value: padding value of segmentation map.
+    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, size=0, value=0, mask_value=0, *, order=None):
@@ -350,15 +350,15 @@ class Pad(VisionTransform):
 class Resize(VisionTransform):
    r"""Resize the input data.

-    :param output_size: Target size of image, with (height, width) shape.
-    :param interpolation: Interpolation method. All methods are listed below:
+    :param output_size: target size of image, with (height, width) shape.
+    :param interpolation: interpolation method. All methods are listed below:

        * cv2.INTER_NEAREST – a nearest-neighbor interpolation.
        * cv2.INTER_LINEAR – a bilinear interpolation (used by default).
        * cv2.INTER_AREA – resampling using pixel area relation.
        * cv2.INTER_CUBIC – a bicubic interpolation over 4×4 pixel neighborhood.
        * cv2.INTER_LANCZOS4 – a Lanczos interpolation over 8×8 pixel neighborhood.
-    :param order: The same with :class:`VisionTransform`
+    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, output_size, interpolation=cv2.INTER_LINEAR, *, order=None):
@@ -476,8 +476,8 @@ class ShortestEdgeResize(VisionTransform):
 class RandomResize(VisionTransform):
    r"""Resize the input data randomly.

-    :param scale_range: .
-    :param order: The same with :class:`VisionTransform`
+    :param scale_range: range of scaling.
+    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, scale_range, interpolation=cv2.INTER_LINEAR, *, order=None):
@@ -519,13 +519,13 @@ class RandomResize(VisionTransform):

 class RandomCrop(VisionTransform):
    r"""Crop the input data randomly. Before applying the crop transform,
-    pad the image first. And if target size is still bigger than the size of
+    pad the image first. If target size is still bigger than the size of
    padded image, pad the image size to target size.

-    :param output_size: Target size of output image, with (height, width) shape.
-    :param padding_size: The same with `size` in ``Pad``
-    :param padding_value: The same with `value` in ``Pad``
-    :param order: The same with :class:`VisionTransform`
+    :param output_size: target size of output image, with (height, width) shape.
+    :param padding_size: the same with `size` in ``Pad``.
+    :param padding_value: the same with `value` in ``Pad``.
+    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(
@@ -580,10 +580,10 @@ class RandomResizedCrop(VisionTransform):
    aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made.
    After applying crop transfrom, the input data will be resized to given size.

-    :param output_size: Target size of output image, with (height, width) shape.
-    :param scale_range: Range of size of the origin size cropped. Default: (0.08, 1.0)
-    :param ratio_range: Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
-    :param order: The same with :class:`VisionTransform`
+    :param output_size: target size of output image, with (height, width) shape.
+    :param scale_range: range of size of the origin size cropped. Default: (0.08, 1.0)
+    :param ratio_range: range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
+    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(
@@ -666,8 +666,8 @@ class RandomResizedCrop(VisionTransform):
 class CenterCrop(VisionTransform):
    r"""Crops the given the input data at the center.

-    :param output_size: Target size of output image, with (height, width) shape.
-    :param order: The same with :class:`VisionTransform`
+    :param output_size: target size of output image, with (height, width) shape.
+    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, output_size, *, order=None):
@@ -710,7 +710,7 @@ class RandomHorizontalFlip(VisionTransform):
    r"""Horizontally flip the input data randomly with a given probability.

    :param p: probability of the input data being flipped. Default: 0.5
-    :param order: The same with :class:`VisionTransform`
+    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, prob: float = 0.5, *, order=None):
@@ -742,7 +742,7 @@ class RandomVerticalFlip(VisionTransform):
    r"""Vertically flip the input data randomly with a given probability.

    :param p: probability of the input data being flipped. Default: 0.5
-    :param order: The same with :class:`VisionTransform`
+    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, prob: float = 0.5, *, order=None):
@@ -776,9 +776,9 @@ class Normalize(VisionTransform):
    this transform will normalize each channel of the input data.
    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``

-    :param mean: Sequence of means for each channel.
-    :param std: Sequence of standard deviations for each channel.
-    :param order: The same with :class:`VisionTransform`
+    :param mean: sequence of means for each channel.
+    :param std: sequence of standard deviations for each channel.
+    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, mean=0.0, std=1.0, *, order=None):
@@ -802,7 +802,7 @@ class GaussianNoise(VisionTransform):

    :param mean: Gaussian mean used to generate noise.
    :param std: Gaussian standard deviation used to generate noise.
-    :param order: The same with :class:`VisionTransform`
+    :param order: the same with :class:`VisionTransform`
    """

    def __init__(self, mean=0.0, std=1.0, *, order=None):
@@ -826,9 +826,9 @@ class GaussianNoise(VisionTransform):
 class BrightnessTransform(VisionTransform):
    r"""Adjust brightness of the input data.

-    :param value: How much to adjust the brightness. Can be any
-        non negative number. 0 gives the original image
-    :param order: The same with :class:`VisionTransform`
+    :param value: how much to adjust the brightness. Can be any
+        non negative number. 0 gives the original image.
+    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, value, *, order=None):
@@ -857,9 +857,9 @@ class BrightnessTransform(VisionTransform):
 class ContrastTransform(VisionTransform):
    r"""Adjust contrast of the input data.

-    :param value: How much to adjust the contrast. Can be any
-        non negative number. 0 gives the original image
-    :param order: The same with :class:`VisionTransform`
+    :param value: how much to adjust the contrast. Can be any
+        non negative number. 0 gives the original image.
+    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, value, *, order=None):
@@ -888,9 +888,9 @@ class ContrastTransform(VisionTransform):
 class SaturationTransform(VisionTransform):
    r"""Adjust saturation of the input data.

-    :param value: How much to adjust the saturation. Can be any
-        non negative number. 0 gives the original image
-    :param order: The same with :class:`VisionTransform`
+    :param value: how much to adjust the saturation. Can be any
+        non negative number. 0 gives the original image.
+    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, value, *, order=None):
@@ -919,9 +919,9 @@ class SaturationTransform(VisionTransform):
 class HueTransform(VisionTransform):
    r"""Adjust hue of the input data.

-    :param value: How much to adjust the hue. Can be any number
-        between 0 and 0.5, 0 gives the original image
-    :param order: The same with :class:`VisionTransform`
+    :param value: how much to adjust the hue. Can be any number
+        between 0 and 0.5, 0 gives the original image.
+    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, value, *, order=None):
@@ -957,19 +957,19 @@ class HueTransform(VisionTransform):
 class ColorJitter(VisionTransform):
    r"""Randomly change the brightness, contrast, saturation and hue of an image.

-    :param brightness: How much to jitter brightness.
+    :param brightness: how much to jitter brightness.
        Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
        or the given [min, max]. Should be non negative numbers.
-    :param contrast: How much to jitter contrast.
+    :param contrast: how much to jitter contrast.
        Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
        or the given [min, max]. Should be non negative numbers.
-    :param saturation: How much to jitter saturation.
+    :param saturation: how much to jitter saturation.
        Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
        or the given [min, max]. Should be non negative numbers.
-    :param hue: How much to jitter hue.
+    :param hue: how much to jitter hue.
        Chosen uniformly from [-hue, hue] or the given [min, max].
        Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
-    :param order: The same with :class:`VisionTransform`
+    :param order: the same with :class:`VisionTransform`.
    """

    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0, *, order=None):

--- a/imperative/python/megengine/device.py
+++ b/imperative/python/megengine/device.py
@@ -7,6 +7,7 @@
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import os
+import re

 from .core._imperative_rt.common import CompNode, DeviceType
 from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config
@@ -22,10 +23,8 @@ __all__ = [


 def _valid_device(inp):
-    if isinstance(inp, str) and len(inp) == 4:
-        if inp[0] in {"x", "c", "g"} and inp[1:3] == "pu":
-            if inp[3] == "x" or inp[3].isdigit():
-                return True
+    if isinstance(inp, str) and re.match("^[cxg]pu(\d+|\d+:\d+|x)$", inp):
+        return True
    return False


@@ -71,11 +70,11 @@ def set_default_device(device: str = "xpux"):

        'multithread' device type is avaliable when inference, which implements
        multi-threading parallelism at the operator level. For example,
-        'multithread4' will compute with 4 threads. which implements
+        'multithread4' will compute with 4 threads.

        The default value is 'xpux' to specify any device available. The priority of using gpu is higher when both gpu and cpu are available.

-        It can also be set by environmental variable `MGE_DEFAULT_DEVICE`.
+        It can also be set by environment variable `MGE_DEFAULT_DEVICE`.
    """
    assert _valid_device(device), "Invalid device name {}".format(device)
    CompNode._set_default_device(device)
@@ -99,13 +98,13 @@ def set_prealloc_config(
    growth_factor=2.0,
    device_type=DeviceType.CUDA,
 ):
-    """specifies how to pre-allocate from raw dev allocator
+    """Specifies how to pre-allocate from raw device allocator.

    :param alignment: specifies the alignment in bytes.
    :param min_req: min request size in bytes.
    :param max_overhead: max overhead above required size in bytes.
-    :growth_factor: request size / cur allocated
-    :device_type: the device type
+    :param growth_factor: `request size / cur allocated`
+    :param device_type: the device type

    """
    assert alignment > 0

--- a/imperative/python/megengine/distributed/functional.py
+++ b/imperative/python/megengine/distributed/functional.py
@@ -102,7 +102,7 @@ def _(op: RemoteRecv):


 def collective_comm(inp, mode, group, device):
-    """Helper function for applying collective communication functions"""
+    """Helper function for applying collective communication functions."""
    assert isinstance(group, Group)
    if group is None:
        return inp
@@ -123,11 +123,11 @@ def collective_comm(inp, mode, group, device):
 def reduce_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create reduce_sum operator for collective communication
+    """Create reduce_sum operator for collective communication.

-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
    """
    mode = CollectiveCommMode.REDUCE_SUM
    return collective_comm(inp, mode, group, device)
@@ -136,11 +136,11 @@ def reduce_sum(
 def broadcast(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create broadcast operator for collective communication
+    """Create broadcast operator for collective communication.

-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
    """
    mode = CollectiveCommMode.BROADCAST
    return collective_comm(inp, mode, group, device)
@@ -149,11 +149,11 @@ def broadcast(
 def all_gather(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create all_gather operator for collective communication
+    """Create all_gather operator for collective communication.

-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
    """
    mode = CollectiveCommMode.ALL_GATHER
    return collective_comm(inp, mode, group, device)
@@ -162,11 +162,11 @@ def all_gather(
 def reduce_scatter_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create reduce_scatter_sum operator for collective communication
+    """Create reduce_scatter_sum operator for collective communication.

-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
    """
    mode = CollectiveCommMode.REDUCE_SCATTER_SUM
    return collective_comm(inp, mode, group, device)
@@ -175,11 +175,11 @@ def reduce_scatter_sum(
 def all_reduce_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create all_reduce_sum operator for collective communication
+    """Create all_reduce_sum operator for collective communication.

-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
    """
    mode = CollectiveCommMode.ALL_REDUCE_SUM
    return collective_comm(inp, mode, group, device)
@@ -188,11 +188,11 @@ def all_reduce_sum(
 def all_reduce_max(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create all_reduce_max operator for collective communication
+    """Create all_reduce_max operator for collective communication.

-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
    """
    mode = CollectiveCommMode.ALL_REDUCE_MAX
    return collective_comm(inp, mode, group, device)
@@ -201,11 +201,11 @@ def all_reduce_max(
 def all_reduce_min(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create all_reduce_min operator for collective communication
+    """Create all_reduce_min operator for collective communication.

-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
    """
    mode = CollectiveCommMode.ALL_REDUCE_MIN
    return collective_comm(inp, mode, group, device)
@@ -214,11 +214,11 @@ def all_reduce_min(
 def gather(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create gather operator for collective communication
+    """Create gather operator for collective communication.

-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
    """
    mode = CollectiveCommMode.GATHER
    return collective_comm(inp, mode, group, device)
@@ -227,11 +227,11 @@ def gather(
 def scatter(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create scatter operator for collective communication
+    """Create scatter operator for collective communication.

-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
    """
    mode = CollectiveCommMode.SCATTER
    return collective_comm(inp, mode, group, device)
@@ -240,21 +240,21 @@ def scatter(
 def all_to_all(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create all_to_all operator for collective communication
+    """Create all_to_all operator for collective communication.

-    :param inp: input tensor
-    :param group: communication group
-    :param device: execute placement
+    :param inp: input tensor.
+    :param group: communication group.
+    :param device: execution device.
    """
    mode = CollectiveCommMode.ALL_TO_ALL
    return collective_comm(inp, mode, group, device)


 def remote_send(inp: Tensor, dest_rank: int) -> Tensor:
-    """Send a Tensor to a remote process
+    """Send a Tensor to a remote process.

-    :param inp: tensor to send
-    :param dest_rank: destination process rank
+    :param inp: tensor to send.
+    :param dest_rank: destination process rank.
    """
    op = RemoteSend()
    op.key = "{}->{}".format(get_rank(), dest_rank)
@@ -266,12 +266,12 @@ def remote_send(inp: Tensor, dest_rank: int) -> Tensor:
 def remote_recv(
    src_rank: int, shape: Tuple[int], dtype: type, device: Optional[str] = None
 ) -> Tensor:
-    """Receive a Tensor from a remote process
+    """Receive a Tensor from a remote process.

-    :param src_rank: source process rank
-    :param shape: the shape of the tensor to receive
-    :param dtype: the data type of the tensor to receive
-    :param device: the device to place the received tensor
+    :param src_rank: source process rank.
+    :param shape: the shape of the tensor to receive.
+    :param dtype: the data type of the tensor to receive.
+    :param device: the device to place the received tensor.
    """
    key = "{}->{}".format(src_rank, get_rank())


--- a/imperative/python/megengine/distributed/group.py
+++ b/imperative/python/megengine/distributed/group.py
@@ -83,12 +83,12 @@ def init_process_group(
 ) -> None:
    """Initialize the distributed process group and specify the device used in the current process

-    :param master_ip: IP address of the master node
-    :param port: Port available for all processes to communicate
-    :param world_size: Total number of processes participating in the job
-    :param rank: Rank of the current process
-    :param device: The GPU device id to bind this process to
-    :param backend: Communicator backend, currently support 'nccl' and 'ucx'
+    :param master_ip: ip address of the master node.
+    :param port: port available for all processes to communicate.
+    :param world_size: total number of processes participating in the job.
+    :param rank: rank of the current process.
+    :param device: the GPU device id to bind this process to.
+    :param backend: communicator backend, currently support 'nccl' and 'ucx'.
    """
    if not isinstance(master_ip, str):
        raise TypeError("Expect type str but got {}".format(type(master_ip)))
@@ -127,50 +127,50 @@ def init_process_group(


 def is_distributed() -> bool:
-    """Return True if the distributed process group has been initialized"""
+    """Return True if the distributed process group has been initialized."""
    return _sd is not None


 def get_rank() -> int:
-    """Get the rank of the current process"""
+    """Get the rank of the current process."""
    return _sd.proc_rank if _sd is not None else 0


 def get_world_size() -> int:
-    """Get the total number of processes participating in the job"""
+    """Get the total number of processes participating in the job."""
    return _sd.world_size if _sd is not None else 1


 def get_backend() -> str:
-    """Get the backend str"""
+    """Get the backend str."""
    assert _sd is not None, "please call init_process_group first"
    return _sd.backend if _sd is not None else None


 def get_py_server_addr() -> Tuple[str, int]:
-    """Get master_ip and port of python XML RPC server"""
+    """Get master_ip and port of python XML RPC server."""
    assert _sd is not None, "please call init_process_group first"
    return _sd.master_ip, _sd.py_server_port


 def get_mm_server_addr() -> Tuple[str, int]:
-    """Get master_ip and port of C++ mm_server"""
+    """Get master_ip and port of C++ mm_server."""
    assert _sd is not None, "please call init_process_group first"
    return _sd.master_ip, _sd.mm_server_port


 def get_client() -> Client:
-    """Get client of python XML RPC server"""
+    """Get client of python XML RPC server."""
    assert _sd is not None, "please call init_process_group first"
    return _sd.client


 def new_group(proc_ranks: List[int]) -> Group:
-    """Build a subgroup containing certain ranks"""
+    """Build a subgroup containing certain ranks."""
    return Group(proc_ranks)


 def group_barrier(group: Optional[Group] = WORLD) -> None:
-    """Block until all ranks in the group reach this barrier"""
+    """Block until all ranks in the group reach this barrier."""
    assert isinstance(group, Group)
    _sd.client.group_barrier(group.key, group.size)
--- a/imperative/python/megengine/distributed/helper.py
+++ b/imperative/python/megengine/distributed/helper.py
@@ -17,11 +17,112 @@ import numpy as np
 from megengine.autodiff.grad_manager import GradManager, get_backwarding_grad_manager
 from megengine.device import get_default_device, get_device_count

-from ..functional.param_pack import get_offsets, pack_allreduce_split
+from ..core.ops.builtin import ParamPackConcat, ParamPackSplit
+from ..core.tensor.core import apply
 from ..functional.utils import copy
+from ..tensor import Tensor
 from ..utils.future import Future
 from .functional import all_reduce_sum, broadcast
-from .group import WORLD, group_barrier, is_distributed
+from .group import WORLD, Group, group_barrier, is_distributed
+
+
+def param_pack_split(inp: Tensor, offsets: list, shapes: list):
+    r"""
+    Returns split tensor to tensor list as offsets and shapes described,
+            only used for ``parampack``.
+
+    :param inp: input tensor.
+    :param offsets: offsets of outputs, length of `2 * n`,
+            while n is tensor nums you want to split,
+            format `[begin0, end0, begin1, end1]`.
+    :param shapes: tensor shapes of outputs.
+    :return: splitted tensors.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        from megengine.distributed.helper import param_pack_split
+
+        a = tensor(np.ones((10,), np.int32))
+        b, c = param_pack_split(a, [0, 1, 1, 10], [(1,), (3, 3)])
+        print(b.numpy())
+        print(c.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [1]
+        [[1 1 1]
+         [1 1 1]
+         [1 1 1]]
+
+    """
+    op = ParamPackSplit()
+    op.offsets = offsets
+    op.shapes = shapes
+    return apply(op, inp)
+
+
+def param_pack_concat(inps: list, offsets: Tensor, offsets_val: list):
+    r"""
+    Returns concated tensor, only used for ``parampack``.
+
+    :param inps: input tensors.
+    :param offsets: device value of offsets.
+    :param offsets_val: offsets of inputs, length of `2 * n`,
+            format `[begin0, end0, begin1, end1]`.
+    :return: concated tensor.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        from megengine.distributed.helper import param_pack_concat
+
+        a = tensor(np.ones((1,), np.int32))
+        b = tensor(np.ones((3, 3), np.int32))
+        offsets_val = [0, 1, 1, 10]
+        offsets = tensor(offsets_val, np.int32)
+        c = param_pack_concat([a, b], offsets, offsets_val)
+        print(c.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [1 1 1 1 1 1 1 1 1 1]
+
+    """
+    op = ParamPackConcat()
+    op.offsets = offsets_val
+    return apply(op, *inps, offsets)[0]
+
+
+def get_offsets(shapes):
+    offsets = []
+    offset = 0
+    for shape in shapes:
+        offsets.append(offset)
+        offset += int(np.prod(shape))
+        offsets.append(offset)
+    return offsets
+
+
+def pack_allreduce_split(pack_list, shapes, group, reduce_method):
+    offsets_val = get_offsets(shapes)
+    offsets = Tensor(offsets_val)
+    packed_grads = param_pack_concat(pack_list, offsets, offsets_val)
+    packed_grads = all_reduce_sum(packed_grads, group, group.comp_node)
+    if reduce_method == "mean":
+        packed_grads /= group.size
+    grads = param_pack_split(packed_grads, offsets_val, shapes)
+    return grads


 class TensorFuture(Future):
@@ -54,28 +155,43 @@ def synchronized(func: Callable):
    return wrapper


-def get_device_count_by_fork(device_type: str):
-    q = mp.Queue()
+def _get_device_count_worker(queue, device_type):
+    num = get_device_count(device_type)
+    queue.put(num)

-    def worker(queue):
-        num = get_device_count(device_type)
-        queue.put(num)

-    p = mp.Process(target=worker, args=(q,))
+def get_device_count_by_fork(device_type: str):
+    """Get device count in fork thread.
+    See https://stackoverflow.com/questions/22950047/cuda-initialization-error-after-fork
+    for more information.
+    """
+    q = mp.Queue()
+    p = mp.Process(target=_get_device_count_worker, args=(q, device_type))
    p.start()
    p.join()
    return q.get()


-def bcast_list_(params, group):
-    for p in params:
-        p._reset(broadcast(p, group))
+def bcast_list_(inps: list, group: Group = WORLD):
+    """Broadcast tensors between given group.
+
+    :param inps: input tensors.
+    :param group: communication group.
+    """
+    for inp in inps:
+        inp._reset(broadcast(inp, group))


 class AllreduceCallback:
-    def __init__(self, reduce_method, group=WORLD):
+    """Allreduce Callback with tensor fusion optimization.
+
+    :param reduce_method: the method to reduce gradiants.
+    :param group: communication group.
+    """
+
+    def __init__(self, reduce_method: str, group: Group = WORLD):
        reduce_method = reduce_method.lower()
-        assert reduce_method in ["sum", "mean"]
+        assert reduce_method in ["sum", "mean"], "reduce_method should be sum or mean"
        self._reduce_method = reduce_method
        self._group = group
        self._marked_gm = WeakSet()
@@ -88,6 +204,7 @@ class AllreduceCallback:
        self._futures_dict = dict()
        self._packing_list = defaultdict(list)
        self._packing_size = defaultdict(int)
+        self._grad_origin_device = dict()

    def _pack(self, dtype):
        grad_list = [self._gradients_dict[p] for p in self._packing_list[dtype]]
@@ -109,6 +226,7 @@ class AllreduceCallback:
        self._params.append(param)
        self._futures_dict[param] = TensorFuture(ack=False)
        self._gradients_dict[param] = grad
+        self._grad_origin_device[param] = str(grad.device)

        dtype_str = str(np.dtype(param.dtype))
        dtype_size = np.dtype(param.dtype).itemsize
@@ -123,6 +241,7 @@ class AllreduceCallback:
            self._pack(dtype)
        for param in self._params:
            grad = self._gradients_dict[param]
+            grad = copy(grad, self._grad_origin_device[param])
            self._futures_dict[param].set(grad)
        self._reset()


--- a/imperative/python/megengine/distributed/launcher.py
+++ b/imperative/python/megengine/distributed/launcher.py
@@ -15,7 +15,7 @@ from .util import get_free_ports


 def _run_wrapped(func, master_ip, port, world_size, rank, dev, args, kwargs):
-    """init distributed process group and run wrapped function"""
+    """Init distributed process group and run wrapped function."""
    init_process_group(
        master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=dev
    )
@@ -23,7 +23,7 @@ def _run_wrapped(func, master_ip, port, world_size, rank, dev, args, kwargs):


 def launcher(func):
-    """decorator for launching multiple processes in single-machine multi-gpu training"""
+    """Decorator for launching multiple processes in single-machine multi-gpu training."""

    n_gpus = get_device_count_by_fork("gpu")


--- a/imperative/python/megengine/distributed/server.py
+++ b/imperative/python/megengine/distributed/server.py
@@ -21,6 +21,12 @@ from .util import get_free_ports


 class Methods:
+    """Distributed Server Method.
+    Used for exchange information between distributed nodes.
+
+    :param mm_server_port: multiple machine rpc server port.
+    """
+
    def __init__(self, mm_server_port):
        self.lock = threading.Lock()
        self.mm_server_port = mm_server_port
@@ -31,51 +37,65 @@ class Methods:
        self.dict_barrier_event = defaultdict(threading.Event)

    def connect(self):
+        """Method for checking connection success."""
        return True

    def get_mm_server_port(self):
+        """Get multiple machine rpc server port."""
        return self.mm_server_port

-    def set_is_grad(self, rank_peer, is_grad):
+    def set_is_grad(self, key, is_grad):
+        """Mark send/recv need gradiants by key.
+        
+        :param key: key to match send/recv op.
+        :param is_grad: whether this op need grad.
+        """
        with self.lock:
-            future = self.dict_is_grad[rank_peer]
+            future = self.dict_is_grad[key]
        future.set(is_grad)
        return True

-    def check_is_grad(self, rank_peer):
+    def check_is_grad(self, key):
+        """Check whether send/recv need gradiants.
+        
+        :param key: key to match send/recv op.
+        """
        with self.lock:
-            future = self.dict_is_grad[rank_peer]
+            future = self.dict_is_grad[key]
        ret = future.get()
        with self.lock:
-            del self.dict_is_grad[rank_peer]
+            del self.dict_is_grad[key]
        return ret

-    def set_remote_tracer(self, rank_peer, tracer_set):
+    def set_remote_tracer(self, key, tracer_set):
+        """Set tracer dict for tracing send/recv op.
+
+        :param key: key to match send/recv op.
+        :param tracer_set: valid tracer set.
+        """
        with self.lock:
-            future = self.dict_remote_tracer[rank_peer]
+            future = self.dict_remote_tracer[key]
        future.set(tracer_set)
        return True

-    def check_remote_tracer(self, rank_peer):
+    def check_remote_tracer(self, key):
+        """Get tracer dict for send/recv op.
+        
+        :param key: key to match send/recv op.
+        """
        with self.lock:
-            future = self.dict_remote_tracer[rank_peer]
+            future = self.dict_remote_tracer[key]
        ret = future.get()
        with self.lock:
-            del self.dict_remote_tracer[rank_peer]
+            del self.dict_remote_tracer[key]
        return ret

-    def set_pack_list(self, key, pack_list):
-        with self.lock:
-            future = self.dict_pack_list[key]
-        future.set(pack_list)
-        return True
-
-    def get_pack_list(self, key):
-        with self.lock:
-            future = self.dict_pack_list[key]
-        return future.get()
-
    def group_barrier(self, key, size):
+        """A barrier wait for all group member.
+        
+        :param key: group key to match each other.
+        :param size: group size.
+        """
        with self.lock:
            self.dict_barrier_counter[key] += 1
            counter = self.dict_barrier_counter[key]
@@ -94,12 +114,23 @@ class ThreadXMLRPCServer(ThreadingMixIn, SimpleXMLRPCServer):


 def start_server(py_server_port, mm_server_port):
+    """Start python distributed server and multiple machine server.
+    
+    :param py_server_port: python server port.
+    :param mm_server_port: multiple machine server port.
+    """
    server = ThreadXMLRPCServer(("0.0.0.0", py_server_port), logRequests=False)
    server.register_instance(Methods(mm_server_port))
    server.serve_forever()


 class Server:
+    """Distributed Server for distributed training.
+    Should be running at master node.
+
+    :param port: python server port.
+    """
+
    def __init__(self, port):
        self.py_server_port = get_free_ports(1)[0] if port == 0 else port
        self.mm_server_port = create_mm_server("0.0.0.0", 0)
@@ -112,12 +143,19 @@ class Server:


 class Client:
+    """Distributed Client for distributed training.
+
+    :param master_ip: ip address of master node.
+    :param port: port of server at master node.
+    """
+
    def __init__(self, master_ip, port):
        self.master_ip = master_ip
        self.port = port
        self.connect()

    def connect(self):
+        """Check connection success."""
        while True:
            try:
                self.proxy = ServerProxy(
@@ -129,25 +167,43 @@ class Client:
                time.sleep(1)

    def get_mm_server_port(self):
+        """Get multiple machine server port."""
        return self.proxy.get_mm_server_port()

-    def set_is_grad(self, rank_peer, is_grad):
-        self.proxy.set_is_grad(rank_peer, is_grad)
-
-    def check_is_grad(self, rank_peer):
-        return self.proxy.check_is_grad(rank_peer)
-
-    def set_remote_tracer(self, rank_peer, tracer_set):
-        self.proxy.set_remote_tracer(rank_peer, tracer_set)
-
-    def check_remote_tracer(self, rank_peer):
-        return self.proxy.check_remote_tracer(rank_peer)
-
-    def set_pack_list(self, key, pack_list):
-        self.proxy.set_pack_list(key, pack_list)
-
-    def get_pack_list(self, key):
-        return self.proxy.get_pack_list(key)
+    def set_is_grad(self, key, is_grad):
+        """Mark send/recv need gradiants by key.
+        
+        :param key: key to match send/recv op.
+        :param is_grad: whether this op need grad.
+        """
+        self.proxy.set_is_grad(key, is_grad)
+
+    def check_is_grad(self, key):
+        """Check whether send/recv need gradiants.
+        
+        :param key: key to match send/recv op.
+        """
+        return self.proxy.check_is_grad(key)
+
+    def set_remote_tracer(self, key, tracer_set):
+        """Set tracer dict for tracing send/recv op.
+
+        :param key: key to match send/recv op.
+        :param tracer_set: valid tracer set.
+        """
+        self.proxy.set_remote_tracer(key, tracer_set)
+
+    def check_remote_tracer(self, key):
+        """Get tracer dict for send/recv op.
+        
+        :param key: key to match send/recv op.
+        """
+        return self.proxy.check_remote_tracer(key)

    def group_barrier(self, key, size):
+        """A barrier wait for all group member.
+        
+        :param key: group key to match each other.
+        :param size: group size.
+        """
        self.proxy.group_barrier(key, size)
--- a/imperative/python/megengine/functional/__init__.py
+++ b/imperative/python/megengine/functional/__init__.py
@@ -8,13 +8,10 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # pylint: disable=redefined-builtin
 from .elemwise import *
-from .graph import add_update
-from .loss import *
 from .math import *
 from .nn import *
-from .quantized import conv_bias_activation
 from .tensor import *
-from .utils import accuracy, copy
+from .utils import *

 from . import distributed  # isort:skip


--- a/imperative/python/megengine/functional/debug_param.py
+++ b/imperative/python/megengine/functional/debug_param.py
@@ -26,14 +26,14 @@ def set_conv_execution_strategy(option: str):
        Available values:

        * 'HEURISTIC' uses heuristic to choose the fastest algorithm.
-        * 'PROFILE' runs possible algorithms on real device to find the best.
-        * 'PROFILE_HEURISTIC' uses profile result and heuristic to choose the fastest algorithm.
-        * 'PROFILE_REPRODUCIBLE' uses the fastest of profile result that is also reproducible.
+        * 'PROFILE' runs possible algorithms on real device to find the best one.
+        * 'PROFILE_HEURISTIC' uses profiling result and heuristic to choose the fastest algorithm.
+        * 'PROFILE_REPRODUCIBLE' uses the fastest of profiling result that is also reproducible.
        * 'HEURISTIC_REPRODUCIBLE' uses heuristic to choose the fastest algorithm that is also reproducible.

        The default strategy is 'HEURISTIC'.

-        It can also be set through the environmental variable 'MEGENGINE_CONV_EXECUTION_STRATEGY'.
+        It can also be set through the environment variable 'MEGENGINE_CONV_EXECUTION_STRATEGY'.
    """
    valid_option = (
        "HEURISTIC",

--- a/imperative/python/megengine/functional/elemwise.py
+++ b/imperative/python/megengine/functional/elemwise.py
@@ -26,23 +26,22 @@ __all__ = [
    "acosh",
    "atanh",
    "ceil",
-    "clamp",
+    "clip",
    "cos",
    "cosh",
    "div",
-    "eq",
+    "equal",
    "exp",
    "expm1",
-    "fast_tanh",
    "floor",
    "floor_div",
-    "gt",
-    "ge",
+    "greater",
+    "greater_equal",
    "hswish",
    "hsigmoid",
    "left_shift",
-    "lt",
-    "le",
+    "less",
+    "less_equal",
    "log",
    "log1p",
    "logical_and",
@@ -54,7 +53,7 @@ __all__ = [
    "mod",
    "mul",
    "neg",
-    "ne",
+    "not_equal",
    "pow",
    "relu",
    "relu6",
@@ -88,13 +87,6 @@ def _elwise(*args, mode):
    return result


-def _logical(*args, mode):
-    op = builtin.CondExecPredLogical(mode=mode)
-    args = utils.convert_inputs(*args)
-    (result,) = apply(op, *args)
-    return result
-
-
 def _elemwise_multi_type(*args, mode, **kwargs):
    op = builtin.ElemwiseMultiType(mode=mode, **kwargs)
    args = utils.convert_inputs(*args)
@@ -106,9 +98,10 @@ def _elemwise_multi_type(*args, mode, **kwargs):


 def add(x, y):
-    """Element-wise addition.
+    """Element-wise `addition`.
    At least one operand should be tensor.
-    Same for sub/mul/div/floor_div/pow/mod/atan2/eq/ne/lt/le/gt/ge/maximum/minmium.
+
+    Same for sub/mul/div/floor_div/pow/mod/atan2/equal/not_equal/less/less_equal/greater/greater_equal/maximum/minmium.

    :param x: input tensor.
    :return: computed tensor.
@@ -138,68 +131,68 @@ def add(x, y):


 def sub(x, y):
-    """Element-wise subtraction."""
+    """Element-wise `subtraction`."""
    return _elwise(x, y, mode="sub")


 def mul(x, y):
-    """Element-wise multiplication."""
+    """Element-wise `multiplication`."""
    return _elwise(x, y, mode="mul")


 def div(x, y):
-    """Element-wise (x / y)."""
+    """Element-wise `(x / y)`."""
    return _elwise(x, y, mode="true_div")


 def floor_div(x, y):
-    """Element-wise floor(x / y)."""
+    """Element-wise `floor(x / y)`."""
    return _elwise(x, y, mode="floor_divide")


 def neg(x):
-    """Element-wise negation."""
+    """Element-wise `negation`."""
    return _elwise(x, mode="negate")


 def pow(x, y):
-    """Element-wise power."""
+    """Element-wise `power`."""
    return _elwise(x, y, mode="pow")


 def mod(x, y):
-    """Element-wise remainder of division."""
+    """Element-wise `remainder of division`."""
    return _elwise(x, y, mode="mod")


 def abs(x):
-    """Element-wise absolute value."""
+    """Element-wise `absolute value`."""
    return _elwise(x, mode="abs")


 def exp(x):
-    """Element-wise exponential."""
+    """Element-wise `exponential`."""
    return _elwise(x, mode="exp")


 def expm1(x):
-    """Element-wise exp(x)-1."""
+    """Element-wise `exp(x)-1`."""
    return _elwise(x, mode="expm1")


 def log(x):
-    """Element-wise logarithm (base `e`)."""
+    """Element-wise `logarithm (base e)`."""
    return _elwise(x, mode="log")


 def log1p(x):
-    """Element-wise log(x+1) (base `e`)."""
+    """Element-wise `log(x+1) (base e)`."""
    return _elwise(x, mode="log1p")


 def sqrt(x: Tensor) -> Tensor:
-    """Element-wise sqrt.
-    For negative input value, return ``NaN``.
+    """Element-wise `sqrt`.
+    Returns ``NaN`` for negative input value.

    :param x: input tensor.
    :return: computed tensor.
@@ -229,10 +222,10 @@ def sqrt(x: Tensor) -> Tensor:

 def square(x: Tensor) -> Tensor:
    """
-    Return a new tensor with the square of the elements of input tensor.
+    Returns a new tensor with the square of the elements of input tensor.

-    :param inp: The input tensor
-    :return: The computed tensor
+    :param inp: input tensor.
+    :return: computed tensor.

    Examples:

@@ -258,27 +251,27 @@ def square(x: Tensor) -> Tensor:


 def round(x):
-    """Element-wise rounding to int."""
+    """Element-wise `rounding to int`."""
    return _elwise(x, mode="round")


 def ceil(x):
-    """Element-wise ceiling."""
+    """Element-wise `ceiling`."""
    return _elwise(x, mode="ceil")


 def floor(x):
-    """Element-wise floor."""
+    """Element-wise `floor`."""
    return _elwise(x, mode="floor")


 def maximum(x, y):
-    """Element-wise maximum of array elements."""
+    """Element-wise `maximum of array elements`."""
    return _elwise(x, y, mode="max")


 def minimum(x, y):
-    """Element-wise minimum of array elements."""
+    """Element-wise `minimum of array elements`."""
    return _elwise(x, y, mode="min")


@@ -286,7 +279,7 @@ def minimum(x, y):


 def cos(x):
-    """Element-wise cosine.
+    """Element-wise `cosine`.

    :param x: input tensor.
    :return: computed tensor.
@@ -315,80 +308,71 @@ def cos(x):


 def sin(x):
-    """Element-wise sine."""
+    """Element-wise `sine`."""
    return _elwise(x, mode="sin")


 def tan(x):
-    """Element-wise tangent."""
+    """Element-wise `tangent`."""
    return sin(x) / cos(x)


 def acos(x):
-    """Element-wise inverse cosine."""
+    """Element-wise `inverse cosine`."""
    return _elwise(x, mode="acos")


 def asin(x):
-    """Element-wise inverse sine."""
+    """Element-wise `inverse sine`."""
    return _elwise(x, mode="asin")


 def atan(x):
-    """Element-wise inverse tangent."""
+    """Element-wise `inverse tangent`."""
    return _elwise(x, 1, mode="atan2")


 def atan2(y, x):
-    """Element-wise 2-argument arctangent."""
+    """Element-wise `2-argument arctangent`."""
    return _elwise(y, x, mode="atan2")


 def cosh(x):
-    r"""Element-wise hyperbolic cosine."""
+    r"""Element-wise `hyperbolic cosine`."""
    return 0.5 * (exp(x) + exp(-x))


 def sinh(x):
-    r"""Element-wise hyperbolic sine."""
+    r"""Element-wise `hyperbolic sine`."""
    u = expm1(x)
    return 0.5 * u / (u + 1) * (u + 2)


 def tanh(x):
-    r"""Element-wise hyperbolic tangent."""
+    r"""Element-wise `hyperbolic tangent`."""
    return _elwise(x, mode="tanh")


 def asinh(x):
-    r"""Element-wise inverse hyperbolic sine."""
+    r"""Element-wise `inverse hyperbolic sine`."""
    return log(x + (x ** 2 + 1) ** 0.5)


 def acosh(x):
-    r"""Element-wise inverse hyperbolic cosine."""
+    r"""Element-wise `inverse hyperbolic cosine`."""
    return log(x + (x ** 2 - 1) ** 0.5)


 def atanh(x):
-    r"""Element-wise inverse hyperbolic tangent."""
+    r"""Element-wise `inverse hyperbolic tangent`."""
    return log1p(2 * x / (1 - x)) / 2


-def fast_tanh(x):
-    r"""Element-wise fast tanh; this is an approximation:
-
-    .. math::
-        \text{fast_tanh}(x) = x * (27. + x * x) / (27. + 9. * x * x)
-    """
-    return _elwise(x, mode="fast_tanh")
-
-
 # bit-twiddling functions


 def left_shift(x, y):
-    """Element-wise bitwise binary: x << y.
+    """Element-wise `bitwise binary: x << y`.

    :param x: input tensor, should be int.
    :param y: how many bits to be left-shifted.
@@ -418,7 +402,7 @@ def left_shift(x, y):


 def right_shift(x, y):
-    """Element-wise bitwise binary: x >> y."""
+    """Element-wise `bitwise binary: x >> y`."""
    return _elwise(x, y, mode="shr")


@@ -426,30 +410,30 @@ def right_shift(x, y):


 def logical_and(x, y):
-    """Element-wise logical and: x && y."""
+    """Element-wise `logical and: x && y`."""
    return _elwise(x, y, mode="AND")


 def logical_not(x):
-    """Element-wise logical not: ~x."""
+    """Element-wise `logical not: ~x`."""
    return _elwise(x, mode="NOT")


 def logical_or(x, y):
-    """Element-wise logical or: x || y."""
+    """Element-wise `logical or: x || y`."""
    return _elwise(x, y, mode="OR")


 def logical_xor(x, y):
-    """Element-wise logical xor: x ^ y."""
+    """Element-wise `logical xor: x ^ y`."""
    return _elwise(x, y, mode="XOR")


 # comparison functions


-def eq(x, y):
-    """Element-wise (x == y).
+def equal(x, y):
+    """Element-wise `(x == y)`.

    :param x: input tensor 1.
    :param y: input tensor 2.
@@ -465,7 +449,7 @@ def eq(x, y):

        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
        y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        out = F.eq(x, y)
+        out = F.equal(x, y)
        print(out.numpy())

    Outputs:
@@ -479,28 +463,28 @@ def eq(x, y):
    return _elwise(x, y, mode="eq")


-def ne(x, y):
-    """Element-wise (x != y)."""
+def not_equal(x, y):
+    """Element-wise `(x != y)`."""
    return x != y


-def lt(x, y):
-    """Element-wise (x < y)."""
+def less(x, y):
+    """Element-wise `(x < y)`."""
    return _elwise(x, y, mode="lt")


-def le(x, y):
-    """Element-wise (x <= y)."""
+def less_equal(x, y):
+    """Element-wise `(x <= y)`."""
    return _elwise(x, y, mode="leq")


-def gt(x, y):
-    """Element-wise (x > y)."""
+def greater(x, y):
+    """Element-wise `(x > y)`."""
    return _elwise(y, x, mode="lt")


-def ge(x, y):
-    """Element-wise (x >= y)."""
+def greater_equal(x, y):
+    """Element-wise `(x >= y)`."""
    return _elwise(y, x, mode="leq")


@@ -508,7 +492,7 @@ def ge(x, y):


 def hswish(x):
-    """Element-wise x * relu6(x + 3) / 6.
+    """Element-wise `x * relu6(x + 3) / 6`.

    :param x: input tensor.
    :return: computed tensor.
@@ -534,7 +518,7 @@ def hswish(x):


 def hsigmoid(x):
-    """Element-wise relu6(x + 3) / 6."""
+    """Element-wise `relu6(x + 3) / 6`."""
    return relu6(x + 3) / 6


@@ -544,16 +528,16 @@ def relu(x):


 def relu6(x):
-    """Element-wise min(max(x, 0), 6)."""
+    """Element-wise `min(max(x, 0), 6)`."""
    return minimum(maximum(x, 0), 6)


 def sigmoid(x):
-    """Element-wise 1 / ( 1 + exp( -x ) )."""
+    """Element-wise `1 / ( 1 + exp( -x ) )`."""
    return _elwise(x, mode="sigmoid")


-def clamp(x: Tensor, lower=None, upper=None) -> Tensor:
+def clip(x: Tensor, lower=None, upper=None) -> Tensor:
    r"""Clamps all elements in input tensor into the range `[` :attr:`lower`, :attr:`upper` `]` and returns
    a resulting tensor:

@@ -578,9 +562,9 @@ def clamp(x: Tensor, lower=None, upper=None) -> Tensor:
        import megengine.functional as F

        a = tensor(np.arange(5).astype(np.int32))
-        print(F.clamp(a, 2, 4).numpy())
-        print(F.clamp(a, lower=3).numpy())
-        print(F.clamp(a, upper=3).numpy())
+        print(F.clip(a, 2, 4).numpy())
+        print(F.clip(a, lower=3).numpy())
+        print(F.clip(a, upper=3).numpy())

    Outputs:

@@ -596,7 +580,7 @@ def clamp(x: Tensor, lower=None, upper=None) -> Tensor:
    ), "At least one of 'lower' or 'upper' must not be None"
    if lower is not None:
        if upper is not None:
-            assert lower <= upper, "clamp lower bound is bigger that upper bound"
+            assert lower <= upper, "clip lower bound is bigger that upper bound"
            return minimum(maximum(x, lower), upper)
        else:
            return maximum(x, lower)

--- a/imperative/python/megengine/functional/external.py
+++ b/imperative/python/megengine/functional/external.py
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# pylint: disable=too-many-lines
-from typing import List
-
-from ..tensor import Tensor
-
-
-def cambricon_subgraph(
-    inputs: List[Tensor], data: bytes, symbol: str, tensor_dim_mutable: bool,
-) -> List[Tensor]:
-    """Loads a serialized Cambricon subgraph (i.e. cnrtModel_t) and
-    execute the operations defined in the subgraph.
-
-    :param inputs: list of input tensors of the subgraph.
-    :param data: the serialized subgraph.
-    :param symbol: the name of the function in the subgraph.
-        The function is corresponding to a cnmlFusionOp
-        which is added to the cnmlModel_t/cnrtModel_t.
-    :param tensor_dim_mutable: whether the input tensors' shapes are mutalbe
-        in cnrtModel_t.
-    """
-    raise NotImplementedError
-
-
-def extern_opr_subgraph(
-    inputs, output_shapes: List[tuple], dump_name: str, dump_data: bytes,
-) -> List[Tensor]:
-    """Loads a serialized extern opr subgraph and fake execute the operator.
-
-    :param inputs: tensor or list of input tensors.
-    :param output_shapes: the output shapes.
-    :param dump_name: the serialized subgraph name.
-    :param dump_data: the serialized subgraph.
-
-    :return: list of tensors.
-    """
-    raise NotImplementedError
--- a/imperative/python/megengine/functional/graph.py
+++ b/imperative/python/megengine/functional/graph.py
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import collections
-from typing import Iterable, Optional, Union
-
-from ..tensor import Tensor
-
-
-def add_update(
-    dest: Tensor,
-    delta: Tensor,
-    *,
-    alpha: Union[Tensor, float, int] = 1.0,
-    beta: Union[Tensor, float, int] = 1.0,
-    bias: Union[Tensor, float, int] = 0.0
-):
-    r"""Modify ``dest`` inplace as follows:
-
-    .. math::
-        dest = alpha * dest +  beta * delta + bias
-
-    :param dest: input data that will be inplace modified.
-    :param delta: update value that will be added to ``dest``.
-    :param alpha: weight ratio of ``dest``. Default: 1.0
-    :param beta: weight ratio of ``delta``. Default: 1.0
-    :param bias: bias value appended to the result. Default: 0.0
-    """
-    if beta is not None and beta != 1.0:
-        delta = delta * beta
-    if bias is not None and bias != 0.0:
-        delta = delta + bias
-    if alpha is not None and alpha != 1.0:
-        dest *= alpha
-    dest += delta
-    return dest
--- a/imperative/python/megengine/functional/loss.py
+++ b/imperative/python/megengine/functional/loss.py
@@ -10,14 +10,14 @@ import numpy as np

 from ..core.tensor.utils import make_shape_tuple
 from ..tensor import Tensor
-from .elemwise import abs, eq, exp, log, maximum, pow, relu
-from .nn import indexing_one_hot
+from .elemwise import abs, equal, exp, log, maximum, pow, relu
+from .nn import indexing_one_hot, logsigmoid, logsumexp
 from .tensor import where

 __all__ = [
    "l1_loss",
    "square_loss",
-    "cross_entropy_with_softmax",
+    "cross_entropy",
    "binary_cross_entropy",
    "hinge_loss",
 ]
@@ -55,7 +55,7 @@ def l1_loss(pred: Tensor, label: Tensor) -> Tensor:

        ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
        tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
-        loss = F.l1_loss(ipt, tgt)
+        loss = F.nn.l1_loss(ipt, tgt)
        print(loss.numpy())

    Outputs:
@@ -106,7 +106,7 @@ def square_loss(pred: Tensor, label: Tensor) -> Tensor:

        ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
        tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
-        loss = F.square_loss(ipt, tgt)
+        loss = F.nn.square_loss(ipt, tgt)
        print(loss.numpy())

    Outputs:
@@ -120,10 +120,16 @@ def square_loss(pred: Tensor, label: Tensor) -> Tensor:
    return (diff ** 2).mean()


-def cross_entropy_with_softmax(
-    pred: Tensor, label: Tensor, axis: int = 1, label_smooth: float = 0
+def cross_entropy(
+    pred: Tensor,
+    label: Tensor,
+    axis: int = 1,
+    with_logits: bool = True,
+    label_smooth: float = 0,
 ) -> Tensor:
-    r"""Returns loss after applying :func:`~.softmax` + :func:`~.cross_entropy`.
+    r"""Compute the multi-class cross entropy loss (using logits by default).
+
+    By default, prediction is assumed to be logits, whose softmax gives probabilities.

    It has better numerical stability compared with sequential calls to :func:`~.softmax` and :func:`~.cross_entropy`.

@@ -132,11 +138,12 @@ def cross_entropy_with_softmax(
    .. math:: y^{LS}_{k}=y_{k}\left(1-\alpha\right)+\alpha/K

    where :math:`y^{LS}` and :math:`y` are new label distribution and origin label distribution respectively.
-    k is the index of label distribution. :math:`\alpha` is label_smooth and :math:`K` is the number of classes.
+    k is the index of label distribution. :math:`\alpha` is ``label_smooth`` and :math:`K` is the number of classes.

    :param pred: input tensor representing the predicted probability.
    :param label: input tensor representing the classification label.
    :param axis: an axis along which softmax will be applied. Default: 1
+    :param with_logits: whether to apply softmax first. Default: True
    :param label_smooth: a label smoothing of parameter that can re-distribute target distribution. Default: 0
    :return: loss value.

@@ -150,9 +157,9 @@ def cross_entropy_with_softmax(

        data_shape = (1, 2)
        label_shape = (1, )
-        pred = tensor(np.array([0.5, 0.5], dtype=np.float32).reshape(data_shape))
+        pred = tensor(np.array([0, 0], dtype=np.float32).reshape(data_shape))
        label = tensor(np.ones(label_shape, dtype=np.int32))
-        loss = F.cross_entropy_with_softmax(pred, label)
+        loss = F.nn.cross_entropy(pred, label)
        print(loss.numpy())

    Outputs:
@@ -170,26 +177,41 @@ def cross_entropy_with_softmax(
    )

    num_classes = pred.shape[axis]
+    no_label_smooth = (
+        label_smooth is None or type(label_smooth) in (int, float) and label_smooth == 0
+    )
+
+    if not with_logits:
+        if no_label_smooth:
+            return -log(indexing_one_hot(pred, label, axis)).mean()
+        pred = log(pred)
+        return (
+            label_smooth * pred.mean()
+            - (1 - label_smooth) * indexing_one_hot(pred, label, axis).mean()
+        )

    # Denominator of the softmax
-    offset = pred.max(axis=axis, keepdims=True).detach()
-    pred = pred - offset
-    down = exp(pred).sum(axis=axis, keepdims=True)
+    down = logsumexp(pred, axis=axis, keepdims=True)

    up = indexing_one_hot(pred, label, axis)

-    if label_smooth != 0:
+    if not no_label_smooth:
        factor = label_smooth / num_classes
        up = up * (1 - label_smooth) + pred.sum(axis=axis, keepdims=True) * factor

-    return (log(down) - up).mean()
+    return (down - up).mean()


-def binary_cross_entropy(pred: Tensor, label: Tensor) -> Tensor:
-    r"""Function that measures the Binary Cross Entropy between the target and the prediction.
+def binary_cross_entropy(
+    pred: Tensor, label: Tensor, with_logits: bool = True
+) -> Tensor:
+    r"""Compute the binary cross entropy loss (using logits by default).
+
+    By default, prediction is assumed to be logits, whose sigmoid gives probabilities.

-    :param pred: `(N, *)` where `*` means any number of additional dimensions.
+    :param pred: `(N, *)`, where `*` means any number of additional dimensions.
    :param label: `(N, *)`, same shape as the input.
+    :param with_logits: bool, whether to apply sigmoid first. Default: True
    :return: loss value.

    Examples:
@@ -200,9 +222,9 @@ def binary_cross_entropy(pred: Tensor, label: Tensor) -> Tensor:
        from megengine import tensor
        import megengine.functional as F

-        pred = tensor(np.array([0.5, 0.5], dtype=np.float32).reshape(1, 2))
+        pred = tensor(np.array([0, 0], dtype=np.float32).reshape(1, 2))
        label = tensor(np.ones((1, 2), dtype=np.float32))
-        loss = F.binary_cross_entropy(pred, label)
+        loss = F.nn.binary_cross_entropy(pred, label)
        print(loss.numpy())

    Outputs:
@@ -212,11 +234,15 @@ def binary_cross_entropy(pred: Tensor, label: Tensor) -> Tensor:
        [0.6931]

    """
-    return -1.0 * (label * log(pred) + (1.0 - label) * log(1 - pred)).mean()
+    if not with_logits:
+        return -(label * log(pred) + (1 - label) * log(1 - pred)).mean()
+    # logsigmoid(pred) and logsigmoid(-pred) has common sub-expression
+    # hopefully the backend would optimize this
+    return -(label * logsigmoid(pred) + (1 - label) * logsigmoid(-pred)).mean()


 def hinge_loss(pred: Tensor, label: Tensor, norm: str = "L1") -> Tensor:
-    r"""Caculate the hinge loss which is often used in SVMs.
+    r"""Caculates the hinge loss which is often used in SVM.

    The hinge loss can be described as:

@@ -236,7 +262,7 @@ def hinge_loss(pred: Tensor, label: Tensor, norm: str = "L1") -> Tensor:

        pred = tensor([[0.5, -0.5, 0.1], [-0.6, 0.7, 0.8]], dtype="float32")
        label = tensor([[1, -1, -1], [-1, 1, 1]], dtype="float32")
-        loss = F.hinge_loss(pred, label)
+        loss = F.nn.hinge_loss(pred, label)
        print(loss.numpy())

    Outputs:

--- a/imperative/python/megengine/functional/math.py
+++ b/imperative/python/megengine/functional/math.py
@@ -14,11 +14,12 @@ from typing import Optional, Sequence, Tuple, Union

 from ..core.ops import builtin
 from ..core.ops._internal import param_defs as P
+from ..core.ops.special import Const
 from ..core.tensor import utils
-from ..core.tensor.core import apply
+from ..core.tensor.core import TensorBase, TensorWrapperBase, apply
 from ..tensor import Tensor
-from .elemwise import clamp, exp, log, log1p
-from .tensor import add_axis, remove_axis, reshape
+from .elemwise import clip, exp, log, log1p
+from .tensor import reshape, squeeze

 __all__ = [
    "argmax",
@@ -45,7 +46,7 @@ def isnan(inp: Tensor) -> Tensor:
    r"""Returns a new tensor representing if each element is ``NaN`` or not.

    :param inp: input tensor.
-    :return: a new tensor representing if each element in inp is NaN or not.
+    :return: result tensor.

    Examples:

@@ -71,7 +72,7 @@ def isinf(inp: Tensor) -> Tensor:
    r"""Returns a new tensor representing if each element is ``Inf`` or not.

    :param inp: input tensor.
-    :return: a new tensor representing if each element in inp is Inf or not.
+    :return: result tensor.

    Examples:

@@ -84,7 +85,7 @@ def isinf(inp: Tensor) -> Tensor:
        print(F.isinf(x).numpy())

    Outputs:
-    
+
    .. testoutput::

        [False  True False]
@@ -108,7 +109,7 @@ def sign(inp: Tensor):

        x = tensor([1, -1, 0])
        print(F.sign(x).numpy())
-    
+
    Outputs:

    .. testoutput::
@@ -128,7 +129,7 @@ def sum(
    reduce over all of them.

    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced.
+    :param axis: dimension to reduce. If None, all dimensions will be reduced.
        Default: None
    :param keepdims: whether the output tensor has axis retained or not.
        Default: False
@@ -163,7 +164,7 @@ def prod(
    reduce over all of them.

    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -199,7 +200,7 @@ def mean(
    reduce over all of them.

    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -235,7 +236,7 @@ def var(
    reduce over all of them.

    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -275,7 +276,7 @@ def std(
    reduce over all of them.

    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -310,7 +311,7 @@ def min(
    reduce over all of them.

    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -346,7 +347,7 @@ def max(
    reduce over all of them.

    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -373,18 +374,14 @@ def max(


 def norm(
-    inp: Tensor,
-    p: int = 2,
-    axis: Optional[Union[int, Sequence[int]]] = None,
-    keepdims=False,
+    inp: Tensor, ord: float = None, axis: int = None, keepdims=False,
 ):
    """Calculates ``p``-norm of input tensor along
-    given axis. If axis is a list of dimensions,
-    reduce over all of them.
+    given axis.

    :param inp: input tensor.
-    :param p: power of value applied to inp. Default: 2
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param ord: power of value applied to inp. Default: 2
+    :param axis: dimension to reduce. If None, input must be a vector. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -396,7 +393,7 @@ def norm(
        from megengine import tensor
        import megengine.functional as F

-        x = tensor(np.arange(-3, 3, dtype=np.float32).reshape(2,3))
+        x = tensor(np.arange(-3, 3, dtype=np.float32))
        out = F.norm(x)
        print(out.numpy())

@@ -407,13 +404,18 @@ def norm(
        [4.3589]

    """
-    if p == 0:
+    if axis is None:
+        if inp.ndim != 1:
+            raise TypeError("axis is required unless input is a vector")
+    if ord is None:
+        ord = 2
+    if ord == 0:
        return sum(inp != 0, axis=axis, keepdims=keepdims)
-    if p == math.inf:
+    if ord == math.inf:
        return max(abs(inp))
-    if p == -math.inf:
+    if ord == -math.inf:
        return min(abs(inp))
-    return sum(abs(inp) ** p, axis=axis, keepdims=keepdims) ** (1.0 / p)
+    return sum(abs(inp) ** ord, axis=axis, keepdims=keepdims) ** (1.0 / ord)


 def argmin(
@@ -426,7 +428,7 @@ def argmin(
    reduce over all of them.

    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -458,7 +460,7 @@ def argmin(
            (inp,) = apply(op, inp)

            if not keepdims:
-                inp = remove_axis(inp, ai)
+                inp = squeeze(inp, ai)

        return inp

@@ -470,7 +472,7 @@ def argmin(
    op = builtin.Argmin(axis=axis)
    (result,) = apply(op, inp)
    if not keepdims:
-        result = remove_axis(result, axis)
+        result = squeeze(result, axis)
    return result


@@ -484,7 +486,7 @@ def argmax(
    reduce over all of them.

    :param inp: input tensor.
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced. Default: None
+    :param axis: dimension to reduce. If None, all dimensions will be reduced. Default: None
    :param keepdims: whether the output tensor has axis retained or not. Default: False
    :return: output tensor.

@@ -516,7 +518,7 @@ def argmax(
            (inp,) = apply(op, inp)

            if not keepdims:
-                inp = remove_axis(inp, ai)
+                inp = squeeze(inp, ai)

        return inp

@@ -528,45 +530,40 @@ def argmax(
    op = builtin.Argmax(axis=axis)
    (result,) = apply(op, inp)
    if not keepdims:
-        result = remove_axis(result, axis)
+        result = squeeze(result, axis)
    return result


 def normalize(
-    inp: Tensor,
-    p: int = 2,
-    axis: Optional[Union[int, Sequence[int]]] = None,
-    eps: float = 1e-12,
+    inp: Tensor, ord: float = None, axis: int = None, eps: float = 1e-12,
 ) -> Tensor:
    r"""Performs :math:`L_p` normalization of input tensor along
-    given axis. If axis is a list of dimensions,
-    reduce over all of them.
+    given axis.

-    For a tensor inp of shape :math:`(n_0, ..., n_{dim}, ..., n_k)`, each
+    For a tensor of shape :math:`(n_0, ..., n_{dim}, ..., n_k)`, each
    :math:`n_{dim}` -element vector :math:`v` along dimension :attr:`axis` is transformed as:

    .. math::
        v = \frac{v}{\max(\lVert v \rVert_p, \epsilon)}.

    :param inp: input tensor.
-    :param p: power of value applied to inp. Default: 2
-    :param axis: dimension to reduce. If None, all the dimensions will be reduced
-        to calculate the norm. Default: None
+    :param ord: power of value applied to input tensor. Default: 2
+    :param axis: dimension to reduce.If None, input must be a vector. Default: None
    :param eps: a small value to avoid division by zero. Default: 1e-12
    :return: normalized output tensor.
    """
    if axis is None:
-        return inp / clamp(norm(inp, p, axis), lower=eps)
+        return inp / clip(norm(inp, ord, axis), lower=eps)
    else:
-        return inp / clamp(norm(inp, p, axis, keepdims=True), lower=eps)
+        return inp / clip(norm(inp, ord, axis, keepdims=True), lower=eps)


 def argsort(inp: Tensor, descending: bool = False) -> Tensor:
-    r"""Sorts the target 2d matrix by row, return both the sorted tensor and indices.
+    r"""Returns the indices that would sort the input tensor.

-    :param inp: input tensor, if 2d, each row will be sorted.
-    :param descending: Sort in descending order, where the largest comes first. Default: False
-    :return: Tuple of two tensors `(sorted_tensor, indices_of_int32)`.
+    :param inp: input tensor. If it's 2d, the result would be array of indices show how to sort each row in the input tensor.
+    :param descending: sort in descending order, where the largest comes first. Default: False
+    :return: indices of int32 indicates how to sort the input.

    Examples:

@@ -603,6 +600,31 @@ def argsort(inp: Tensor, descending: bool = False) -> Tensor:


 def sort(inp: Tensor, descending: bool = False) -> Tuple[Tensor, Tensor]:
+    r"""Returns sorted tensor and the indices would sort the input tensor.
+
+    :param inp: input tensor. If it's 2d, the result would be sorted by row.
+    :param descending: sort in descending order, where the largest comes first. Default: False
+    :return: tuple of two tensors `(sorted_tensor, indices_of_int32)`.
+
+    Examples:
+
+    .. testcode::
+
+        import numpy as np
+        from megengine import tensor
+        import megengine.functional as F
+
+        x = tensor(np.array([1,2], dtype=np.float32))
+        out, indices = F.sort(x)
+        print(out.numpy())
+
+    Outputs:
+
+    .. testoutput::
+
+        [1. 2.]
+
+    """
    assert len(inp.shape) <= 2, "Input should be 1d or 2d"
    if descending:
        order = P.Argsort.Order.DESCENDING
@@ -625,13 +647,13 @@ def topk(
    kth_only: bool = False,
    no_sort: bool = False,
 ) -> Tuple[Tensor, Tensor]:
-    r"""Selects the ``Top-K(by default)`` smallest elements of 2d matrix by row.
+    r"""Selects the ``Top-K``(by default) smallest elements of 2d matrix by row.

-    :param inp: input tensor, if 2d, each row will be sorted.
+    :param inp: input tensor. If input tensor is 2d, each row will be sorted.
    :param k: number of elements needed.
-    :param descending: if true, return the largest elements instead. Default: False
-    :param kth_only: if true, only the k-th element will be returned. Default: False
-    :param no_sort: if true, the returned elements can be unordered. Default: False
+    :param descending: if True, return the largest elements instead. Default: False
+    :param kth_only: if True, only the k-th element will be returned. Default: False
+    :param no_sort: if True, the returned elements can be unordered. Default: False
    :return: tuple of two tensors `(topk_tensor, indices_of_int32)`.

    Examples:
@@ -665,15 +687,18 @@ def topk(
        mode = Mode.VALUE_IDX_SORTED
    op = builtin.TopK(mode=mode)

+    if not isinstance(k, (TensorBase, TensorWrapperBase)):
+        (k,) = Const(k, dtype="int32", device=inp.device)(inp)
+
    if len(inp.shape) == 1:
        inp = inp.reshape(1, -1)
-        res = apply(op, inp, Tensor(k, dtype="int32"))
+        res = apply(op, inp, k)
        if kth_only:
            tns = res[0]
        else:
            tns, ind = res[0][0], res[1][0]
    else:
-        res = apply(op, inp, Tensor(k, dtype="int32"))
+        res = apply(op, inp, k)
        if kth_only:
            tns = res
        else:

--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
--- a/imperative/python/megengine/functional/param_pack.py
+++ b/imperative/python/megengine/functional/param_pack.py
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-import numpy as np
-
-from ..tensor import Tensor
-from .distributed import all_reduce_sum
-from .tensor import param_pack_concat, param_pack_split
-
-
-def get_offsets(shapes):
-    offsets = []
-    offset = 0
-    for shape in shapes:
-        offsets.append(offset)
-        offset += int(np.prod(shape))
-        offsets.append(offset)
-    return offsets
-
-
-def pack_allreduce_split(pack_list, shapes, group, reduce_method):
-    offsets_val = get_offsets(shapes)
-    offsets = Tensor(offsets_val)
-    packed_grads = param_pack_concat(pack_list, offsets, offsets_val)
-    packed_grads = all_reduce_sum(packed_grads, group)
-    if reduce_method == "mean":
-        packed_grads /= group.size
-    grads = param_pack_split(packed_grads, offsets_val, shapes)
-    return grads
--- a/imperative/python/megengine/functional/quantized.py
+++ b/imperative/python/megengine/functional/quantized.py
@@ -34,26 +34,23 @@ def conv_bias_activation(
    :param weight: convolution kernel.
    :param bias: bias added to the result of convolution
    :param stride: stride of the 2D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides of its
-        spatial dimensions. Only zero-padding is supported. Default: 0
+    :param padding: size of the paddings added to the input on both sides of its spatial dimensions. Only zero-padding is supported. Default: 0
    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups to divide input and output channels into,
-        so as to perform a "grouped convolution". When groups is not 1,
-        in_channels and out_channels must be divisible by groups,
+    :param groups: number of groups into which the input and output channels are divided, so as to perform a "grouped convolution". When ``groups`` is not 1,
+        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
        and the shape of weight should be `(groups, out_channel // groups,
        in_channels // groups, height, width)`.
    :type conv_mode: string or :class:`P.Convolution.Mode`.
    :param conv_mode: supports 'CROSS_CORRELATION' or 'CONVOLUTION'. Default:
        'CROSS_CORRELATION'
-    :param dtype: support for np.dtype, Default: np.int8
+    :param dtype: support for ``np.dtype``, Default: np.int8
    :param scale: scale if use quantization, Default: 0.0
    :param zero_point: scale if use quantization quint8, Default: 0.0
    :type compute_mode: string or
        :class:`P.Convolution.ComputeMode`.
-    :param compute_mode: when set to 'DEFAULT', no special requirements will be
-        placed on the precision of intermediate results. When set to 'FLOAT32',
-        Float32 would be used for accumulator and intermediate result, but only
-        effective when input and output are of Float16 dtype.
+    :param compute_mode: when set to "DEFAULT", no special requirements will be
+        placed on the precision of intermediate results. When set to "FLOAT32",
+        "Float32" would be used for accumulator and intermediate result, but only effective when input and output are of Float16 dtype.

    """
    ph, pw = _pair(padding)

--- a/imperative/python/megengine/functional/tensor.py
+++ b/imperative/python/megengine/functional/tensor.py
--- a/imperative/python/megengine/functional/utils.py
+++ b/imperative/python/megengine/functional/utils.py
@@ -11,18 +11,24 @@ from typing import Iterable, Union

 import numpy as np

-from ..core.ops.builtin import Copy
+from ..core._wrap import device as as_device
+from ..core.ops.builtin import Copy, Identity
 from ..core.tensor import Tensor
 from ..core.tensor.core import apply
 from .math import topk as _topk
-from .tensor import transpose as _transpose
+from .tensor import broadcast_to, transpose

+__all__ = [
+    "topk_accuracy",
+    "copy",
+]

-def accuracy(
+
+def topk_accuracy(
    logits: Tensor, target: Tensor, topk: Union[int, Iterable[int]] = 1
 ) -> Union[Tensor, Iterable[Tensor]]:
    r"""
-    Calculate the classification accuracy given predicted logits and ground-truth labels.
+    Calculates the classification accuracy given predicted logits and ground-truth labels.

    :param logits: model predictions of shape `[batch_size, num_classes]`,
        representing the probability (likelyhood) of each class.
@@ -40,7 +46,7 @@ def accuracy(

        logits = tensor(np.arange(80, dtype=np.int32).reshape(8,10))
        target = tensor(np.arange(8, dtype=np.int32))
-        top1, top5 = F.accuracy(logits, target, (1, 5))
+        top1, top5 = F.topk_accuracy(logits, target, (1, 5))
        print(top1.numpy(), top5.numpy())

    Outputs:
@@ -54,8 +60,8 @@ def accuracy(
    _, pred = _topk(logits, k=max(topk), descending=True)
    accs = []
    for k in topk:
-        correct = pred[:, :k].detach() == _transpose(target, (0, "x")).broadcast(
-            target.shape[0], k
+        correct = pred[:, :k].detach() == broadcast_to(
+            transpose(target, (0, "x")), (target.shape[0], k)
        )
        accs.append(correct.astype(np.float32).sum() / target.shape[0])
    if len(topk) == 1:  # type: ignore[arg-type]
@@ -63,25 +69,12 @@ def accuracy(
    return accs


-def zero_grad(inp: Tensor) -> Tensor:
-    r"""
-    Returns a tensor which is treated as constant during backward gradient calcuation,
-    i.e. its gradient is zero.
-
-    :param inp: Input tensor.
-
-    See implementation of :func:`~.softmax` for example.
-    """
-    print("zero_grad is obsoleted, please use detach instead")
-    raise NotImplementedError
-
-
-def copy(inp, cn):
+def copy(inp, device=None):
    r"""
-    Copy tensor to another device.
+    Copies tensor to another device.

    :param inp: input tensor.
-    :param cn: device that you copy to.
+    :param device: destination device.

    Examples:

@@ -101,4 +94,6 @@ def copy(inp, cn):

        [1 2 3]
    """
-    return apply(Copy(comp_node=cn), inp)[0]
+    if device is None:
+        return apply(Identity(), inp)[0]
+    return apply(Copy(comp_node=as_device(device).to_c()), inp)[0]
--- a/imperative/python/megengine/hub/exceptions.py
+++ b/imperative/python/megengine/hub/exceptions.py
@@ -19,12 +19,12 @@ class InvalidGitHost(FetcherError):


 class GitPullError(FetcherError):
-    """A git pull error occurred"""
+    """A git pull error occurred."""


 class GitCheckoutError(FetcherError):
-    """A git checkout error occurred"""
+    """A git checkout error occurred."""


 class InvalidProtocol(FetcherError):
-    """The protocol provided was somehow invalid"""
+    """The protocol provided was somehow invalid."""
--- a/imperative/python/megengine/hub/fetcher.py
+++ b/imperative/python/megengine/hub/fetcher.py
@@ -106,20 +106,20 @@ class GitSSHFetcher(RepoFetcherBase):

        :param git_host:
            host address of git repo.
-            example: github.com
+            Example: github.com
        :param repo_info:
            a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
            tag/branch. The default branch is ``master`` if not specified.
-            example: ``"brain_sdk/MegBrain[:hub]"``
+            Example: ``"brain_sdk/MegBrain[:hub]"``
        :param use_cache:
-            whether to use locally fetched code or completely re-fetch
+            whether to use locally fetched code or completely re-fetch.
        :param commit:
-            commit id on github or gitlab
+            commit id on github or gitlab.
        :param silent:
            whether to accept the stdout and stderr of the subprocess with PIPE, instead of
-            displaying on the screen
+            displaying on the screen.
        :return:
-            directory where the repo code is stored
+            directory where the repo code is stored.
        """
        if not cls._check_git_host(git_host):
            raise InvalidGitHost("git_host: '{}' is malformed.".format(git_host))
@@ -215,24 +215,24 @@ class GitHTTPSFetcher(RepoFetcherBase):
        silent: bool = True,
    ) -> str:
        """
-        Fetches git repo by HTTPS protocol
+        Fetches git repo by HTTPS protocol.

        :param git_host:
-            host address of git repo
-            example: github.com
+            host address of git repo.
+            Example: github.com
        :param repo_info:
            a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
            tag/branch. The default branch is ``master`` if not specified.
-            example: ``"brain_sdk/MegBrain[:hub]"``
+            Example: ``"brain_sdk/MegBrain[:hub]"``
        :param use_cache:
-            whether to use locally cached code or completely re-fetch
+            whether to use locally cached code or completely re-fetch.
        :param commit:
-            commit id on github or gitlab
+            commit id on github or gitlab.
        :param silent:
            whether to accept the stdout and stderr of the subprocess with PIPE, instead of
-            displaying on the screen
+            displaying on the screen.
        :return:
-            directory where the repo code is stored
+            directory where the repo code is stored.
        """
        if not cls._check_git_host(git_host):
            raise InvalidGitHost("git_host: '{}' is malformed.".format(git_host))

--- a/imperative/python/megengine/hub/hub.py
+++ b/imperative/python/megengine/hub/hub.py
@@ -94,24 +94,24 @@ def _init_hub(
    commit: str = None,
    protocol: str = DEFAULT_PROTOCOL,
 ):
-    """Imports hubmodule like python import
+    """Imports hubmodule like python import.

    :param repo_info:
        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
        tag/branch. The default branch is ``master`` if not specified.
        Example: ``"brain_sdk/MegBrain[:hub]"``
    :param git_host:
-        host address of git repo
+        host address of git repo.
        Example: github.com
    :param use_cache:
-        whether to use locally cached code or completely re-fetch
+        whether to use locally cached code or completely re-fetch.
    :param commit:
-        commit id on github or gitlab
+        commit id on github or gitlab.
    :param protocol:
        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
        The value should be one of HTTPS, SSH.
    :return:
-        hubconf.py as a python module
+        a python module.
    """
    cache_dir = os.path.expanduser(os.path.join(_get_megengine_home(), "hub"))
    os.makedirs(cache_dir, exist_ok=True)
@@ -137,24 +137,24 @@ def list(
    commit: str = None,
    protocol: str = DEFAULT_PROTOCOL,
 ) -> List[str]:
-    """Lists all entrypoints available in repo hubconf
+    """Lists all entrypoints available in repo hubconf.

    :param repo_info:
        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
        tag/branch. The default branch is ``master`` if not specified.
        Example: ``"brain_sdk/MegBrain[:hub]"``
    :param git_host:
-        host address of git repo
+        host address of git repo.
        Example: github.com
    :param use_cache:
-        whether to use locally cached code or completely re-fetch
+        whether to use locally cached code or completely re-fetch.
    :param commit:
-        commit id on github or gitlab
+        commit id on github or gitlab.
    :param protocol:
        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
        The value should be one of HTTPS, SSH.
    :return:
-        all entrypoint names of the model
+        all entrypoint names of the model.
    """
    hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)

@@ -182,14 +182,14 @@ def load(
        tag/branch. The default branch is ``master`` if not specified.
        Example: ``"brain_sdk/MegBrain[:hub]"``
    :param entry:
-        an entrypoint defined in hubconf
+        an entrypoint defined in hubconf.
    :param git_host:
-        host address of git repo
+        host address of git repo.
        Example: github.com
    :param use_cache:
-        whether to use locally cached code or completely re-fetch
+        whether to use locally cached code or completely re-fetch.
    :param commit:
-        commit id on github or gitlab
+        commit id on github or gitlab.
    :param protocol:
        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
        The value should be one of HTTPS, SSH.
@@ -217,9 +217,9 @@ def help(
 ) -> str:
    """This function returns docstring of entrypoint ``entry`` by following steps:

-    1. Pull the repo code specified by git and repo_info
+    1. Pull the repo code specified by git and repo_info.
    2. Load the entry defined in repo's hubconf.py
-    3. Return docstring of function entry
+    3. Return docstring of function entry.

    :param repo_info:
        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
@@ -228,17 +228,17 @@ def help(
    :param entry:
        an entrypoint defined in hubconf.py
    :param git_host:
-        host address of git repo
+        host address of git repo.
        Example: github.com
    :param use_cache:
-        whether to use locally cached code or completely re-fetch
+        whether to use locally cached code or completely re-fetch.
    :param commit:
-        commit id on github or gitlab
+        commit id on github or gitlab.
    :param protocol:
        which protocol to use to get the repo, and HTTPS protocol only supports public repo on github.
        The value should be one of HTTPS, SSH.
    :return:
-        docstring of entrypoint ``entry``
+        docstring of entrypoint ``entry``.
    """
    hubmodule = _init_hub(repo_info, git_host, use_cache, commit, protocol)

@@ -255,10 +255,10 @@ def load_serialized_obj_from_url(url: str, model_dir=None) -> Any:
    If the object is already present in ``model_dir``, it's deserialized and
    returned. If no ``model_dir`` is specified, it will be ``MGE_HOME/serialized``.

-    :param url: url to serialized object
-    :param model_dir: dir to cache target serialized file
+    :param url: url to serialized object.
+    :param model_dir: dir to cache target serialized file.

-    :return: loaded object
+    :return: loaded object.
    """
    if model_dir is None:
        model_dir = os.path.join(_get_megengine_home(), "serialized")

--- a/imperative/python/megengine/hub/tools.py
+++ b/imperative/python/megengine/hub/tools.py
@@ -15,10 +15,10 @@ from typing import Iterator

 def load_module(name: str, path: str) -> types.ModuleType:
    """
-    Loads module specified by name and path
+    Loads module specified by name and path.

-    :param name: module name
-    :param path: module path
+    :param name: module name.
+    :param path: module path.
    """
    spec = importlib.util.spec_from_file_location(name, path)
    module = importlib.util.module_from_spec(spec)
@@ -27,18 +27,18 @@ def load_module(name: str, path: str) -> types.ModuleType:


 def check_module_exists(module: str) -> bool:
-    """Checks whether python module exists or not
+    """Checks whether python module exists or not.

-    :param module: name of module
+    :param module: name of module.
    """
    return importlib.util.find_spec(module) is not None


 @contextmanager
 def cd(target: str) -> Iterator[None]:
-    """Changes current directory to target
+    """Changes current directory to target.

-    :param target: target directory
+    :param target: target directory.
    """
    prev = os.getcwd()
    os.chdir(os.path.expanduser(target))

--- a/imperative/python/megengine/jit/tracing.py
+++ b/imperative/python/megengine/jit/tracing.py
--- a/imperative/python/megengine/logger.py
+++ b/imperative/python/megengine/logger.py
@@ -12,7 +12,7 @@ import os
 import sys

 _all_loggers = []
-_default_level_name = os.getenv("MEGENGINE_LOGGING_LEVEL", "ERROR")
+_default_level_name = os.getenv("MEGENGINE_LOGGING_LEVEL", "INFO")
 _default_level = logging.getLevelName(_default_level_name.upper())



--- a/imperative/python/megengine/module/__init__.py
+++ b/imperative/python/megengine/module/__init__.py
@@ -8,6 +8,7 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 from .activation import LeakyReLU, PReLU, ReLU, Sigmoid, Softmax
+from .adaptive_pooling import AdaptiveAvgPool2d, AdaptiveMaxPool2d
 from .batchnorm import BatchNorm1d, BatchNorm2d, SyncBatchNorm
 from .concat import Concat
 from .conv import Conv2d, ConvRelu2d, ConvTranspose2d, LocalConv2d

--- a/imperative/python/megengine/module/activation.py
+++ b/imperative/python/megengine/module/activation.py
@@ -20,10 +20,10 @@ class Softmax(Module):
    .. math::
            \text{Softmax}(x_{i}) = \frac{exp(x_i)}{\sum_j exp(x_j)}

-    It is applied to an n-dimensional input Tensor and rescaling them so that the elements of the
-    n-dimensional output Tensor lie in the range of `[0, 1]` and sum to 1.
+    It is applied to all elements along axis, and rescales elements so that
+    they stay in the range `[0, 1]` and sum to 1.

-    :param axis: An axis along which softmax will be applied. By default,
+    :param axis: Along which axis softmax will be applied. By default,
        softmax will apply along the highest ranked axis.

    Examples:
@@ -55,6 +55,9 @@ class Softmax(Module):
    def forward(self, inputs):
        return softmax(inputs, self.axis)

+    def _module_info_string(self) -> str:
+        return "axis={axis}".format(axis=self.axis)
+

 class Sigmoid(Module):
    r"""
@@ -138,8 +141,7 @@ class PReLU(Module):
        \end{cases}

    Here :math:`a` is a learnable parameter. When called without arguments, `PReLU()` uses
-    a single paramter :math:`a` across all input channel. If called with `PReLU(num_of_channels)`,
-    a seperate :math:`a` is used for each input channle.
+    a single paramter :math:`a` across all input channel. If called with `PReLU(num_of_channels)`, each input channle will has it's own :math:`a`.

    :param num_parameters: number of :math:`a` to learn, there is only two
        values are legitimate: 1, or the number of channels at input. Default: 1

--- a/imperative/python/megengine/module/adaptive_pooling.py
+++ b/imperative/python/megengine/module/adaptive_pooling.py
--- a/imperative/python/megengine/module/batchnorm.py
+++ b/imperative/python/megengine/module/batchnorm.py
@@ -11,7 +11,7 @@ from typing import Optional
 import numpy as np

 from ..distributed.group import WORLD, Group
-from ..functional import batch_norm2d, sync_batch_norm
+from ..functional.nn import batch_norm, sync_batch_norm
 from ..tensor import Parameter, Tensor
 from . import init
 from .module import Module
@@ -96,7 +96,7 @@ class _BatchNorm(Module):
        else:
            exponential_average_factor = 0.0  # useless

-        output = batch_norm2d(
+        output = batch_norm(
            inp,
            self.running_mean if self.track_running_stats else None,
            self.running_var if self.track_running_stats else None,
@@ -113,6 +113,13 @@ class _BatchNorm(Module):

        return output

+    def _module_info_string(self) -> str:
+        s = (
+            "{num_features}, eps={eps}, momentum={momentum}, affine={affine}, "
+            "track_running_stats={track_running_stats}"
+        )
+        return s.format(**self.__dict__)
+

 class SyncBatchNorm(_BatchNorm):
    r"""
@@ -213,8 +220,8 @@ class BatchNorm2d(_BatchNorm):
    of 0.9.

    If :attr:`track_running_stats` is set to ``False``, this layer will not
-    keep running estimates, and batch statistics are instead used during
-    evaluation time.
+    keep running estimates, batch statistics is used during
+    evaluation time instead.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
@@ -229,15 +236,14 @@ class BatchNorm2d(_BatchNorm):
    Spatial Batch Normalization.

    :type num_features: int
-    :param num_features: usually the :math:`C` from an input of size
-        :math:`(N, C, H, W)` or the highest ranked dimension of an input with
+    :param num_features: usually :math:`C` from an input of shape
+        :math:`(N, C, H, W)` or the highest ranked dimension of an input
        less than 4D.
    :type eps: float
    :param eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    :type momentum: float
-    :param momentum: the value used for the `running_mean` and `running_var`
-        computation.
+    :param momentum: the value used for the ``running_mean`` and ``running_var`` computation.
        Default: 0.9
    :type affine: bool
    :param affine: a boolean value that when set to True, this module has

--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
--- a/imperative/python/megengine/module/dropout.py
+++ b/imperative/python/megengine/module/dropout.py
@@ -11,7 +11,7 @@ from .module import Module


 class Dropout(Module):
-    r"""Randomly set input elements to zeros with the probability :math:`drop\_prob` during training.
+    r"""Randomly sets input elements to zeros with the probability :math:`drop\_prob` during training.
    Commonly used in large networks to prevent overfitting.
    Note that we perform dropout only during training, we also rescale(multiply) the output tensor
    by :math:`\frac{1}{1 - drop\_prob}`. During inference :class:`~.Dropout` is equal to :class:`~.Identity`.
@@ -28,3 +28,6 @@ class Dropout(Module):
            return dropout(inputs, self.drop_prob, training=True)
        else:
            return inputs
+
+    def _module_info_string(self) -> str:
+        return "drop_prob={drop_prob}".format(drop_prob=self.drop_prob)
--- a/imperative/python/megengine/module/elemwise.py
+++ b/imperative/python/megengine/module/elemwise.py
--- a/imperative/python/megengine/module/embedding.py
+++ b/imperative/python/megengine/module/embedding.py
--- a/imperative/python/megengine/module/identity.py
+++ b/imperative/python/megengine/module/identity.py
--- a/imperative/python/megengine/module/init.py
+++ b/imperative/python/megengine/module/init.py
--- a/imperative/python/megengine/module/linear.py
+++ b/imperative/python/megengine/module/linear.py
--- a/imperative/python/megengine/module/module.py
+++ b/imperative/python/megengine/module/module.py
--- a/imperative/python/megengine/module/pooling.py
+++ b/imperative/python/megengine/module/pooling.py
--- a/imperative/python/megengine/module/qat/conv_bn.py
+++ b/imperative/python/megengine/module/qat/conv_bn.py
--- a/imperative/python/megengine/module/qat/linear.py
+++ b/imperative/python/megengine/module/qat/linear.py
--- a/imperative/python/megengine/module/qat/module.py
+++ b/imperative/python/megengine/module/qat/module.py
--- a/imperative/python/megengine/module/quantized/concat.py
+++ b/imperative/python/megengine/module/quantized/concat.py
--- a/imperative/python/megengine/module/quantized/conv.py
+++ b/imperative/python/megengine/module/quantized/conv.py
--- a/imperative/python/megengine/module/quantized/conv_bn.py
+++ b/imperative/python/megengine/module/quantized/conv_bn.py
--- a/imperative/python/megengine/module/quantized/elemwise.py
+++ b/imperative/python/megengine/module/quantized/elemwise.py
--- a/imperative/python/megengine/module/quantized/linear.py
+++ b/imperative/python/megengine/module/quantized/linear.py
--- a/imperative/python/megengine/module/quantized/module.py
+++ b/imperative/python/megengine/module/quantized/module.py
--- a/imperative/python/megengine/module/quantized/quant_dequant.py
+++ b/imperative/python/megengine/module/quantized/quant_dequant.py
--- a/imperative/python/megengine/module/sequential.py
+++ b/imperative/python/megengine/module/sequential.py
--- a/imperative/python/megengine/optimizer/adadelta.py
+++ b/imperative/python/megengine/optimizer/adadelta.py
--- a/imperative/python/megengine/optimizer/adagrad.py
+++ b/imperative/python/megengine/optimizer/adagrad.py
--- a/imperative/python/megengine/optimizer/lr_scheduler.py
+++ b/imperative/python/megengine/optimizer/lr_scheduler.py
--- a/imperative/python/megengine/optimizer/multi_step_lr.py
+++ b/imperative/python/megengine/optimizer/multi_step_lr.py
--- a/imperative/python/megengine/quantization/fake_quant.py
+++ b/imperative/python/megengine/quantization/fake_quant.py
--- a/imperative/python/megengine/quantization/observer.py
+++ b/imperative/python/megengine/quantization/observer.py
--- a/imperative/python/megengine/quantization/utils.py
+++ b/imperative/python/megengine/quantization/utils.py
--- a/imperative/python/megengine/random/distribution.py
+++ b/imperative/python/megengine/random/distribution.py
--- a/imperative/python/megengine/serialization.py
+++ b/imperative/python/megengine/serialization.py
--- a/imperative/python/megengine/tensor.py
+++ b/imperative/python/megengine/tensor.py
--- a/imperative/python/megengine/test/__init__.py
+++ b/imperative/python/megengine/test/__init__.py
--- a/imperative/python/megengine/core/utils/comp_graph_tools.py
+++ b/imperative/python/megengine/core/utils/comp_graph_tools.py
--- a/imperative/python/megengine/utils/http_download.py
+++ b/imperative/python/megengine/utils/http_download.py
--- a/imperative/python/megengine/utils/max_recursion_limit.py
+++ b/imperative/python/megengine/utils/max_recursion_limit.py
--- a/imperative/python/megengine/utils/plugin.py
+++ b/imperative/python/megengine/utils/plugin.py
--- a/imperative/python/megengine/utils/profile_analyzer.py
+++ b/imperative/python/megengine/utils/profile_analyzer.py
--- a/imperative/python/megengine/utils/profiler.py
+++ b/imperative/python/megengine/utils/profiler.py
--- a/imperative/python/megengine/utils/tensor_sanity_check.py
+++ b/imperative/python/megengine/utils/tensor_sanity_check.py
--- a/imperative/python/megengine/utils/types.py
+++ b/imperative/python/megengine/utils/types.py
--- a/imperative/python/src/common.cpp
+++ b/imperative/python/src/common.cpp
--- a/imperative/python/src/dispatcher.cpp
+++ b/imperative/python/src/dispatcher.cpp
--- a/imperative/python/src/graph_rt.cpp
+++ b/imperative/python/src/graph_rt.cpp
--- a/imperative/python/src/graph_rt.h
+++ b/imperative/python/src/graph_rt.h
--- a/imperative/python/src/utils.cpp
+++ b/imperative/python/src/utils.cpp
--- a/imperative/python/test/conftest.py
+++ b/imperative/python/test/conftest.py
--- a/imperative/python/test/helpers/utils.py
+++ b/imperative/python/test/helpers/utils.py
--- a/imperative/python/test/integration/test_converge.py
+++ b/imperative/python/test/integration/test_converge.py
--- a/imperative/python/test/integration/test_correctness.py
+++ b/imperative/python/test/integration/test_correctness.py
--- a/imperative/python/test/integration/test_dp_correctness.py
+++ b/imperative/python/test/integration/test_dp_correctness.py
--- a/imperative/python/test/integration/test_trace_dump.py
+++ b/imperative/python/test/integration/test_trace_dump.py
--- a/imperative/python/test/run.sh
+++ b/imperative/python/test/run.sh
--- a/imperative/python/test/unit/core/test_autodiff.py
+++ b/imperative/python/test/unit/core/test_autodiff.py
--- a/imperative/python/test/unit/core/test_dtype_bfloat16.py
+++ b/imperative/python/test/unit/core/test_dtype_bfloat16.py
--- a/imperative/python/test/unit/core/test_dtype_intbx.py
+++ b/imperative/python/test/unit/core/test_dtype_intbx.py
--- a/imperative/python/test/unit/core/test_dtype_quant.py
+++ b/imperative/python/test/unit/core/test_dtype_quant.py
--- a/imperative/python/test/unit/core/test_indexing_op.py
+++ b/imperative/python/test/unit/core/test_indexing_op.py
--- a/imperative/python/test/unit/core/test_megbrain_graph.py
+++ b/imperative/python/test/unit/core/test_megbrain_graph.py
--- a/imperative/python/test/unit/core/test_tensor_wrapper.py
+++ b/imperative/python/test/unit/core/test_tensor_wrapper.py
--- a/imperative/python/test/unit/data/__init__.py
+++ b/imperative/python/test/unit/data/__init__.py
--- a/imperative/python/test/unit/data/test_dataloader.py
+++ b/imperative/python/test/unit/data/test_dataloader.py
--- a/imperative/python/test/unit/distributed/test_distributed.py
+++ b/imperative/python/test/unit/distributed/test_distributed.py
--- a/imperative/python/test/unit/functional/__init__.py
+++ b/imperative/python/test/unit/functional/__init__.py
--- a/imperative/python/test/unit/functional/test_elemwise.py
+++ b/imperative/python/test/unit/functional/test_elemwise.py
--- a/imperative/python/test/unit/functional/test_functional.py
+++ b/imperative/python/test/unit/functional/test_functional.py
--- a/imperative/python/test/unit/functional/test_distributed.py
+++ b/imperative/python/test/unit/functional/test_distributed.py
--- a/imperative/python/test/unit/functional/test_loss.py
+++ b/imperative/python/test/unit/functional/test_loss.py
--- a/imperative/python/test/unit/functional/test_math.py
+++ b/imperative/python/test/unit/functional/test_math.py
--- a/imperative/python/test/unit/functional/test_tensor.py
+++ b/imperative/python/test/unit/functional/test_tensor.py
--- a/imperative/python/test/unit/module/test_activation.py
+++ b/imperative/python/test/unit/module/test_activation.py
--- a/imperative/python/test/unit/module/test_batchnorm.py
+++ b/imperative/python/test/unit/module/test_batchnorm.py
--- a/imperative/python/test/unit/module/test_conv.py
+++ b/imperative/python/test/unit/module/test_conv.py
--- a/imperative/python/test/unit/module/test_module.py
+++ b/imperative/python/test/unit/module/test_module.py
--- a/imperative/python/test/unit/module/test_tensor.py
+++ b/imperative/python/test/unit/module/test_tensor.py
--- a/imperative/python/test/unit/module/test_qat.py
+++ b/imperative/python/test/unit/module/test_qat.py
--- a/imperative/python/test/unit/quantization/test_fake_quant.py
+++ b/imperative/python/test/unit/quantization/test_fake_quant.py
--- a/imperative/python/test/unit/test_cgtools.py
+++ b/imperative/python/test/unit/test_cgtools.py
--- a/imperative/python/test/unit/test_tracing.py
+++ b/imperative/python/test/unit/test_tracing.py
--- a/imperative/python/tools/gen_ops.py
+++ b/imperative/python/tools/gen_ops.py
--- a/imperative/src/include/megbrain/imperative/function_hook.h
+++ b/imperative/src/include/megbrain/imperative/function_hook.h
--- a/imperative/src/impl/interpreter_impl.cpp
+++ b/imperative/src/impl/interpreter_impl.cpp
--- a/imperative/src/impl/opr_utility.cpp
+++ b/imperative/src/impl/opr_utility.cpp
--- a/imperative/src/impl/ops/tensor_manip.cpp
+++ b/imperative/src/impl/ops/tensor_manip.cpp
--- a/imperative/src/impl/profiler.cpp
+++ b/imperative/src/impl/profiler.cpp
--- a/imperative/src/include/megbrain/imperative/opr_utility.h
+++ b/imperative/src/include/megbrain/imperative/opr_utility.h
--- a/imperative/src/include/megbrain/imperative/profiler.h
+++ b/imperative/src/include/megbrain/imperative/profiler.h
--- a/imperative/src/test/helper.cpp
+++ b/imperative/src/test/helper.cpp
--- a/imperative/src/test/opr_utility.cpp
+++ b/imperative/src/test/opr_utility.cpp
--- a/scripts/cmake-build/BUILD_README.md
+++ b/scripts/cmake-build/BUILD_README.md
--- a/scripts/whl/BUILD_PYTHON_WHL_README.md
+++ b/scripts/whl/BUILD_PYTHON_WHL_README.md
--- a/scripts/whl/macos/macos_build_whl.sh
+++ b/scripts/whl/macos/macos_build_whl.sh
--- a/sdk/load-and-run/dump_with_testcase_imperative.py
+++ b/sdk/load-and-run/dump_with_testcase_imperative.py
--- a/sdk/load-and-run/dump_with_testcase_mge.py
+++ b/sdk/load-and-run/dump_with_testcase_mge.py
--- a/sdk/load-and-run/src/mgblar.cpp
+++ b/sdk/load-and-run/src/mgblar.cpp
--- a/sdk/xor-deploy/xornet.py
+++ b/sdk/xor-deploy/xornet.py
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
--- a/src/core/impl/comp_node/atlas/comp_node.cpp
+++ b/src/core/impl/comp_node/atlas/comp_node.cpp
--- a/src/core/impl/comp_node/cpu/comp_node.cpp
+++ b/src/core/impl/comp_node/cpu/comp_node.cpp
--- a/src/core/impl/comp_node/cpu/comp_node.h
+++ b/src/core/impl/comp_node/cpu/comp_node.h
--- a/src/core/impl/graph/cg_impl.h
+++ b/src/core/impl/graph/cg_impl.h
--- a/src/core/impl/graph/cg_impl_seq.cpp
+++ b/src/core/impl/graph/cg_impl_seq.cpp
--- a/src/core/impl/graph/operator_node.cpp
+++ b/src/core/impl/graph/operator_node.cpp
--- a/src/core/impl/graph/seq_comp_node_opt_impl.cpp
+++ b/src/core/impl/graph/seq_comp_node_opt_impl.cpp
--- a/src/core/impl/tensor.cpp
+++ b/src/core/impl/tensor.cpp
--- a/src/core/include/megbrain/comp_node.h
+++ b/src/core/include/megbrain/comp_node.h
--- a/src/core/include/megbrain/graph/bases.h
+++ b/src/core/include/megbrain/graph/bases.h
--- a/src/core/include/megbrain/graph/cg.h
+++ b/src/core/include/megbrain/graph/cg.h
--- a/src/core/test/comp_node_helper.cpp
+++ b/src/core/test/comp_node_helper.cpp
--- a/src/core/test/comp_node_helper.h
+++ b/src/core/test/comp_node_helper.h
--- a/src/core/test/graph/misc.cpp
+++ b/src/core/test/graph/misc.cpp
--- a/src/core/test/graph/multi_thread.cpp
+++ b/src/core/test/graph/multi_thread.cpp
--- a/src/gopt/impl/inference.cpp
+++ b/src/gopt/impl/inference.cpp
--- a/src/gopt/impl/tensor_reformat.cpp
+++ b/src/gopt/impl/tensor_reformat.cpp
--- a/src/gopt/test/inference.cpp
+++ b/src/gopt/test/inference.cpp
--- a/src/jit/impl/fusion_pass.cpp
+++ b/src/jit/impl/fusion_pass.cpp
--- a/src/jit/impl/mlir/ir/create_gpu_kernel_outlining_pass.cpp
+++ b/src/jit/impl/mlir/ir/create_gpu_kernel_outlining_pass.cpp
--- a/src/jit/impl/mlir/ir/lower_to_affine_pass.cpp
+++ b/src/jit/impl/mlir/ir/lower_to_affine_pass.cpp
--- a/src/jit/impl/mlir/ir/lower_to_llvm_pass.cpp
+++ b/src/jit/impl/mlir/ir/lower_to_llvm_pass.cpp
--- a/src/jit/impl/mlir/ir/ops.td
+++ b/src/jit/impl/mlir/ir/ops.td
--- a/src/jit/impl/mlir/ir/types.h
+++ b/src/jit/impl/mlir/ir/types.h
--- a/src/jit/test/mlir/CMakeLists.txt
+++ b/src/jit/test/mlir/CMakeLists.txt
--- a/src/jit/test/mlir/ir/BUILD
+++ b/src/jit/test/mlir/ir/BUILD
--- a/src/jit/test/mlir/ir/add.mlir
+++ b/src/jit/test/mlir/ir/add.mlir
--- a/src/jit/test/mlir/utils/BUILD
+++ b/src/jit/test/mlir/utils/BUILD
--- a/src/jit/test/mlir/utils/lit.bzl
+++ b/src/jit/test/mlir/utils/lit.bzl
--- a/src/jit/test/mlir/utils/lit.bzl.cfg.py
+++ b/src/jit/test/mlir/utils/lit.bzl.cfg.py
--- a/src/jit/test/mlir/utils/lit.bzl.site.cfg.py
+++ b/src/jit/test/mlir/utils/lit.bzl.site.cfg.py
--- a/src/jit/test/mlir/utils/lit.bzl.site.cfg.py.in
+++ b/src/jit/test/mlir/utils/lit.bzl.site.cfg.py.in
--- a/src/jit/test/mlir/utils/lit.cfg.py
+++ b/src/jit/test/mlir/utils/lit.cfg.py
--- a/src/jit/test/mlir/utils/lit.site.cfg.py.in
+++ b/src/jit/test/mlir/utils/lit.site.cfg.py.in
--- a/src/megbrain_build_config.h.in
+++ b/src/megbrain_build_config.h.in
--- a/src/opr-mm/impl/collective_comm.cpp
+++ b/src/opr-mm/impl/collective_comm.cpp
--- a/src/opr/impl/dnn/adaptive_pooling.cpp
+++ b/src/opr/impl/dnn/adaptive_pooling.cpp
--- a/src/opr/impl/dnn/batch_norm.cpp
+++ b/src/opr/impl/dnn/batch_norm.cpp
--- a/src/opr/impl/dnn/convolution.cpp
+++ b/src/opr/impl/dnn/convolution.cpp
--- a/src/opr/impl/dnn/dnn.oprdecl
+++ b/src/opr/impl/dnn/dnn.oprdecl
--- a/src/opr/impl/dnn/dnn.sereg.h
+++ b/src/opr/impl/dnn/dnn.sereg.h
--- a/src/opr/impl/misc.cpp
+++ b/src/opr/impl/misc.cpp
--- a/src/opr/impl/misc.oprdecl
+++ b/src/opr/impl/misc.oprdecl
--- a/src/opr/impl/misc.sereg.h
+++ b/src/opr/impl/misc.sereg.h
--- a/src/opr/impl/tensor_manip.cpp
+++ b/src/opr/impl/tensor_manip.cpp
--- a/src/opr/include/megbrain/opr/dnn/adaptive_pooling.h
+++ b/src/opr/include/megbrain/opr/dnn/adaptive_pooling.h
--- a/src/opr/include/megbrain/opr/dnn/batch_norm.h
+++ b/src/opr/include/megbrain/opr/dnn/batch_norm.h
--- a/src/opr/include/megbrain/opr/misc.h
+++ b/src/opr/include/megbrain/opr/misc.h
--- a/src/opr/include/megbrain/opr/tensor_manip.h
+++ b/src/opr/include/megbrain/opr/tensor_manip.h
--- a/src/opr/test/basic_arith/others.cpp
+++ b/src/opr/test/basic_arith/others.cpp
--- a/src/opr/test/dnn/adaptive_pooling.cpp
+++ b/src/opr/test/dnn/adaptive_pooling.cpp
--- a/src/opr/test/dnn/convolution.cpp
+++ b/src/opr/test/dnn/convolution.cpp
--- a/src/opr/test/tensor_manip.cpp
+++ b/src/opr/test/tensor_manip.cpp
--- a/src/serialization/impl/schema.fbs
+++ b/src/serialization/impl/schema.fbs
--- a/imperative/src/version.ld
+++ b/imperative/src/version.ld
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
--- a/test/src/helper.cpp
+++ b/test/src/helper.cpp
--- a/test/src/include/megbrain/test/helper.h
+++ b/test/src/include/megbrain/test/helper.h
--- a/tools/param_defs/mgb_opr_param_defs.py
+++ b/tools/param_defs/mgb_opr_param_defs.py