docs(mge): pytest for sphinx docstring

GitOrigin-RevId: 8bed12562a2a9b16c120916870697c1908f002c9

docs(mge): pytest for sphinx docstring
GitOrigin-RevId: 8bed12562a2a9b16c120916870697c1908f002c9
b309890c · Megvii Engine Team · 1b568517 · b309890c · b309890c · b309890c
66 changed file
--- a/imperative/python/megengine/autodiff/grad_manager.py
+++ b/imperative/python/megengine/autodiff/grad_manager.py
@@ -17,7 +17,8 @@ def get_backwarding_grad_manager():


 class GradManager:
-    r"""GradManager manages auto differentiation and all resources required to perform it.
+    r"""
+    GradManager manages auto differentiation and all resources required to perform it.

    Our auto differentiation framework requires that the user explicitly indicates when
    the forward operations start and when all resources should be released. A typical usage of
@@ -71,7 +72,8 @@ class GradManager:
        self._gradients = dict()

    def attach(self, params: list, callbacks=None):
-        r"""Registers parameters that gradients should be calculated with respect to.
+        r"""
+        Registers parameters that gradients should be calculated with respect to.
        Callback Functions should have a signature like this:

            .. code-block::
@@ -99,7 +101,8 @@ class GradManager:
        return self

    def detach(self, params: list):
-        r"""Remove specific registered parameters and callback functions.
+        r"""
+        Remove specific registered parameters and callback functions.

        :param params: registered parameters
        """
@@ -125,7 +128,8 @@ class GradManager:
        return self

    def backward(self, ys, dys=None):
-        r"""Performs back-propagation and computes gradients.
+        r"""
+        Performs back-propagation and computes gradients.

        :param ys: outputs of forward operators, e.g., the loss tensor
        :param dys: derivatives of ys
@@ -165,7 +169,8 @@ class GradManager:
            backwarding_grad_manager = cache

    def record(self):
-        r"""Starts recording forward operations.
+        r"""
+        Starts recording forward operations.
        """
        if self._recording:
            raise RuntimeError("already recording")
@@ -190,7 +195,8 @@ class GradManager:
        self._grad.wrt(param_wrapper, callback=callback)

    def release(self):
-        r"""Stops recording and releases resources for gradients calculation.
+        r"""
+        Stops recording and releases resources for gradients calculation.
        """
        if self._grad is not None:
            self._grad.__exit__(None, None, None)

--- a/imperative/python/megengine/core/_trace_option.py
+++ b/imperative/python/megengine/core/_trace_option.py
@@ -15,7 +15,8 @@ if os.environ.get("MEGENGINE_USE_SYMBOLIC_SHAPE"):


 def use_symbolic_shape() -> bool:
-    """Returns whether tensor.shape returns a tensor instead of a tuple
+    """
+    Returns whether tensor.shape returns a tensor instead of a tuple

    """
    return _use_symbolic_shape

--- a/imperative/python/megengine/core/ops/_internal/enum36.py
+++ b/imperative/python/megengine/core/ops/_internal/enum36.py
@@ -78,7 +78,8 @@ class auto:


 class _EnumDict(dict):
-    """Track enum member order and ensure member names are not reused.
+    """
+    Track enum member order and ensure member names are not reused.

    EnumMeta will use the names found in self._member_names as the
    enumeration member names.
@@ -91,7 +92,8 @@ class _EnumDict(dict):
        self._last_values = []

    def __setitem__(self, key, value):
-        """Changes anything not dundered or not a descriptor.
+        """
+        Changes anything not dundered or not a descriptor.

        If an enum member name is used twice, an error is raised; duplicate
        values are not checked for.
@@ -303,7 +305,8 @@ class EnumMeta(type):
    def __call__(
        cls, value, names=None, *, module=None, qualname=None, type=None, start=1
    ):
-        """Either returns an existing member, or creates a new enum class.
+        """
+        Either returns an existing member, or creates a new enum class.

        This method is used both when an enum class is given a value to match
        to an enumeration member (i.e. Color(3)) and for the functional API
@@ -353,7 +356,8 @@ class EnumMeta(type):
        ] + self._member_names_

    def __getattr__(cls, name):
-        """Return the enum member matching `name`
+        """
+        Return the enum member matching `name`

        We use __getattr__ instead of descriptors or inserting into the enum
        class' __dict__ in order to support `name` and `value` being both
@@ -379,7 +383,8 @@ class EnumMeta(type):

    @property
    def __members__(cls):
-        """Returns a mapping of member name->value.
+        """
+        Returns a mapping of member name->value.

        This mapping lists all enum members, including aliases. Note that this
        is a read-only view of the internal mapping.
@@ -394,7 +399,8 @@ class EnumMeta(type):
        return (cls._member_map_[name] for name in reversed(cls._member_names_))

    def __setattr__(cls, name, value):
-        """Block attempts to reassign Enum members.
+        """
+        Block attempts to reassign Enum members.

        A simple assignment to the class namespace only changes one of the
        several possible ways to get an Enum member from the Enum class,
@@ -409,7 +415,8 @@ class EnumMeta(type):
    def _create_(
        cls, class_name, names=None, *, module=None, qualname=None, type=None, start=1
    ):
-        """Convenience method to create a new Enum class.
+        """
+        Convenience method to create a new Enum class.

        `names` can be:

@@ -465,7 +472,8 @@ class EnumMeta(type):

    @staticmethod
    def _get_mixins_(bases):
-        """Returns the type for creating enum members, and the first inherited
+        """
+        Returns the type for creating enum members, and the first inherited
        enum class.

        bases: the tuple of bases that was given to __new__
@@ -510,7 +518,8 @@ class EnumMeta(type):

    @staticmethod
    def _find_new_(classdict, member_type, first_enum):
-        """Returns the __new__ to be used for creating the enum members.
+        """
+        Returns the __new__ to be used for creating the enum members.

        classdict: the class dictionary given to __new__
        member_type: the data type whose __new__ will be used by default
@@ -556,7 +565,8 @@ class EnumMeta(type):


 class Enum(metaclass=EnumMeta):
-    """Generic enumeration.
+    """
+    Generic enumeration.

    Derive from this class to define new enumerations.


--- a/imperative/python/megengine/core/tensor/megbrain_graph.py
+++ b/imperative/python/megengine/core/tensor/megbrain_graph.py
@@ -188,7 +188,8 @@ class OpNode:


 def optimize_for_inference(dest_vars, **kwargs):
-    r"""Applies optimize_for_inference pass for computing graph.
+    r"""
+    Applies optimize_for_inference pass for computing graph.

        :param dest_vars: list of output vars in the computing graph

@@ -287,7 +288,8 @@ def dump_graph(
    strip_info_file=None,
    append_json=False
 ):
-    """serialize the computing graph of `output_vars` and get byte result.
+    """
+    serialize the computing graph of `output_vars` and get byte result.

    :param output_vars: output variables which are the graph's end point.

@@ -385,7 +387,8 @@ CompGraphLoadResult = collections.namedtuple(


 def load_graph(fpath):
-    """Load a serialized computing graph from file.
+    """
+    Load a serialized computing graph from file.

    :param fpath: Path or Handle of the input file
    :return: An instance of namedtuple :class:`CompGraphLoadResult`,

--- a/imperative/python/megengine/core/tensor/multipledispatch/dispatcher.py
+++ b/imperative/python/megengine/core/tensor/multipledispatch/dispatcher.py
@@ -69,7 +69,8 @@ def ambiguity_warn(dispatcher, ambiguities):


 def variadic_signature_matches_iter(types, full_signature):
-    """Check if a set of input types matches a variadic signature.
+    """
+    Check if a set of input types matches a variadic signature.

    Notes
    -----
@@ -288,7 +289,8 @@ class Dispatcher(CDispatcher):
    __repr__ = __str__

    def dispatch(self, *types):
-        """Deterimine appropriate implementation for this type signature
+        """
+        Deterimine appropriate implementation for this type signature

        This method is internal.  Users should call this object as a function.
        Implementation resolution occurs within the ``__call__`` method.

--- a/imperative/python/megengine/core/tensor/multipledispatch/utils.py
+++ b/imperative/python/megengine/core/tensor/multipledispatch/utils.py
@@ -110,7 +110,8 @@ def _toposort(edges):


 def reverse_dict(d):
-    """Reverses direction of dependence dict
+    """
+    Reverses direction of dependence dict

    >>> d = {'a': (1, 2), 'b': (2, 3), 'c':()}
    >>> reverse_dict(d)  # doctest: +SKIP
@@ -156,7 +157,8 @@ def groupby(func, seq):


 def typename(type):
-    """Get the name of `type`.
+    """
+    Get the name of `type`.

    Parameters
    ----------

--- a/imperative/python/megengine/core/tensor/multipledispatch/variadic.py
+++ b/imperative/python/megengine/core/tensor/multipledispatch/variadic.py
@@ -72,7 +72,8 @@ class VariadicSignatureType(type):


 def isvariadic(obj):
-    """Check whether the type `obj` is variadic.
+    """
+    Check whether the type `obj` is variadic.

    Parameters
    ----------
@@ -95,7 +96,8 @@ def isvariadic(obj):


 class VariadicSignatureMeta(type):
-    """A metaclass that overrides ``__getitem__`` on the class. This is used to
+    """
+    A metaclass that overrides ``__getitem__`` on the class. This is used to
    generate a new type for Variadic signatures. See the Variadic class for
    examples of how this behaves.
    """
@@ -117,7 +119,8 @@ class VariadicSignatureMeta(type):


 class Variadic(metaclass=VariadicSignatureMeta):
-    """A class whose getitem method can be used to generate a new type
+    """
+    A class whose getitem method can be used to generate a new type
    representing a specific variadic signature.

    Examples

--- a/imperative/python/megengine/core/tensor/tensor_wrapper.py
+++ b/imperative/python/megengine/core/tensor/tensor_wrapper.py
@@ -389,7 +389,8 @@ class ArrayMethodMixin(abc.ABC):
        return self.reshape(-1)

    def sum(self, axis=None, keepdims: bool = False):
-        r"""Returns the sum of each row of the input tensor in the given dimension ``axis``.
+        r"""
+        Returns the sum of each row of the input tensor in the given dimension ``axis``.
        If ``axis`` is a list of axises, reduce over all of them.

        If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor, except in the dimension(s) ``axis`` where it is of size 1. Otherwise, ``axis`` is squeezed(see :meth:`~.functional.tensor.squeeze`).

--- a/imperative/python/megengine/data/_queue.py
+++ b/imperative/python/megengine/data/_queue.py
@@ -59,7 +59,8 @@ class _PlasmaStoreManager:

 class PlasmaShmQueue:
    def __init__(self, maxsize: int = 0):
-        r"""Use pyarrow in-memory plasma store to implement shared memory queue.
+        r"""
+        Use pyarrow in-memory plasma store to implement shared memory queue.

        Compared to native `multiprocess.Queue`, `PlasmaShmQueue` avoid pickle/unpickle
        and communication overhead, leading to better performance in multi-process

--- a/imperative/python/megengine/data/dataloader.py
+++ b/imperative/python/megengine/data/dataloader.py
@@ -42,7 +42,8 @@ class DataLoader:
        timeout: int = 0,
        divide: bool = False,
    ):
-        r"""Provides a convenient way to iterate on a given dataset.
+        r"""
+        Provides a convenient way to iterate on a given dataset.

        `DataLoader` combines a dataset with `sampler`, `transform` and `collator`,
        make it flexible to get minibatch continually from a dataset.

--- a/imperative/python/megengine/data/dataset/vision/cityscapes.py
+++ b/imperative/python/megengine/data/dataset/vision/cityscapes.py
@@ -23,7 +23,8 @@ from .meta_vision import VisionDataset


 class Cityscapes(VisionDataset):
-    r"""`Cityscapes <http://www.cityscapes-dataset.com/>`_ Dataset.
+    r"""
+    `Cityscapes <http://www.cityscapes-dataset.com/>`_ Dataset.
    """

    supported_order = (

--- a/imperative/python/megengine/data/dataset/vision/coco.py
+++ b/imperative/python/megengine/data/dataset/vision/coco.py
@@ -46,7 +46,8 @@ def has_valid_annotation(anno, order):


 class COCO(VisionDataset):
-    r"""`MS COCO <http://cocodataset.org/#home>`_ Dataset.
+    r"""
+    `MS COCO <http://cocodataset.org/#home>`_ Dataset.
    """

    supported_order = (

--- a/imperative/python/megengine/data/dataset/vision/objects365.py
+++ b/imperative/python/megengine/data/dataset/vision/objects365.py
@@ -23,7 +23,8 @@ from .meta_vision import VisionDataset


 class Objects365(VisionDataset):
-    r"""`Objects365 <https://www.objects365.org/overview.html>`_ Dataset.
+    r"""
+    `Objects365 <https://www.objects365.org/overview.html>`_ Dataset.
    """

    supported_order = (

--- a/imperative/python/megengine/data/dataset/vision/voc.py
+++ b/imperative/python/megengine/data/dataset/vision/voc.py
@@ -24,7 +24,8 @@ from .meta_vision import VisionDataset


 class PascalVOC(VisionDataset):
-    r"""`Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ Dataset.
+    r"""
+    `Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ Dataset.
    """

    supported_order = (

--- a/imperative/python/megengine/data/transform/vision/transform.py
+++ b/imperative/python/megengine/data/transform/vision/transform.py
@@ -154,7 +154,8 @@ class VisionTransform(Transform):


 class ToMode(VisionTransform):
-    r"""Change input data to a target mode.
+    r"""
+    Change input data to a target mode.
    For example, most transforms use HWC mode image,
    while the neural network might use CHW mode input tensor.

@@ -301,7 +302,8 @@ class TorchTransformCompose(VisionTransform):


 class Pad(VisionTransform):
-    r"""Pad the input data.
+    r"""
+    Pad the input data.

    :param size: padding size of input image, it could be integer or sequence.
        If it is an integer, the input image will be padded in four directions.
@@ -348,7 +350,8 @@ class Pad(VisionTransform):


 class Resize(VisionTransform):
-    r"""Resize the input data.
+    r"""
+    Resize the input data.

    :param output_size: target size of image, with (height, width) shape.
    :param interpolation: interpolation method. All methods are listed below:
@@ -474,7 +477,8 @@ class ShortestEdgeResize(VisionTransform):


 class RandomResize(VisionTransform):
-    r"""Resize the input data randomly.
+    r"""
+    Resize the input data randomly.

    :param scale_range: range of scaling.
    :param order: the same with :class:`VisionTransform`.
@@ -518,7 +522,8 @@ class RandomResize(VisionTransform):


 class RandomCrop(VisionTransform):
-    r"""Crop the input data randomly. Before applying the crop transform,
+    r"""
+    Crop the input data randomly. Before applying the crop transform,
    pad the image first. If target size is still bigger than the size of
    padded image, pad the image size to target size.

@@ -575,7 +580,8 @@ class RandomCrop(VisionTransform):


 class RandomResizedCrop(VisionTransform):
-    r"""Crop the input data to random size and aspect ratio.
+    r"""
+    Crop the input data to random size and aspect ratio.
    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
    aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made.
    After applying crop transfrom, the input data will be resized to given size.
@@ -664,7 +670,8 @@ class RandomResizedCrop(VisionTransform):


 class CenterCrop(VisionTransform):
-    r"""Crops the given the input data at the center.
+    r"""
+    Crops the given the input data at the center.

    :param output_size: target size of output image, with (height, width) shape.
    :param order: the same with :class:`VisionTransform`.
@@ -707,7 +714,8 @@ class CenterCrop(VisionTransform):


 class RandomHorizontalFlip(VisionTransform):
-    r"""Horizontally flip the input data randomly with a given probability.
+    r"""
+    Horizontally flip the input data randomly with a given probability.

    :param p: probability of the input data being flipped. Default: 0.5
    :param order: the same with :class:`VisionTransform`.
@@ -739,7 +747,8 @@ class RandomHorizontalFlip(VisionTransform):


 class RandomVerticalFlip(VisionTransform):
-    r"""Vertically flip the input data randomly with a given probability.
+    r"""
+    Vertically flip the input data randomly with a given probability.

    :param p: probability of the input data being flipped. Default: 0.5
    :param order: the same with :class:`VisionTransform`.
@@ -771,7 +780,8 @@ class RandomVerticalFlip(VisionTransform):


 class Normalize(VisionTransform):
-    r"""Normalize the input data with mean and standard deviation.
+    r"""
+    Normalize the input data with mean and standard deviation.
    Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels,
    this transform will normalize each channel of the input data.
    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
@@ -797,7 +807,8 @@ class Normalize(VisionTransform):


 class GaussianNoise(VisionTransform):
-    r"""Add random gaussian noise to the input data.
+    r"""
+    Add random gaussian noise to the input data.
    Gaussian noise is generated with given mean and std.

    :param mean: Gaussian mean used to generate noise.
@@ -824,7 +835,8 @@ class GaussianNoise(VisionTransform):


 class BrightnessTransform(VisionTransform):
-    r"""Adjust brightness of the input data.
+    r"""
+    Adjust brightness of the input data.

    :param value: how much to adjust the brightness. Can be any
        non negative number. 0 gives the original image.
@@ -855,7 +867,8 @@ class BrightnessTransform(VisionTransform):


 class ContrastTransform(VisionTransform):
-    r"""Adjust contrast of the input data.
+    r"""
+    Adjust contrast of the input data.

    :param value: how much to adjust the contrast. Can be any
        non negative number. 0 gives the original image.
@@ -886,7 +899,8 @@ class ContrastTransform(VisionTransform):


 class SaturationTransform(VisionTransform):
-    r"""Adjust saturation of the input data.
+    r"""
+    Adjust saturation of the input data.

    :param value: how much to adjust the saturation. Can be any
        non negative number. 0 gives the original image.
@@ -917,7 +931,8 @@ class SaturationTransform(VisionTransform):


 class HueTransform(VisionTransform):
-    r"""Adjust hue of the input data.
+    r"""
+    Adjust hue of the input data.

    :param value: how much to adjust the hue. Can be any number
        between 0 and 0.5, 0 gives the original image.
@@ -955,7 +970,8 @@ class HueTransform(VisionTransform):


 class ColorJitter(VisionTransform):
-    r"""Randomly change the brightness, contrast, saturation and hue of an image.
+    r"""
+    Randomly change the brightness, contrast, saturation and hue of an image.

    :param brightness: how much to jitter brightness.
        Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]

--- a/imperative/python/megengine/device.py
+++ b/imperative/python/megengine/device.py
@@ -40,7 +40,8 @@ def _str2device_type(type_str: str, allow_unspec: bool = True):


 def get_device_count(device_type: str) -> int:
-    """Gets number of devices installed on this system.
+    """
+    Gets number of devices installed on this system.

    :param device_type: device type, one of 'gpu' or 'cpu'
    """
@@ -54,7 +55,8 @@ def get_device_count(device_type: str) -> int:


 def is_cuda_available() -> bool:
-    """Returns whether cuda device is available on this system.
+    """
+    Returns whether cuda device is available on this system.

    """
    t = _str2device_type("gpu")
@@ -62,7 +64,8 @@ def is_cuda_available() -> bool:


 def set_default_device(device: str = "xpux"):
-    r"""Sets default computing node.
+    r"""
+    Sets default computing node.

    :param device: default device type. The type can be 'cpu0', 'cpu1', etc.,
        or 'gpu0', 'gpu1', etc., to specify the particular cpu or gpu to use.
@@ -81,7 +84,8 @@ def set_default_device(device: str = "xpux"):


 def get_default_device() -> str:
-    r"""Gets default computing node.
+    r"""
+    Gets default computing node.

    It returns the value set by :func:`~.set_default_device`.
    """
@@ -98,7 +102,8 @@ def set_prealloc_config(
    growth_factor=2.0,
    device_type=DeviceType.CUDA,
 ):
-    """Specifies how to pre-allocate from raw device allocator.
+    """
+    Specifies how to pre-allocate from raw device allocator.

    :param alignment: specifies the alignment in bytes.
    :param min_req: min request size in bytes.

--- a/imperative/python/megengine/distributed/functional.py
+++ b/imperative/python/megengine/distributed/functional.py
@@ -123,7 +123,8 @@ def collective_comm(inp, mode, group, device):
 def reduce_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create reduce_sum operator for collective communication.
+    """
+    Create reduce_sum operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
@@ -136,7 +137,8 @@ def reduce_sum(
 def broadcast(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create broadcast operator for collective communication.
+    """
+    Create broadcast operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
@@ -149,7 +151,8 @@ def broadcast(
 def all_gather(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create all_gather operator for collective communication.
+    """
+    Create all_gather operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
@@ -162,7 +165,8 @@ def all_gather(
 def reduce_scatter_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create reduce_scatter_sum operator for collective communication.
+    """
+    Create reduce_scatter_sum operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
@@ -175,7 +179,8 @@ def reduce_scatter_sum(
 def all_reduce_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create all_reduce_sum operator for collective communication.
+    """
+    Create all_reduce_sum operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
@@ -188,7 +193,8 @@ def all_reduce_sum(
 def all_reduce_max(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create all_reduce_max operator for collective communication.
+    """
+    Create all_reduce_max operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
@@ -201,7 +207,8 @@ def all_reduce_max(
 def all_reduce_min(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create all_reduce_min operator for collective communication.
+    """
+    Create all_reduce_min operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
@@ -214,7 +221,8 @@ def all_reduce_min(
 def gather(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create gather operator for collective communication.
+    """
+    Create gather operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
@@ -227,7 +235,8 @@ def gather(
 def scatter(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create scatter operator for collective communication.
+    """
+    Create scatter operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
@@ -240,7 +249,8 @@ def scatter(
 def all_to_all(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = ""
 ) -> Tensor:
-    """Create all_to_all operator for collective communication.
+    """
+    Create all_to_all operator for collective communication.

    :param inp: input tensor.
    :param group: communication group.
@@ -251,7 +261,8 @@ def all_to_all(


 def remote_send(inp: Tensor, dest_rank: int) -> Tensor:
-    """Send a Tensor to a remote process.
+    """
+    Send a Tensor to a remote process.

    :param inp: tensor to send.
    :param dest_rank: destination process rank.
@@ -266,7 +277,8 @@ def remote_send(inp: Tensor, dest_rank: int) -> Tensor:
 def remote_recv(
    src_rank: int, shape: Tuple[int], dtype: type, device: Optional[str] = None
 ) -> Tensor:
-    """Receive a Tensor from a remote process.
+    """
+    Receive a Tensor from a remote process.

    :param src_rank: source process rank.
    :param shape: the shape of the tensor to receive.

--- a/imperative/python/megengine/distributed/group.py
+++ b/imperative/python/megengine/distributed/group.py
@@ -81,7 +81,8 @@ def init_process_group(
    device: int,
    backend: Optional[str] = "nccl",
 ) -> None:
-    """Initialize the distributed process group and specify the device used in the current process
+    """
+    Initialize the distributed process group and specify the device used in the current process

    :param master_ip: ip address of the master node.
    :param port: port available for all processes to communicate.

--- a/imperative/python/megengine/distributed/helper.py
+++ b/imperative/python/megengine/distributed/helper.py
@@ -140,7 +140,8 @@ class TensorFuture(Future):


 def synchronized(func: Callable):
-    """Decorator. Decorated function will synchronize when finished.
+    """
+    Decorator. Decorated function will synchronize when finished.
    Specifically, we use this to prevent data race during hub.load"""

    @functools.wraps(func)
@@ -161,7 +162,8 @@ def _get_device_count_worker(queue, device_type):


 def get_device_count_by_fork(device_type: str):
-    """Get device count in fork thread.
+    """
+    Get device count in fork thread.
    See https://stackoverflow.com/questions/22950047/cuda-initialization-error-after-fork
    for more information.
    """
@@ -173,7 +175,8 @@ def get_device_count_by_fork(device_type: str):


 def bcast_list_(inps: list, group: Group = WORLD):
-    """Broadcast tensors between given group.
+    """
+    Broadcast tensors between given group.

    :param inps: input tensors.
    :param group: communication group.
@@ -183,7 +186,8 @@ def bcast_list_(inps: list, group: Group = WORLD):


 class AllreduceCallback:
-    """Allreduce Callback with tensor fusion optimization.
+    """
+    Allreduce Callback with tensor fusion optimization.

    :param reduce_method: the method to reduce gradiants.
    :param group: communication group.

--- a/imperative/python/megengine/distributed/server.py
+++ b/imperative/python/megengine/distributed/server.py
@@ -21,7 +21,8 @@ from .util import get_free_ports


 class Methods:
-    """Distributed Server Method.
+    """
+    Distributed Server Method.
    Used for exchange information between distributed nodes.

    :param mm_server_port: multiple machine rpc server port.
@@ -45,7 +46,8 @@ class Methods:
        return self.mm_server_port

    def set_is_grad(self, key, is_grad):
-        """Mark send/recv need gradiants by key.
+        """
+        Mark send/recv need gradiants by key.
        
        :param key: key to match send/recv op.
        :param is_grad: whether this op need grad.
@@ -56,7 +58,8 @@ class Methods:
        return True

    def check_is_grad(self, key):
-        """Check whether send/recv need gradiants.
+        """
+        Check whether send/recv need gradiants.
        
        :param key: key to match send/recv op.
        """
@@ -68,7 +71,8 @@ class Methods:
        return ret

    def set_remote_tracer(self, key, tracer_set):
-        """Set tracer dict for tracing send/recv op.
+        """
+        Set tracer dict for tracing send/recv op.

        :param key: key to match send/recv op.
        :param tracer_set: valid tracer set.
@@ -79,7 +83,8 @@ class Methods:
        return True

    def check_remote_tracer(self, key):
-        """Get tracer dict for send/recv op.
+        """
+        Get tracer dict for send/recv op.
        
        :param key: key to match send/recv op.
        """
@@ -91,7 +96,8 @@ class Methods:
        return ret

    def group_barrier(self, key, size):
-        """A barrier wait for all group member.
+        """
+        A barrier wait for all group member.
        
        :param key: group key to match each other.
        :param size: group size.
@@ -114,7 +120,8 @@ class ThreadXMLRPCServer(ThreadingMixIn, SimpleXMLRPCServer):


 def start_server(py_server_port, mm_server_port):
-    """Start python distributed server and multiple machine server.
+    """
+    Start python distributed server and multiple machine server.
    
    :param py_server_port: python server port.
    :param mm_server_port: multiple machine server port.
@@ -125,7 +132,8 @@ def start_server(py_server_port, mm_server_port):


 class Server:
-    """Distributed Server for distributed training.
+    """
+    Distributed Server for distributed training.
    Should be running at master node.

    :param port: python server port.
@@ -143,7 +151,8 @@ class Server:


 class Client:
-    """Distributed Client for distributed training.
+    """
+    Distributed Client for distributed training.

    :param master_ip: ip address of master node.
    :param port: port of server at master node.
@@ -171,7 +180,8 @@ class Client:
        return self.proxy.get_mm_server_port()

    def set_is_grad(self, key, is_grad):
-        """Mark send/recv need gradiants by key.
+        """
+        Mark send/recv need gradiants by key.
        
        :param key: key to match send/recv op.
        :param is_grad: whether this op need grad.
@@ -179,14 +189,16 @@ class Client:
        self.proxy.set_is_grad(key, is_grad)

    def check_is_grad(self, key):
-        """Check whether send/recv need gradiants.
+        """
+        Check whether send/recv need gradiants.
        
        :param key: key to match send/recv op.
        """
        return self.proxy.check_is_grad(key)

    def set_remote_tracer(self, key, tracer_set):
-        """Set tracer dict for tracing send/recv op.
+        """
+        Set tracer dict for tracing send/recv op.

        :param key: key to match send/recv op.
        :param tracer_set: valid tracer set.
@@ -194,14 +206,16 @@ class Client:
        self.proxy.set_remote_tracer(key, tracer_set)

    def check_remote_tracer(self, key):
-        """Get tracer dict for send/recv op.
+        """
+        Get tracer dict for send/recv op.
        
        :param key: key to match send/recv op.
        """
        return self.proxy.check_remote_tracer(key)

    def group_barrier(self, key, size):
-        """A barrier wait for all group member.
+        """
+        A barrier wait for all group member.
        
        :param key: group key to match each other.
        :param size: group size.

--- a/imperative/python/megengine/distributed/util.py
+++ b/imperative/python/megengine/distributed/util.py
@@ -12,7 +12,8 @@ from typing import List


 def get_free_ports(num: int) -> List[int]:
-    """Get one or more free ports.
+    """
+    Get one or more free ports.
    """
    socks, ports = [], []
    for i in range(num):

--- a/imperative/python/megengine/functional/debug_param.py
+++ b/imperative/python/megengine/functional/debug_param.py
@@ -12,7 +12,8 @@ _conv_execution_strategy = os.getenv("MEGENGINE_CONV_EXECUTION_STRATEGY", "HEURI


 def get_conv_execution_strategy() -> str:
-    """Returns the execuation strategy of :class:`~.Conv2d`.
+    """
+    Returns the execuation strategy of :class:`~.Conv2d`.

    See :func:`~.set_conv_execution_strategy` for possible return values
    """
@@ -20,7 +21,8 @@ def get_conv_execution_strategy() -> str:


 def set_conv_execution_strategy(option: str):
-    """Sets the execuation strategy of :class:`~.Conv2d`.
+    """
+    Sets the execuation strategy of :class:`~.Conv2d`.

    :param option: Decides how :class:`~.Conv2d` algorithm is chosen.
        Available values:

--- a/imperative/python/megengine/functional/elemwise.py
+++ b/imperative/python/megengine/functional/elemwise.py
@@ -100,7 +100,8 @@ def _elemwise_multi_type(*args, mode, **kwargs):


 def add(x, y):
-    """Element-wise `addition`.
+    """
+    Element-wise `addition`.
    At least one operand should be tensor.

    Same for sub/mul/div/floor_div/pow/mod/atan2/equal/not_equal/less/less_equal/greater/greater_equal/maximum/minmium.
@@ -193,7 +194,8 @@ def log1p(x):


 def sqrt(x: Tensor) -> Tensor:
-    """Element-wise `sqrt`.
+    """
+    Element-wise `sqrt`.
    Returns ``NaN`` for negative input value.

    :param x: input tensor.
@@ -209,7 +211,7 @@ def sqrt(x: Tensor) -> Tensor:

        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
        out = F.sqrt(x)
-        print(out.numpy())
+        print(out.numpy().round(decimals=4))

    Outputs:

@@ -239,7 +241,7 @@ def square(x: Tensor) -> Tensor:

        data = mge.tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
        out = F.square(data)
-        print(out.numpy())
+        print(out.numpy().round(decimals=4))

    Outputs:

@@ -281,7 +283,8 @@ def minimum(x, y):


 def cos(x):
-    """Element-wise `cosine`.
+    """
+    Element-wise `cosine`.

    :param x: input tensor.
    :return: computed tensor.
@@ -296,7 +299,7 @@ def cos(x):

        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
        out = F.cos(x)
-        print(out.numpy())
+        print(out.numpy().round(decimals=4))

    Outputs:

@@ -374,7 +377,8 @@ def atanh(x):


 def left_shift(x, y):
-    """Element-wise `bitwise binary: x << y`.
+    """
+    Element-wise `bitwise binary: x << y`.

    :param x: input tensor, should be int.
    :param y: how many bits to be left-shifted.
@@ -435,7 +439,8 @@ def logical_xor(x, y):


 def equal(x, y):
-    """Element-wise `(x == y)`.
+    """
+    Element-wise `(x == y)`.

    :param x: input tensor 1.
    :param y: input tensor 2.
@@ -494,7 +499,8 @@ def greater_equal(x, y):


 def hswish(x):
-    """Element-wise `x * relu6(x + 3) / 6`.
+    """
+    Element-wise `x * relu6(x + 3) / 6`.

    :param x: input tensor.
    :return: computed tensor.
@@ -509,7 +515,7 @@ def hswish(x):

        x = tensor(np.arange(5).astype(np.float32))
        out = F.hswish(x)
-        print(out.numpy())
+        print(out.numpy().round(decimals=4))

    .. testoutput::

@@ -540,7 +546,8 @@ def sigmoid(x):


 def clip(x: Tensor, lower=None, upper=None) -> Tensor:
-    r"""Clamps all elements in input tensor into the range `[` :attr:`lower`, :attr:`upper` `]` and returns
+    r"""
+    Clamps all elements in input tensor into the range `[` :attr:`lower`, :attr:`upper` `]` and returns
    a resulting tensor:

    .. math::

--- a/imperative/python/megengine/functional/loss.py
+++ b/imperative/python/megengine/functional/loss.py
@@ -24,7 +24,8 @@ __all__ = [


 def l1_loss(pred: Tensor, label: Tensor) -> Tensor:
-    r"""Calculates the mean absolute error (MAE) between
+    r"""
+    Calculates the mean absolute error (MAE) between
    each element in the pred :math:`x` and label :math:`y`.

    The mean absolute error can be described as:
@@ -70,7 +71,8 @@ def l1_loss(pred: Tensor, label: Tensor) -> Tensor:


 def square_loss(pred: Tensor, label: Tensor) -> Tensor:
-    r"""Calculates the mean squared error (squared L2 norm) between
+    r"""
+    Calculates the mean squared error (squared L2 norm) between
    each element in the pred :math:`x` and label :math:`y`.

    The mean squared error can be described as:
@@ -127,7 +129,8 @@ def cross_entropy(
    with_logits: bool = True,
    label_smooth: float = 0,
 ) -> Tensor:
-    r"""Computes the multi-class cross entropy loss (using logits by default).
+    r"""
+    Computes the multi-class cross entropy loss (using logits by default).

    By default(``with_logitis`` is True), ``pred`` is assumed to be logits,
    class probabilities are given by softmax.
@@ -161,7 +164,7 @@ def cross_entropy(
        pred = tensor(np.array([0, 0], dtype=np.float32).reshape(data_shape))
        label = tensor(np.ones(label_shape, dtype=np.int32))
        loss = F.nn.cross_entropy(pred, label)
-        print(loss.numpy())
+        print(loss.numpy().round(decimals=4))

    Outputs:

@@ -195,7 +198,8 @@ def cross_entropy(
 def binary_cross_entropy(
    pred: Tensor, label: Tensor, with_logits: bool = True
 ) -> Tensor:
-    r"""Computes the binary cross entropy loss (using logits by default).
+    r"""
+    Computes the binary cross entropy loss (using logits by default).

    By default(``with_logitis`` is True), ``pred`` is assumed to be logits,
    class probabilities are given by sigmoid.
@@ -216,7 +220,7 @@ def binary_cross_entropy(
        pred = tensor(np.array([0, 0], dtype=np.float32).reshape(1, 2))
        label = tensor(np.ones((1, 2), dtype=np.float32))
        loss = F.nn.binary_cross_entropy(pred, label)
-        print(loss.numpy())
+        print(loss.numpy().round(decimals=4))

    Outputs:

@@ -233,7 +237,8 @@ def binary_cross_entropy(


 def hinge_loss(pred: Tensor, label: Tensor, norm: str = "L1") -> Tensor:
-    r"""Caculates the hinge loss which is often used in SVM.
+    r"""
+    Caculates the hinge loss which is often used in SVM.

    The hinge loss can be described as:


--- a/imperative/python/megengine/functional/math.py
+++ b/imperative/python/megengine/functional/math.py
@@ -43,7 +43,8 @@ __all__ = [


 def isnan(inp: Tensor) -> Tensor:
-    r"""Returns a new tensor representing if each element is ``NaN`` or not.
+    r"""
+    Returns a new tensor representing if each element is ``NaN`` or not.

    :param inp: input tensor.
    :return: result tensor.
@@ -69,7 +70,8 @@ def isnan(inp: Tensor) -> Tensor:


 def isinf(inp: Tensor) -> Tensor:
-    r"""Returns a new tensor representing if each element is ``Inf`` or not.
+    r"""
+    Returns a new tensor representing if each element is ``Inf`` or not.

    :param inp: input tensor.
    :return: result tensor.
@@ -95,7 +97,8 @@ def isinf(inp: Tensor) -> Tensor:


 def sign(inp: Tensor):
-    r"""Returns a new tensor representing the sign of each element in input tensor.
+    r"""
+    Returns a new tensor representing the sign of each element in input tensor.

    :param: input tensor.
    :return: the sign of input tensor.
@@ -125,7 +128,8 @@ def sum(
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
 ) -> Tensor:
-    r"""Returns the sum of input tensor along given axis. If axis is a list of dimensions,
+    r"""
+    Returns the sum of input tensor along given axis. If axis is a list of dimensions,
    reduce over all of them.

    :param inp: input tensor.
@@ -160,7 +164,8 @@ def sum(
 def prod(
    inp: Tensor, axis: Optional[Union[int, Sequence[int]]] = None, keepdims=False
 ) -> Tensor:
-    r"""Returns the product of input tensor along given axis. If axis is a list of dimensions,
+    r"""
+    Returns the product of input tensor along given axis. If axis is a list of dimensions,
    reduce over all of them.

    :param inp: input tensor.
@@ -195,7 +200,8 @@ def mean(
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
 ) -> Tensor:
-    """Returns the mean value of input tensor along
+    """
+    Returns the mean value of input tensor along
    given axis. If axis is a list of dimensions,
    reduce over all of them.

@@ -231,7 +237,8 @@ def var(
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
 ) -> Tensor:
-    """Returns the variance value of input tensor along
+    """
+    Returns the variance value of input tensor along
    given axis. If axis is a list of dimensions,
    reduce over all of them.

@@ -250,7 +257,7 @@ def var(

        data = tensor(np.arange(1, 7, dtype=np.float32).reshape(2, 3))
        out = F.var(data)
-        print(out.numpy())
+        print(out.numpy().round(decimals=4))

    Outputs:

@@ -271,7 +278,8 @@ def std(
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
 ) -> Tensor:
-    """Returns the standard deviation of input tensor along
+    """
+    Returns the standard deviation of input tensor along
    given axis. If axis is a list of dimensions,
    reduce over all of them.

@@ -290,7 +298,7 @@ def std(

        data = tensor(np.arange(1, 7, dtype=np.float32).reshape(2, 3))
        out = F.std(data, axis=1)
-        print(out.numpy())
+        print(out.numpy().round(decimals=4))

    Outputs:

@@ -306,7 +314,8 @@ def min(
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
 ) -> Tensor:
-    r"""Returns the min value of input tensor along
+    r"""
+    Returns the min value of input tensor along
    given axis. If axis is a list of dimensions,
    reduce over all of them.

@@ -342,7 +351,8 @@ def max(
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
 ) -> Tensor:
-    r"""Returns the max value of the input tensor along
+    r"""
+    Returns the max value of the input tensor along
    given axis. If axis is a list of dimensions,
    reduce over all of them.

@@ -376,7 +386,8 @@ def max(
 def norm(
    inp: Tensor, ord: float = None, axis: int = None, keepdims=False,
 ):
-    """Calculates ``p``-norm of input tensor along
+    """
+    Calculates ``p``-norm of input tensor along
    given axis.

    :param inp: input tensor.
@@ -395,7 +406,7 @@ def norm(

        x = tensor(np.arange(-3, 3, dtype=np.float32))
        out = F.norm(x)
-        print(out.numpy())
+        print(out.numpy().round(decimals=4))

    Outputs:

@@ -423,7 +434,8 @@ def argmin(
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
 ) -> Tensor:
-    r"""Returns the indices of the minimum values along
+    r"""
+    Returns the indices of the minimum values along
    given axis. If axis is a list of dimensions,
    reduce over all of them.

@@ -481,7 +493,8 @@ def argmax(
    axis: Optional[Union[int, Sequence[int]]] = None,
    keepdims: bool = False,
 ) -> Tensor:
-    r"""Returns the indices of the maximum values along
+    r"""
+    Returns the indices of the maximum values along
    given axis. If axis is a list of dimensions,
    reduce over all of them.

@@ -537,7 +550,8 @@ def argmax(
 def normalize(
    inp: Tensor, ord: float = None, axis: int = None, eps: float = 1e-12,
 ) -> Tensor:
-    r"""Performs :math:`L_p` normalization of input tensor along
+    r"""
+    Performs :math:`L_p` normalization of input tensor along
    given axis.

    For a tensor of shape :math:`(n_0, ..., n_{dim}, ..., n_k)`, each
@@ -559,7 +573,8 @@ def normalize(


 def argsort(inp: Tensor, descending: bool = False) -> Tensor:
-    r"""Returns the indices that would sort the input tensor.
+    r"""
+    Returns the indices that would sort the input tensor.

    :param inp: input tensor. If it's 2d, the result would be array of indices show how to sort each row in the input tensor.
    :param descending: sort in descending order, where the largest comes first. Default: False
@@ -600,7 +615,8 @@ def argsort(inp: Tensor, descending: bool = False) -> Tensor:


 def sort(inp: Tensor, descending: bool = False) -> Tuple[Tensor, Tensor]:
-    r"""Returns sorted tensor and the indices would sort the input tensor.
+    r"""
+    Returns sorted tensor and the indices would sort the input tensor.

    :param inp: input tensor. If it's 2d, the result would be sorted by row.
    :param descending: sort in descending order, where the largest comes first. Default: False
@@ -647,7 +663,8 @@ def topk(
    kth_only: bool = False,
    no_sort: bool = False,
 ) -> Tuple[Tensor, Tensor]:
-    r"""Selects the ``Top-K``(by default) smallest elements of 2d matrix by row.
+    r"""
+    Selects the ``Top-K``(by default) smallest elements of 2d matrix by row.

    :param inp: input tensor. If input tensor is 2d, each row will be sorted.
    :param k: number of elements needed.

--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
@@ -75,7 +75,8 @@ def expand_hw(x):


 def linear(inp: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tensor:
-    """Applies a linear transformation to the input tensor.
+    """
+    Applies a linear transformation to the input tensor.

    Refer to :class:`~.module.linear.Linear` for more information.

@@ -101,7 +102,8 @@ def conv2d(
    conv_mode="CROSS_CORRELATION",
    compute_mode="DEFAULT",
 ) -> Tensor:
-    """2D convolution operation.
+    """
+    2D convolution operation.

    Refer to :class:`~.Conv2d` for more information.

@@ -166,7 +168,8 @@ def conv_transpose2d(
    conv_mode="CROSS_CORRELATION",
    compute_mode="DEFAULT",
 ) -> Tensor:
-    """2D transposed convolution operation.
+    """
+    2D transposed convolution operation.

    Refer to :class:`~.ConvTranspose2d` for more information.

@@ -227,7 +230,8 @@ def local_conv2d(
    dilation: Union[int, Tuple[int, int]] = 1,
    conv_mode="CROSS_CORRELATION",
 ):
-    """Applies spatial 2D convolution over an groupped channeled image with untied kernels.
+    """
+    Applies spatial 2D convolution over an groupped channeled image with untied kernels.
    """
    assert conv_mode == "CROSS_CORRELATION" or conv_mode.name == "CROSS_CORRELATION"

@@ -261,7 +265,8 @@ def max_pool2d(
    stride: Optional[Union[int, Tuple[int, int]]] = None,
    padding: Union[int, Tuple[int, int]] = 0,
 ) -> Tensor:
-    """Applies a 2D max pooling over an input tensor.
+    """
+    Applies a 2D max pooling over an input tensor.

    Refer to :class:`~.MaxPool2d` for more information.

@@ -298,7 +303,8 @@ def avg_pool2d(
    padding: Union[int, Tuple[int, int]] = 0,
    mode: str = "AVERAGE_COUNT_EXCLUDE_PADDING",
 ) -> Tensor:
-    """Applies 2D average pooling over an input tensor.
+    """
+    Applies 2D average pooling over an input tensor.

    Refer to :class:`~.AvgPool2d` for more information.

@@ -332,7 +338,8 @@ def avg_pool2d(
 def adaptive_max_pool2d(
    inp: Tensor, oshp: Union[Tuple[int, int], int, Tensor],
 ) -> Tensor:
-    """Applies a 2D max adaptive pooling over an input.
+    """
+    Applies a 2D max adaptive pooling over an input.

    Refer to :class:`~.MaxAdaptivePool2d` for more information.

@@ -353,7 +360,8 @@ def adaptive_max_pool2d(
 def adaptive_avg_pool2d(
    inp: Tensor, oshp: Union[Tuple[int, int], int, Tensor],
 ) -> Tensor:
-    """Applies a 2D average adaptive pooling over an input.
+    """
+    Applies a 2D average adaptive pooling over an input.

    Refer to :class:`~.AvgAdaptivePool2d` for more information.

@@ -390,7 +398,8 @@ def leaky_relu(inp: Tensor, negative_slope: float = 0.01) -> Tensor:


 def softplus(inp: Tensor) -> Tensor:
-    r"""Applies the element-wise function:
+    r"""
+    Applies the element-wise function:

    .. math::
        \text{softplus}(x) = \log(1 + \exp(x))
@@ -416,7 +425,7 @@ def softplus(inp: Tensor) -> Tensor:

        x = tensor(np.arange(-3, 3, dtype=np.float32))
        y = F.softplus(x)
-        print(y.numpy())
+        print(y.numpy().round(decimals=4))

    Outputs:

@@ -429,7 +438,8 @@ def softplus(inp: Tensor) -> Tensor:


 def logsoftmax(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:
-    r"""Applies the :math:`\log(\text{Softmax}(x))` function to an n-dimensional
+    r"""
+    Applies the :math:`\log(\text{Softmax}(x))` function to an n-dimensional
    input Tensor. The LogSoftmax formulation can be simplified as:

    .. math::
@@ -456,7 +466,7 @@ def logsoftmax(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:

        x = tensor(np.arange(-5, 5, dtype=np.float32)).reshape(2,5)
        y = F.logsoftmax(x, axis=1)
-        print(y.numpy())
+        print(y.numpy().round(decimals=4))

    Outputs:

@@ -470,7 +480,8 @@ def logsoftmax(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:


 def logsigmoid(inp: Tensor) -> Tensor:
-    r"""Applies the element-wise function:
+    r"""
+    Applies the element-wise function:

    .. math::
        \text{logsigmoid}(x) = \log(\frac{ 1 }{ 1 + \exp(-x)})
@@ -490,13 +501,13 @@ def logsigmoid(inp: Tensor) -> Tensor:

        x = tensor(np.arange(-5, 5, dtype=np.float32))
        y = F.logsigmoid(x)
-        print(y.numpy())
+        print(y.numpy().round(decimals=4))

    Outputs:

    .. testoutput::

-        [-5.0067 -4.0181 -3.0486 -2.1269 -1.3133 -0.6931 -0.3133 -0.1269 -0.0486
+        [-5.0067 -4.0182 -3.0486 -2.1269 -1.3133 -0.6931 -0.3133 -0.1269 -0.0486
         -0.0181]

    """
@@ -539,7 +550,7 @@ def logsumexp(

        x = tensor(np.arange(-5, 5, dtype=np.float32)).reshape(2,5)
        y = F.logsumexp(x, axis=1, keepdims=False)
-        print(y.numpy())
+        print(y.numpy().round(decimals=4))

    Outputs:

@@ -589,7 +600,7 @@ def softmax(inp: Tensor, axis: Optional[int] = None) -> Tensor:

        x = tensor(np.arange(-5, 5, dtype=np.float32)).reshape(2,5)
        out = F.softmax(x)
-        print(out.numpy())
+        print(out.numpy().round(decimals=4))

    Outputs:

@@ -619,7 +630,8 @@ def batch_norm(
    eps: float = 1e-5,
    inplace: bool = True
 ):
-    r"""Applies batch normalization to the input.
+    r"""
+    Applies batch normalization to the input.

    Refer to :class:`~.BatchNorm2d` and :class:`~.BatchNorm1d` for more information.

@@ -734,7 +746,8 @@ def sync_batch_norm(
    eps_mode="ADDITIVE",
    group=WORLD,
 ) -> Tensor:
-    r"""Applies synchronized batch normalization to the input.
+    r"""
+    Applies synchronized batch normalization to the input.

    Refer to :class:`~.BatchNorm2d` and :class:`~.BatchNorm1d` for more information.

@@ -835,7 +848,8 @@ def sync_batch_norm(


 def one_hot(inp: Tensor, num_classes: int) -> Tensor:
-    r"""Performs one-hot encoding for the input tensor.
+    r"""
+    Performs one-hot encoding for the input tensor.

    :param inp: input tensor.
    :param num_classes: number of classes denotes the last dimension of the output tensor.
@@ -878,7 +892,8 @@ def warp_perspective(
    border_val: float = 0.0,
    interp_mode: str = "LINEAR",
 ):
-    r"""Applies perspective transformation to batched 2D images.
+    r"""
+    Applies perspective transformation to batched 2D images.

    The input images are transformed to the output images by the transformation matrix:

@@ -1094,13 +1109,13 @@ def svd(inp: Tensor, full_matrices=False, compute_uv=True) -> Tensor:

        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2,3))
        _, y, _ = F.svd(x)
-        print(y.numpy())
+        print(y.numpy().round(decimals=3))

    Outputs:

    .. testoutput::

-        [7.3485 1.    ]
+        [7.348 1.   ]

    """
    op = builtin.SVD(full_matrices=full_matrices, compute_uv=compute_uv)
@@ -1115,7 +1130,8 @@ def interpolate(
    mode: str = "BILINEAR",
    align_corners: bool = None,
 ) -> Tensor:
-    r"""Down/up samples the input tensor to either the given size or with the given scale_factor. ``size`` can not coexist with ``scale_factor``.
+    r"""
+    Down/up samples the input tensor to either the given size or with the given scale_factor. ``size`` can not coexist with ``scale_factor``.

    :param inp: input tensor.
    :param size: size of the output tensor. Default: None
@@ -1257,7 +1273,8 @@ def interpolate(


 def dropout(inp: Tensor, drop_prob: float, training: bool = True) -> Tensor:
-    """Returns a new tensor where each of the elements are randomly set to zero
+    """
+    Returns a new tensor where each of the elements are randomly set to zero
    with probability P = ``drop_prob``. Optionally rescale the output tensor if ``training`` is True.

    :param inp: input tensor.
@@ -1302,7 +1319,8 @@ def embedding(
    max_norm: Optional[float] = None,
    norm_type: Optional[float] = None,
 ):
-    """Applies lookup table for embedding.
+    """
+    Applies lookup table for embedding.

    :param inp: tensor with indices.
    :param weight: learnable weights which embeds from.
@@ -1329,7 +1347,8 @@ def roi_pooling(
    mode: str = "max",
    scale: float = 1.0,
 ) -> Tensor:
-    """Applies roi pooling on input feature.
+    """
+    Applies roi pooling on input feature.

    :param inp: tensor that represents the input feature, `(N, C, H, W)` images.
    :param rois: `(K, 5)` boxes. First column is the index into N. The other 4 columns are xyxy.
@@ -1350,7 +1369,7 @@ def roi_pooling(
            inp = tensor(np.random.randn(1, 1, 128, 128))
            rois = tensor(np.random.random((4, 5)))
            y = F.nn.roi_pooling(inp, rois, (2, 2))
-            print(y.numpy()[0])
+            print(y.numpy()[0].round(decimals=4))

    Outputs:

@@ -1382,7 +1401,8 @@ def roi_align(
    sample_points: Union[int, tuple, list] = 2,
    aligned: bool = True,
 ) -> Tensor:
-    """Applies roi align on input feature.
+    """
+    Applies roi align on input feature.

    :param inp: tensor that represents the input feature, shape is `(N, C, H, W)`.
    :param rois: `(N, 5)` boxes. First column is the box index. The other 4 columns are ``xyxy``.
@@ -1407,7 +1427,7 @@ def roi_align(
            inp = tensor(np.random.randn(1, 1, 128, 128))
            rois = tensor(np.random.random((4, 5)))
            y = F.nn.roi_align(inp, rois, (2, 2))
-            print(y.numpy()[0])
+            print(y.numpy()[0].round(decimals=4))

    Outputs:

@@ -1444,7 +1464,8 @@ def roi_align(
 def indexing_one_hot(
    src: Tensor, index: Tensor, axis: int = 1, keepdims=False
 ) -> Tensor:
-    r"""One-hot indexing for some axes.
+    r"""
+    One-hot indexing for some axes.

    :param src: input tensor.
    :param index: index tensor.

--- a/imperative/python/megengine/functional/quantized.py
+++ b/imperative/python/megengine/functional/quantized.py
@@ -28,7 +28,8 @@ def conv_bias_activation(
    conv_mode="CROSS_CORRELATION",
    compute_mode="DEFAULT",
 ) -> Tensor:
-    """Convolution bias with activation operation, only for inference.
+    """
+    Convolution bias with activation operation, only for inference.

    :param inp: feature map of the convolution operation.
    :param weight: convolution kernel.

--- a/imperative/python/megengine/functional/tensor.py
+++ b/imperative/python/megengine/functional/tensor.py
@@ -58,7 +58,8 @@ __all__ = [


 def eye(N, M=None, *, dtype="float32", device: Optional[CompNode] = None) -> Tensor:
-    """Returns a 2D tensor with ones on the diagonal and zeros elsewhere.
+    """
+    Returns a 2D tensor with ones on the diagonal and zeros elsewhere.

    :param shape: expected shape of output tensor.
    :param dtype: data type. Default: None
@@ -100,7 +101,8 @@ def eye(N, M=None, *, dtype="float32", device: Optional[CompNode] = None) -> Ten


 def full(shape, value, dtype="float32", device=None):
-    """Returns a tensor with given shape and value.
+    """
+    Returns a tensor with given shape and value.
    """
    if isinstance(shape, int):
        shape = (shape,)
@@ -113,7 +115,8 @@ def full(shape, value, dtype="float32", device=None):


 def ones(shape, dtype="float32", device=None):
-    """Returns a ones tensor with given shape.
+    """
+    Returns a ones tensor with given shape.

    :param inp: input tensor.
    :return: output zero tensor.
@@ -139,13 +142,15 @@ def ones(shape, dtype="float32", device=None):


 def zeros(shape, dtype="float32", device=None):
-    """Returns a zero tensor with given shape.
+    """
+    Returns a zero tensor with given shape.
    """
    return full(shape, 0.0, dtype=dtype, device=device)


 def zeros_like(inp: Tensor) -> Tensor:
-    """Returns a zero tensor with the same shape as input tensor.
+    """
+    Returns a zero tensor with the same shape as input tensor.

    :param inp: input tensor.
    :return: output zero tensor.
@@ -174,13 +179,15 @@ def zeros_like(inp: Tensor) -> Tensor:


 def ones_like(inp: Tensor) -> Tensor:
-    """Returns a ones tensor with the same shape as input tensor.
+    """
+    Returns a ones tensor with the same shape as input tensor.
    """
    return ones(inp.shape, dtype=inp.dtype, device=inp.device)


 def full_like(inp: Tensor, value: Union[int, float]) -> Tensor:
-    """Returns a tensor filled with given value with the same shape as input tensor.
+    """
+    Returns a tensor filled with given value with the same shape as input tensor.
    """
    return full(inp.shape, value, dtype=inp.dtype, device=inp.device)

@@ -274,7 +281,8 @@ def concat(inps: Iterable[Tensor], axis: int = 0, device=None) -> Tensor:


 def stack(inps, axis=0, device=None):
-    """Concats a sequence of tensors along a new axis.
+    """
+    Concats a sequence of tensors along a new axis.
    The input tensors must have the same shape.

    :param inps: input tensors.
@@ -316,7 +324,8 @@ def stack(inps, axis=0, device=None):


 def split(inp, nsplits_or_sections, axis=0):
-    """Splits the input tensor into several smaller tensors.
+    """
+    Splits the input tensor into several smaller tensors.
    When nsplits_or_sections is int, the last tensor may be smaller than others.

    :param inp: input tensor.
@@ -334,7 +343,7 @@ def split(inp, nsplits_or_sections, axis=0):

        x = tensor(np.random.random((2,3,4,5)), dtype=np.float32)
        out = F.split(x, 2, axis=3)
-        print(out[0].shape, out[1].shape)
+        print(out[0].numpy().shape, out[1].numpy().shape)

    Outputs:

@@ -400,7 +409,8 @@ def _get_idx(index, axis):

 def gather(inp: Tensor, axis: int, index: Tensor) -> Tensor:
    # TODO: rewrite doc
-    r"""Gathers data from input tensor on axis using index.
+    r"""
+    Gathers data from input tensor on axis using index.

    For a 3-D tensor, the output is specified by::

@@ -472,7 +482,8 @@ def gather(inp: Tensor, axis: int, index: Tensor) -> Tensor:

 def scatter(inp: Tensor, axis: int, index: Tensor, source: Tensor) -> Tensor:
    # TODO: rewrite doc
-    r"""Writes all values from the tensor source into input tensor
+    r"""
+    Writes all values from the tensor source into input tensor
    at the indices specified in the index tensor.

    For each value in source, its output index is specified by its index
@@ -577,7 +588,8 @@ def scatter(inp: Tensor, axis: int, index: Tensor, source: Tensor) -> Tensor:


 def where(mask: Tensor, x: Tensor, y: Tensor) -> Tensor:
-    r"""Selects elements either from Tensor x or Tensor y, according to mask.
+    r"""
+    Selects elements either from Tensor x or Tensor y, according to mask.

    .. math::

@@ -764,7 +776,8 @@ AxisDesc = AxisAddRemove.AxisDesc


 def flatten(inp: Tensor, start_axis: int = 0, end_axis: int = -1) -> Tensor:
-    r"""Reshapes the tensor by flattening the sub-tensor from dimension ``start_axis`` to dimension ``end_axis``.
+    r"""
+    Reshapes the tensor by flattening the sub-tensor from dimension ``start_axis`` to dimension ``end_axis``.

    :param inp: input tensor.
    :param start_axis: start dimension that the sub-tensor to be flattened. Default: 0
@@ -819,7 +832,7 @@ def expand_dims(inp: Tensor, axis: Union[int, Sequence[int]]) -> Tensor:

        x = tensor([1, 2])
        out = F.expand_dims(x, 0)
-        print(out.shape)
+        print(out.numpy().shape)

    Outputs:

@@ -865,7 +878,7 @@ def squeeze(inp: Tensor, axis: Optional[Union[int, Sequence[int]]] = None) -> Te

        x = tensor(np.array([1, 2], dtype=np.int32).reshape(1, 1, 2, 1))
        out = F.squeeze(x, 3)
-        print(out.shape)
+        print(out.numpy().shape)

    Outputs:

@@ -884,7 +897,8 @@ def linspace(
    dtype="float32",
    device: Optional[CompNode] = None,
 ) -> Tensor:
-    r"""Returns equally spaced numbers over a specified interval.
+    r"""
+    Returns equally spaced numbers over a specified interval.

    :param start: starting value of the squence, shoule be scalar.
    :param stop: last value of the squence, shoule be scalar.
@@ -928,7 +942,8 @@ def arange(
    dtype="float32",
    device: Optional[CompNode] = None,
 ) -> Tensor:
-    r"""Returns a tensor with values from start to stop with adjacent interval step.
+    r"""
+    Returns a tensor with values from start to stop with adjacent interval step.

    :param start: starting value of the squence, shoule be scalar.
    :param stop: ending value of the squence, shoule be scalar.

--- a/imperative/python/megengine/functional/types.py
+++ b/imperative/python/megengine/functional/types.py
@@ -11,7 +11,8 @@ import functools


 def get_ndtuple(value, *, n, allow_zero: bool = True):
-    r"""Converts possibly 1D tuple to n-dim tuple.
+    r"""
+    Converts possibly 1D tuple to n-dim tuple.

    :param value: value will be filled in generated tuple.
    :param n: how many elements will the tuple have.

--- a/imperative/python/megengine/hub/hub.py
+++ b/imperative/python/megengine/hub/hub.py
@@ -43,7 +43,8 @@ PROTOCOLS = {


 def _get_megengine_home() -> str:
-    """MGE_HOME setting complies with the XDG Base Directory Specification
+    """
+    MGE_HOME setting complies with the XDG Base Directory Specification
    """
    megengine_home = os.path.expanduser(
        os.getenv(
@@ -94,7 +95,8 @@ def _init_hub(
    commit: str = None,
    protocol: str = DEFAULT_PROTOCOL,
 ):
-    """Imports hubmodule like python import.
+    """
+    Imports hubmodule like python import.

    :param repo_info:
        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
@@ -137,7 +139,8 @@ def list(
    commit: str = None,
    protocol: str = DEFAULT_PROTOCOL,
 ) -> List[str]:
-    """Lists all entrypoints available in repo hubconf.
+    """
+    Lists all entrypoints available in repo hubconf.

    :param repo_info:
        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
@@ -175,7 +178,8 @@ def load(
    protocol: str = DEFAULT_PROTOCOL,
    **kwargs
 ) -> Any:
-    """Loads model from github or gitlab repo, with pretrained weights.
+    """
+    Loads model from github or gitlab repo, with pretrained weights.

    :param repo_info:
        a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
@@ -215,7 +219,8 @@ def help(
    commit: str = None,
    protocol: str = DEFAULT_PROTOCOL,
 ) -> str:
-    """This function returns docstring of entrypoint ``entry`` by following steps:
+    """
+    This function returns docstring of entrypoint ``entry`` by following steps:

    1. Pull the repo code specified by git and repo_info.
    2. Load the entry defined in repo's hubconf.py
@@ -250,7 +255,8 @@ def help(


 def load_serialized_obj_from_url(url: str, model_dir=None) -> Any:
-    """Loads MegEngine serialized object from the given URL.
+    """
+    Loads MegEngine serialized object from the given URL.

    If the object is already present in ``model_dir``, it's deserialized and
    returned. If no ``model_dir`` is specified, it will be ``MGE_HOME/serialized``.

--- a/imperative/python/megengine/hub/tools.py
+++ b/imperative/python/megengine/hub/tools.py
@@ -27,7 +27,8 @@ def load_module(name: str, path: str) -> types.ModuleType:


 def check_module_exists(module: str) -> bool:
-    """Checks whether python module exists or not.
+    """
+    Checks whether python module exists or not.

    :param module: name of module.
    """
@@ -36,7 +37,8 @@ def check_module_exists(module: str) -> bool:

 @contextmanager
 def cd(target: str) -> Iterator[None]:
-    """Changes current directory to target.
+    """
+    Changes current directory to target.

    :param target: target directory.
    """

--- a/imperative/python/megengine/jit/tracing.py
+++ b/imperative/python/megengine/jit/tracing.py
@@ -519,7 +519,8 @@ class trace:
        optimize_for_inference=True,
        **kwargs
    ):
-        r"""Serializes trace to file system.
+        r"""
+        Serializes trace to file system.

        :param file: output file, could be file object or filename.
        :param arg_names: names of the input tensors in the traced function.

--- a/imperative/python/megengine/logger.py
+++ b/imperative/python/megengine/logger.py
@@ -17,7 +17,8 @@ _default_level = logging.getLevelName(_default_level_name.upper())


 def set_log_file(fout, mode="a"):
-    r"""Sets log output file.
+    r"""
+    Sets log output file.

    :type fout: str or file-like
    :param fout: file-like object that supports write and flush, or string for
@@ -38,37 +39,44 @@ class MegEngineLogFormatter(logging.Formatter):
    max_lines = 256

    def _color_exc(self, msg):
-        r"""Sets the color of message as the execution type.
+        r"""
+        Sets the color of message as the execution type.
        """
        return "\x1b[34m{}\x1b[0m".format(msg)

    def _color_dbg(self, msg):
-        r"""Sets the color of message as the debugging type.
+        r"""
+        Sets the color of message as the debugging type.
        """
        return "\x1b[36m{}\x1b[0m".format(msg)

    def _color_warn(self, msg):
-        r"""Sets the color of message as the warning type.
+        r"""
+        Sets the color of message as the warning type.
        """
        return "\x1b[1;31m{}\x1b[0m".format(msg)

    def _color_err(self, msg):
-        r"""Sets the color of message as the error type.
+        r"""
+        Sets the color of message as the error type.
        """
        return "\x1b[1;4;31m{}\x1b[0m".format(msg)

    def _color_omitted(self, msg):
-        r"""Sets the color of message as the omitted type.
+        r"""
+        Sets the color of message as the omitted type.
        """
        return "\x1b[35m{}\x1b[0m".format(msg)

    def _color_normal(self, msg):
-        r"""Sets the color of message as the normal type.
+        r"""
+        Sets the color of message as the normal type.
        """
        return msg

    def _color_date(self, msg):
-        r"""Sets the color of message the same as date.
+        r"""
+        Sets the color of message the same as date.
        """
        return "\x1b[32m{}\x1b[0m".format(msg)

@@ -142,7 +150,8 @@ class MegEngineLogFormatter(logging.Formatter):


 def get_logger(name=None, formatter=MegEngineLogFormatter):
-    r"""Gets megengine logger with given name.
+    r"""
+    Gets megengine logger with given name.
    """

    logger = logging.getLogger(name)
@@ -161,7 +170,8 @@ def get_logger(name=None, formatter=MegEngineLogFormatter):


 def set_log_level(level, update_existing=True):
-    """Sets default logging level.
+    """
+    Sets default logging level.

    :type level: int e.g. logging.INFO
    :param level: loggin level given by python :mod:`logging` module
@@ -198,7 +208,8 @@ try:
        _imperative_rt_logger.set_log_level(_imperative_rt_logger.LogLevel.Debug)

    def set_mgb_log_level(level):
-        r"""Sets megbrain log level
+        r"""
+        Sets megbrain log level

        :type level: int e.g. logging.INFO
        :param level: new log level
@@ -218,7 +229,8 @@ except ImportError as exc:

 @contextlib.contextmanager
 def replace_mgb_log_level(level):
-    r"""Replaces megbrain log level in a block and restore after exiting.
+    r"""
+    Replaces megbrain log level in a block and restore after exiting.

    :type level: int e.g. logging.INFO
    :param level: new log level
@@ -231,7 +243,8 @@ def replace_mgb_log_level(level):


 def enable_debug_log():
-    r"""Sets logging level to debug for all components.
+    r"""
+    Sets logging level to debug for all components.
    """
    set_log_level(logging.DEBUG)
    set_mgb_log_level(logging.DEBUG)
--- a/imperative/python/megengine/module/adaptive_pooling.py
+++ b/imperative/python/megengine/module/adaptive_pooling.py
@@ -27,7 +27,8 @@ class _AdaptivePoolNd(Module):


 class AdaptiveMaxPool2d(_AdaptivePoolNd):
-    r"""Applies a 2D max adaptive pooling over an input.
+    r"""
+    Applies a 2D max adaptive pooling over an input.

    For instance, given an input of the size :math:`(N, C, H, W)` and
    an output shape :math:`(OH, OW)`, this layer generates the output of
@@ -62,7 +63,7 @@ class AdaptiveMaxPool2d(_AdaptivePoolNd):

    .. testoutput::

-        [[[[5.  7.]
+        [[[[ 5.  7.]
           [13. 15.]]]]

    """
@@ -72,7 +73,8 @@ class AdaptiveMaxPool2d(_AdaptivePoolNd):


 class AdaptiveAvgPool2d(_AdaptivePoolNd):
-    r"""Applies a 2D average pooling over an input.
+    r"""
+    Applies a 2D average pooling over an input.

    For instance, given an input of the size :math:`(N, C, H, W)` and
    an output shape :math:`(OH, OW)`, this layer generates the output of
@@ -105,7 +107,7 @@ class AdaptiveAvgPool2d(_AdaptivePoolNd):

    .. testoutput::

-        [[[[2.5  4.5]
+        [[[[ 2.5  4.5]
           [10.5 12.5]]]]

    """

--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
@@ -87,7 +87,8 @@ class _ConvNd(Module):


 class Conv2d(_ConvNd):
-    r"""Applies a 2D convolution over an input tensor.
+    r"""
+    Applies a 2D convolution over an input tensor.

    For instance, given an input of the size :math:`(N, C_{\text{in}}, H, W)`,
    this layer generates an output of the size
@@ -145,7 +146,7 @@ class Conv2d(_ConvNd):
        m = M.Conv2d(in_channels=3, out_channels=1, kernel_size=3)
        inp = mge.tensor(np.arange(0, 96).astype("float32").reshape(2, 3, 4, 4))
        oup = m(inp)
-        print(oup.shape)
+        print(oup.numpy().shape)

    Outputs:

@@ -232,7 +233,8 @@ class Conv2d(_ConvNd):


 class ConvTranspose2d(_ConvNd):
-    r"""Applies a 2D transposed convolution over an input tensor.
+    r"""
+    Applies a 2D transposed convolution over an input tensor.

    This module is also known as a deconvolution or a fractionally-strided convolution.
    :class:`ConvTranspose2d` can be seen as the gradient of :class:`Conv2d` operation
@@ -340,7 +342,8 @@ class ConvTranspose2d(_ConvNd):


 class LocalConv2d(Conv2d):
-    r"""Applies a spatial convolution with untied kernels over an groupped channeled input 4D tensor.
+    r"""
+    Applies a spatial convolution with untied kernels over an groupped channeled input 4D tensor.
    It is also known as the locally connected layer.

    :param in_channels: number of input channels.

--- a/imperative/python/megengine/module/dropout.py
+++ b/imperative/python/megengine/module/dropout.py
@@ -11,7 +11,8 @@ from .module import Module


 class Dropout(Module):
-    r"""Randomly sets input elements to zeros with the probability :math:`drop\_prob` during training.
+    r"""
+    Randomly sets input elements to zeros with the probability :math:`drop\_prob` during training.
    Commonly used in large networks to prevent overfitting.
    Note that we perform dropout only during training, we also rescale(multiply) the output tensor
    by :math:`\frac{1}{1 - drop\_prob}`. During inference :class:`~.Dropout` is equal to :class:`~.Identity`.

--- a/imperative/python/megengine/module/embedding.py
+++ b/imperative/python/megengine/module/embedding.py
@@ -93,7 +93,7 @@ class Embedding(Module):
            )
            self.reset_parameters()
        else:
-            if initial_weight.shape != (num_embeddings, embedding_dim):
+            if initial_weight.numpy().shape != (num_embeddings, embedding_dim):
                raise ValueError(
                    "The weight shape should match num_embeddings and embedding_dim"
                )

--- a/imperative/python/megengine/module/init.py
+++ b/imperative/python/megengine/module/init.py
@@ -18,7 +18,8 @@ from ..tensor import Tensor


 def fill_(tensor: Tensor, val: Union[float, int]) -> None:
-    """Fills the given ``tensor`` with value ``val``.
+    """
+    Fills the given ``tensor`` with value ``val``.

    :param tensor: tensor to be initialized.
    :param val: value to be filled throughout the tensor.
@@ -27,7 +28,8 @@ def fill_(tensor: Tensor, val: Union[float, int]) -> None:


 def zeros_(tensor: Tensor) -> None:
-    """Fills the given ``tensor`` with scalar value `0`.
+    """
+    Fills the given ``tensor`` with scalar value `0`.

    :param tensor: tensor to be initialized.
    """
@@ -35,7 +37,8 @@ def zeros_(tensor: Tensor) -> None:


 def ones_(tensor: Tensor) -> None:
-    """Fills the given ``tensor`` with the scalar value `1`.
+    """
+    Fills the given ``tensor`` with the scalar value `1`.

    :param tensor: tensor to be initialized.
    """
@@ -43,7 +46,8 @@ def ones_(tensor: Tensor) -> None:


 def uniform_(tensor: Tensor, a: float = 0.0, b: float = 1.0) -> None:
-    r"""Fills the given ``tensor`` with random value sampled from uniform distribution
+    r"""
+    Fills the given ``tensor`` with random value sampled from uniform distribution
    :math:`\mathcal{U}(\text{a}, \text{b})`.

    :param tensor: tensor to be initialized.
@@ -54,7 +58,8 @@ def uniform_(tensor: Tensor, a: float = 0.0, b: float = 1.0) -> None:


 def normal_(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> None:
-    r"""Fills the given ``tensor`` with random value sampled from normal distribution
+    r"""
+    Fills the given ``tensor`` with random value sampled from normal distribution
    :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.

    :param tensor: tensor to be initialized.
@@ -67,7 +72,8 @@ def normal_(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> None:
 def calculate_gain(
    nonlinearity: str, param: Optional[Union[int, float]] = None
 ) -> float:
-    r"""Returns a recommended gain value (see the table below) for the given nonlinearity
+    r"""
+    Returns a recommended gain value (see the table below) for the given nonlinearity
    function.

    ================= ====================================================
@@ -168,7 +174,8 @@ def calculate_correct_fan(tensor: Tensor, mode: str) -> float:


 def xavier_uniform_(tensor: Tensor, gain: float = 1.0) -> None:
-    r"""Fills tensor with random values sampled from :math:`\mathcal{U}(-a, a)`
+    r"""
+    Fills tensor with random values sampled from :math:`\mathcal{U}(-a, a)`
    where

    .. math::
@@ -188,7 +195,8 @@ def xavier_uniform_(tensor: Tensor, gain: float = 1.0) -> None:


 def xavier_normal_(tensor: Tensor, gain: float = 1.0) -> None:
-    r"""Fills tensor with random values sampled from
+    r"""
+    Fills tensor with random values sampled from
    :math:`\mathcal{N}(0, \text{std}^2)` where

    .. math::
@@ -209,7 +217,8 @@ def xavier_normal_(tensor: Tensor, gain: float = 1.0) -> None:
 def msra_uniform_(
    tensor: Tensor, a: float = 0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"
 ) -> None:
-    r"""Fills tensor wilth random values sampled from
+    r"""
+    Fills tensor wilth random values sampled from
    :math:`\mathcal{U}(-\text{bound}, \text{bound})` where

    .. math::
@@ -238,7 +247,8 @@ def msra_uniform_(
 def msra_normal_(
    tensor: Tensor, a: float = 0, mode: str = "fan_in", nonlinearity: str = "leaky_relu"
 ) -> None:
-    r"""Fills tensor wilth random values sampled from
+    r"""
+    Fills tensor wilth random values sampled from
    :math:`\mathcal{N}(0, \text{std}^2)` where

    .. math::

--- a/imperative/python/megengine/module/linear.py
+++ b/imperative/python/megengine/module/linear.py
@@ -14,7 +14,8 @@ from .module import Module


 class Linear(Module):
-    r"""Applies a linear transformation to the input. For instance, if input
+    r"""
+    Applies a linear transformation to the input. For instance, if input
    is x, then output y is:

    .. math::
@@ -39,7 +40,7 @@ class Linear(Module):
        m = M.Linear(in_features=3, out_features=1)
        inp = mge.tensor(np.arange(0, 6).astype("float32").reshape(2, 3))
        oup = m(inp)
-        print(oup.shape)
+        print(oup.numpy().shape)

    Outputs:


--- a/imperative/python/megengine/module/module.py
+++ b/imperative/python/megengine/module/module.py
@@ -57,7 +57,8 @@ def _is_module(obj):


 class Module(metaclass=ABCMeta):
-    """Base Module class.
+    """
+    Base Module class.
    """

    def __init__(self):
@@ -76,7 +77,8 @@ class Module(metaclass=ABCMeta):
        pass

    def register_forward_pre_hook(self, hook: Callable) -> HookHandler:
-        """Registers a hook to handle forward inputs. `hook` should be a function.
+        """
+        Registers a hook to handle forward inputs. `hook` should be a function.

        :param hook: a function that receive `module` and `inputs`, then return
        a modified `inputs` or `None`.
@@ -85,7 +87,8 @@ class Module(metaclass=ABCMeta):
        return HookHandler(self._forward_pre_hooks, hook)

    def register_forward_hook(self, hook: Callable) -> HookHandler:
-        """Registers a hook to handle forward results. `hook` should be a function that
+        """
+        Registers a hook to handle forward results. `hook` should be a function that
        receive `module`, `inputs` and `outputs`, then return a modified `outputs` or `None`.

        This method return a handler with :meth:`~.HookHandler.remove` interface to delete the hook.
@@ -118,7 +121,8 @@ class Module(metaclass=ABCMeta):
        predicate: Callable[[Any], bool] = lambda _: True,
        seen: Optional[Set[int]] = None
    ) -> Union[Iterable[Any], Iterable[Tuple[str, Any]]]:
-        """Scans the module object and returns an iterable for the :class:`~.Tensor`
+        """
+        Scans the module object and returns an iterable for the :class:`~.Tensor`
        and :class:`~.Module` attributes that agree with the ``predicate``. For multiple
        calls of this function with same arguments, the order of objects within the
        returned iterable is guaranteed to be identical, as long as all the involved
@@ -165,7 +169,8 @@ class Module(metaclass=ABCMeta):
                    )

    def parameters(self, recursive: bool = True, **kwargs) -> Iterable[Parameter]:
-        r"""Returns an iterable for the :class:`~.Parameter` of the module.
+        r"""
+        Returns an iterable for the :class:`~.Parameter` of the module.

        :param recursive: If ``True``, returns all :class:`~.Parameter` within this
            module, else only returns :class:`~.Parameter` that are direct attributes
@@ -190,7 +195,8 @@ class Module(metaclass=ABCMeta):
    def named_parameters(
        self, prefix: Optional[str] = None, recursive: bool = True, **kwargs
    ) -> Iterable[Tuple[str, Parameter]]:
-        """Returns an iterable for key :class:`~.Parameter` pairs of the module, where
+        """
+        Returns an iterable for key :class:`~.Parameter` pairs of the module, where
        ``key`` is the dotted path from this module to the :class:`~.Parameter`.

        :param prefix: prefix prepended to the keys.
@@ -219,7 +225,8 @@ class Module(metaclass=ABCMeta):
        )

    def buffers(self, recursive: bool = True, **kwargs) -> Iterable[Tensor]:
-        """Returns an iterable for the buffers of the module.
+        """
+        Returns an iterable for the buffers of the module.

        Buffer is defined to be :class:`~.Tensor` excluding :class:`~.Parameter`.

@@ -234,7 +241,8 @@ class Module(metaclass=ABCMeta):
    def named_buffers(
        self, prefix: Optional[str] = None, recursive: bool = True, **kwargs
    ) -> Iterable[Tuple[str, Tensor]]:
-        """Returns an iterable for key buffer pairs of the module, where
+        """
+        Returns an iterable for key buffer pairs of the module, where
        ``key`` is the dotted path from this module to the buffer.

        Buffer is defined to be :class:`~.Tensor` excluding :class:`~.Parameter`.
@@ -253,7 +261,8 @@ class Module(metaclass=ABCMeta):
        )

    def children(self, **kwargs) -> "Iterable[Module]":
-        """Returns an iterable for all the submodules that are direct attributes of this
+        """
+        Returns an iterable for all the submodules that are direct attributes of this
        module.
        """
        yield from self._flatten(
@@ -261,7 +270,8 @@ class Module(metaclass=ABCMeta):
        )

    def named_children(self, **kwargs) -> "Iterable[Tuple[str, Module]]":
-        """Returns an iterable of key-submodule pairs for all the submodules that are
+        """
+        Returns an iterable of key-submodule pairs for all the submodules that are
        direct attributes of this module, where 'key' is the attribute name of
        submodules.
        """
@@ -270,7 +280,8 @@ class Module(metaclass=ABCMeta):
        )

    def modules(self, **kwargs) -> "Iterable[Module]":
-        """Returns an iterable for all the modules within this module, including itself.
+        """
+        Returns an iterable for all the modules within this module, including itself.
        """
        if "with_parent" in kwargs and kwargs["with_parent"]:
            yield self, None
@@ -281,7 +292,8 @@ class Module(metaclass=ABCMeta):
    def named_modules(
        self, prefix: Optional[str] = None, **kwargs
    ) -> "Iterable[Tuple[str, Module]]":
-        """Returns an iterable of key-module pairs for all the modules within this
+        """
+        Returns an iterable of key-module pairs for all the modules within this
        module, including itself, where 'key' is the dotted path from this module to the
        submodules.

@@ -296,7 +308,8 @@ class Module(metaclass=ABCMeta):
        )

    def apply(self, fn: "Callable[[Module], Any]") -> None:
-        """Applies function ``fn`` to all the modules within this module, including
+        """
+        Applies function ``fn`` to all the modules within this module, including
        itself.

        :param fn: the function to be applied on modules.
@@ -306,14 +319,16 @@ class Module(metaclass=ABCMeta):

    @deprecated(version="1.0")
    def zero_grad(self) -> None:
-        """Sets all parameters' grads to zero
+        """
+        Sets all parameters' grads to zero
        """
        for param in self.parameters():
            if param.grad is not None:
                param.grad.reset_zero()

    def train(self, mode: bool = True, recursive: bool = True) -> None:
-        """Sets training mode of all the modules within this module (including itself) to
+        """
+        Sets training mode of all the modules within this module (including itself) to
        ``mode``. This effectively sets the ``training`` attributes of those modules
        to ``mode``, but only has effect on certain modules (e.g.
        :class:`~.BatchNorm2d`, :class:`~.Dropout`, :class:`~.Observer`)
@@ -331,7 +346,8 @@ class Module(metaclass=ABCMeta):
        self.apply(fn)

    def eval(self) -> None:
-        """Sets training mode of all the modules within this module (including itself) to
+        """
+        Sets training mode of all the modules within this module (including itself) to
        ``False``. See :meth:`~.Module.train` for details.
        """
        self.train(False)
@@ -351,7 +367,8 @@ class Module(metaclass=ABCMeta):
    def replace_param(
        self, params: dict, start_pos: int, seen: Optional[Set[int]] = None
    ):
-        """Replaces module's parameters with ``params``, used by :class:`~.ParamPack` to
+        """
+        Replaces module's parameters with ``params``, used by :class:`~.ParamPack` to
        speedup multimachine training.
        """
        offset = 0
@@ -377,7 +394,8 @@ class Module(metaclass=ABCMeta):
        return offset

    def state_dict(self, rst=None, prefix="", keep_var=False):
-        r"""Returns a dictionary containing whole states of the module.
+        r"""
+        Returns a dictionary containing whole states of the module.
        """

        def is_state(obj):
@@ -407,7 +425,8 @@ class Module(metaclass=ABCMeta):
        state_dict: Union[dict, Callable[[str, Tensor], Optional[np.ndarray]]],
        strict=True,
    ):
-        r"""Loads a given dictionary created by :func:`state_dict` into this module.
+        r"""
+        Loads a given dictionary created by :func:`state_dict` into this module.
        If ``strict`` is ``True``, the keys of :func:`state_dict` must exactly match the keys
        returned by :func:`state_dict`.

@@ -485,7 +504,8 @@ class Module(metaclass=ABCMeta):
                )

    def _load_state_dict_with_closure(self, closure):
-        """Advance state_dict load through callable ``closure`` whose signature is
+        """
+        Advance state_dict load through callable ``closure`` whose signature is
        ``closure(key: str, var: Tensor) -> Union[np.ndarry, None]``
        """
        assert callable(closure), "closure must be a function"
@@ -536,7 +556,8 @@ class Module(metaclass=ABCMeta):
        super().__delattr__(name)

    def _module_info_string(self) -> str:
-        r"""Set the extra representation of the module.
+        r"""
+        Set the extra representation of the module.
        """
        return ""


--- a/imperative/python/megengine/module/pooling.py
+++ b/imperative/python/megengine/module/pooling.py
@@ -36,7 +36,8 @@ class _PoolNd(Module):


 class MaxPool2d(_PoolNd):
-    r"""Applies a 2D max pooling over an input.
+    r"""
+    Applies a 2D max pooling over an input.

    For instance, given an input of the size :math:`(N, C, H, W)` and
    :attr:`kernel_size` :math:`(kH, kW)`, this layer generates the output of
@@ -83,7 +84,8 @@ class MaxPool2d(_PoolNd):


 class AvgPool2d(_PoolNd):
-    r"""Applies a 2D average pooling over an input.
+    r"""
+    Applies a 2D average pooling over an input.

    For instance, given an input of the size :math:`(N, C, H, W)` and
    :attr:`kernel_size` :math:`(kH, kW)`, this layer generates the output of

--- a/imperative/python/megengine/module/quantized/conv.py
+++ b/imperative/python/megengine/module/quantized/conv.py
@@ -19,7 +19,8 @@ from .module import QuantizedModule

 class Conv2d(Float.Conv2d, QuantizedModule):
    r"""Quantized version of :class:`~.qat.conv.Conv2d`."""
-    r"""Applies a 2D convolution over a quantized input tensor, used for inference only.
+    r"""
+    Applies a 2D convolution over a quantized input tensor, used for inference only.

    The parameter is same with :class: `~.Conv2d`.
    """

--- a/imperative/python/megengine/module/quantized/conv_bn.py
+++ b/imperative/python/megengine/module/quantized/conv_bn.py
@@ -11,7 +11,8 @@ from .conv import Conv2d


 class _ConvBnActivation2d(Conv2d):
-    r"""Applies a 2D convolution over a quantized input tensor, used for inference only.
+    r"""
+    Applies a 2D convolution over a quantized input tensor, used for inference only.

    The parameter is same with :class: `~.Conv2d`.
    """

--- a/imperative/python/megengine/module/sequential.py
+++ b/imperative/python/megengine/module/sequential.py
@@ -12,7 +12,8 @@ from .module import Module


 class Sequential(Module):
-    r"""A sequential container.
+    r"""
+    A sequential container.
    Modules will be added to it in the order they are passed in the constructor.
    Alternatively, an ordered dict of modules can also be passed in.

@@ -29,10 +30,9 @@ class Sequential(Module):
        from collections import OrderedDict

        batch_size = 64
-        data = mge.tensor(np.zeros((batch_size, 1, 28, 28)), dtype=np.float32)
+        data = mge.tensor(np.zeros((batch_size, 28 * 28)), dtype=np.float32)
        label = mge.tensor(np.zeros(batch_size,), dtype=np.int32)

-        data = data.reshape(batch_size, -1)
        net0 = M.Sequential(
                M.Linear(28 * 28, 320),
                M.Linear(320, 10)
@@ -40,10 +40,9 @@ class Sequential(Module):
        pred0 = net0(data)

        modules = OrderedDict()
-        modules["fc0"] = nn.Linear(28 * 28, 320)
-        modules["fc1"] = nn.Linear(320, 10)
-        net1 = nn.Sequential(modules)
-
+        modules["fc0"] = M.Linear(28 * 28, 320)
+        modules["fc1"] = M.Linear(320, 10)
+        net1 = M.Sequential(modules)
        pred1 = net1(data)
    """


--- a/imperative/python/megengine/optimizer/adadelta.py
+++ b/imperative/python/megengine/optimizer/adadelta.py
@@ -16,7 +16,8 @@ from .optimizer import Optimizer


 class Adadelta(Optimizer):
-    r"""Implements Adadelta algorithm.
+    r"""
+    Implements Adadelta algorithm.

    It has been proposed in `"ADADELTA: An Adaptive Learning Rate Method" <https://arxiv.org/abs/1212.5701>`_.


--- a/imperative/python/megengine/optimizer/adagrad.py
+++ b/imperative/python/megengine/optimizer/adagrad.py
@@ -16,7 +16,8 @@ from .optimizer import Optimizer


 class Adagrad(Optimizer):
-    r"""Implements Adagrad algorithm.
+    r"""
+    Implements Adagrad algorithm.

    It has been proposed in `"Adaptive Subgradient Methods for Online Learning
    and Stochastic Optimization" <http://jmlr.org/papers/v12/duchi11a.html>`_.

--- a/imperative/python/megengine/optimizer/adam.py
+++ b/imperative/python/megengine/optimizer/adam.py
@@ -13,7 +13,8 @@ from .optimizer import Optimizer


 class Adam(Optimizer):
-    r"""Implements Adam algorithm proposed in `"Adam: A Method for Stochastic Optimization" <https://arxiv.org/abs/1412.6980>`_.
+    r"""
+    Implements Adam algorithm proposed in `"Adam: A Method for Stochastic Optimization" <https://arxiv.org/abs/1412.6980>`_.

    :param params: iterable of parameters to optimize or dicts defining
            parameter groups.

--- a/imperative/python/megengine/optimizer/lr_scheduler.py
+++ b/imperative/python/megengine/optimizer/lr_scheduler.py
@@ -12,7 +12,8 @@ from .optimizer import Optimizer


 class LRScheduler(metaclass=ABCMeta):
-    r"""Base class for all learning rate based schedulers.
+    r"""
+    Base class for all learning rate based schedulers.

    :param optimizer: wrapped optimizer.
    :param current_epoch: the index of current epoch. Default: -1
@@ -44,14 +45,16 @@ class LRScheduler(metaclass=ABCMeta):
        self.step()

    def state_dict(self):
-        r"""Returns the state of the scheduler as a :class:`dict`.
+        r"""
+        Returns the state of the scheduler as a :class:`dict`.
            It contains an entry for every variable in self.__dict__ which
            is not the optimizer.
        """
        raise NotImplementedError

    def load_state_dict(self, state_dict):
-        r"""Loads the schedulers state.
+        r"""
+        Loads the schedulers state.

        :type state_dict: dict
        :param state_dict: scheduler state.

--- a/imperative/python/megengine/optimizer/multi_step_lr.py
+++ b/imperative/python/megengine/optimizer/multi_step_lr.py
@@ -14,7 +14,8 @@ from .optimizer import Optimizer


 class MultiStepLR(LRScheduler):
-    r"""Decays the learning rate of each parameter group by gamma once the
+    r"""
+    Decays the learning rate of each parameter group by gamma once the
        number of epoch reaches one of the milestones.

    :param optimizer: wrapped optimizer.
@@ -44,7 +45,8 @@ class MultiStepLR(LRScheduler):
        super().__init__(optimizer, current_epoch)

    def state_dict(self):
-        r"""Returns the state of the scheduler as a :class:`dict`.
+        r"""
+        Returns the state of the scheduler as a :class:`dict`.
            It contains an entry for every variable in self.__dict__ which
            is not the optimizer.
        """
@@ -55,7 +57,8 @@ class MultiStepLR(LRScheduler):
        }

    def load_state_dict(self, state_dict):
-        r"""Loads the schedulers state.
+        r"""
+        Loads the schedulers state.

        :type state_dict: dict
        :param state_dict: scheduler state.

--- a/imperative/python/megengine/optimizer/optimizer.py
+++ b/imperative/python/megengine/optimizer/optimizer.py
@@ -28,7 +28,8 @@ required = _RequiredParameter()


 class Optimizer(metaclass=ABCMeta):
-    r"""Base class for all optimizers.
+    r"""
+    Base class for all optimizers.

    :param params: specifies what Tensors should be optimized.
    :param defaults: a dict of default parameters of Optimizer, like learning rate or momentum.
@@ -72,7 +73,8 @@ class Optimizer(metaclass=ABCMeta):
            self._create_state(group)

    def add_param_group(self, param_group: dict):
-        r"""Add a param group to ``param_groups`` of the :class:`~megengine.optim.optimizer.Optimizer`.
+        r"""
+        Add a param group to ``param_groups`` of the :class:`~megengine.optim.optimizer.Optimizer`.

        This can be useful when fine tuning a pre-trained network as frozen layers can be made
        trainable and added to the :class:`~megengine.optim.optimizer.Optimizer` as training progresses.
@@ -137,7 +139,8 @@ class Optimizer(metaclass=ABCMeta):
        return params

    def step(self):
-        r"""Performs a single optimization step.
+        r"""
+        Performs a single optimization step.

        """
        for group in self.param_groups:
@@ -158,14 +161,16 @@ class Optimizer(metaclass=ABCMeta):
                    param.grad.reset_zero()

    def clear_grad(self):
-        r"""Set the grad attribute to None for all parameters.
+        r"""
+        Set the grad attribute to None for all parameters.
        """
        for param_group in self.param_groups:
            for param in param_group["params"]:
                param.grad = None

    def state_dict(self) -> Dict:
-        r"""Export the optimizer state.
+        r"""
+        Export the optimizer state.

        :return: optimizer state. Can be loaded by :meth:`load_state_dict`.
        """
@@ -191,7 +196,8 @@ class Optimizer(metaclass=ABCMeta):
        return {"param_groups": param_groups, "state": state}

    def load_state_dict(self, state: dict):
-        r"""Loads the optimizer state.
+        r"""
+        Loads the optimizer state.

        :param state: optimizer state. Should be an object returned
                from a call to :meth:`state_dict`.

--- a/imperative/python/megengine/optimizer/sgd.py
+++ b/imperative/python/megengine/optimizer/sgd.py
@@ -13,7 +13,8 @@ from .optimizer import Optimizer


 class SGD(Optimizer):
-    r"""Implements stochastic gradient descent.
+    r"""
+    Implements stochastic gradient descent.

    Nesterov momentum is based on the formula from
    `"On the importance of initialization and momentum in deep learning" <http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf>`_ .

--- a/imperative/python/megengine/quantization/observer.py
+++ b/imperative/python/megengine/quantization/observer.py
@@ -174,7 +174,8 @@ class HistogramObserver(MinMaxObserver):
        self.histogram = Tensor([-1] + [0.0] * (bins - 1), dtype="float32")

    def _non_linear_param_search(self):
-        r"""Non-linear parameter search.
+        r"""
+        Non-linear parameter search.
        An approximation for L2 error minimization for selecting min/max.
        By selecting new min/max, we filter out outliers in input distribution.
        """

--- a/imperative/python/megengine/quantization/utils.py
+++ b/imperative/python/megengine/quantization/utils.py
@@ -43,7 +43,8 @@ def register_method_to_class(cls):


 class QuantMode(Enum):
-    """Quantization mode enumerate class.
+    """
+    Quantization mode enumerate class.
    """

    SYMMERTIC = 1
@@ -63,13 +64,15 @@ qparam_dict = {


 def get_qparam_dict(mode: QuantMode):
-    """Return the quantization parameters dictionary according to the mode.
+    """
+    Return the quantization parameters dictionary according to the mode.
    """
    return qparam_dict.get(mode, None)


 def fake_quant_tensor(inp: Tensor, qmin: int, qmax: int, q_dict: Dict) -> Tensor:
-    """Apply fake quantization to the inp tensor.
+    """
+    Apply fake quantization to the inp tensor.

    :param inp: the input tensor which need to be faked.
    :param qmin: the minimum value which the integer limit to.
@@ -91,7 +94,8 @@ def fake_quant_tensor(inp: Tensor, qmin: int, qmax: int, q_dict: Dict) -> Tensor


 def fake_quant_bias(bias: Tensor, inp: Tensor, w_qat: Tensor) -> Tensor:
-    """Apply fake quantization to bias, with the special scale from input tensor
+    """
+    Apply fake quantization to bias, with the special scale from input tensor
    and weight tensor, the quantized type set to qint32 also.

    :param bias: the bias tensor which need to be faked.

--- a/imperative/python/megengine/random/distribution.py
+++ b/imperative/python/megengine/random/distribution.py
@@ -21,7 +21,8 @@ __all__ = ["normal", "uniform"]
 def normal(
    mean: float = 0, std: float = 1, size: Optional[Iterable[int]] = None
 ) -> Tensor:
-    r"""Random variable with Gaussian distribution :math:`N(\mu, \sigma)`.
+    r"""
+    Random variable with Gaussian distribution :math:`N(\mu, \sigma)`.

    :param size: output tensor size.
    :param mean: the mean or expectation of the distribution.
@@ -59,7 +60,8 @@ def normal(
 def uniform(
    low: float = 0, high: float = 1, size: Optional[Iterable[int]] = None
 ) -> Tensor:
-    r"""Random variable with uniform distribution $U(0, 1)$.
+    r"""
+    Random variable with uniform distribution $U(0, 1)$.

    :param size: output tensor size.
    :param low: lower range.

--- a/imperative/python/megengine/serialization.py
+++ b/imperative/python/megengine/serialization.py
@@ -14,7 +14,8 @@ from .utils.max_recursion_limit import max_recursion_limit


 def save(obj, f, pickle_module=pickle, pickle_protocol=pickle.HIGHEST_PROTOCOL):
-    r"""Save an object to disk file.
+    r"""
+    Save an object to disk file.

    :type obj: object
    :param obj: object to save. Only ``module`` or ``state_dict`` are allowed.
@@ -81,7 +82,8 @@ def _get_callable_map_location(map_location):


 def load(f, map_location=None, pickle_module=pickle):
-    r"""Load an object saved with save() from a file.
+    r"""
+    Load an object saved with save() from a file.

    :type f: text file object
    :param f: a string of file name or a text file object from which to load.

--- a/imperative/python/megengine/tensor.py
+++ b/imperative/python/megengine/tensor.py
@@ -97,5 +97,6 @@ tensor = Tensor


 class Parameter(Tensor):
-    r"""A kind of Tensor that is to be considered a module parameter.
+    r"""
+    A kind of Tensor that is to be considered a module parameter.
    """
--- a/imperative/python/megengine/utils/comp_graph_tools.py
+++ b/imperative/python/megengine/utils/comp_graph_tools.py
@@ -17,7 +17,8 @@ from ..core.tensor.raw_tensor import as_raw_tensor


 def get_dep_vars(var: VarNode, var_type: str = None) -> List[VarNode]:
-    """Returns :class:`.tensor.core.megbrain_graph.VarNode` of type ``var_type`` that input ``var``
+    """
+    Returns :class:`.tensor.core.megbrain_graph.VarNode` of type ``var_type`` that input ``var``
    depands on. If ``var_type`` is None, returns all types.
    """
    outputs = []
@@ -46,14 +47,16 @@ def get_dep_vars(var: VarNode, var_type: str = None) -> List[VarNode]:


 def get_owner_opr_inputs(var: VarNode) -> List[VarNode]:
-    """Gets the inputs of owner opr of a variable.
+    """
+    Gets the inputs of owner opr of a variable.
    """
    assert isinstance(var, VarNode)
    return var.owner.inputs


 def get_owner_opr_type(var: VarNode) -> str:
-    """Gets the type of owner opr of a variable.
+    """
+    Gets the type of owner opr of a variable.

    """
    assert isinstance(var, VarNode)
@@ -61,14 +64,16 @@ def get_owner_opr_type(var: VarNode) -> str:


 def get_opr_type(opr: OperatorNode) -> str:
-    """Gets the type of an opr.
+    """
+    Gets the type of an opr.
    """
    assert isinstance(opr, OperatorNode)
    return opr.type


 def graph_traversal(outputs: VarNode):
-    """Helper function to traverse the computing graph and return enough useful information.
+    """
+    Helper function to traverse the computing graph and return enough useful information.

    :param outputs: model outputs.
    :return:  tuple (map_oprs, map_vars, var2oprs, opr2receivers, indegree2opr, opr2indegree)
@@ -124,7 +129,8 @@ def graph_traversal(outputs: VarNode):


 def get_oprs_seq(outputs: List[VarNode], prune_reshape=False) -> List[OperatorNode]:
-    """Gets oprs in some topological order for a dumped model.
+    """
+    Gets oprs in some topological order for a dumped model.

    :param outputs: model outputs.
    :param prune_reshape: whether to prune the useless operators during inference.
@@ -194,7 +200,8 @@ def get_oprs_seq(outputs: List[VarNode], prune_reshape=False) -> List[OperatorNo


 def replace_vars(dst: VarNode, varmap: Dict[VarNode, VarNode]) -> List[VarNode]:
-    """Replaces vars in the graph.
+    """
+    Replaces vars in the graph.

    :param dst: target vars representing the graph.
    :param varmap: the map that specifies how to replace the vars.
@@ -221,7 +228,8 @@ def replace_vars(dst: VarNode, varmap: Dict[VarNode, VarNode]) -> List[VarNode]:
 def replace_oprs(
    dst: List[VarNode], oprmap: Dict[OperatorNode, OperatorNode]
 ) -> List[VarNode]:
-    """Replaces operators in the graph.
+    """
+    Replaces operators in the graph.

    :param dst: target vars representing the graph.
    :param oprmap: the map that specifies how to replace the operators.
@@ -246,7 +254,8 @@ def replace_oprs(


 def set_priority_to_id(dest_vars):
-    """For all oprs in the subgraph constructed by dest_vars,
+    """
+    For all oprs in the subgraph constructed by dest_vars,
       sets its priority to id if its original priority is zero.
    :param dest_vars: target vars representing the graph.
    """
@@ -258,7 +267,8 @@ def set_priority_to_id(dest_vars):


 def load_and_inference(file, inp_data_list: List[numpy.ndarray]) -> List[numpy.ndarray]:
-    """Loads a serialized computing graph and run inference with input data.
+    """
+    Loads a serialized computing graph and run inference with input data.

    :param file: path or handle of the input file.
    :param inp_data_list: list of input data.

--- a/imperative/python/megengine/utils/max_recursion_limit.py
+++ b/imperative/python/megengine/utils/max_recursion_limit.py
@@ -16,7 +16,8 @@ if platform.system() != "Windows":


 class AlternativeRecursionLimit:
-    r"""A reentrant context manager for setting global recursion limits.
+    r"""
+    A reentrant context manager for setting global recursion limits.
    """

    def __init__(self, new_py_limit):
@@ -73,6 +74,7 @@ _max_recursion_limit_context_manager = AlternativeRecursionLimit(2 ** 31 - 1)


 def max_recursion_limit():
-    r"""Sets recursion limit to the max possible value.
+    r"""
+    Sets recursion limit to the max possible value.
    """
    return _max_recursion_limit_context_manager
--- a/imperative/python/megengine/utils/plugin.py
+++ b/imperative/python/megengine/utils/plugin.py
@@ -12,7 +12,8 @@ import numpy as np


 def load_tensor_binary(fobj):
-    """Load a tensor dumped by the :class:`BinaryOprIODump` plugin; the actual
+    """
+    Load a tensor dumped by the :class:`BinaryOprIODump` plugin; the actual
    tensor value dump is implemented by ``mgb::debug::dump_tensor``.

    Multiple values can be compared by ``tools/compare_binary_iodump.py``.

--- a/imperative/python/megengine/utils/profile_analyze.py
+++ b/imperative/python/megengine/utils/profile_analyze.py
@@ -57,7 +57,8 @@ def _tabulate_confluence(tab, **kwargs):


 def main(passed_args=None):  # pylint: disable=too-many-statements
-    """Analyses profile info from :mod:`~.utils.profile_analyzer` .
+    """
+    Analyses profile info from :mod:`~.utils.profile_analyzer` .

    Run this file with ``--help`` to get more usage.
    """

--- a/imperative/python/megengine/utils/profile_analyzer.py
+++ b/imperative/python/megengine/utils/profile_analyzer.py
@@ -15,7 +15,8 @@ import numpy as np


 class NonExistNum:
-    """An object that behaves like a number but means a field does not exist; It is
+    """
+    An object that behaves like a number but means a field does not exist; It is
    always greater than any real number.
    """

@@ -64,15 +65,18 @@ class OprProfRst:
    """A dict containing operator info:  name, id and type."""

    time_dict = None
-    """A mapping from ``"host"`` or ``"device"`` to list of profiling
+    """
+    A mapping from ``"host"`` or ``"device"`` to list of profiling
    results."""

    footprint = None
-    """A mapping from ``"memory"`` or ``"computation"`` to the actual number
+    """
+    A mapping from ``"memory"`` or ``"computation"`` to the actual number
    of corresponding operations."""

    def __init__(self, entry: dict):
-        """Opr profiling initialization, which sets up name, type and id of opr_info.
+        """
+        Opr profiling initialization, which sets up name, type and id of opr_info.

        :param entry: profiling json exec_graph items.
        """
@@ -84,7 +88,8 @@ class OprProfRst:
        self.footprint = collections.defaultdict(NonExistNum)

    def update_device_prof_info(self, dev_time: dict):
-        """Updates device profiling info.
+        """
+        Updates device profiling info.

        :param dev_time: device time for single opr,
            is an attribute of profiling result.
@@ -93,7 +98,8 @@ class OprProfRst:
        self.time_dict["device"].append(copy.deepcopy(dev_time))

    def update_host_prof_info(self, host_time: dict):
-        """Updates host profiling info.
+        """
+        Updates host profiling info.

        :param host_time: host time for single opr,
            is an attribute of profiling result.
@@ -102,7 +108,8 @@ class OprProfRst:
        self.time_dict["host"].append(copy.deepcopy(host_time))

    def update_footprint(self, footprint: dict):
-        """Updates opr footprint.
+        """
+        Updates opr footprint.

        :param footprint: footprint for single opr,
            is an attribute of profiling result.
@@ -128,7 +135,8 @@ class Record:
    ]

    def __init__(self, time: float, info: dict, footprint: dict):
-        """Initializes single record.
+        """
+        Initializes single record.

        :param time: opr running time, evaluated by applying users providing
            function to OprProfRst.
@@ -153,7 +161,8 @@ class Record:
            self.opr_id = int(self.opr_id)

    def get_column_by_name(self, name: str = None):
-        """Extracts column value by its column name.
+        """
+        Extracts column value by its column name.

        :param name: column name, None for time.
        """
@@ -165,7 +174,8 @@ class Record:

 class ProfileAnalyzer:
    def __init__(self, obj: dict, opr_filter: Callable = lambda opr, inp, out: True):
-        """Initializes ProfileAnalyzer.
+        """
+        Initializes ProfileAnalyzer.

        :param obj: dict dumped from json str.
        :param opr_filter: function that filter oprs.
@@ -202,7 +212,8 @@ class ProfileAnalyzer:
    def _aggregate(
        self, records: List[Record], aop: Union[str, Callable], atype: Optional[str]
    ) -> List[Record]:
-        """Aggregate operation.
+        """
+        Aggregate operation.
    
        :param records: selected records.
        :param aop: aggregate operation, if aop is str, we would replace it
@@ -247,7 +258,8 @@ class ProfileAnalyzer:
        return rst

    def _sort(self, records: List[Record], sort_by: str) -> List[Record]:
-        """Sort operation.
+        """
+        Sort operation.

        :param records: the records after aggregate operation.
        :param sort_by: keyword for sorting the list.
@@ -271,7 +283,8 @@ class ProfileAnalyzer:
        sort_by: str = None,
        top_k: int = 0,
    ) -> List[Record]:
-        """Select operation.
+        """
+        Select operation.

        :param time_func: time_func provided by user, would apply to every
            OprProfRst.
@@ -304,7 +317,8 @@ class TimeFuncHelper:

    @staticmethod
    def _eval_time(prof_type, end_key, func, opr_prof):
-        """Eval time.
+        """
+        Eval time.

        :type prof_type: str
        :param prof_type: 'host' or 'device'.
@@ -325,7 +339,8 @@ class TimeFuncHelper:

    @staticmethod
    def eval_time_func(prof_type: str, end_key: str, func: Callable) -> float:
-        """Eval oprerator profile time.
+        """
+        Eval oprerator profile time.

        :param prof_type: 'host' or 'device'.
        :param end_key: 'kern' or 'end'.
@@ -338,7 +353,8 @@ class TimeFuncHelper:
    def _min_start(
        prof_type, end_key, func, opr_prof
    ):  # pylint: disable=unused-argument
-        """Eval minimum start time.
+        """
+        Eval minimum start time.

        :type prof_type: str
        :param prof_type: 'host' or 'device'.
@@ -360,7 +376,8 @@ class TimeFuncHelper:
    def min_start_func(
        prof_type: str, end_key: str, func: Callable
    ) -> float:  # pylint: disable=unused-argument
-        """Eval oprerator profile min start time.
+        """
+        Eval oprerator profile min start time.

        :param prof_type: 'host' or 'device'.
        :param end_key: 'kern' or 'end'.
@@ -371,7 +388,8 @@ class TimeFuncHelper:

    @staticmethod
    def _max_end(prof_type, end_key, func, opr_prof):  # pylint: disable=unused-argument
-        """Eval maximum end time
+        """
+        Eval maximum end time

        :type prof_type: str
        :param prof_type: 'host' or 'device'.
@@ -391,7 +409,8 @@ class TimeFuncHelper:

    @staticmethod
    def max_end_func(prof_type: str, end_key: str, func: Callable) -> float:
-        """Eval oprerator profile max end time.
+        """
+        Eval oprerator profile max end time.

        :param prof_type: 'host' or 'device'.
        :param end_key: 'kern' or 'end'.

--- a/imperative/python/megengine/utils/profiler.py
+++ b/imperative/python/megengine/utils/profiler.py
@@ -169,7 +169,7 @@ class Profiler:

    Examples:

-    .. testcode::
+    .. code-block::

        import megengine as mge
        import megengine.module as M

--- a/imperative/python/megengine/utils/tensor_sanity_check.py
+++ b/imperative/python/megengine/utils/tensor_sanity_check.py
@@ -3,7 +3,8 @@ from ..core._imperative_rt.imperative import sync


 class TensorSanityCheck:
-    r"""An object that checks whether the input tensors of each operator have changed before and after the operation.
+    r"""
+    An object that checks whether the input tensors of each operator have changed before and after the operation.
    
    Examples:


--- a/imperative/python/megengine/utils/types.py
+++ b/imperative/python/megengine/utils/types.py
@@ -11,7 +11,8 @@ import functools


 def get_ndtuple(value, *, n, allow_zero=True):
-    r"""Converts possibly 1D tuple to nd tuple.
+    r"""
+    Converts possibly 1D tuple to nd tuple.

    :type allow_zero: bool
    :param allow_zero: whether to allow zero tuple value."""

--- a/imperative/python/test/pytest.ini
+++ b/imperative/python/test/pytest.ini
-# -*- coding: utf-8 -*-
-# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
-#
-# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-[pytest]
-markers =
-    isolated_distributed: marks distributed tests that should runs without cuda use
-        in main thread (deselect with '-m "not "isolated_distributed"')
--- a/imperative/python/test/run.sh
+++ b/imperative/python/test/run.sh
 #!/bin/bash -e

-test_dirs="test megengine"
+test_dirs="megengine test"

 TEST_PLAT=$1