docs(docstring): transfer to google style

GitOrigin-RevId: a71245c553e763bf1ed5b04913da4d2a1d4cd2c5

docs(docstring): transfer to google style
GitOrigin-RevId: a71245c553e763bf1ed5b04913da4d2a1d4cd2c5
c5f8b583 · Megvii Engine Team · 76ce81e8 · c5f8b583 · c5f8b583 · c5f8b583
123 changed file
--- a/imperative/python/megengine/amp/autocast.py
+++ b/imperative/python/megengine/amp/autocast.py
@@ -11,38 +11,37 @@ from ..core.tensor import amp


 class autocast:
-    r"""
-    A class to control autocast mode for amp as a context manager or a decorator.
+    r"""A class to control autocast mode for amp as a context manager or a decorator.

-    :param enabled: Whether autocast mode is enabled.
-    :param low_prec_dtype: Set amp autocast mode's lower precision dtype. It will change
-        the target dtype in tensor casting for better speed and memory. Default: float16.
-    :param high_prec_dtype: Set amp autocast mode's higher precision dtype. It will
-        change the target dtype in tensor casting for better precision. Default: float32.
+    Args:
+        enabled: Whether autocast mode is enabled.
+        low_prec_dtype: Set amp autocast mode's lower precision dtype. It will change
+            the target dtype in tensor casting for better speed and memory. Default: float16.
+        high_prec_dtype: Set amp autocast mode's higher precision dtype. It will
+            change the target dtype in tensor casting for better precision. Default: float32.

    Examples:
+        .. code-block::

-    .. code-block::
+           # used as decorator
+           @autocast()
+           def train_step(image, label):
+               with gm:
+                   logits = model(image)
+                   loss = F.nn.cross_entropy(logits, label)
+                   gm.backward(loss)
+               opt.step().clear_grad()
+               return loss

-        # used as decorator
-        @autocast()
-        def train_step(image, label):
-            with gm:
-                logits = model(image)
-                loss = F.nn.cross_entropy(logits, label)
-                gm.backward(loss)
-            opt.step().clear_grad()
-            return loss
-
-        # used as context manager
-        def train_step(image, label):
-            with autocast():
-                with gm:
-                    logits = model(image)
-                    loss = F.nn.cross_entropy(logits, label)
-                    gm.backward(loss)
-            opt.step().clear_grad()
-            return loss
+           # used as context manager
+           def train_step(image, label):
+               with autocast():
+                   with gm:
+                       logits = model(image)
+                       loss = F.nn.cross_entropy(logits, label)
+                       gm.backward(loss)
+               opt.step().clear_grad()
+               return loss
    """

    def __init__(

--- a/imperative/python/megengine/amp/grad_scaler.py
+++ b/imperative/python/megengine/amp/grad_scaler.py
@@ -16,50 +16,51 @@ from ..tensor import Tensor


 class GradScaler:
-    r"""
-    A helper class that performs grad scaling to prevent from data overflow in
+    r"""A helper class that performs grad scaling to prevent from data overflow in
    :class:`~.autocast` mode.

-    :param init_scale: Initial scale factor.
-    :param growth_factor: Factor that the scale is multiplied by in actual
-        :meth:`update` stage. If growth_factor is 0, scale_factor will not update.
-    :param backoff_factor: Factor that the scale is multiplied by when encountering
-        overflow grad.
-    :param growth_interval: The interval between two scale update stages.
-
-    Example::
-
-        gm = GradManager()
-        opt = ...
-        scaler = GradScaler()
-
-        gm.attach(model.parameters())
-
-        @autocast()
-        def train_step(image, label):
-            with gm:
-                logits = model(image)
-                loss = F.nn.cross_entropy(logits, label)
-                scaler.backward(gm, loss)
-            opt.step().clear_grad()
-            return loss
-
-    If need more flexible usage, could split ``scaler.backward`` into three lines:
-
-    .. code-block::
-
-        @autocast()
-        def train_step(image, label):
-            with gm:
-                logits = model(image)
-                loss = F.nn.cross_entropy(logits, label)
-                gm.backward(loss， dy=megengine.tensor(scaler.scale_factor))
-            scaler.unscale(gm.attached_tensors())
-            scaler.update()
-            opt.step().clear_grad()
-            return loss
-
-    This is useful when need to accumulate grads for multi batches.
+    Args:
+        init_scale: Initial scale factor.
+        growth_factor: Factor that the scale is multiplied by in actual
+            :meth:`update` stage. If growth_factor is 0, scale_factor will not update.
+        backoff_factor: Factor that the scale is multiplied by when encountering
+            overflow grad.
+        growth_interval: The interval between two scale update stages.
+
+    Example:
+        .. code-block::
+
+           gm = GradManager()
+           opt = ...
+           scaler = GradScaler()
+
+           gm.attach(model.parameters())
+
+           @autocast()
+           def train_step(image, label):
+               with gm:
+                   logits = model(image)
+                   loss = F.nn.cross_entropy(logits, label)
+                   scaler.backward(gm, loss)
+               opt.step().clear_grad()
+               return loss
+
+        If need more flexible usage, could split ``scaler.backward`` into three lines:
+
+        .. code-block::
+
+           @autocast()
+           def train_step(image, label):
+               with gm:
+                   logits = model(image)
+                   loss = F.nn.cross_entropy(logits, label)
+                   gm.backward(loss， dy=megengine.tensor(scaler.scale_factor))
+               scaler.unscale(gm.attached_tensors())
+               scaler.update()
+               opt.step().clear_grad()
+               return loss
+
+        This is useful when need to accumulate grads for multi batches.
    """

    def __init__(
@@ -86,18 +87,18 @@ class GradScaler:
        unscale_grad: bool = True,
        update_scale: bool = "if_unscale_grad"
    ):
-        r"""
-        A wrapper of GradManager's :meth:`~.GradManager.backward`, used to scale
+        r"""A wrapper of GradManager's :meth:`~.GradManager.backward`, used to scale
        ``y``'s grad and unscale parameters' grads.

-        :param gm: The to be wrapped GradManager.
-        :param y: Same as GradManager backward's ``y``.
-        :param dy: Same as GradManager backward's ``dy``. Will be multiplied
-            by ``scale_factor``.
-        :param unscale_grad: Whether do :meth:`unscale` at the same time. Could be
-            ``False`` if needs to accumulate grads.
-        :param update_scale: Same as :meth:`unscale`'s ``update``. Will be ignored
-            if ``unscale_grad`` is ``False``.
+        Args:
+            gm: The to be wrapped GradManager.
+            y: Same as GradManager backward's ``y``.
+            dy: Same as GradManager backward's ``dy``. Will be multiplied
+                by ``scale_factor``.
+            unscale_grad: Whether do :meth:`unscale` at the same time. Could be
+                ``False`` if needs to accumulate grads.
+            update_scale: Same as :meth:`unscale`'s ``update``. Will be ignored
+                if ``unscale_grad`` is ``False``.
        """
        # These checks should be consistent with GradManager's
        if y is None:
@@ -121,11 +122,11 @@ class GradScaler:
                self.update()

    def unscale(self, grad_tensors: Iterable[Tensor]):
-        r"""
-        Unscale all ``grad_tensors``'s grad.
+        r"""Unscale all ``grad_tensors``'s grad.

-        :param grad_tensors: Tensors needed to unscale grads. Should be all tensors
-            that are affected by ``target`` tensor in GradManager's backward.
+        Args:
+            grad_tensors: Tensors needed to unscale grads. Should be all tensors
+                that are affected by ``target`` tensor in GradManager's backward.
        """
        # use float64 for better precision
        inv_scale = Tensor(1.0 / self.scale_factor)
@@ -151,7 +152,8 @@ class GradScaler:

    def update(self, new_scale: float = None):
        r"""Update the scale factor according to whether encountered overflow grad.
-        If ``new_scale`` is provided, internal update mechanism will be ignored."""
+        If ``new_scale`` is provided, internal update mechanism will be ignored.
+        """
        if self.growth_interval == 0:
            return


--- a/imperative/python/megengine/autodiff/grad_manager.py
+++ b/imperative/python/megengine/autodiff/grad_manager.py
@@ -32,8 +32,7 @@ _global_priority = 0


 class GradManager:
-    r"""
-    GradManager computes gradients or more generally, vector-Jacobian product, by reverse mode
+    r"""GradManager computes gradients or more generally, vector-Jacobian product, by reverse mode
    automatic differentiation (a.k.a. back propagation).

    Reverse mode autodiff normally reuses many intermediate tensors for best computation efficiency.
@@ -120,7 +119,6 @@ class GradManager:

        gm = GradManager()
        gm.attach(model.parameters(), callback=dist.make_allreduce_cb("MEAN"))
-
    """

    def __init__(self):
@@ -136,8 +134,7 @@ class GradManager:
        return [spec.tensor() for spec in self._attach_specs.values()]

    def attach(self, tensors: Iterable[Tensor], callbacks=None):
-        r"""
-        Instruct GradManager to track operations on tensors, so that gradients with respect
+        r"""Instruct GradManager to track operations on tensors, so that gradients with respect
        to those tensors could be evaluated later.

        :meth:`attach` also accepts a list of callbacks, which will be called with the tensor and
@@ -188,8 +185,9 @@ class GradManager:
            multiple uses of a GradManager, which is unrelated to whether resources is timely
            released within a single use.

-        :param tensors: tensor or list of tensors to track
-        :param callbacks: callback or list of callbacks
+        Args:
+            tensors: tensor or list of tensors to track
+            callbacks: callback or list of callbacks
        """
        if callbacks is None:
            callbacks = []
@@ -234,8 +232,7 @@ class GradManager:
        y: Union[Tensor, List[Tensor]] = None,
        dy: Union[Tensor, List[Tensor]] = None,
    ):
-        r"""
-        Compute gradients (or vector-Jacobian product) for all attached tensors, accumulate to
+        r"""Compute gradients (or vector-Jacobian product) for all attached tensors, accumulate to
        corresponding .grad attribute, and release resources along the way.

        :meth:`backward` computes the vector-Jacobian product :math:`dx_j = \sum_{i} dy_i J_{ij}`
@@ -257,8 +254,9 @@ class GradManager:
        process of this call. When the call successfully finishes, the GradManager will be put back
        to an inactive state.

-        :param y: tensor or list of tensors
-        :param dy: tensor or list of tensors. Defaults to 1 if y is scalar
+        Args:
+            y: tensor or list of tensors
+            dy: tensor or list of tensors. Defaults to 1 if y is scalar
        """
        push_scope("backward")
        set_option("record_computing_path", 0)
@@ -310,8 +308,7 @@ class GradManager:
        pop_scope("backward")

    def record(self):
-        r"""
-        Start recording operations
+        r"""Start recording operations

        After this call, you will be able to call :meth:`backward`.
        """
@@ -342,8 +339,7 @@ class GradManager:
        self._grad.wrt(tensor, callback=callback)

    def release(self):
-        r"""
-        Stop recording operations and release resources kept for gradient computation
+        r"""Stop recording operations and release resources kept for gradient computation

        After this call, you will not be able to call :meth:`backward`.
        """

--- a/imperative/python/megengine/core/_trace_option.py
+++ b/imperative/python/megengine/core/_trace_option.py
@@ -15,16 +15,12 @@ if os.environ.get("MEGENGINE_USE_SYMBOLIC_SHAPE"):


 def use_symbolic_shape() -> bool:
-    """
-    Returns whether tensor.shape returns a tensor instead of a tuple
-
-    """
+    r"""Returns whether tensor.shape returns a tensor instead of a tuple"""
    return _use_symbolic_shape


 def set_symbolic_shape(option: bool):
-    """ Sets whether tensor.shape returns a tensor instead of a tuple
-    """
+    r"""Sets whether tensor.shape returns a tensor instead of a tuple"""
    global _use_symbolic_shape
    _org = _use_symbolic_shape
    _use_symbolic_shape = option

--- a/imperative/python/megengine/core/autodiff/grad.py
+++ b/imperative/python/megengine/core/autodiff/grad.py
@@ -88,67 +88,56 @@ class Grad:


 class Function(ops.PyOpBase):
-    """
-    Defines a block of operations with customizable differentiation.
-
+    r"""Defines a block of operations with customizable differentiation.
+    
    The computation should be defined in ``forward`` method, with gradient
    computation defined in ``backward`` method.
-
+    
    Each instance of ``Function`` should be used only once during forwardding.
-
+    
    Examples:
-
-    .. code-block::
-
-        class Sigmoid(Function):
-            def forward(self, x):
-                y = 1 / (1 + F.exp(-x))
-                self.y = y
-                return y
-
-            def backward(self, dy):
-                y = self.y
-                return dy * y * (1-y)
-
+    
+        .. code-block::
+    
+            class Sigmoid(Function):
+                def forward(self, x):
+                    y = 1 / (1 + F.exp(-x))
+                    self.y = y
+                    return y
+
+                def backward(self, dy):
+                    y = self.y
    """

    def forward(self, *args, **kwargs):
-        """
-        Applies operations to ``inputs`` and returns results. It must be overriden by all subclasses.
-
-        :param input: input tensors.
-        :return: a tuple of Tensor or a single Tensor.
-
-        .. note::
-
-            This method should return a tuple of Tensor or a single Tensor representing the output
-            of the function.
-
-        .. note::
-
-            positional arguments should all be Tensor
-
+        r"""Applies operations to ``inputs`` and returns results. It must be overriden by all subclasses.
+
+        Args:
+            input: input tensors.
+
+        Returns:
+            a tuple of Tensor or a single Tensor.
+          
+        Note:
+            * This method should return a tuple of Tensor or a single Tensor representing the output
+              of the function.
+            * positional arguments should all be Tensor
        """
        raise NotImplementedError

    def backward(self, *output_grads):
-        """
-        Compute the gradient of the forward function. It must be overriden by all subclasses.
-
-        :param output_grads: gradients of outputs that are returned by :meth:`forward`.
-
-        .. note::
-
-            In case when some tensors of outputs are not related to loss function, the corresponding
-            values in ``output_grads`` would be ``None``.
-
-        .. note::
-
-            This method should return a tuple which containing the gradients of all inputs, in the same order
-            as the ``inputs`` argument of :meth:`forward` . A ``Tensor`` could be returned
-            instead if there is only one input. If users want to stop the propagation of some gradients,
-            the corresponding returned values should be set ``None`` .
-
+        r"""Compute the gradient of the forward function. It must be overriden by all subclasses.
+
+        Args:
+            output_grads: gradients of outputs that are returned by :meth:`forward`.
+        
+        Note:
+            * In case when some tensors of outputs are not related to loss function, the corresponding
+              values in ``output_grads`` would be ``None``.
+            * This method should return a tuple which containing the gradients of all inputs, in the same order
+              as the ``inputs`` argument of :meth:`forward` . A ``Tensor`` could be returned
+              instead if there is only one input. If users want to stop the propagation of some gradients,
+              the corresponding returned values should be set ``None`` .
        """
        raise NotImplementedError


--- a/imperative/python/megengine/core/tensor/amp.py
+++ b/imperative/python/megengine/core/tensor/amp.py
@@ -12,16 +12,14 @@ _low_prec_dtype = "float16"

 @property
 def enabled(mod):
-    r"""
-    Get or set amp autocast mode enabled or not.
-
+    r"""Get or set amp autocast mode enabled or not.
+    
    Examples:
+    
+        .. code-block::

-    .. code-block::
-
-        import megengine as mge
-        mge.amp.enabled = True
-
+           import megengine as mge
+           mge.amp.enabled = True
    """
    return _enabled

@@ -34,17 +32,15 @@ def enabled(mod, enabled: bool):

 @property
 def high_prec_dtype(mod):
-    r"""
-    Get or set amp autocast mode's higher precision dtype. It will change the
+    r"""Get or set amp autocast mode's higher precision dtype. It will change the
    target dtype in tensor casting for better precision. Default: float32.
-
+    
    Examples:
+    
+        .. code-block::

-    .. code-block::
-
-        import megengine as mge
-        mge.amp.high_prec_dtype = "float32"
-
+           import megengine as mge
+           mge.amp.high_prec_dtype = "float32"
    """
    return _high_prec_dtype

@@ -57,17 +53,15 @@ def high_prec_dtype(mod, dtype: str):

 @property
 def low_prec_dtype(mod):
-    r"""
-    Get or set amp autocast mode's lower precision dtype. It will change the
+    r"""Get or set amp autocast mode's lower precision dtype. It will change the
    target dtype in tensor casting for better speed and memory. Default: float16.
-
+    
    Examples:
+    
+        .. code-block::

-    .. code-block::
-
-        import megengine as mge
-        mge.amp.low_prec_dtype = "float16"
-
+           import megengine as mge
+           mge.amp.low_prec_dtype = "float16"
    """
    return _low_prec_dtype


--- a/imperative/python/megengine/core/tensor/array_method.py
+++ b/imperative/python/megengine/core/tensor/array_method.py
@@ -389,9 +389,7 @@ class ArrayMethodMixin(abc.ABC):

    @property
    def ndim(self):
-        r"""
-        Returns the number of dimensions of self :class:`~.Tensor`.
-        """
+        r"""Returns the number of dimensions of self :class:`~.Tensor`."""
        shape = self._tuple_shape
        if shape is None:
            raise ValueError("unkown ndim")
@@ -399,8 +397,7 @@ class ArrayMethodMixin(abc.ABC):

    @property
    def size(self):
-        r"""
-        Returns the size of the self :class:`~.Tensor`.
+        r"""Returns the size of the self :class:`~.Tensor`.
        The returned value is a subclass of :class:`tuple`.
        """
        shape = self.shape
@@ -410,14 +407,11 @@ class ArrayMethodMixin(abc.ABC):

    @property
    def T(self):
-        r"""
-        alias of :attr:`~.Tensor.transpose`.
-        """
+        r"""alias of :attr:`~.Tensor.transpose`."""
        return self.transpose()

    def item(self, *args):
-        r"""
-        Returns the value of this :class:`~.Tensor` as a standard Python :class:`numbers.Number`.
+        r"""Returns the value of this :class:`~.Tensor` as a standard Python :class:`numbers.Number`.
        This only works for tensors with one element. For other cases, see :meth:`~.tolist`.
        """
        if not args:
@@ -427,8 +421,7 @@ class ArrayMethodMixin(abc.ABC):
        return self[args].item()

    def tolist(self):
-        r"""
-        Returns the tensor as a (nested) list.
+        r"""Returns the tensor as a (nested) list.
        For scalars, a standard Python number is returned, just like with :meth:`~.item`.
        Tensors are automatically moved to the CPU first if necessary.

@@ -437,16 +430,13 @@ class ArrayMethodMixin(abc.ABC):
        return self.numpy().tolist()

    def astype(self, dtype):
-        r"""
-        Returns a :class:`Tensor` with the same data and number of elements
+        r"""Returns a :class:`Tensor` with the same data and number of elements
        with the specified :attr:`~.Tensor.dtype`.
        """
        return astype(self, dtype)

    def reshape(self, *args):
-        r"""
-        See :func:`~.reshape`.
-        """
+        r"""See :func:`~.reshape`."""
        return _reshape(self, _expand_args(args))

    # FIXME: remove this method
@@ -454,9 +444,7 @@ class ArrayMethodMixin(abc.ABC):
        return _broadcast(self, _expand_args(args))

    def transpose(self, *args):
-        r"""
-        See :func:`~.transpose`.
-        """
+        r"""See :func:`~.transpose`."""
        if self.ndim == 0:
            assert (
                len(args) == 0
@@ -469,172 +457,170 @@ class ArrayMethodMixin(abc.ABC):
        return _transpose(self, _expand_args(args))

    def flatten(self):
-        r"""
-        See :func:`~.flatten`.
-        """
+        r"""See :func:`~.flatten`."""
        return self.reshape(-1)

    def sum(self, axis=None, keepdims: bool = False):
-        r"""
-        Returns the sum of each row of the input tensor in the given dimension ``axis``.
+        r"""Returns the sum of each row of the input tensor in the given dimension ``axis``.

        If ``axis`` is a list of axises, reduce over all of them.
        If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor,
        except in the dimension(s) ``axis`` where it is of size 1.
        Otherwise, ``axis`` is squeezed (see :func:`~.squeeze`).

-        :param axis: the dimension or dimensions to reduce.
-        :param keepdims: whether the output tensor has ndim retained or not.
-        :return: output tensor.
+        Args:
+            axis: the dimension or dimensions to reduce.
+            keepdims: whether the output tensor has ndim retained or not.

-        Examples:
-
-        .. testcode::
+        Returns:
+            output tensor.

-            from megengine import tensor
-            a = tensor([False, True, True, False])
-            b = tensor([1.0, 2.0, 3.0, 4.0])
-            print(a.sum().numpy())
-            print(b.sum().numpy())
+        Examples:
+            .. testcode::

-        Outputs:
+               from megengine import tensor
+               a = tensor([False, True, True, False])
+               b = tensor([1.0, 2.0, 3.0, 4.0])
+               print(a.sum().numpy())
+               print(b.sum().numpy())

-        .. testoutput::
+            Outputs:

-            2
-            10.0
+            .. testoutput::

+               2
+               10.0
        """
        return _reduce("sum")(self, axis, keepdims)

    def prod(self, axis=None, keepdims: bool = False):
-        r"""
-        Returns the product of each row of the input tensor in the given dimension ``axis``.
+        r"""Returns the product of each row of the input tensor in the given dimension ``axis``.

        If ``axis`` is a list of axises, reduce over all of them.
        If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor,
        except in the dimension(s) ``axis`` where it is of size 1.
        Otherwise, ``axis`` is squeezed (see :func:`~.squeeze`).

-        :param axis: the dimension or dimensions to reduce.
-        :param keepdims: whether the output tensor has ndim retained or not.
-        :return: output tensor.
-
-        Examples:
+        Args:
+            axis: the dimension or dimensions to reduce.
+            keepdims: whether the output tensor has ndim retained or not.

-        .. testcode::
+        Returns:
+            output tensor.

-            from megengine import tensor
-            a = tensor([False, True, True, False])
-            b = tensor([1.0, 2.0, 3.0, 4.0])
-            print(a.prod().numpy())
-            print(b.prod().numpy())
+        Examples:
+            .. testcode::

-        Outputs:
+               from megengine import tensor
+               a = tensor([False, True, True, False])
+               b = tensor([1.0, 2.0, 3.0, 4.0])
+               print(a.prod().numpy())
+               print(b.prod().numpy())

-        .. testoutput::
+            Outputs:

-            0
-            24.0
+            .. testoutput::

+               0
+               24.0
        """
        return _reduce("product")(self, axis, keepdims)

    def min(self, axis=None, keepdims: bool = False):
-        r"""
-        Returns the min value of each row of the input tensor in the given dimension ``axis``.
+        r"""Returns the min value of each row of the input tensor in the given dimension ``axis``.

        If ``axis`` is a list of axises, reduce over all of them.
        If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor,
        except in the dimension(s) ``axis`` where it is of size 1.
        Otherwise, ``axis`` is squeezed (see :func:`~.squeeze`).

-        :param axis: the dimension or dimensions to reduce.
-        :param keepdims: whether the output tensor has ndim retained or not.
-        :return: output tensor.
-
-        Examples:
+        Args:
+            axis: the dimension or dimensions to reduce.
+            keepdims: whether the output tensor has ndim retained or not.

-        .. testcode::
+        Returns:
+            output tensor.

-            from megengine import tensor
-            a = tensor([False, True, True, False])
-            b = tensor([1.0, 2.0, 3.0, 4.0])
-            print(a.min().numpy())
-            print(b.min().numpy())
+        Examples:
+            .. testcode::

-        Outputs:
+               from megengine import tensor
+               a = tensor([False, True, True, False])
+               b = tensor([1.0, 2.0, 3.0, 4.0])
+               print(a.min().numpy())
+               print(b.min().numpy())

-        .. testoutput::
+            Outputs:

-            False
-            1.0
+            .. testoutput::

+               False
+               1.0
        """
        return _reduce("min")(self, axis, keepdims)

    def max(self, axis=None, keepdims: bool = False):
-        r"""
-        Returns the max value of each row of the input tensor in the given dimension ``axis``.
+        r"""Returns the max value of each row of the input tensor in the given dimension ``axis``.

        If ``axis`` is a list of axises, reduce over all of them.
        If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor,
        except in the dimension(s) ``axis`` where it is of size 1.
        Otherwise, ``axis`` is squeezed (see :func:`~.squeeze`).

-        :param axis: the dimension or dimensions to reduce.
-        :param keepdims: whether the output tensor has ndim retained or not.
-        :return: output tensor.
-
-        Examples:
+        Args:
+            axis: the dimension or dimensions to reduce.
+            keepdims: whether the output tensor has ndim retained or not.

-        .. testcode::
+        Returns:
+            output tensor.

-            from megengine import tensor
-            a = tensor([False, True, True, False])
-            b = tensor([1.0, 2.0, 3.0, 4.0])
-            print(a.max().numpy())
-            print(b.max().numpy())
+        Examples:
+            .. testcode::

-        Outputs:
+               from megengine import tensor
+               a = tensor([False, True, True, False])
+               b = tensor([1.0, 2.0, 3.0, 4.0])
+               print(a.max().numpy())
+               print(b.max().numpy())

-        .. testoutput::
+            Outputs:

-            True
-            4.0
+            .. testoutput::

+               True
+               4.0
        """
        return _reduce("max")(self, axis, keepdims)

    def mean(self, axis=None, keepdims: bool = False):
-        r"""
-        Returns the mean value of each row of the input tensor in the given dimension ``axis``.
+        r"""Returns the mean value of each row of the input tensor in the given dimension ``axis``.

        If ``axis`` is a list of axises, reduce over all of them.
        If ``keepdims`` is ``True``, the shape of output tensor is the same as the input tensor,
        except in the dimension(s) ``axis`` where it is of size 1.
        Otherwise, ``axis`` is squeezed (see :func:`~.squeeze`).

-        :param axis: the dimension or dimensions to reduce.
-        :param keepdims: whether the output tensor has ndim retained or not.
-        :return: output tensor.
+        Args:
+            axis: the dimension or dimensions to reduce.
+            keepdims: whether the output tensor has ndim retained or not.

-        Examples:
+        Returns:
+            output tensor.

-        .. testcode::
-
-            from megengine import tensor
-            a = tensor([False, True, True, False])
-            b = tensor([1.0, 2.0, 3.0, 4.0])
-            print(a.mean().numpy())
-            print(b.mean().numpy())
+        Examples:
+            .. testcode::

-        Outputs:
+               from megengine import tensor
+               a = tensor([False, True, True, False])
+               b = tensor([1.0, 2.0, 3.0, 4.0])
+               print(a.mean().numpy())
+               print(b.mean().numpy())

-        .. testoutput::
+            Outputs:

-            0.5
-            2.5
+            .. testoutput::

+               0.5
+               2.5
        """
        return _reduce("mean")(self, axis, keepdims)
--- a/imperative/python/megengine/core/tensor/dtype.py
+++ b/imperative/python/megengine/core/tensor/dtype.py
@@ -47,17 +47,17 @@ class QuantDtypeMeta(
        ["name", "cname", "np_dtype_str", "qmin", "qmax", "is_unsigned"],
    )
 ):
-    r"""
-    Store metadata for quantize dtype. Could be used to create custom quant dtype
+    r"""Store metadata for quantize dtype. Could be used to create custom quant dtype
    for QAT when the network don't need to be converted for inference, but only
    to export network metadata for third-party platform inference.

-    :param name: a unique name string.
-    :param cname: used in :func:`~.create_quantized_dtype` for model dump and inference.
-    :param np_dtype_str: used in :func:`~.create_quantized_dtype` to generate ``np.dtype``.
-    :param qmin: a int number indicating quant dtype's lowerbound.
-    :param qmax: a int number indicating quant dtype's upperbound.
-    :param is_unsigned: a helper value that could be inference from np_dtype_str.
+    Args:
+        name: a unique name string.
+        cname: used in :func:`~.create_quantized_dtype` for model dump and inference.
+        np_dtype_str: used in :func:`~.create_quantized_dtype` to generate ``np.dtype``.
+        qmin: a int number indicating quant dtype's lowerbound.
+        qmax: a int number indicating quant dtype's upperbound.
+        is_unsigned: a helper value that could be inference from np_dtype_str.
    """

    def __new__(
@@ -77,7 +77,7 @@ class QuantDtypeMeta(
        return self

    def __deepcopy__(self, _):
-        """
+        r"""
        Ignore deepcopy so that a dtype meta can be treated as singleton, for more
        strict check in :meth:`~.FakeQuantize.fake_quant_forward`.
        """
@@ -113,17 +113,17 @@ def _check_zero_point(zp: int, dtype_meta: QuantDtypeMeta):
 def create_quantized_dtype(
    dtype_meta: QuantDtypeMeta, scale: float, zp: Union[int, None]
 ):
-    r"""
-    Get quantized dtype with metadata attribute according to _metadata_dict.
-
+    r"""Get quantized dtype with metadata attribute according to _metadata_dict.
+    
    Note that unsigned dtype must have ``zero_point`` and signed dtype must
    not have ``zero_point``, to be consitent with tensor generated by calling
    compiled function from `CompGraph.compile(inputs, outspec)`.

-    :param dtype_meta: a QuantDtypeMeta indicating which dtype to return. the
-        ``cname`` attribute cannot be ``None``.
-    :param scale: a number for scale to store in dtype's metadata
-    :param zp: a number for zero_point to store in dtype's metadata
+    Args:
+        dtype_meta: a QuantDtypeMeta indicating which dtype to return. the
+            ``cname`` attribute cannot be ``None``.
+        scale: a number for scale to store in dtype's metadata
+        zp: a number for zero_point to store in dtype's metadata
    """
    if dtype_meta.cname is None:
        raise ValueError("dtype {} without cname attr is not supported.")
@@ -152,8 +152,7 @@ def create_quantized_dtype(


 def quint8(scale, zero_point):
-    """
-    Consturct a quantized unsigned int8 data type with ``scale`` (float) and
+    r"""Consturct a quantized unsigned int8 data type with ``scale`` (float) and
    ``zero_point`` (uint8). The real value represented by a quint8 data type is
    float_val = scale * (uint8_val - zero_point)
    """
@@ -161,24 +160,21 @@ def quint8(scale, zero_point):


 def qint8(scale):
-    """
-    Construct a quantized int8 data type with ``scale`` (float). The real value
+    r"""Construct a quantized int8 data type with ``scale`` (float). The real value
    represented by a qint8 data type is float_val = scale * int8_val
    """
    return create_quantized_dtype(_builtin_quant_dtypes["qint8"], scale, None)


 def qint32(scale):
-    """
-    Construct a quantized int32 data type with ``scale`` (float). The real value
+    r"""Construct a quantized int32 data type with ``scale`` (float). The real value
    represented by a qint32 data type is float_val = scale * int32_val
    """
    return create_quantized_dtype(_builtin_quant_dtypes["qint32"], scale, None)


 def quint4(scale, zero_point):
-    """
-    Consturct a quantized unsigned int4 data type with ``scale`` (float) and
+    r"""Consturct a quantized unsigned int4 data type with ``scale`` (float) and
    ``zero_point`` (uint8). The real value represented by a quint4 data type is
    float_val = scale * (uint4_val - zero_point)
    """
@@ -186,8 +182,7 @@ def quint4(scale, zero_point):


 def qint4(scale):
-    """
-    Construct a quantized int4 data type with ``scale`` (float). The real value
+    r"""Construct a quantized int4 data type with ``scale`` (float). The real value
    represented by a qint4 data type is float_val = scale * int4_val
    """
    return create_quantized_dtype(_builtin_quant_dtypes["qint4"], scale, None)
@@ -244,95 +239,95 @@ def _convert_from_quantized_dtype(arr: np.ndarray, dtype_meta: QuantDtypeMeta):


 def convert_to_quint8(arr: np.ndarray, q: np.dtype):
-    """
-    Quantize a float NumPy ndarray into a quint8 one with specified params.
+    r"""Quantize a float NumPy ndarray into a quint8 one with specified params.

-    :param arr: Input ndarray.
-    :param q: Target data type, should be a quint8.
+    Args:
+        arr: Input ndarray.
+        q: Target data type, should be a quint8.
    """
    return _convert_to_quantized_dtype(arr, q, _builtin_quant_dtypes["quint8"])


 def convert_from_quint8(arr: np.ndarray):
-    """
-    Dequantize a quint8 NumPy ndarray into a float one.
+    r"""Dequantize a quint8 NumPy ndarray into a float one.

-    :param arr: Input ndarray.
+    Args:
+        arr: Input ndarray.
    """
    return _convert_from_quantized_dtype(arr, _builtin_quant_dtypes["quint8"])


 def convert_to_qint8(arr: np.ndarray, q: np.dtype):
-    """
-    Quantize a float NumPy ndarray into a qint8 one with specified params.
+    r"""Quantize a float NumPy ndarray into a qint8 one with specified params.

-    :param arr: Input ndarray.
-    :param q: Target data type, should be a qint8.
+    Args:
+        arr: Input ndarray.
+        q: Target data type, should be a qint8.
    """
    return _convert_to_quantized_dtype(arr, q, _builtin_quant_dtypes["qint8"])


 def convert_from_qint8(arr: np.ndarray):
-    """
-    Dequantize a qint8 NumPy ndarray into a float one.
+    r"""Dequantize a qint8 NumPy ndarray into a float one.

-    :param arr: Input ndarray.
+    Args:
+        arr: Input ndarray.
    """
    return _convert_from_quantized_dtype(arr, _builtin_quant_dtypes["qint8"])


 def convert_to_qint32(arr: np.ndarray, q: np.dtype):
-    """
-    Quantize a float NumPy ndarray into a qint32 one with specified params.
+    r"""Quantize a float NumPy ndarray into a qint32 one with specified params.

-    :param arr: Input ndarray.
-    :param q: Target data type, should be a qint8.
+    Args:
+        arr: Input ndarray.
+        q: Target data type, should be a qint8.
    """
    return _convert_to_quantized_dtype(arr, q, _builtin_quant_dtypes["qint32"])


 def convert_from_qint32(arr):
-    """
-    Dequantize a qint32 NumPy ndarray into a float one.
+    r"""Dequantize a qint32 NumPy ndarray into a float one.

-    :param arr: Input ndarray.
+    Args:
+        arr: Input ndarray.
    """
    return _convert_from_quantized_dtype(arr, _builtin_quant_dtypes["qint32"])


 def convert_to_quint4(arr: np.ndarray, q: np.dtype):
-    """
-    Quantize a float NumPy ndarray into a quint4 one with specified params.
+    r"""Quantize a float NumPy ndarray into a quint4 one with specified params.

-    :param arr: Input ndarray.
-    :param q: Target data type, should be a quint4.
+    Args:
+        arr: Input ndarray.
+        q: Target data type, should be a quint4.
    """
    return _convert_to_quantized_dtype(arr, q, _builtin_quant_dtypes["quint4"])


 def convert_from_quint4(arr: np.ndarray):
-    """
-    Dequantize a quint4 NumPy ndarray into a float one.
+    r"""Dequantize a quint4 NumPy ndarray into a float one.

-    :param arr: Input ndarray.
+    Args:
+        arr: Input ndarray.
    """
    return _convert_from_quantized_dtype(arr, _builtin_quant_dtypes["quint4"])


 def convert_to_qint4(arr: np.ndarray, q: np.dtype):
-    """
-    Quantize a float NumPy ndarray into a qint4 one with specified params.
+    r"""Quantize a float NumPy ndarray into a qint4 one with specified params.

-    :param arr: Input ndarray.
-    :param q: Target data type, should be a qint4.
+    Args:
+        arr: Input ndarray.
+        q: Target data type, should be a qint4.
    """
    return _convert_to_quantized_dtype(arr, q, _builtin_quant_dtypes["qint4"])


 def convert_from_qint4(arr: np.ndarray):
-    """
-    Dequantize a qint4 NumPy ndarray into a float one.
+    r"""Dequantize a qint4 NumPy ndarray into a float one.

-    :param arr: Input ndarray.
+    Args:
+        arr: Input ndarray.
    """
    return _convert_from_quantized_dtype(arr, _builtin_quant_dtypes["qint4"])
--- a/imperative/python/megengine/core/tensor/megbrain_graph.py
+++ b/imperative/python/megengine/core/tensor/megbrain_graph.py
@@ -24,11 +24,11 @@ from .core import TensorBase


 def set_priority_to_id(dest_vars):
-    """
-    For all oprs in the subgraph constructed by dest_vars,
+    r"""For all oprs in the subgraph constructed by dest_vars,
    sets its priority to id if its original priority is zero.
-    
-    :param dest_vars: target vars representing the graph.
+
+    Args:
+        dest_vars: target vars representing the graph.
    """
    dest_vec = []
    for i in dest_vars:
@@ -220,54 +220,50 @@ class OpNode:


 def optimize_for_inference(dest_vars, **kwargs):
-    r"""
-    Applies optimize_for_inference pass for computing graph.
-
-        :param dest_vars: list of output vars in the computing graph
-
-        :Keyword Arguments:
-
-            * enable_io16xc32 --
-                whether to use float16 for I/O between oprs and use
-                float32 as internal computation precision. Note the output var would be
-                changed to float16.
-            * enable_ioc16 --
-                whether to use float16 for both I/O and computation
-                precision.
-
-            * enable_hwcd4 --
-                whether to use NHWCD4 data layout. This is faster on some
-                OpenCL backend.
-            * enable_nchw88 --
-                whether to use NCHW88 data layout, currently
-                used in X86 AVX backend.
-            * enable_nchw44 --
-                whether to use NCHW44 data layout, currently
-                used in arm backend.
-            * enable_nchw44_dot --
-                whether to use NCHW44_dot data layout, currently
-                used in armv8.2+dotprod backend.
-            * enable_nchw4 --
-                whether to use NCHW4 data layout, currently
-                used in nvidia backend(based on cudnn).
-            * enable_nchw32 --
-                whether to use NCHW32 data layout, currently
-                used in nvidia backend with tensorcore(based on cudnn).
-            * enable_chwn4 --
-                whether to use CHWN4 data layout, currently
-                used in nvidia backend with tensorcore.
-            * enable_nchw64 --
-                whether to use NCHW64 data layout, used for fast int4
-                support on Nvidia GPU.
-
-            * enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
-                into one opr.
-            * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
-                input for inference on nvidia backend(this optimization pass will
-                result in mismatch of the precision of output of training and
-                inference)
-            * enable_fuse_preprocess: whether to fuse astype\pad channel\dimshuffle and
-                etc opr from h2d opr.
+    r"""Applies optimize_for_inference pass for computing graph.
+
+    Args:
+        dest_vars: list of output vars in the computing graph
+
+    Keyword Arguments:
+
+        * enable_io16xc32 --
+          whether to use float16 for I/O between oprs and use
+          float32 as internal computation precision. Note the output var would be
+          changed to float16.
+        * enable_ioc16 --
+          whether to use float16 for both I/O and computation
+          precision.
+        * enable_hwcd4 --
+          whether to use NHWCD4 data layout. This is faster on some
+          OpenCL backend.
+        * enable_nchw88 --
+          whether to use NCHW88 data layout, currently
+          used in X86 AVX backend.
+        * enable_nchw44 --
+          whether to use NCHW44 data layout, currently
+          used in arm backend.
+        * enable_nchw44_dot --
+          whether to use NCHW44_dot data layout, currently
+          used in armv8.2+dotprod backend.
+        * enable_nchw4 --
+          whether to use NCHW4 data layout, currently
+          used in nvidia backend(based on cudnn).
+        * enable_nchw32 --
+          whether to use NCHW32 data layout, currently
+          used in nvidia backend with tensorcore(based on cudnn).
+        * enable_chwn4 --
+          whether to use CHWN4 data layout, currently
+          used in nvidia backend with tensorcore.
+        * enable_nchw64 --
+          whether to use NCHW64 data layout, used for fast int4
+          support on Nvidia GPU.
+        * enable_fuse_conv_bias_nonlinearity: whether to fuse conv+bias+nonlinearty
+          into one opr.
+        * enable_fuse_conv_bias_with_z: whether to fuse conv_bias with z
+          input for inference on nvidia backend(this optimization pass will
+          result in mismatch of the precision of output of training and
+          inference)
    """
    inference_options = GraphOptimizeOptions()
    inference_optimize_layout_transform_map = {
@@ -305,11 +301,13 @@ def optimize_for_inference(dest_vars, **kwargs):


 def deserialize_infer_option(x: int) -> Dict[str, bool]:
-    r"""
-    Deserailize optimize options generated by ``imperative_rt.GraphOptimizeOptions``.
+    r"""Deserailize optimize options generated by ``imperative_rt.GraphOptimizeOptions``.

-    :param x: inference options represented by int.
-    :return: inference options represented by dict.
+    Args:
+        x: inference options represented by int.
+
+    Returns:
+        inference options represented by dict.
    """

    inference_options = GraphOptimizeOptions.deserialize(x)
@@ -346,13 +344,12 @@ def deserialize_infer_option(x: int) -> Dict[str, bool]:


 def modify_opr_algo_strategy_inplace(dest_vars, strategy: str):
-    """
-    C++ graph version of :func:`~.set_execution_strategy`. Used to inplacely modify
+    r"""C++ graph version of :func:`~.set_execution_strategy`. Used to inplacely modify
    dumped graph's fast-run strategy.

-    :param dest_vars: list of output vars in the computing graph.
-    :param strategy: fast-run algorithms strategy.
-
+    Args:
+        dest_vars: list of output vars in the computing graph.
+        strategy: fast-run algorithms strategy.
    """
    dest_vars = _unwrap(dest_vars)
    _imperative_rt.modify_opr_algo_strategy_inplace(dest_vars, strategy)
@@ -383,39 +380,40 @@ def dump_graph(
    append_json=False,
    metadata=None
 ) -> Tuple[bytes, CompGraphDumpResult]:
-    """
-    serialize the computing graph of `output_vars` and get byte result.
-
-    :param output_vars: output variables which are the graph's end point.
-
-        .. note::
-
-            The underlying C++ API only accepts a var list. If a dict is given,
-            the vars would be renamed to the given names.
-
-    :param keep_var_name: level for keeping variable names:
-
-        * 0: none of the names are kept
-        * 1: (default)keep names of output vars
-        * 2: keep names of all (output and internal) vars
-    :param keep_opr_name: whether to keep operator names.
-    :param keep_param_name: whether to keep param names, so param values can be
-        easily manipulated after loading model
-    :param keep_opr_priority: whether to keep priority setting for operators
-    :param strip_info_file: a string for path or a file handler. if is not None,
-        then the dump information for code strip would be written to ``strip_info_file``
-    :param append_json: will be check when `strip_info_file` is not None. if set
-        true, the information for code strip will be append to strip_info_file.
-        if set false, will rewrite strip_info_file
-    :return: dump result as byte string, and an instance of namedtuple
+    r"""serialize the computing graph of `output_vars` and get byte result.
+
+    Args:
+        output_vars: output variables which are the graph's end point.
+        keep_var_name: level for keeping variable names:
+
+            * 0: none of the names are kept
+            * 1: (default)keep names of output vars
+            * 2: keep names of all (output and internal) vars
+
+        keep_opr_name: whether to keep operator names.
+        keep_param_name: whether to keep param names, so param values can be
+            easily manipulated after loading model
+        keep_opr_priority: whether to keep priority setting for operators
+        strip_info_file: a string for path or a file handler. if is not None,
+            then the dump information for code strip would be written to ``strip_info_file``
+        append_json: will be check when `strip_info_file` is not None. if set
+            true, the information for code strip will be append to strip_info_file.
+            if set false, will rewrite strip_info_file
+
+    Note:
+        The underlying C++ API only accepts a var list. If a dict is given,
+        the vars would be renamed to the given names.
+
+    Returns:
+        dump result as byte string, and an instance of namedtuple
        :class:`CompGraphDumpResult`, whose fields are:

-            * ``nr_opr`` number of operators dumped
-            * ``tot_bytes`` total bytes for the whole graph
-            * ``tensor_value_bytes`` bytes consumed for dumping tensor values
-            * ``inputs`` names of input tensors
-            * ``params`` list of names of dumped params
-            * ``outputs`` names of output vars
+        * ``nr_opr`` number of operators dumped
+        * ``tot_bytes`` total bytes for the whole graph
+        * ``tensor_value_bytes`` bytes consumed for dumping tensor values
+        * ``inputs`` names of input tensors
+        * ``params`` list of names of dumped params
+        * ``outputs`` names of output vars
    """
    if isinstance(output_vars, dict):
        used_vars = set()
@@ -483,17 +481,19 @@ CompGraphLoadResult = collections.namedtuple(


 def load_graph(fpath) -> CompGraphLoadResult:
-    """
-    Load a serialized computing graph from file.
+    r"""Load a serialized computing graph from file.
+
+    Args:
+        fpath: Path or Handle of the input file

-    :param fpath: Path or Handle of the input file
-    :return: An instance of namedtuple :class:`CompGraphLoadResult`,
+    Returns:
+        An instance of namedtuple :class:`CompGraphLoadResult`,
        whose fields are:

-            * ``graph`` loaded CompGraph
-            * ``output_vars_dict`` A Python dict, mapping name to output SymbolVar
-            * ``output_vars_list`` A Python list, containing output vars in the
-                                   order passed to serialize_comp_graph_to_file
+        * ``graph`` loaded CompGraph
+        * ``output_vars_dict`` A Python dict, mapping name to output SymbolVar
+        * ``output_vars_list`` A Python list, containing output vars in the
+          order passed to serialize_comp_graph_to_file
    """
    output_vars_map = []
    output_vars_list = []

--- a/imperative/python/megengine/core/tensor/utils.py
+++ b/imperative/python/megengine/core/tensor/utils.py
@@ -24,12 +24,12 @@ _enable_convert_inputs = True


 def get_convert_inputs():
-    """ get the curerent state of `_enable_convert_inputs` """
+    r"""get the curerent state of `_enable_convert_inputs`"""
    return _enable_convert_inputs


 def set_convert_inputs(flag):
-    """ This function is a temporary workaround for reducing the overhead of operator
+    r"""This function is a temporary workaround for reducing the overhead of operator
    invocations. The function `convert_inputs` is disabled if the global state
    `_enable_convert_inputs` is set to `False`, otherwise enabled. This function is for
    internal use only, and should be removed when the tensor-like system is refactored.
@@ -137,11 +137,11 @@ def setscalar(x):


 def astensor1d(x, *reference, dtype=None, device=None):
-    """
-    Convert something to 1D tensor. Support following types
-    * sequence of scalar literal / tensor
-    * numpy array
-    * tensor (returned as is, regardless of dtype and device)
+    """Convert something to 1D tensor. Support following types
+
+      * sequence of scalar literal / tensor
+      * numpy array
+      * tensor (returned as is, regardless of dtype and device)
    """
    try:
        ndim = x.ndim

--- a/imperative/python/megengine/data/collator.py
+++ b/imperative/python/megengine/data/collator.py
@@ -33,16 +33,11 @@ default_collate_err_msg_format = (


 class Collator:
-    r"""
-    Used for merging a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a dataset.
+    r"""Used for merging a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a dataset.
    Modified from https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py
    """

    def apply(self, inputs):
-        """
-        :param inputs: sequence_N(tuple(CHW, C, CK)).
-        :return: tuple(NCHW, NC, NCK).
-        """
        elem = inputs[0]
        elem_type = type(elem)
        if (

--- a/imperative/python/megengine/data/dataloader.py
+++ b/imperative/python/megengine/data/dataloader.py
@@ -44,28 +44,28 @@ def raise_timeout_error():

 class DataLoader:
    r"""Provides a convenient way to iterate on a given dataset.
-
+    
    DataLoader combines a dataset with
    :class:`~.Sampler`, :class:`~.Transform` and :class:`~.Collator`,
    make it flexible to get minibatch continually from a dataset.

-    :param dataset: dataset from which to load the minibatch.
-    :param sampler: defines the strategy to sample data from the dataset.
-    :param transform: defined the transforming strategy for a sampled batch.
-        Default: None
-    :param collator: defined the merging strategy for a transformed batch.
-        Default: None
-    :param num_workers: the number of sub-process to load, transform and collate
-        the batch. ``0`` means using single-process. Default: 0
-    :param timeout: if positive, means the timeout value(second) for collecting a
-        batch from workers. Default: 0
-    :param timeout_event: callback function triggered by timeout, default to raise
-        runtime error.
-    :param divide: define the paralleling strategy in multi-processing mode.
-        ``True`` means one batch is divided into :attr:`num_workers` pieces, and
-        the workers will process these pieces parallelly. ``False`` means
-        different sub-process will process different batch. Default: False
-
+    Args:
+        dataset: dataset from which to load the minibatch.
+        sampler: defines the strategy to sample data from the dataset.
+        transform: defined the transforming strategy for a sampled batch.
+            Default: None
+        collator: defined the merging strategy for a transformed batch.
+            Default: None
+        num_workers: the number of sub-process to load, transform and collate
+            the batch. ``0`` means using single-process. Default: 0
+        timeout: if positive, means the timeout value(second) for collecting a
+            batch from workers. Default: 0
+        timeout_event: callback function triggered by timeout, default to raise
+            runtime error.
+        divide: define the paralleling strategy in multi-processing mode.
+            ``True`` means one batch is divided into :attr:`num_workers` pieces, and
+            the workers will process these pieces parallelly. ``False`` means
+            different sub-process will process different batch. Default: False
    """
    __initialized = False


--- a/imperative/python/megengine/data/dataset/meta_dataset.py
+++ b/imperative/python/megengine/data/dataset/meta_dataset.py
@@ -11,8 +11,7 @@ from typing import Tuple


 class Dataset(ABC):
-    r"""
-    An abstract base class for all datasets.
+    r"""An abstract base class for all datasets.

    __getitem__ and __len__ method are aditionally needed.
    """
@@ -31,8 +30,7 @@ class Dataset(ABC):


 class StreamDataset(Dataset):
-    r"""
-    An abstract class for stream data.
+    r"""An abstract class for stream data.

    __iter__ method is aditionally needed.
    """
@@ -53,10 +51,9 @@ class StreamDataset(Dataset):


 class ArrayDataset(Dataset):
-    r"""
-    ArrayDataset is a dataset for numpy array data.
+    r"""ArrayDataset is a dataset for numpy array data.

-    One or more numpy arrays are needed to initiate the dataset. 
+    One or more numpy arrays are needed to initiate the dataset.
    And the dimensions represented sample number are expected to be the same.
    """


--- a/imperative/python/megengine/data/dataset/vision/cifar.py
+++ b/imperative/python/megengine/data/dataset/vision/cifar.py
@@ -21,8 +21,7 @@ logger = get_logger(__name__)


 class CIFAR10(VisionDataset):
-    r""" :class:`~.Dataset` for CIFAR10 meta data.
-    """
+    r""":class:`~.Dataset` for CIFAR10 meta data."""

    url_path = "http://www.cs.utoronto.ca/~kriz/"
    raw_file_name = "cifar-10-python.tar.gz"
@@ -138,8 +137,7 @@ class CIFAR10(VisionDataset):


 class CIFAR100(CIFAR10):
-    r""" :class:`~.Dataset` for CIFAR100 meta data.
-    """
+    r""":class:`~.Dataset` for CIFAR100 meta data."""

    url_path = "http://www.cs.utoronto.ca/~kriz/"
    raw_file_name = "cifar-100-python.tar.gz"

--- a/imperative/python/megengine/data/dataset/vision/cityscapes.py
+++ b/imperative/python/megengine/data/dataset/vision/cityscapes.py
@@ -23,9 +23,7 @@ from .meta_vision import VisionDataset


 class Cityscapes(VisionDataset):
-    r"""
-    `Cityscapes <http://www.cityscapes-dataset.com/>`_ Dataset.
-    """
+    r"""`Cityscapes <http://www.cityscapes-dataset.com/>`_ Dataset."""

    supported_order = (
        "image",

--- a/imperative/python/megengine/data/dataset/vision/coco.py
+++ b/imperative/python/megengine/data/dataset/vision/coco.py
@@ -46,9 +46,7 @@ def has_valid_annotation(anno, order):


 class COCO(VisionDataset):
-    r"""
-    `MS COCO <http://cocodataset.org/#home>`_ Dataset.
-    """
+    r"""`MS COCO <http://cocodataset.org/#home>`_ Dataset."""

    supported_order = (
        "image",

--- a/imperative/python/megengine/data/dataset/vision/folder.py
+++ b/imperative/python/megengine/data/dataset/vision/folder.py
@@ -26,22 +26,21 @@ from .utils import is_img


 class ImageFolder(VisionDataset):
-    r"""
-    ImageFolder is a class for loading image data and labels from a organized folder.
-
+    r"""ImageFolder is a class for loading image data and labels from a organized folder.
+    
    The folder is expected to be organized as followed: root/cls/xxx.img_ext
-
+    
    Labels are indices of sorted classes in the root directory.

-    :param root: root directory of an image folder.
-    :param loader: a function used to load image from path,
-                   if ``None``, default function that loads
-                   images with PIL will be called.
-    :param check_valid_func: a function used to check if files in folder are
-                             expected image files, if ``None``, default function
-                             that checks file extensions will be called.
-    :param class_name: if ``True``, return class name instead of class index.
-
+    Args:
+        root: root directory of an image folder.
+        loader: a function used to load image from path,
+            if ``None``, default function that loads
+            images with PIL will be called.
+        check_valid_func: a function used to check if files in folder are
+            expected image files, if ``None``, default function
+            that checks file extensions will be called.
+        class_name: if ``True``, return class name instead of class index.
    """

    def __init__(self, root: str, check_valid_func=None, class_name: bool = False):

--- a/imperative/python/megengine/data/dataset/vision/imagenet.py
+++ b/imperative/python/megengine/data/dataset/vision/imagenet.py
@@ -30,11 +30,10 @@ logger = get_logger(__name__)


 class ImageNet(ImageFolder):
-    r"""
-    Load ImageNet from raw files or folder. Expected folder looks like:
-
-    .. code-block:: bash
-
+    r"""Load ImageNet from raw files or folder. Expected folder looks like:
+    
+    .. code-block:: shell
+    
        ${root}/
        |       [REQUIRED TAR FILES]
        |-  ILSVRC2012_img_train.tar
@@ -45,22 +44,8 @@ class ImageNet(ImageFolder):
        |-  val/cls/xxx.${img_ext}
        |-  ILSVRC2012_devkit_t12/data/meta.mat
        |-  ILSVRC2012_devkit_t12/data/ILSVRC2012_validation_ground_truth.txt
-
+    
    If the image folders don't exist, raw tar files are required to get extracted and processed.
-    """
-
-    raw_file_meta = {
-        "train": ("ILSVRC2012_img_train.tar", "1d675b47d978889d74fa0da5fadfb00e"),
-        "val": ("ILSVRC2012_img_val.tar", "29b22e2961454d5413ddabcf34fc5622"),
-        "devkit": ("ILSVRC2012_devkit_t12.tar.gz", "fa75699e90414af021442c21a62c3abf"),
-    }  # ImageNet raw files
-    default_train_dir = "train"
-    default_val_dir = "val"
-    default_devkit_dir = "ILSVRC2012_devkit_t12"
-
-    def __init__(self, root: str = None, train: bool = True, **kwargs):
-        r"""
-        Initialization:

        * if ``root`` contains ``self.target_folder`` depending on ``train``:

@@ -77,10 +62,22 @@ class ImageNet(ImageFolder):

            * raise error.

-        :param root: root directory of imagenet data, if root is ``None``, use default_dataset_root.
-        :param train: if ``True``, load the train split, otherwise load the validation split.
-        """
+    Args:
+        root: root directory of imagenet data, if root is ``None``, use default_dataset_root.
+        train: if ``True``, load the train split, otherwise load the validation split.
+
+    """

+    raw_file_meta = {
+        "train": ("ILSVRC2012_img_train.tar", "1d675b47d978889d74fa0da5fadfb00e"),
+        "val": ("ILSVRC2012_img_val.tar", "29b22e2961454d5413ddabcf34fc5622"),
+        "devkit": ("ILSVRC2012_devkit_t12.tar.gz", "fa75699e90414af021442c21a62c3abf"),
+    }  # ImageNet raw files
+    default_train_dir = "train"
+    default_val_dir = "val"
+    default_devkit_dir = "ILSVRC2012_devkit_t12"
+
+    def __init__(self, root: str = None, train: bool = True, **kwargs):
        # process the root path
        if root is None:
            self.root = self._default_root

--- a/imperative/python/megengine/data/dataset/vision/mnist.py
+++ b/imperative/python/megengine/data/dataset/vision/mnist.py
@@ -22,8 +22,7 @@ logger = get_logger(__name__)


 class MNIST(VisionDataset):
-    r""" :class:`~.Dataset` for MNIST meta data.
-    """
+    r""":class:`~.Dataset` for MNIST meta data."""

    url_path = "http://yann.lecun.com/exdb/mnist/"
    """

--- a/imperative/python/megengine/data/dataset/vision/objects365.py
+++ b/imperative/python/megengine/data/dataset/vision/objects365.py
@@ -23,9 +23,7 @@ from .meta_vision import VisionDataset


 class Objects365(VisionDataset):
-    r"""
-    `Objects365 <https://www.objects365.org/overview.html>`_ Dataset.
-    """
+    r"""`Objects365 <https://www.objects365.org/overview.html>`_ Dataset."""

    supported_order = (
        "image",

--- a/imperative/python/megengine/data/dataset/vision/voc.py
+++ b/imperative/python/megengine/data/dataset/vision/voc.py
@@ -24,9 +24,7 @@ from .meta_vision import VisionDataset


 class PascalVOC(VisionDataset):
-    r"""
-    `Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ Dataset.
-    """
+    r"""`Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ Dataset."""

    supported_order = (
        "image",

--- a/imperative/python/megengine/data/sampler.py
+++ b/imperative/python/megengine/data/sampler.py
@@ -17,9 +17,7 @@ import megengine.distributed as dist


 class Sampler(ABC):
-    r"""
-    An abstract base class for all Sampler
-    """
+    r"""An abstract base class for all Sampler"""

    @abstractmethod
    def __init__(self):
@@ -27,19 +25,19 @@ class Sampler(ABC):


 class MapSampler(Sampler):
-    r"""
-    Sampler for map dataset.
-
-    :param dataset: dataset to sample from.
-    :param batch_size: batch size for batch method.
-    :param drop_last: set ``True`` to drop the last incomplete batch,
-        if the dataset size is not divisible by the batch size. If ``False`` and 
-        the size of dataset is not divisible by the batch_size, then the last batch will
-        be smaller. Default: False
-    :param num_samples: number of samples assigned to one rank.
-    :param world_size: number of ranks.
-    :param rank: rank id, non-negative interger within 0 and ``world_size``.
-    :param seed: seed for random operators.
+    r"""Sampler for map dataset.
+
+    Args:
+        dataset: dataset to sample from.
+        batch_size: batch size for batch method.
+        drop_last: set ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch_size, then the last batch will
+            be smaller. Default: False
+        num_samples: number of samples assigned to one rank.
+        world_size: number of ranks.
+        rank: rank id, non-negative interger within 0 and ``world_size``.
+        seed: seed for random operators.
    """

    def __init__(
@@ -106,14 +104,11 @@ class MapSampler(Sampler):
            return int(math.ceil(self.num_samples / self.batch_size))

    def sample(self):
-        """
-        Return a list contains all sample indices.
-        """
+        r"""Return a list contains all sample indices."""
        raise NotImplementedError

    def scatter(self, indices) -> List:
-        r"""
-        Scatter method is used for splitting indices into subset, each subset
+        r"""Scatter method is used for splitting indices into subset, each subset
        will be assigned to a rank. Indices are evenly splitted by default.
        If customized indices assignment method is needed, please rewrite this method.
        """
@@ -130,9 +125,7 @@ class MapSampler(Sampler):
        return indices

    def batch(self) -> Iterator[List[Any]]:
-        r"""
-        Batch method provides a batch indices generator.
-        """
+        r"""Batch method provides a batch indices generator."""
        indices = list(self.sample())

        # user might pass the world_size parameter without dist,
@@ -150,18 +143,15 @@ class MapSampler(Sampler):


 class StreamSampler(Sampler):
-    r"""
-    Sampler for stream dataset.
-
-    .. warning::
+    r"""Sampler for stream dataset.

+    Warning:
        In the case of multiple machines, sampler should ensure that each worker gets
        different data. But this class cannot do it yet, please build your own
        dataset and sampler to achieve this goal.

    Usually, :meth:`~.StreamDataset.__iter__` can return different iterator by
    ``rank = dist.get_rank()``. So that they will get different data.
-
    """

    def __init__(self, batch_size=1):
@@ -175,18 +165,18 @@ class StreamSampler(Sampler):


 class SequentialSampler(MapSampler):
-    r"""
-    Sample elements sequentially.
-
-    :param dataset: dataset to sample from.
-    :param batch_size: batch size for batch method.
-    :param drop_last: set ``True`` to drop the last incomplete batch,
-        if the dataset size is not divisible by the batch size. If ``False`` and 
-        the size of dataset is not divisible by the batch_size, then the last batch will
-        be smaller. Default: False
-    :param indices: indice of samples.
-    :param world_size: number of ranks.
-    :param rank: rank id, non-negative interger within 0 and ``world_size``.
+    r"""Sample elements sequentially.
+
+    Args:
+        dataset: dataset to sample from.
+        batch_size: batch size for batch method.
+        drop_last: set ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch_size, then the last batch will
+            be smaller. Default: False
+        indices: indice of samples.
+        world_size: number of ranks.
+        rank: rank id, non-negative interger within 0 and ``world_size``.
    """

    def __init__(
@@ -207,9 +197,7 @@ class SequentialSampler(MapSampler):
        self.indices = indices

    def sample(self) -> Iterator[Any]:
-        r"""
-        Return a generator.
-        """
+        r"""Return a generator."""
        if self.indices is None:
            return iter(range(len(self.dataset)))
        else:
@@ -217,19 +205,19 @@ class SequentialSampler(MapSampler):


 class RandomSampler(MapSampler):
-    r"""
-    Sample elements randomly without replacement.
-
-    :param dataset: dataset to sample from.
-    :param batch_size: batch size for batch method.
-    :param drop_last: set ``True`` to drop the last incomplete batch,
-        if the dataset size is not divisible by the batch size. If ``False`` and 
-        the size of dataset is not divisible by the batch_size, then the last batch will
-        be smaller. Default: False
-    :param indices: indice of samples.
-    :param world_size: number of ranks.
-    :param rank: rank id, non-negative interger within 0 and ``world_size``.
-    :param seed: seed for random operators.
+    r"""Sample elements randomly without replacement.
+
+    Args:
+        dataset: dataset to sample from.
+        batch_size: batch size for batch method.
+        drop_last: set ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch_size, then the last batch will
+            be smaller. Default: False
+        indices: indice of samples.
+        world_size: number of ranks.
+        rank: rank id, non-negative interger within 0 and ``world_size``.
+        seed: seed for random operators.
    """

    def __init__(
@@ -258,20 +246,20 @@ class RandomSampler(MapSampler):


 class ReplacementSampler(MapSampler):
-    r"""
-    Sample elements randomly with replacement.
-
-    :param dataset: dataset to sample from.
-    :param batch_size: batch size for batch method.
-    :param drop_last: set ``True`` to drop the last incomplete batch,
-        if the dataset size is not divisible by the batch size. If ``False`` and 
-        the size of dataset is not divisible by the batch_size, then the last batch will
-        be smaller. Default: False
-    :param num_samples: number of samples assigned to one rank.
-    :param weights: weights for sampling indices, it could be unnormalized weights.
-    :param world_size: number of ranks.
-    :param rank: rank id, non-negative interger within 0 and ``world_size``.
-    :param seed: seed for random operators.
+    r"""Sample elements randomly with replacement.
+
+    Args:
+        dataset: dataset to sample from.
+        batch_size: batch size for batch method.
+        drop_last: set ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch_size, then the last batch will
+            be smaller. Default: False
+        num_samples: number of samples assigned to one rank.
+        weights: weights for sampling indices, it could be unnormalized weights.
+        world_size: number of ranks.
+        rank: rank id, non-negative interger within 0 and ``world_size``.
+        seed: seed for random operators.
    """

    def __init__(

--- a/imperative/python/megengine/data/tools/_queue.py
+++ b/imperative/python/megengine/data/tools/_queue.py
@@ -59,15 +59,13 @@ class _PlasmaStoreManager:

 class PlasmaShmQueue:
    def __init__(self, maxsize: int = 0):
-        r"""
-        Use pyarrow in-memory plasma store to implement shared memory queue.
-
+        r"""Use pyarrow in-memory plasma store to implement shared memory queue.
        Compared to native `multiprocess.Queue`, `PlasmaShmQueue` avoid pickle/unpickle
        and communication overhead, leading to better performance in multi-process
        application.

-        :type maxsize: int
-        :param maxsize: maximum size of the queue, `None` means no limit. (default: ``None``)
+        Args:
+            maxsize: maximum size of the queue, `None` means no limit. (default: ``None``)
        """

        # Lazy start the plasma store manager

--- a/imperative/python/megengine/data/transform/meta_transform.py
+++ b/imperative/python/megengine/data/transform/meta_transform.py
@@ -11,9 +11,7 @@ from typing import Sequence, Tuple


 class Transform(ABC):
-    """
-    Rewrite apply method in subclass.
-    """
+    r"""Rewrite apply method in subclass."""

    def apply_batch(self, inputs: Sequence[Tuple]):
        return tuple(self.apply(input) for input in inputs)

--- a/imperative/python/megengine/data/transform/vision/functional.py
+++ b/imperative/python/megengine/data/transform/vision/functional.py
@@ -15,7 +15,7 @@ import numpy as np


 def wrap_keepdims(func):
-    """Wraper to keep the dimension of input images unchanged."""
+    r"""Wraper to keep the dimension of input images unchanged."""

    @functools.wraps(func)
    def wrapper(image, *args, **kwargs):
@@ -33,41 +33,47 @@ def wrap_keepdims(func):

 @wrap_keepdims
 def to_gray(image):
-    r"""
-    Change BGR format image's color space to gray.
+    r"""Change BGR format image's color space to gray.

-    :param image: input BGR format image, with `(H, W, C)` shape.
-    :return: gray format image, with `(H, W, C)` shape.
+    Args:
+        image: input BGR format image, with `(H, W, C)` shape.
+
+    Returns:
+        gray format image, with `(H, W, C)` shape.
    """
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)


 @wrap_keepdims
 def to_bgr(image):
-    r"""
-    Change gray format image's color space to BGR.
+    r"""Change gray format image's color space to BGR.
+
+    Args:
+        image: input Gray format image, with `(H, W, C)` shape.

-    :param image: input Gray format image, with `(H, W, C)` shape.
-    :return: BGR format image, with `(H, W, C)` shape.
+    Returns:
+        BGR format image, with `(H, W, C)` shape.
    """
    return cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)


 @wrap_keepdims
 def pad(input, size, value):
-    r"""
-    Pad input data with *value* and given *size*.
-
-    :param input: input data, with `(H, W, C)` shape.
-    :param size: padding size of input data, it could be integer or sequence.
-        If it is an integer, the input data will be padded in four directions.
-        If it is a sequence contains two integer, the bottom and right side
-        of input data will be padded.
-        If it is a sequence contains four integer, the top, bottom, left, right
-        side of input data will be padded with given size.
-    :param value: padding value of data, could be a sequence of int or float.
-        If it is float value, the dtype of image will be casted to float32 also.
-    :return: padded image.
+    r"""Pad input data with *value* and given *size*.
+
+    Args:
+        input: input data, with `(H, W, C)` shape.
+        size: padding size of input data, it could be integer or sequence.
+            If it is an integer, the input data will be padded in four directions.
+            If it is a sequence contains two integer, the bottom and right side
+            of input data will be padded.
+            If it is a sequence contains four integer, the top, bottom, left, right
+            side of input data will be padded with given size.
+        value: padding value of data, could be a sequence of int or float.
+            If it is float value, the dtype of image will be casted to float32 also.
+
+    Returns:
+        padded image.
    """
    if isinstance(size, int):
        size = (size, size, size, size)
@@ -80,32 +86,33 @@ def pad(input, size, value):

 @wrap_keepdims
 def flip(image, flipCode):
-    r"""
-    Accordding to the flipCode (the type of flip), flip the input image.
-
-    :param image: input image, with `(H, W, C)` shape.
-    :param flipCode: code that indicates the type of flip.
+    r"""Accordding to the flipCode (the type of flip), flip the input image.

-        * 1 : Flip horizontally
+    Args:
+        image: input image, with `(H, W, C)` shape.
+        flipCode: code that indicates the type of flip.

-        * 0 : Flip vertically
+            * 1 : Flip horizontally
+            * 0 : Flip vertically
+            * -1: Flip horizontally and vertically

-        * -1: Flip horizontally and vertically
-
-    :return: BGR format image, with `(H, W, C)` shape.
+    Returns:
+        BGR format image, with `(H, W, C)` shape.
    """
    return cv2.flip(image, flipCode=flipCode)


 @wrap_keepdims
 def resize(input, size, interpolation=cv2.INTER_LINEAR):
-    r"""
-    Resize the input data to given size.
+    r"""Resize the input data to given size.
+
+    Args:
+        input: input data, could be image or masks, with `(H, W, C)` shape.
+        size: target size of input data, with (height, width) shape.
+        interpolation: interpolation method.

-    :param input: input data, could be image or masks, with `(H, W, C)` shape.
-    :param size: target size of input data, with (height, width) shape.
-    :param interpolation: interpolation method.
-    :return: resized data, with `(H, W, C)` shape.
+    Returns:
+        resized data, with `(H, W, C)` shape.
    """
    if len(size) != 2:
        raise ValueError("resize needs (h, w), but got {}".format(size))

--- a/imperative/python/megengine/data/transform/vision/transform.py
+++ b/imperative/python/megengine/data/transform/vision/transform.py
--- a/imperative/python/megengine/device.py
+++ b/imperative/python/megengine/device.py
@@ -54,10 +54,10 @@ _device_type_set = {"cpu", "gpu", "xpu", "rocm"}


 def get_device_count(device_type: str) -> int:
-    """
-    Gets number of devices installed on this system.
+    r"""Gets number of devices installed on this system.

-    :param device_type: device type, one of 'gpu' or 'cpu'
+    Args:
+        device_type: device type, one of 'gpu' or 'cpu'
    """
    assert device_type in _device_type_set, "device must be one of {}".format(
        _device_type_set
@@ -67,73 +67,59 @@ def get_device_count(device_type: str) -> int:


 def is_cuda_available() -> bool:
-    """
-    Returns whether cuda device is available on this system.
-
-    """
+    r"""Returns whether cuda device is available on this system."""
    t = _str2device_type("gpu")
    return CompNode._get_device_count(t, False) > 0


 def is_cambricon_available() -> bool:
-    """
-    Returns whether cambricon device is available on this system.
-
-    """
+    r"""Returns whether cambricon device is available on this system."""
    t = _str2device_type("cambricon")
    return CompNode._get_device_count(t, False) > 0


 def is_atlas_available() -> bool:
-    """
-    Returns whether atlas device is available on this system.
-
-    """
+    r"""Returns whether atlas device is available on this system."""
    t = _str2device_type("atlas")
    return CompNode._get_device_count(t, False) > 0


 def is_rocm_available() -> bool:
-    """Returns whether rocm device is available on this system.
-
-    """
+    r"""Returns whether rocm device is available on this system."""
    t = _str2device_type("rocm")
    return CompNode._get_device_count(t, False) > 0


 def set_default_device(device: str = "xpux"):
-    r"""
-    Sets default computing node.
-
-    :param device: default device type. The type can be 'cpu0', 'cpu1', etc.,
-        or 'gpu0', 'gpu1', etc., to specify the particular cpu or gpu to use.
-        'cpux' and  'gpux' can also be used to specify any number of cpu or gpu devices.
-
-        'multithread' device type is avaliable when inference, which implements
-        multi-threading parallelism at the operator level. For example,
-        'multithread4' will compute with 4 threads.
-
-        The default value is 'xpux' to specify any device available. The priority of using gpu is higher when both gpu and cpu are available.
-
-        It can also be set by environment variable `MGE_DEFAULT_DEVICE`.
+    r"""Sets default computing node.
+
+    Args:
+        device: default device type.
+
+    Note:
+        * The type can be 'cpu0', 'cpu1', etc., or 'gpu0', 'gpu1', etc.,
+          to specify the particular CPU or GPU to use.
+        * 'cpux' and  'gpux' can also be used to specify any number of CPU or GPU devices.
+        * The default value is 'xpux' to specify any device available.
+        * The priority of using GPU is higher when both GPU and CPU are available.
+        * 'multithread' device type is avaliable when inference,
+          which implements multi-threading parallelism at the operator level.
+          For example, 'multithread4' will compute with 4 threads.
+        * It can also be set by environment variable ``MGE_DEFAULT_DEVICE``.
    """
    assert _valid_device(device), "Invalid device name {}".format(device)
    CompNode._set_default_device(device)


 def get_default_device() -> str:
-    r"""
-    Gets default computing node.
-
+    r"""Gets default computing node.
    It returns the value set by :func:`~.set_default_device`.
    """
    return CompNode._get_default_device()


 def get_mem_status_bytes(device: Optional[str] = None):
-    r"""
-    Get total and free memory on the computing device in bytes.
-    """
+    r"""Get total and free memory on the computing device in bytes."""
    if device is None:
        device = get_default_device()
    tot, free = CompNode(device).get_mem_status_bytes
@@ -150,15 +136,17 @@ def set_prealloc_config(
    growth_factor=2.0,
    device_type=DeviceType.CUDA,
 ):
-    """
-    Specifies how to pre-allocate from raw device allocator.
-
-    :param alignment: specifies the alignment in bytes.
-    :param min_req: min request size in bytes.
-    :param max_overhead: max overhead above required size in bytes.
-    :param growth_factor: `request size / cur allocated`
-    :param device_type: the device type
-
+    r"""Specifies how to pre-allocate from raw device allocator.
+
+    Args:
+        alignment: specifies the alignment in bytes.
+        min_req: min request size in bytes.
+        max_overhead: max overhead above required size in bytes.
+        growth_factor: request size / cur allocated`
+        device_type: the device type
+        alignment: int:
+        min_req: int:
+        max_overhead: int:
    """
    assert alignment > 0
    assert min_req > 0

--- a/imperative/python/megengine/distributed/__init__.py
+++ b/imperative/python/megengine/distributed/__init__.py
@@ -31,17 +31,15 @@ from .server import Client, Server

 @mproperty
 def backend(mod):
-    r"""
-    Get or set backend of collective communication.
+    r"""Get or set backend of collective communication.
    Available backends are ['nccl', 'shm', 'rccl']

    Examples:

-    .. code-block::
-
-        import megengine.distributed as dist
-        dist.backend = "nccl"
+        .. code-block::

+            import megengine.distributed as dist
+            dist.backend = "nccl"
    """
    assert group._sd, "please call init_process_group first"
    return group._sd.backend

--- a/imperative/python/megengine/distributed/functional.py
+++ b/imperative/python/megengine/distributed/functional.py
@@ -50,7 +50,7 @@ def _backend():


 def collective_comm(inp, mode, group, device):
-    """Helper function for applying collective communication functions."""
+    r"""Helper function for applying collective communication functions."""
    assert isinstance(group, Group)
    if group is None:
        return inp
@@ -158,8 +158,7 @@ class _ReduceSum(Function):
 def reduce_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None,
 ) -> Tensor:
-    r"""
-    Reduce tensor data across the specified group by sum.
+    r"""Reduce tensor data across the specified group by sum.
    Only root process will receive the final result.

    Args:
@@ -176,22 +175,20 @@ def reduce_sum(
        Reduced tensor if in root process, None in other processes.

    Examples:
-
-    .. code-block::
-
-        input = Tensor([rank])
-        # Rank 0 # input: Tensor([0])
-        # Rank 1 # input: Tensor([1])
-        output = reduce_sum(input)
-        # Rank 0 # output: Tensor([1])
-        # Rank 1 # output: None
-
-        input = Tensor([rank])
-        group = Group([1, 0]) # first rank is root
-        output = reduce_sum(input, group)
-        # Rank 0 # output: None
-        # Rank 1 # output: Tensor([1])
-
+        .. code-block::
+
+           input = Tensor([rank])
+           # Rank 0 # input: Tensor([0])
+           # Rank 1 # input: Tensor([1])
+           output = reduce_sum(input)
+           # Rank 0 # output: Tensor([1])
+           # Rank 1 # output: None
+
+           input = Tensor([rank])
+           group = Group([1, 0]) # first rank is root
+           output = reduce_sum(input, group)
+           # Rank 0 # output: None
+           # Rank 1 # output: Tensor([1])
    """
    op = _ReduceSum(group, device)
    (out,) = apply(op, inp)
@@ -222,8 +219,7 @@ class _Broadcast(Function):
 def broadcast(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None,
 ) -> Tensor:
-    r"""
-    Broadcast tensor data from root process to others.
+    r"""Broadcast tensor data from root process to others.

    Args:
        inp: Input tensor.
@@ -240,21 +236,20 @@ def broadcast(

    Examples:

-    .. code-block::
-
-        input = Tensor([rank])
-        # Rank 0 # input: Tensor([0])
-        # Rank 1 # input: Tensor([1])
-        output = broadcast(input)
-        # Rank 0 # output: Tensor([0])
-        # Rank 1 # output: Tensor([0])
+        .. code-block::

-        input = Tensor([rank])
-        group = Group([1, 0]) # first rank is root
-        output = broadcast(input, group)
-        # Rank 0 # output: Tensor([1])
-        # Rank 1 # output: Tensor([1])
+           input = Tensor([rank])
+           # Rank 0 # input: Tensor([0])
+           # Rank 1 # input: Tensor([1])
+           output = broadcast(input)
+           # Rank 0 # output: Tensor([0])
+           # Rank 1 # output: Tensor([0])

+           input = Tensor([rank])
+           group = Group([1, 0]) # first rank is root
+           output = broadcast(input, group)
+           # Rank 0 # output: Tensor([1])
+           # Rank 1 # output: Tensor([1])
    """
    shape, dtype = _bcast_shape_dtype(group, inp)
    if group.rank != 0:
@@ -278,8 +273,7 @@ def _bcast_param(
 def all_gather(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, axis=0,
 ) -> Tensor:
-    r"""
-    Gather tensors across the specified group and concat them at first dimension.
+    r"""Gather tensors across the specified group and concat them at first dimension.

    Args:
        inp: Input tensor.
@@ -298,21 +292,20 @@ def all_gather(

    Examples:

-    .. code-block::
-
-        input = Tensor([rank])
-        # Rank 0 # input: Tensor([0])
-        # Rank 1 # input: Tensor([1])
-        output = all_gather(input)
-        # Rank 0 # output: Tensor([0 1])
-        # Rank 1 # output: Tensor([0 1])
+        .. code-block::

-        input = Tensor([rank])
-        group = Group([1, 0])
-        output = all_gather(input, group)
-        # Rank 0 # output: Tensor([1 0])
-        # Rank 1 # output: Tensor([1 0])
+           input = Tensor([rank])
+           # Rank 0 # input: Tensor([0])
+           # Rank 1 # input: Tensor([1])
+           output = all_gather(input)
+           # Rank 0 # output: Tensor([0 1])
+           # Rank 1 # output: Tensor([0 1])

+           input = Tensor([rank])
+           group = Group([1, 0])
+           output = all_gather(input, group)
+           # Rank 0 # output: Tensor([1 0])
+           # Rank 1 # output: Tensor([1 0])
    """
    mode = CollectiveComm.Mode.ALL_GATHER
    out = collective_comm(inp, mode, group, device)
@@ -338,8 +331,7 @@ def all_gather(
 def reduce_scatter_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, axis=0
 ) -> Tensor:
-    r"""
-    Reduce tensors across the specified group by sum and split them at first dimension.
+    r"""Reduce tensors across the specified group by sum and split them at first dimension.

    Args:
        inp: Input tensor.
@@ -358,21 +350,20 @@ def reduce_scatter_sum(

    Examples:

-    .. code-block::
-
-        input = Tensor([0 1])
-        # Rank 0 # input: Tensor([0 1])
-        # Rank 1 # input: Tensor([0 1])
-        output = reduce_scatter_sum(input)
-        # Rank 0 # output: Tensor([0])
-        # Rank 1 # output: Tensor([2])
+        .. code-block::

-        input = Tensor([0 1])
-        group = Group([1, 0])
-        output = reduce_scatter_sum(input, group)
-        # Rank 0 # output: Tensor([2])
-        # Rank 1 # output: Tensor([0])
+           input = Tensor([0 1])
+           # Rank 0 # input: Tensor([0 1])
+           # Rank 1 # input: Tensor([0 1])
+           output = reduce_scatter_sum(input)
+           # Rank 0 # output: Tensor([0])
+           # Rank 1 # output: Tensor([2])

+           input = Tensor([0 1])
+           group = Group([1, 0])
+           output = reduce_scatter_sum(input, group)
+           # Rank 0 # output: Tensor([2])
+           # Rank 1 # output: Tensor([0])
    """
    group_size = group.size if group is not None else 1
    assert (
@@ -398,8 +389,7 @@ def reduce_scatter_sum(
 def all_reduce_sum(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None,
 ) -> Tensor:
-    r"""
-    Reduce tensors across the specified group by sum.
+    r"""Reduce tensors across the specified group by sum.

    Args:
        inp: Input tensor.
@@ -416,15 +406,14 @@ def all_reduce_sum(

    Examples:

-    .. code-block::
-
-        input = Tensor(rank)
-        # Rank 0 # input: Tensor(0)
-        # Rank 1 # input: Tensor(1)
-        output = all_reduce_sum(input)
-        # Rank 0 # output: Tensor(1)
-        # Rank 1 # output: Tensor(1)
+        .. code-block::

+           input = Tensor(rank)
+           # Rank 0 # input: Tensor(0)
+           # Rank 1 # input: Tensor(1)
+           output = all_reduce_sum(input)
+           # Rank 0 # output: Tensor(1)
+           # Rank 1 # output: Tensor(1)
    """
    mode = CollectiveComm.Mode.ALL_REDUCE_SUM
    return collective_comm(inp, mode, group, device)
@@ -433,8 +422,7 @@ def all_reduce_sum(
 def all_reduce_max(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None,
 ) -> Tensor:
-    r"""
-    Reduce tensors across the specified group by max.
+    r"""Reduce tensors across the specified group by max.

    Args:
        inp: Input tensor.
@@ -451,15 +439,14 @@ def all_reduce_max(

    Examples:

-    .. code-block::
-
-        input = Tensor(rank)
-        # Rank 0 # input: Tensor(0)
-        # Rank 1 # input: Tensor(1)
-        output = all_reduce_max(input)
-        # Rank 0 # output: Tensor(1)
-        # Rank 1 # output: Tensor(1)
+        .. code-block::

+           input = Tensor(rank)
+           # Rank 0 # input: Tensor(0)
+           # Rank 1 # input: Tensor(1)
+           output = all_reduce_max(input)
+           # Rank 0 # output: Tensor(1)
+           # Rank 1 # output: Tensor(1)
    """
    mode = CollectiveComm.Mode.ALL_REDUCE_MAX
    return collective_comm(inp, mode, group, device)
@@ -468,8 +455,7 @@ def all_reduce_max(
 def all_reduce_min(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None,
 ) -> Tensor:
-    r"""
-    Reduce tensors across the specified group by min.
+    r"""Reduce tensors across the specified group by min.

    Args:
        inp: Input tensor.
@@ -486,15 +472,14 @@ def all_reduce_min(

    Examples:

-    .. code-block::
-
-        input = Tensor(rank)
-        # Rank 0 # input: Tensor(0)
-        # Rank 1 # input: Tensor(1)
-        output = all_reduce_min(input)
-        # Rank 0 # output: Tensor(0)
-        # Rank 1 # output: Tensor(0)
+        .. code-block::

+           input = Tensor(rank)
+           # Rank 0 # input: Tensor(0)
+           # Rank 1 # input: Tensor(1)
+           output = all_reduce_min(input)
+           # Rank 0 # output: Tensor(0)
+           # Rank 1 # output: Tensor(0)
    """
    mode = CollectiveComm.Mode.ALL_REDUCE_MIN
    return collective_comm(inp, mode, group, device)
@@ -520,8 +505,7 @@ class _Gather(Function):
 def gather(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, axis=0,
 ) -> Tensor:
-    r"""
-    Gather tensors across the specified group.
+    r"""Gather tensors across the specified group.
    Only root process will receive the final result.

    Args:
@@ -534,27 +518,23 @@ def gather(
            Specify "gpu0:1" to execute this operator on diffrent cuda stream,
            1 is stream id, and default stream id is 0.
        axis: The concat axis for collective_comm result
-            The default axis is 0
-    Returns:
-        Result tensor if in root process, None if in other process

    Examples:

-    .. code-block::
-
-        input = Tensor([rank])
-        # Rank 0 # input: Tensor([0])
-        # Rank 1 # input: Tensor([1])
-        output = gather(input)
-        # Rank 0 # output: Tensor([0 1])
-        # Rank 1 # output: None
+        .. code-block::

-        input = Tensor([rank])
-        group = Group([1, 0]) # first rank is root
-        output = gather(input, group)
-        # Rank 0 # output: None
-        # Rank 1 # output: Tensor([1 0])
+           input = Tensor([rank])
+           # Rank 0 # input: Tensor([0])
+           # Rank 1 # input: Tensor([1])
+           output = gather(input)
+           # Rank 0 # output: Tensor([0 1])
+           # Rank 1 # output: None

+           input = Tensor([rank])
+           group = Group([1, 0]) # first rank is root
+           output = gather(input, group)
+           # Rank 0 # output: None
+           # Rank 1 # output: Tensor([1 0])
    """
    assert (
        axis < inp.ndim
@@ -607,8 +587,7 @@ class _Scatter(Function):
 def scatter(
    inp: Tensor, group: Optional[Group] = WORLD, device: Optional[str] = None, axis=0,
 ) -> Tensor:
-    r"""
-    Split tensor in root process at first dimension.
+    r"""Split tensor in root process at first dimension.

    Args:
        inp: Input tensor.
@@ -627,21 +606,20 @@ def scatter(

    Examples:

-    .. code-block::
-
-        input = Tensor([0 1]) + rank*2
-        # Rank 0 # input: Tensor([0 1])
-        # Rank 1 # input: Tensor([2 3])
-        output = scatter(input)
-        # Rank 0 # output: Tensor([0])
-        # Rank 1 # output: Tensor([1])
+        .. code-block::

-        input = Tensor([0 1]) + rank*2
-        group = Group([1, 0]) # first rank is root
-        output = scatter(input, group)
-        # Rank 0 # output: Tensor([3])
-        # Rank 1 # output: Tensor([2])
+           input = Tensor([0 1]) + rank*2
+           # Rank 0 # input: Tensor([0 1])
+           # Rank 1 # input: Tensor([2 3])
+           output = scatter(input)
+           # Rank 0 # output: Tensor([0])
+           # Rank 1 # output: Tensor([1])

+           input = Tensor([0 1]) + rank*2
+           group = Group([1, 0]) # first rank is root
+           output = scatter(input, group)
+           # Rank 0 # output: Tensor([3])
+           # Rank 1 # output: Tensor([2])
    """
    shape, dtype = _bcast_shape_dtype(group, inp)
    if group.rank != 0:
@@ -680,8 +658,7 @@ def all_to_all(
    split_axis: int = 0,
    concat_axis: int = 0,
 ) -> Tensor:
-    r"""
-    Each process scatter input tensor to all processes and return gathered tensor.
+    r"""Each process scatter input tensor to all processes and return gathered tensor.

    Args:
        inp: Input tensor.
@@ -694,29 +671,26 @@ def all_to_all(
            1 is stream id, and default stream id is 0.
        split_axis: The axis that collectivecomm will split data
            the default axis is 0
-        split_axis: The axis that collectivecomm will concat data
-            the default axis is 0

    Returns:
        Result tensor.

    Examples:

-    .. code-block::
-
-        input = Tensor([0 1]) + rank*2
-        # Rank 0 # input: Tensor([0 1])
-        # Rank 1 # input: Tensor([2 3])
-        output = all_to_all(input)
-        # Rank 0 # output: Tensor([0 2])
-        # Rank 1 # output: Tensor([1 3])
+        .. code-block::

-        input = Tensor([0 1]) + rank*2
-        group = Group([1, 0])
-        output = all_to_all(input, group)
-        # Rank 0 # output: Tensor([0 3])
-        # Rank 1 # output: Tensor([2 1])
+           input = Tensor([0 1]) + rank*2
+           # Rank 0 # input: Tensor([0 1])
+           # Rank 1 # input: Tensor([2 3])
+           output = all_to_all(input)
+           # Rank 0 # output: Tensor([0 2])
+           # Rank 1 # output: Tensor([1 3])

+           input = Tensor([0 1]) + rank*2
+           group = Group([1, 0])
+           output = all_to_all(input, group)
+           # Rank 0 # output: Tensor([0 3])
+           # Rank 1 # output: Tensor([2 1])
    """
    group_size = group.size if group is not None else 1
    assert (
@@ -805,8 +779,7 @@ class _RemoteRecv(Function):


 def remote_send(inp: Tensor, dest_rank: int):
-    r"""
-    Send tensor to another process.
+    r"""Send tensor to another process.

    Args:
        inp: Tensor to send.
@@ -816,17 +789,15 @@ def remote_send(inp: Tensor, dest_rank: int):
        None.

    Examples:
-
-    .. code-block::
-
-        if rank == 0:
-            data = mge.tensor(1)
-            # Tensor(1)
-            F.distributed.remote_send(data, 1) # return None
-        else:
-            data = F.distributed.remote_recv(0)
-            # Tensor(1)
-
+        .. code-block::
+
+           if rank == 0:
+               data = mge.tensor(1)
+               # Tensor(1)
+               F.distributed.remote_send(data, 1) # return None
+           else:
+               data = F.distributed.remote_recv(0)
+               # Tensor(1)
    """
    group = _SendRecvGroup(get_rank(), dest_rank)
    _bcast_shape_dtype(group, inp)
@@ -844,8 +815,7 @@ def remote_send(inp: Tensor, dest_rank: int):


 def remote_recv(src_rank: int, device: Optional[str] = None, inp=None) -> Tensor:
-    r"""
-    Receive a tensor from another process.
+    r"""Receive a tensor from another process.

    Args:
        src_rank: Rank of source process.
@@ -862,14 +832,13 @@ def remote_recv(src_rank: int, device: Optional[str] = None, inp=None) -> Tensor

    .. code-block::

-        if rank == 0:
-            data = mge.tensor(1)
-            # Tensor(1)
-            F.distributed.remote_send(data, 1) # return None
-        else:
-            data = F.distributed.remote_recv(0)
-            # Tensor(1)
-
+       if rank == 0:
+           data = mge.tensor(1)
+           # Tensor(1)
+           F.distributed.remote_send(data, 1) # return None
+       else:
+           data = F.distributed.remote_recv(0)
+           # Tensor(1)
    """
    group = _SendRecvGroup(src_rank, get_rank())
    shape, dtype = _bcast_shape_dtype(group, None)

--- a/imperative/python/megengine/distributed/group.py
+++ b/imperative/python/megengine/distributed/group.py
@@ -36,15 +36,13 @@ _sd = None


 class Group:
-    r"""
-    Include ranked nodes running collective communication (See :mod:`~.functional.distributed`).
+    r"""Include ranked nodes running collective communication (See :mod:`~.functional.distributed`).

-    By default collectives operate on the default group (also called ``WORLD``) 
-    and require all processes to enter the distributed function call. 
+    By default collectives operate on the default group (also called ``WORLD``)
+    and require all processes to enter the distributed function call.

-    :param proc_ranks: rank list of the group, the first one is root rank.
-
-    
+    Args:
+        proc_ranks: rank list of the group, the first one is root rank.
    """

    def __init__(self, proc_ranks):
@@ -116,15 +114,15 @@ def init_process_group(
    backend: Optional[str] = "auto",
    device_type: str = "xpu",
 ) -> None:
-    """
-    Initialize the distributed process group and specify the device used in the current process
-
-    :param master_ip: ip address of the master node.
-    :param port: port available for all processes to communicate.
-    :param world_size: total number of processes participating in the job.
-    :param rank: rank of the current process.
-    :param device: the GPU device id to bind this process to.
-    :param backend: communicator backend, currently support 'nccl' and 'shm'.
+    r"""Initialize the distributed process group and specify the device used in the current process
+
+    Args:
+        master_ip: ip address of the master node.
+        port: port available for all processes to communicate.
+        world_size: total number of processes participating in the job.
+        rank: rank of the current process.
+        device: the GPU device id to bind this process to.
+        backend: communicator backend, currently support 'nccl' and 'shm'.
    """
    physical_device_type = what_is_xpu() if device_type == "xpu" else device_type
    if not isinstance(master_ip, str):
@@ -180,10 +178,10 @@ def _set_machine_ranks(ranks) -> None:

 @contextmanager
 def override_backend(new_backend: str):
-    """
-    Override distributed backend
+    r"""Override distributed backend

-    :param new_backend: communicator backend set in this context.
+    Args:
+        new_backend: communicator backend set in this context.
    """
    global _sd
    assert _sd, "please call init_process_group first"
@@ -196,51 +194,51 @@ def override_backend(new_backend: str):


 def is_distributed() -> bool:
-    """Return True if the distributed process group has been initialized."""
+    r"""Return True if the distributed process group has been initialized."""
    return _sd is not None


 def get_rank() -> int:
-    """Get the rank of the current process."""
+    r"""Get the rank of the current process."""
    return _sd.proc_rank if _sd is not None else 0


 def get_world_size() -> int:
-    """Get the total number of processes participating in the job."""
+    r"""Get the total number of processes participating in the job."""
    return _sd.world_size if _sd is not None else 1


 def get_backend() -> str:
-    """Get the backend str."""
+    r"""Get the backend str."""
    assert _sd is not None, "please call init_process_group first"
    return _sd.backend if _sd is not None else None


 def get_py_server_addr() -> Tuple[str, int]:
-    """Get master_ip and port of python XML RPC server."""
+    r"""Get master_ip and port of python XML RPC server."""
    assert _sd is not None, "please call init_process_group first"
    return _sd.master_ip, _sd.py_server_port


 def get_mm_server_addr() -> Tuple[str, int]:
-    """Get master_ip and port of C++ mm_server."""
+    r"""Get master_ip and port of C++ mm_server."""
    assert _sd is not None, "please call init_process_group first"
    return _sd.master_ip, _sd.mm_server_port


 def get_client() -> Client:
-    """Get client of python XML RPC server."""
+    r"""Get client of python XML RPC server."""
    assert _sd is not None, "please call init_process_group first"
    return _sd.client


 def new_group(proc_ranks: List[int]) -> Group:
-    """Build a subgroup containing certain ranks."""
+    r"""Build a subgroup containing certain ranks."""
    return Group(proc_ranks)


 def group_barrier(group: Group = WORLD) -> None:
-    """Block until all ranks in the group reach this barrier."""
+    r"""Block until all ranks in the group reach this barrier."""
    # if running with single node, skip it
    if _sd is None:
        return

--- a/imperative/python/megengine/distributed/helper.py
+++ b/imperative/python/megengine/distributed/helper.py
@@ -28,39 +28,40 @@ from .group import WORLD, Group, group_barrier, is_distributed, override_backend


 def param_pack_split(inp: Tensor, offsets: list, shapes: list):
-    r"""
-    Returns split tensor to tensor list as offsets and shapes described,
-            only used for ``parampack``.
+    r"""Returns split tensor to tensor list as offsets and shapes described,
+    only used for ``parampack``.

-    :param inp: input tensor.
-    :param offsets: offsets of outputs, length of `2 * n`,
+    Args:
+        inp: input tensor.
+        offsets: offsets of outputs, length of `2 * n`,
            while n is tensor nums you want to split,
            format `[begin0, end0, begin1, end1]`.
-    :param shapes: tensor shapes of outputs.
-    :return: splitted tensors.
+        shapes: tensor shapes of outputs.

-    Examples:
+    Returns:
+        splitted tensors.

-    .. testcode::
+    Examples:

-        import numpy as np
-        from megengine import tensor
-        from megengine.distributed.helper import param_pack_split
+        .. testcode::

-        a = tensor(np.ones((10,), np.int32))
-        b, c = param_pack_split(a, [0, 1, 1, 10], [(1,), (3, 3)])
-        print(b.numpy())
-        print(c.numpy())
+           import numpy as np
+           from megengine import tensor
+           from megengine.distributed.helper import param_pack_split

-    Outputs:
+           a = tensor(np.ones((10,), np.int32))
+           b, c = param_pack_split(a, [0, 1, 1, 10], [(1,), (3, 3)])
+           print(b.numpy())
+           print(c.numpy())

-    .. testoutput::
+        Outputs:

-        [1]
-        [[1 1 1]
-         [1 1 1]
-         [1 1 1]]
+        .. testoutput::

+           [1]
+           [[1 1 1]
+            [1 1 1]
+            [1 1 1]]
    """
    op = ParamPackSplit()
    op.offsets = offsets
@@ -73,36 +74,37 @@ def param_pack_split(inp: Tensor, offsets: list, shapes: list):


 def param_pack_concat(inps: list, offsets: Tensor, offsets_val: list):
-    r"""
-    Returns concated tensor, only used for ``parampack``.
+    r"""Returns concated tensor, only used for ``parampack``.

-    :param inps: input tensors.
-    :param offsets: device value of offsets.
-    :param offsets_val: offsets of inputs, length of `2 * n`,
+    Args:
+         inps: input tensors.
+         offsets: device value of offsets.
+         offsets_val: offsets of inputs, length of `2 * n`,
            format `[begin0, end0, begin1, end1]`.
-    :return: concated tensor.

-    Examples:
+    Returns:
+         concated tensor.

-    .. testcode::
+    Examples:

-        import numpy as np
-        from megengine import tensor
-        from megengine.distributed.helper import param_pack_concat
+         .. testcode::

-        a = tensor(np.ones((1,), np.int32))
-        b = tensor(np.ones((3, 3), np.int32))
-        offsets_val = [0, 1, 1, 10]
-        offsets = tensor(offsets_val, np.int32)
-        c = param_pack_concat([a, b], offsets, offsets_val)
-        print(c.numpy())
+            import numpy as np
+            from megengine import tensor
+            from megengine.distributed.helper import param_pack_concat

-    Outputs:
+            a = tensor(np.ones((1,), np.int32))
+            b = tensor(np.ones((3, 3), np.int32))
+            offsets_val = [0, 1, 1, 10]
+            offsets = tensor(offsets_val, np.int32)
+            c = param_pack_concat([a, b], offsets, offsets_val)
+            print(c.numpy())

-    .. testoutput::
+         Outputs:

-        [1 1 1 1 1 1 1 1 1 1]
+         .. testoutput::

+            [1 1 1 1 1 1 1 1 1 1]
    """
    op = ParamPackConcat()
    op.offsets = offsets_val
@@ -165,9 +167,9 @@ class TensorFuture(Future):


 def synchronized(func: Callable):
+    r"""Decorator. Decorated function will synchronize when finished.
+    Specifically, we use this to prevent data race during hub.load
    """
-    Decorator. Decorated function will synchronize when finished.
-    Specifically, we use this to prevent data race during hub.load"""

    @functools.wraps(func)
    def wrapper(*args, **kwargs):
@@ -199,23 +201,23 @@ get_device_count_by_fork = deprecated_func(


 def bcast_list_(inps: list, group: Group = WORLD):
-    """
-    Broadcast tensors between given group.
+    r"""Broadcast tensors between given group.

-    :param inps: input tensors.
-    :param group: communication group.
+    Args:
+        inps: input tensors.
+        group: communication group.
    """
    for inp in inps:
        inp._reset(_bcast_param(inp, group))


 class AllreduceCallback:
-    """
-    Allreduce Callback with tensor fusion optimization.
+    r"""Allreduce Callback with tensor fusion optimization.

-    :param reduce_method: the method to reduce gradiants.
-    :param group: communication group.
-    :param backend: override distributed backend in allreduce
+    Args:
+        reduce_method: the method to reduce gradiants.
+        group: communication group.
+        backend: override distributed backend in allreduce
    """

    def __init__(self, reduce_method: str, group: Group = WORLD, backend: str = None):

--- a/imperative/python/megengine/distributed/launcher.py
+++ b/imperative/python/megengine/distributed/launcher.py
@@ -39,7 +39,7 @@ def _run_wrapped(
    queue: mp.Queue,
    machine_ranks: list,
 ):
-    """Init distributed process group and run wrapped function."""
+    r"""Init distributed process group and run wrapped function."""
    _check_device_initialized(device_type, dev)
    init_process_group(
        master_ip=master_ip,
@@ -64,15 +64,16 @@ def _run_wrapped(


 class launcher:
-    """Decorator for launching multiple processes in single-machine multi-gpu training.
-
-    :param func: the function you want to launch in distributed mode.
-    :param n_gpus: how many devices each node.
-    :param world_size: how many devices totally.
-    :param rank_start: start number for rank.
-    :param master_ip: ip address for master node (where the rank 0 is).
-    :param port: server port for distributed server.
-    :param backend: set default collective communication backend.
+    r"""Decorator for launching multiple processes in single-machine multi-gpu training.
+
+    Args:
+        func: the function you want to launch in distributed mode.
+        n_gpus: how many devices each node.
+        world_size: how many devices totally.
+        rank_start: start number for rank.
+        master_ip: ip address for master node (where the rank 0 is).
+        port: server port for distributed server.
+        backend: set default collective communication backend.
    """

    def __new__(cls, *args, **kwargs):

--- a/imperative/python/megengine/distributed/server.py
+++ b/imperative/python/megengine/distributed/server.py
@@ -20,11 +20,11 @@ from ..utils.future import Future


 class Methods:
-    """
-    Distributed Server Method.
+    r"""Distributed Server Method.
    Used for exchange information between distributed nodes.

-    :param mm_server_port: multiple machine rpc server port.
+    Args:
+        mm_server_port: multiple machine rpc server port.
    """

    def __init__(self, mm_server_port):
@@ -39,19 +39,19 @@ class Methods:
        self.bcast_dict = {}

    def connect(self):
-        """Method for checking connection success."""
+        r"""Method for checking connection success."""
        return True

    def get_mm_server_port(self):
-        """Get multiple machine rpc server port."""
+        r"""Get multiple machine rpc server port."""
        return self.mm_server_port

    def set_is_grad(self, key, is_grad):
-        """
-        Mark send/recv need gradiants by key.
+        r"""Mark send/recv need gradiants by key.

-        :param key: key to match send/recv op.
-        :param is_grad: whether this op need grad.
+        Args:
+            key: key to match send/recv op.
+            is_grad: whether this op need grad.
        """
        with self.lock:
            future = self.dict_is_grad[key]
@@ -59,10 +59,10 @@ class Methods:
        return True

    def check_is_grad(self, key):
-        """
-        Check whether send/recv need gradiants.
+        r"""Check whether send/recv need gradiants.

-        :param key: key to match send/recv op.
+        Args:
+            key: key to match send/recv op.
        """
        with self.lock:
            future = self.dict_is_grad[key]
@@ -72,11 +72,11 @@ class Methods:
        return ret

    def set_remote_tracer(self, key, tracer_set):
-        """
-        Set tracer dict for tracing send/recv op.
+        r"""Set tracer dict for tracing send/recv op.

-        :param key: key to match send/recv op.
-        :param tracer_set: valid tracer set.
+        Args:
+            key: key to match send/recv op.
+            tracer_set: valid tracer set.
        """
        with self.lock:
            future = self.dict_remote_tracer[key]
@@ -84,10 +84,10 @@ class Methods:
        return True

    def check_remote_tracer(self, key):
-        """
-        Get tracer dict for send/recv op.
+        r"""Get tracer dict for send/recv op.

-        :param key: key to match send/recv op.
+        Args:
+            key: key to match send/recv op.
        """
        with self.lock:
            future = self.dict_remote_tracer[key]
@@ -97,11 +97,11 @@ class Methods:
        return ret

    def group_barrier(self, key, size):
-        """
-        A barrier wait for all group member.
+        r"""A barrier wait for all group member.

-        :param key: group key to match each other.
-        :param size: group size.
+        Args:
+            key: group key to match each other.
+            size: group size.
        """
        with self.lock:
            self.dict_barrier_counter[key] += 1
@@ -116,14 +116,14 @@ class Methods:
        return True

    def user_set(self, key, val):
-        """Set user defined key-value pairs across processes."""
+        r"""Set user defined key-value pairs across processes."""
        with self.lock:
            future = self.user_dict[key]
        future.set(val)
        return True

    def user_get(self, key):
-        """Get user defined key-value pairs across processes."""
+        r"""Get user defined key-value pairs across processes."""
        with self.lock:
            future = self.user_dict[key]
        return future.get()
@@ -161,12 +161,12 @@ class ThreadXMLRPCServer(ThreadingMixIn, SimpleXMLRPCServer):


 def _start_server(py_server_port, queue):
-    """
-    Start python distributed server and multiple machine server.
+    r"""Start python distributed server and multiple machine server.

-    :param py_server_port: python server port.
-    :param mm_server_port: multiple machine server port.
-    :param queue: server port will put in this queue, puts exception when process fails.
+    Args:
+        py_server_port: python server port.
+        mm_server_port: multiple machine server port.
+        queue: server port will put in this queue, puts exception when process fails.
    """
    try:
        mm_server_port = create_mm_server("0.0.0.0", 0)
@@ -182,11 +182,11 @@ def _start_server(py_server_port, queue):


 class Server:
-    """
-    Distributed Server for distributed training.
+    r"""Distributed Server for distributed training.
    Should be running at master node.

-    :param port: python server port.
+    Args:
+        port: python server port.
    """

    def __init__(self, port=0):
@@ -204,11 +204,11 @@ class Server:


 class Client:
-    """
-    Distributed Client for distributed training.
+    r"""Distributed Client for distributed training.

-    :param master_ip: ip address of master node.
-    :param port: port of server at master node.
+    Args:
+        master_ip: ip address of master node.
+        port: port of server at master node.
    """

    def __init__(self, master_ip, port):
@@ -218,7 +218,7 @@ class Client:
        self.bcast_dict = defaultdict(lambda: 0)

    def connect(self):
-        """Check connection success."""
+        r"""Check connection success."""
        while True:
            try:
                self.proxy = ServerProxy(
@@ -230,62 +230,62 @@ class Client:
                time.sleep(1)

    def get_mm_server_port(self):
-        """Get multiple machine server port."""
+        r"""Get multiple machine server port."""
        return self.proxy.get_mm_server_port()

    def set_is_grad(self, key, is_grad):
-        """
-        Mark send/recv need gradiants by key.
+        r"""Mark send/recv need gradiants by key.

-        :param key: key to match send/recv op.
-        :param is_grad: whether this op need grad.
+        Args:
+            key: key to match send/recv op.
+            is_grad: whether this op need grad.
        """
        self.proxy.set_is_grad(key, is_grad)

    def check_is_grad(self, key):
-        """
-        Check whether send/recv need gradiants.
+        r"""Check whether send/recv need gradiants.

-        :param key: key to match send/recv op.
+        Args:
+            key: key to match send/recv op.
        """
        return self.proxy.check_is_grad(key)

    def set_remote_tracer(self, key, tracer_set):
-        """
-        Set tracer dict for tracing send/recv op.
+        r"""Set tracer dict for tracing send/recv op.

-        :param key: key to match send/recv op.
-        :param tracer_set: valid tracer set.
+        Args:
+            key: key to match send/recv op.
+            tracer_set: valid tracer set.
        """
        self.proxy.set_remote_tracer(key, tracer_set)

    def check_remote_tracer(self, key):
-        """
-        Get tracer dict for send/recv op.
+        r"""Get tracer dict for send/recv op.

-        :param key: key to match send/recv op.
+        Args:
+            key: key to match send/recv op.
        """
        return self.proxy.check_remote_tracer(key)

    def group_barrier(self, key, size):
-        """
-        A barrier wait for all group member.
+        r"""A barrier wait for all group member.

-        :param key: group key to match each other.
-        :param size: group size.
+        Args:
+            key: group key to match each other.
+            size: group size.
        """
        self.proxy.group_barrier(key, size)

    def user_set(self, key, val):
-        """Set user defined key-value pairs across processes."""
+        r"""Set user defined key-value pairs across processes."""
        return self.proxy.user_set(key, val)

    def user_get(self, key):
-        """Get user defined key-value pairs across processes."""
+        r"""Get user defined key-value pairs across processes."""
        return self.proxy.user_get(key)

    def user_pop(self, key):
-        """Get user defined key-value pairs and delete the resources when the get is done"""
+        r"""Get user defined key-value pairs and delete the resources when the get is done"""
        return self.proxy.user_pop(key)

    def bcast_val(self, val, key, size):

--- a/imperative/python/megengine/dtr/dtr.py
+++ b/imperative/python/megengine/dtr/dtr.py
@@ -30,24 +30,20 @@ def _str2bytes(text: str) -> int:

 @property
 def eviction_threshold(mod):
-    r"""
-    Get or set the eviction threshold in bytes. It can also be set to a string,
+    r"""Get or set the eviction threshold in bytes. It can also be set to a string,
    whose formatting supports byte(B), kilobyte(KB), megabyte(MB) and
    gigabyte(GB) units.
-
-    .. note::
-
+    
+    Note: 
       When GPU memory usage exceeds this value, DTR will heuristically select
       and evict resident tensors until the amount of used memory falls below
       this threshold.
-
+    
    Examples:
+        .. code-block::

-    .. code-block::
-
-        import megengine as mge
-        mge.dtr.eviction_threshold = "2GB"
-
+           import megengine as mge
+           mge.dtr.eviction_threshold = "2GB"
    """
    return _eviction_threshold

@@ -66,24 +62,21 @@ def eviction_threshold(mod, value: Union[int, str]):

 @property
 def evictee_minimum_size(mod):
-    r"""
-    Get or set the memory threshold of tensors in bytes. It can also be set to a
+    r"""Get or set the memory threshold of tensors in bytes. It can also be set to a
    string, whose formatting supports byte(B), kilobyte(KB), megabyte(MB) and
    gigabyte(GB) units.
-
-    .. note::
-
+    
+    Note:
       Only tensors whose size exceeds this threshold will be added to the
       candidate set. A tensor that is not added to the candidate set will
       never be evicted during its lifetime.
-
+    
    Examples:
+    
+        .. code-block::

-    .. code-block::
-
-        import megengine as mge
-        mge.dtr.evictee_minimum_size = "2MB"
-
+           import megengine as mge
+           mge.dtr.evictee_minimum_size = "2MB"
    """
    return _evictee_minimum_size

@@ -102,19 +95,16 @@ def evictee_minimum_size(mod, value: Union[int, str]):

 @property
 def enable_sqrt_sampling(mod):
-    r"""
-    Get or set whether sqrt sampling is allowed. Sqrt sampling means that given
+    r"""Get or set whether sqrt sampling is allowed. Sqrt sampling means that given
    the size of the candidate set is N, only enumerate sqrt(N) tensors. When
    the number of tensors is very high, enabling this optimization will speed
    up the training.
+    
+    Examples:    
+        .. code-block::

-    Examples:
-
-    .. code-block::
-
-        import megengine as mge
-        mge.dtr.enable_sqrt_sampling = True
-
+           import megengine as mge
+           mge.dtr.enable_sqrt_sampling = True
    """
    return _enable_sqrt_sampling

@@ -127,9 +117,7 @@ def enable_sqrt_sampling(mod, value: bool):


 def enable():
-    r"""
-    Enable to record computing path of tensors and to perform DTR policy.
-    """
+    r"""Enable to record computing path of tensors and to perform DTR policy."""
    _set_defrag(True)
    _set_option("enable_dtr_auto_drop", 1)
    _set_option("enable_drop", 1)
@@ -138,9 +126,7 @@ def enable():


 def disable():
-    r"""
-    Stop recording computing path of tensors and performing DTR policy.
-    """
+    r"""Stop recording computing path of tensors and performing DTR policy."""
    _set_defrag(False)
    _set_option("enable_dtr_auto_drop", 0)
    _set_option("enable_drop", 0)

--- a/imperative/python/megengine/functional/debug_param.py
+++ b/imperative/python/megengine/functional/debug_param.py
@@ -23,8 +23,7 @@ if os.getenv("MEGENGINE_CONV_EXECUTION_STRATEGY") != None:


 def get_execution_strategy() -> Strategy:
-    """
-    Returns the execution strategy of :class:`~module..Conv2d` and :func:`~.matmul`
+    r"""Returns the execution strategy of :class:`~module..Conv2d` and :func:`~.matmul`

    See :func:`~.set_execution_strategy` for possible return values
    """
@@ -32,31 +31,32 @@ def get_execution_strategy() -> Strategy:


 def set_execution_strategy(option):
-    """
-    Sets the execution strategy of :class:`~module.Conv2d` and :func:`~.matmul`
+    r"""Sets the execution strategy of :class:`~module.Conv2d` and :func:`~.matmul`
+
+    Args:
+        option: Decides how :class:`~.module.Conv2d`and :func:`~.matmul` algorithms are chosen.
+            Available value Strategy

-    :param option: Decides how :class:`~module.Conv2d`and :func:`~.matmul` algorithms are chosen.
-        Available value Strategy
-        * HEURISTIC uses heuristic to choose the fastest algorithm.
-        * PROFILE runs possible algorithms on real device to find the best one.
-        * REPRODUCIBLE uses the algorithms that is reproducible.
-        * OPTIMIZED uses the algorithms that is optimized.
+                * HEURISTIC uses heuristic to choose the fastest algorithm.
+                * PROFILE runs possible algorithms on real device to find the best one.
+                * REPRODUCIBLE uses the algorithms that is reproducible.
+                * OPTIMIZED uses the algorithms that is optimized.

-        The default strategy is HEURISTIC, this options can be combined to
-        form a combination option, e.g. PROFILE | REPRODUCIBLE
-        can combined a option that uses the fastest of profiling result that is also reproducible.
+    The default strategy is HEURISTIC, this options can be combined to
+    form a combination option, e.g. PROFILE | REPRODUCIBLE
+    can combined a option that uses the fastest of profiling result that is also reproducible.

-        Available values string:
+    Available values string:

-        * 'HEURISTIC' uses heuristic to choose the fastest algorithm.
-        * 'PROFILE' runs possible algorithms on real device to find the best one.
-        * 'PROFILE_HEURISTIC' uses profiling result and heuristic to choose the fastest algorithm.
-        * 'PROFILE_REPRODUCIBLE' uses the fastest of profiling result that is also reproducible.
-        * 'HEURISTIC_REPRODUCIBLE' uses heuristic to choose the fastest algorithm that is also reproducible.
+    * 'HEURISTIC' uses heuristic to choose the fastest algorithm.
+    * 'PROFILE' runs possible algorithms on real device to find the best one.
+    * 'PROFILE_HEURISTIC' uses profiling result and heuristic to choose the fastest algorithm.
+    * 'PROFILE_REPRODUCIBLE' uses the fastest of profiling result that is also reproducible.
+    * 'HEURISTIC_REPRODUCIBLE' uses heuristic to choose the fastest algorithm that is also reproducible.

-        The default strategy is 'HEURISTIC'.
+    The default strategy is 'HEURISTIC'.

-        It can also be set through the environment variable 'MEGENGINE_EXECUTION_STRATEGY'.
+    It can also be set through the environment variable 'MEGENGINE_EXECUTION_STRATEGY'.
    """
    valid_string_option = {
        "REPRODUCIBLE": Strategy.REPRODUCIBLE,

--- a/imperative/python/megengine/functional/elemwise.py
+++ b/imperative/python/megengine/functional/elemwise.py
@@ -78,182 +78,163 @@ def _elemwise_multi_type(*args, mode, **kwargs):


 def add(x, y):
-    """
-    Element-wise `addition`.
-    At least one operand should be tensor.
-
-    Same for sub/mul/div/floor_div/pow/mod/atan2/equal/not_equal/less/less_equal/greater/greater_equal/maximum/minmium.
-
-    :param x: input tensor.
-    :return: computed tensor.
+    r"""Element-wise `addition`.

    Examples:

-    .. testcode::
+        .. testcode::

-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F

-        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        out = F.add(x, y)
-        print(out.numpy())
+            x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+            y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+            out = F.add(x, y)
+            print(out.numpy())

-    Outputs:
+        Outputs:

-    .. testoutput::
-
-        [[ 0.  2.  4.]
-         [ 6.  8. 10.]]
+        .. testoutput::

+            [[ 0.  2.  4.]
+             [ 6.  8. 10.]]
    """
    return _elwise(x, y, mode=Elemwise.Mode.ADD)


 def sub(x, y):
-    """Element-wise `subtraction`."""
+    r"""Element-wise `subtraction`."""
    return _elwise(x, y, mode=Elemwise.Mode.SUB)


 def mul(x, y):
-    """Element-wise `multiplication`."""
+    r"""Element-wise `multiplication`."""
    return _elwise(x, y, mode=Elemwise.Mode.MUL)


 def div(x, y):
-    """Element-wise `(x / y)`."""
+    r"""Element-wise `(x / y)`."""
    return _elwise(x, y, mode=Elemwise.Mode.TRUE_DIV)


 def floor_div(x, y):
-    """Element-wise `floor(x / y)`."""
+    r"""Element-wise `floor(x / y)`."""
    return _elwise(x, y, mode=Elemwise.Mode.FLOOR_DIV)


 def neg(x):
-    """Element-wise `negation`."""
+    r"""Element-wise `negation`."""
    return _elwise(x, mode=Elemwise.Mode.NEGATE)


 def pow(x, y):
-    """Element-wise `power`."""
+    r"""Element-wise `power`."""
    return _elwise(x, y, mode=Elemwise.Mode.POW)


 def mod(x, y):
-    """Element-wise `remainder of division`."""
+    r"""Element-wise `remainder of division`."""
    return _elwise(x, y, mode=Elemwise.Mode.MOD)


 def abs(x):
-    """Element-wise `absolute value`."""
+    r"""Element-wise `absolute value`."""
    return _elwise(x, mode=Elemwise.Mode.ABS)


 def exp(x):
-    """Element-wise `exponential`."""
+    r"""Element-wise `exponential`."""
    return _elwise(x, mode=Elemwise.Mode.EXP)


 def expm1(x):
-    """Element-wise `exp(x)-1`."""
+    r"""Element-wise `exp(x)-1`."""
    return _elwise(x, mode=Elemwise.Mode.EXPM1)


 def log(x):
-    """Element-wise `logarithm (base e)`."""
+    r"""Element-wise `logarithm (base e)`."""
    return _elwise(x, mode=Elemwise.Mode.LOG)


 def log1p(x):
-    """Element-wise `log(x+1) (base e)`."""
+    r"""Element-wise `log(x+1) (base e)`."""
    return _elwise(x, mode=Elemwise.Mode.LOG1P)


 def sqrt(x: Tensor) -> Tensor:
-    """
-    Element-wise `sqrt`.
-    Returns ``NaN`` for negative input value.
-
-    :param x: input tensor.
-    :return: computed tensor.
+    r"""Element-wise `sqrt`.

    Examples:

-    .. testcode::
+        .. testcode::

-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F

-        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        out = F.sqrt(x)
-        print(out.numpy().round(decimals=4))
+            x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+            out = F.sqrt(x)
+            print(out.numpy().round(decimals=4))

-    Outputs:
+        Outputs:

-    .. testoutput::
-
-        [[0.     1.     1.4142]
-         [1.7321 2.     2.2361]]
+        .. testoutput::

+            [[0.     1.     1.4142]
+             [1.7321 2.     2.2361]]
    """
    return x ** 0.5


 def square(x: Tensor) -> Tensor:
-    """
-    Returns a new tensor with the square of the elements of input tensor.
-
-    :param inp: input tensor.
-    :return: computed tensor.
+    r"""Element-wise `square`.

    Examples:

-    .. testcode::
+        .. testcode::

-        import numpy as np
-        import megengine as mge
-        import megengine.functional as F
+            import numpy as np
+            import megengine as mge
+            import megengine.functional as F

-        data = mge.tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        out = F.square(data)
-        print(out.numpy().round(decimals=4))
+            data = mge.tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+            out = F.square(data)
+            print(out.numpy().round(decimals=4))

-    Outputs:
+        Outputs:

-    .. testoutput::
-
-        [[ 0.  1.  4.]
-         [ 9. 16. 25.]]
+        .. testoutput::

+            [[ 0.  1.  4.]
+             [ 9. 16. 25.]]
    """
    return x ** 2


 def round(x):
-    """Element-wise `rounding to int`."""
+    r"""Element-wise `rounding to int`."""
    return _elwise(x, mode=Elemwise.Mode.ROUND)


 def ceil(x):
-    """Element-wise `ceiling`."""
+    r"""Element-wise `ceiling`."""
    return _elwise(x, mode=Elemwise.Mode.CEIL)


 def floor(x):
-    """Element-wise `floor`."""
+    r"""Element-wise `floor`."""
    return _elwise(x, mode=Elemwise.Mode.FLOOR)


 def maximum(x, y):
-    """Element-wise `maximum of array elements`."""
+    r"""Element-wise `maximum of array elements`."""
    return _elwise(x, y, mode=Elemwise.Mode.MAX)


 def minimum(x, y):
-    """Element-wise `minimum of array elements`."""
+    r"""Element-wise `minimum of array elements`."""
    return _elwise(x, y, mode=Elemwise.Mode.MIN)


@@ -261,62 +242,57 @@ def minimum(x, y):


 def cos(x):
-    """
-    Element-wise `cosine`.
-
-    :param x: input tensor.
-    :return: computed tensor.
+    r"""Element-wise `cosine`.

    Examples:

-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::

-        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        out = F.cos(x)
-        print(out.numpy().round(decimals=4))
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F

-    Outputs:
+            x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+            out = F.cos(x)
+            print(out.numpy().round(decimals=4))

-    .. testoutput::
+        Outputs:

-        [[ 1.      0.5403 -0.4161]
-         [-0.99   -0.6536  0.2837]]
+        .. testoutput::

+            [[ 1.      0.5403 -0.4161]
+             [-0.99   -0.6536  0.2837]]
    """
    return _elwise(x, mode=Elemwise.Mode.COS)


 def sin(x):
-    """Element-wise `sine`."""
+    r"""Element-wise `sine`."""
    return _elwise(x, mode=Elemwise.Mode.SIN)


 def tan(x):
-    """Element-wise `tangent`."""
+    r"""Element-wise `tangent`."""
    return sin(x) / cos(x)


 def acos(x):
-    """Element-wise `inverse cosine`."""
+    r"""Element-wise `inverse cosine`."""
    return _elwise(x, mode=Elemwise.Mode.ACOS)


 def asin(x):
-    """Element-wise `inverse sine`."""
+    r"""Element-wise `inverse sine`."""
    return _elwise(x, mode=Elemwise.Mode.ASIN)


 def atan(x):
-    """Element-wise `inverse tangent`."""
+    r"""Element-wise `inverse tangent`."""
    return _elwise(x, 1, mode=Elemwise.Mode.ATAN2)


 def atan2(y, x):
-    """Element-wise `2-argument arctangent`."""
+    r"""Element-wise `2-argument arctangent`."""
    return _elwise(y, x, mode=Elemwise.Mode.ATAN2)


@@ -355,38 +331,33 @@ def atanh(x):


 def left_shift(x, y):
-    """
-    Element-wise `bitwise binary: x << y`.
+    r"""Element-wise `bitwise binary: x << y`.

-    :param x: input tensor, should be int.
-    :param y: how many bits to be left-shifted.
-    :return: computed tensor.
+        Examples:

-    Examples:
-
-    .. testcode::
+        .. testcode::

-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F

-        x = tensor(np.arange(0, 6, dtype=np.int32).reshape(2, 3))
-        out = F.left_shift(x, 2)
-        print(out.numpy())
+            x = tensor(np.arange(0, 6, dtype=np.int32).reshape(2, 3))
+            out = F.left_shift(x, 2)
+            print(out.numpy())

-    Outputs:
+        Outputs:

-    .. testoutput::
+        .. testoutput::

-        [[ 0  4  8]
-         [12 16 20]]
+            [[ 0  4  8]
+             [12 16 20]]

    """
    return _elwise(x, y, mode=Elemwise.Mode.SHL)


 def right_shift(x, y):
-    """Element-wise `bitwise binary: x >> y`."""
+    r"""Element-wise `bitwise binary: x >> y`."""
    return _elwise(x, y, mode=Elemwise.Mode.SHR)


@@ -394,22 +365,22 @@ def right_shift(x, y):


 def logical_and(x, y):
-    """Element-wise `logical and: x && y`."""
+    r"""Element-wise `logical and: x && y`."""
    return _elwise(x, y, mode=Elemwise.Mode.AND)


 def logical_not(x):
-    """Element-wise `logical not: ~x`."""
+    r"""Element-wise `logical not: ~x`."""
    return _elwise(x, mode=Elemwise.Mode.NOT)


 def logical_or(x, y):
-    """Element-wise `logical or: x || y`."""
+    r"""Element-wise `logical or: x || y`."""
    return _elwise(x, y, mode=Elemwise.Mode.OR)


 def logical_xor(x, y):
-    """Element-wise `logical xor: x ^ y`."""
+    r"""Element-wise `logical xor: x ^ y`."""
    return _elwise(x, y, mode=Elemwise.Mode.XOR)


@@ -417,59 +388,53 @@ def logical_xor(x, y):


 def equal(x, y):
-    """
-    Element-wise `(x == y)`.
-
-    :param x: input tensor 1.
-    :param y: input tensor 2.
-    :return: computed tensor.
+    r"""Element-wise `(x == y)`.

    Examples:

-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::

-        x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
-        out = F.equal(x, y)
-        print(out.numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F

-    Outputs:
+            x = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+            y = tensor(np.arange(0, 6, dtype=np.float32).reshape(2, 3))
+            out = F.equal(x, y)
+            print(out.numpy())

-    .. testoutput::
+        Outputs:

-        [[1. 1. 1.]
-         [1. 1. 1.]]
+        .. testoutput::

+            [[1. 1. 1.]
+             [1. 1. 1.]]
    """
    return _elwise(x, y, mode=Elemwise.Mode.EQ)


 def not_equal(x, y):
-    """Element-wise `(x != y)`."""
+    r"""Element-wise `(x != y)`."""
    return x != y


 def less(x, y):
-    """Element-wise `(x < y)`."""
+    r"""Element-wise `(x < y)`."""
    return _elwise(x, y, mode=Elemwise.Mode.LT)


 def less_equal(x, y):
-    """Element-wise `(x <= y)`."""
+    r"""Element-wise `(x <= y)`."""
    return _elwise(x, y, mode=Elemwise.Mode.LEQ)


 def greater(x, y):
-    """Element-wise `(x > y)`."""
+    r"""Element-wise `(x > y)`."""
    return _elwise(y, x, mode=Elemwise.Mode.LT)


 def greater_equal(x, y):
-    """Element-wise `(x >= y)`."""
+    r"""Element-wise `(x >= y)`."""
    return _elwise(y, x, mode=Elemwise.Mode.LEQ)


@@ -477,43 +442,45 @@ def greater_equal(x, y):


 def clip(x: Tensor, lower=None, upper=None) -> Tensor:
-    r"""
-    Clamps all elements in input tensor into the range `[` :attr:`lower`, :attr:`upper` `]` and returns
+    r"""Clamps all elements in input tensor into the range ``[ lower, upper ]`` and returns
    a resulting tensor:

    .. math::
+
        y_i = \begin{cases}
            \text{lower} & \text{if } x_i < \text{lower} \\
            x_i & \text{if } \text{lower} \leq x_i \leq \text{upper} \\
            \text{upper} & \text{if } x_i > \text{upper}
        \end{cases}

-    :param x: input tensor.
-    :param lower: lower-bound of the range to be clamped to.
-    :param upper: upper-bound of the range to be clamped to.
-    :return: output clamped tensor.
+    Args:
+        x: input tensor.
+        lower: lower-bound of the range to be clamped to.
+        upper: upper-bound of the range to be clamped to.

-    Examples:
+    Returns:
+        output clamped tensor.

-    .. testcode::
+    Examples:

-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::

-        a = tensor(np.arange(5).astype(np.int32))
-        print(F.clip(a, 2, 4).numpy())
-        print(F.clip(a, lower=3).numpy())
-        print(F.clip(a, upper=3).numpy())
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F

-    Outputs:
+            a = tensor(np.arange(5).astype(np.int32))
+            print(F.clip(a, 2, 4).numpy())
+            print(F.clip(a, lower=3).numpy())
+            print(F.clip(a, upper=3).numpy())

-    .. testoutput::
+        Outputs:

-        [2 2 2 3 4]
-        [3 3 3 3 4]
-        [0 1 2 3 3]
+        .. testoutput::

+            [2 2 2 3 4]
+            [3 3 3 3 4]
+            [0 1 2 3 3]
    """
    assert (
        lower is not None or upper is not None

--- a/imperative/python/megengine/functional/external.py
+++ b/imperative/python/megengine/functional/external.py
@@ -23,14 +23,14 @@ def tensorrt_runtime_opr(inputs, *, data: bytes = None):


 def cambricon_runtime_opr(inputs, data, symbol, tensor_dim_mutable):
-    r"""
-    Load a serialized Cambricon model as a runtime operator in MegEngine.
-
-    :param inputs: list of input tensors.
-    :param data: the serialized Cambricon model.
-    :param symbol: name of the function in Cambricon model.
-    :param tensor_dim_mutable: whether the input tensors' shapes are mutable
-        in ``cnrtModel_t``.
+    r"""Load a serialized Cambricon model as a runtime operator in MegEngine.
+
+    Args:
+        inputs: list of input tensors.
+        data: the serialized Cambricon model.
+        symbol: name of the function in Cambricon model.
+        tensor_dim_mutable: whether the input tensors' shapes are mutable
+            in ``cnrtModel_t``.
    """

    op = builtin.CambriconRuntime(data, len(data), symbol, tensor_dim_mutable)
@@ -38,11 +38,11 @@ def cambricon_runtime_opr(inputs, data, symbol, tensor_dim_mutable):


 def atlas_runtime_opr(inputs, data):
-    r"""
-    Load a serialized Atlas model as a runtime operator in MegEngine.
+    r"""Load a serialized Atlas model as a runtime operator in MegEngine.

-    :param inputs: list of input tensors.
-    :param data: the serialized Atlas model.
+    Args:
+        inputs: list of input tensors.
+        data: the serialized Atlas model.
    """

    op = builtin.AtlasRuntime(data, len(data))

--- a/imperative/python/megengine/functional/loss.py
+++ b/imperative/python/megengine/functional/loss.py
@@ -26,9 +26,7 @@ __all__ = [


 def _reduce_output(loss_fn):
-    r"""
-    Wrapper to apply canonical reductions to loss outputs.
-    """
+    r"""Wrapper to apply canonical reductions to loss outputs."""

    @functools.wraps(loss_fn)
    def reduced_loss_fn(*args, reduction="mean", **kwargs):
@@ -45,13 +43,14 @@ def _reduce_output(loss_fn):

 @_reduce_output
 def l1_loss(pred: Tensor, label: Tensor, reduction: str = "mean") -> Tensor:
-    r"""
-    Calculates the mean absolute error (MAE) between
+    r"""Calculates the mean absolute error (MAE) between
    each element in the pred :math:`x` and label :math:`y`.

    The mean absolute error can be described as:

-    .. math:: \ell(x,y) = mean\left(L \right)
+    .. math::
+
+       \ell(x,y) = mean\left(L \right)

    where

@@ -63,30 +62,32 @@ def l1_loss(pred: Tensor, label: Tensor, reduction: str = "mean") -> Tensor:
    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
    of :math:`N` elements each. :math:`N` is the batch size.

-    :param pred: predicted result from model.
-    :param label: ground truth to compare.
-    :param reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
-    :return: loss value.
+    Args:
+        pred: predicted result from model.
+        label: ground truth to compare.
+        reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'

-    Examples:
+    Returns:
+        loss value.

-    .. testcode::
+    Examples:

-        import numpy as np
-        import megengine as mge
-        import megengine.functional as F
+        .. testcode::

-        ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
-        tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
-        loss = F.nn.l1_loss(ipt, tgt)
-        print(loss.numpy())
+            import numpy as np
+            import megengine as mge
+            import megengine.functional as F

-    Outputs:
+            ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
+            tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
+            loss = F.nn.l1_loss(ipt, tgt)
+            print(loss.numpy())

-    .. testoutput::
+        Outputs:

-        2.75
+        .. testoutput::

+            2.75
    """
    diff = pred - label
    return abs(diff)
@@ -94,53 +95,56 @@ def l1_loss(pred: Tensor, label: Tensor, reduction: str = "mean") -> Tensor:

 @_reduce_output
 def square_loss(pred: Tensor, label: Tensor, reduction: str = "mean") -> Tensor:
-    r"""
-    Calculates the mean squared error (squared L2 norm) between
+    r"""Calculates the mean squared error (squared L2 norm) between
    each element in the pred :math:`x` and label :math:`y`.

    The mean squared error can be described as:

-    .. math:: \ell(x, y) = mean\left( L \right)
+    .. math::
+
+       \ell(x, y) = mean\left( L \right)

    where

    .. math::

-        L = \{l_1,\dots,l_N\}, \quad
-        l_n = \left( x_n - y_n \right)^2,
+       L = \{l_1,\dots,l_N\}, \quad
+       l_n = \left( x_n - y_n \right)^2,

    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
    of :math:`N` elements each. :math:`N` is the batch size.

-    :param pred: predicted result from model.
-    :param label: ground truth to compare.
-    :param reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
-    :return: loss value.
+    Args:
+        pred: predicted result from model.
+        label: ground truth to compare.
+        reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
+
+    Returns:
+        loss value.

    Shape:
-        - pred: :math:`(N, *)` where :math:`*` means any number of additional
-          dimensions.
-        - label: :math:`(N, *)`. Same shape as ``pred``.
+      * pred: :math:`(N, *)` where :math:`*` means any number of additional
+        dimensions.
+      * label: :math:`(N, *)`. Same shape as ``pred``.

    Examples:

-    .. testcode::
-
-        import numpy as np
-        import megengine as mge
-        import megengine.functional as F
+        .. testcode::

-        ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
-        tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
-        loss = F.nn.square_loss(ipt, tgt)
-        print(loss.numpy())
+            import numpy as np
+            import megengine as mge
+            import megengine.functional as F

-    Outputs:
+            ipt = mge.tensor(np.array([3, 3, 3, 3]).astype(np.float32))
+            tgt = mge.tensor(np.array([2, 8, 6, 1]).astype(np.float32))
+            loss = F.nn.square_loss(ipt, tgt)
+            print(loss.numpy())

-    .. testoutput::
+        Outputs:

-        9.75
+        .. testoutput::

+            9.75
    """
    diff = pred - label
    return diff ** 2
@@ -155,8 +159,7 @@ def cross_entropy(
    label_smooth: float = 0,
    reduction: str = "mean",
 ) -> Tensor:
-    r"""
-    Computes the multi-class cross entropy loss (using logits by default).
+    r"""Computes the multi-class cross entropy loss (using logits by default).

    By default(``with_logitis`` is True), ``pred`` is assumed to be logits,
    class probabilities are given by softmax.
@@ -170,35 +173,37 @@ def cross_entropy(
    where :math:`y^{LS}` and :math:`y` are new label distribution and origin label distribution respectively.
    k is the index of label distribution. :math:`\alpha` is ``label_smooth`` and :math:`K` is the number of classes.

-    :param pred: input tensor representing the predicted probability.
-    :param label: input tensor representing the classification label.
-    :param axis: an axis along which softmax will be applied. Default: 1
-    :param with_logits: whether to apply softmax first. Default: True
-    :param label_smooth: a label smoothing of parameter that can re-distribute target distribution. Default: 0
-    :param reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
-    :return: loss value.
+    Args:
+        pred: input tensor representing the predicted probability.
+        label: input tensor representing the classification label.
+        axis: an axis along which softmax will be applied. Default: 1
+        with_logits: whether to apply softmax first. Default: True
+        label_smooth: a label smoothing of parameter that can re-distribute target distribution. Default: 0
+        reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'

-    Examples:
+    Returns:
+        loss value.

-    .. testcode::
+    Examples:

-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::

-        data_shape = (1, 2)
-        label_shape = (1, )
-        pred = tensor(np.array([0, 0], dtype=np.float32).reshape(data_shape))
-        label = tensor(np.ones(label_shape, dtype=np.int32))
-        loss = F.nn.cross_entropy(pred, label)
-        print(loss.numpy().round(decimals=4))
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F

-    Outputs:
+            data_shape = (1, 2)
+            label_shape = (1, )
+            pred = tensor(np.array([0, 0], dtype=np.float32).reshape(data_shape))
+            label = tensor(np.ones(label_shape, dtype=np.int32))
+            loss = F.nn.cross_entropy(pred, label)
+            print(loss.numpy().round(decimals=4))

-    .. testoutput::
+        Outputs:

-        0.6931
+        .. testoutput::

+            0.6931
    """
    n0 = pred.ndim
    n1 = label.ndim
@@ -226,37 +231,38 @@ def cross_entropy(
 def binary_cross_entropy(
    pred: Tensor, label: Tensor, with_logits: bool = True, reduction: str = "mean",
 ) -> Tensor:
-    r"""
-    Computes the binary cross entropy loss (using logits by default).
+    r"""Computes the binary cross entropy loss (using logits by default).

    By default(``with_logitis`` is True), ``pred`` is assumed to be logits,
    class probabilities are given by sigmoid.

-    :param pred: `(N, *)`, where `*` means any number of additional dimensions.
-    :param label: `(N, *)`, same shape as the input.
-    :param with_logits: bool, whether to apply sigmoid first. Default: True
-    :param reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
-    :return: loss value.
+    Args:
+        pred: `(N, *)`, where `*` means any number of additional dimensions.
+        label: `(N, *)`, same shape as the input.
+        with_logits: bool, whether to apply sigmoid first. Default: True
+        reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'

-    Examples:
+    Returns:
+        loss value.

-    .. testcode::
+    Examples:

-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::

-        pred = tensor(np.array([0, 0], dtype=np.float32).reshape(1, 2))
-        label = tensor(np.ones((1, 2), dtype=np.float32))
-        loss = F.nn.binary_cross_entropy(pred, label)
-        print(loss.numpy().round(decimals=4))
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F

-    Outputs:
+            pred = tensor(np.array([0, 0], dtype=np.float32).reshape(1, 2))
+            label = tensor(np.ones((1, 2), dtype=np.float32))
+            loss = F.nn.binary_cross_entropy(pred, label)
+            print(loss.numpy().round(decimals=4))

-    .. testoutput::
+        Outputs:

-        0.6931
+        .. testoutput::

+            0.6931
    """
    if not with_logits:
        return -(label * log(pred) + (1 - label) * log(1 - pred))
@@ -269,37 +275,38 @@ def binary_cross_entropy(
 def hinge_loss(
    pred: Tensor, label: Tensor, norm: str = "L1", reduction: str = "mean"
 ) -> Tensor:
-    r"""
-    Caculates the hinge loss which is often used in SVM.
+    r"""Caculates the hinge loss which is often used in SVM.

    The hinge loss can be described as:

    .. math:: loss(x, y) = \frac{1}{N}\sum_i\sum_j(max(0, 1 - x_{ij}*y_{ij}))

-    :param pred: input tensor representing the predicted probability, shape is `(N, C)`.
-    :param label: input tensor representing the binary classification label, shape is `(N, C)`.
-    :param norm: specify the norm to caculate the loss, should be "L1" or "L2".
-    :param reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'
-    :return: loss value.
+    Args:
+        pred: input tensor representing the predicted probability, shape is `(N, C)`.
+        label: input tensor representing the binary classification label, shape is `(N, C)`.
+        norm: specify the norm to caculate the loss, should be "L1" or "L2".
+        reduction: the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default: 'mean'

-    Examples:
+    Returns:
+        loss value.

-    .. testcode::
+    Examples:

-        from megengine import tensor
-        import megengine.functional as F
+        .. testcode::

-        pred = tensor([[0.5, -0.5, 0.1], [-0.6, 0.7, 0.8]], dtype="float32")
-        label = tensor([[1, -1, -1], [-1, 1, 1]], dtype="float32")
-        loss = F.nn.hinge_loss(pred, label)
-        print(loss.numpy())
+            from megengine import tensor
+            import megengine.functional as F

-    Outputs:
+            pred = tensor([[0.5, -0.5, 0.1], [-0.6, 0.7, 0.8]], dtype="float32")
+            label = tensor([[1, -1, -1], [-1, 1, 1]], dtype="float32")
+            loss = F.nn.hinge_loss(pred, label)
+            print(loss.numpy())

-    .. testoutput::
+        Outputs:

-        1.5
+        .. testoutput::

+            1.5
    """
    norm = norm.upper()
    assert norm in ["L1", "L2"], "norm must be L1 or L2"

--- a/imperative/python/megengine/functional/math.py
+++ b/imperative/python/megengine/functional/math.py
--- a/imperative/python/megengine/functional/metric.py
+++ b/imperative/python/megengine/functional/metric.py
@@ -19,33 +19,16 @@ from .tensor import broadcast_to, transpose
 def topk_accuracy(
    logits: Tensor, target: Tensor, topk: Union[int, Iterable[int]] = 1
 ) -> Union[Tensor, Iterable[Tensor]]:
-    r"""
-    Calculates the classification accuracy given predicted logits and ground-truth labels.
+    r"""Calculates the classification accuracy given predicted logits and ground-truth labels.

-    :param logits: model predictions of shape `[batch_size, num_classes]`,
-        representing the probability (likelyhood) of each class.
-    :param target: ground-truth labels, 1d tensor of int32.
-    :param topk: specifies the topk values, could be an int or tuple of ints. Default: 1
-    :return: tensor(s) of classification accuracy between 0.0 and 1.0.
+    Args:
+        logits: model predictions of shape `[batch_size, num_classes]`,
+            representing the probability (likelyhood) of each class.
+        target: ground-truth labels, 1d tensor of int32.
+        topk: specifies the topk values, could be an int or tuple of ints. Default: 1

-    Examples:
-
-    .. testcode::
-
-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
-
-        logits = tensor(np.arange(80, dtype=np.int32).reshape(8,10))
-        target = tensor(np.arange(8, dtype=np.int32))
-        top1, top5 = F.metric.topk_accuracy(logits, target, (1, 5))
-        print(top1.numpy(), top5.numpy())
-
-    Outputs:
-
-    .. testoutput::
-
-        0.0 0.375
+    Returns:
+        tensor(s) of classification accuracy between 0.0 and 1.0.
    """
    if isinstance(topk, int):
        topk = (topk,)

--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
--- a/imperative/python/megengine/functional/quantized.py
+++ b/imperative/python/megengine/functional/quantized.py
@@ -28,32 +28,28 @@ def conv_bias_activation(
    conv_mode="cross_correlation",
    compute_mode="default",
 ) -> Tensor:
-    """
-    Convolution bias with activation operation, only for inference.
-
-    :param inp: feature map of the convolution operation.
-    :param weight: convolution kernel.
-    :param bias: bias added to the result of convolution
-    :param stride: stride of the 2D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides
-        of its spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and the shape of weight should be `(groups, out_channel // groups,
-        in_channels // groups, height, width)`.
-    :type conv_mode: string or :class:`Convolution.Mode`.
-    :param conv_mode: supports 'cross_correlation' or 'convolution'. Default:
-        'cross_correlation'
-    :param dtype: support for ``np.dtype``, Default: np.int8
-    :type compute_mode: string or
-        :class:`Convolution.ComputeMode`.
-    :param compute_mode: when set to "default", no special requirements will be
-        placed on the precision of intermediate results. When set to "float32",
-        "float32" would be used for accumulator and intermediate result,
-        but only effective when input and output are of float16 dtype.
+    r"""Convolution bias with activation operation, only for inference.

+    Args:
+        inp: feature map of the convolution operation.
+        weight: convolution kernel.
+        bias: bias added to the result of convolution
+        stride: stride of the 2D convolution operation. Default: 1
+        padding: size of the paddings added to the input on both sides
+            of its spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: dilation of the 2D convolution operation. Default: 1
+        groups: number of groups into which the input and output channels are divided,
+            so as to perform a "grouped convolution". When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+            and the shape of weight should be `(groups, out_channel // groups,
+            in_channels // groups, height, width)`.
+        conv_mode: supports 'cross_correlation' or 'convolution'. Default:
+            'cross_correlation'
+        dtype: support for ``np.dtype``, Default: np.int8
+        compute_mode: when set to "default", no special requirements will be
+            placed on the precision of intermediate results. When set to "float32",
+            "float32" would be used for accumulator and intermediate result,
+            but only effective when input and output are of float16 dtype.
    """
    ph, pw = _pair(padding)
    sh, sw = _pair_nonzero(stride)
@@ -91,32 +87,28 @@ def batch_conv_bias_activation(
    conv_mode="cross_correlation",
    compute_mode="default",
 ) -> Tensor:
-    """
-    Batch convolution bias with activation operation, only for inference.
-
-    :param inp: feature map of the convolution operation.
-    :param weight: convolution kernel in batched way.
-    :param bias: bias added to the result of convolution
-    :param stride: stride of the 2D convolution operation. Default: 1
-    :param padding: size of the paddings added to the input on both sides
-        of its spatial dimensions. Only zero-padding is supported. Default: 0
-    :param dilation: dilation of the 2D convolution operation. Default: 1
-    :param groups: number of groups into which the input and output channels are divided,
-        so as to perform a "grouped convolution". When ``groups`` is not 1,
-        ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
-        and the shape of weight should be `(groups, out_channel // groups,
-        in_channels // groups, height, width)`.
-    :type conv_mode: string or :class:`Convolution.Mode`.
-    :param conv_mode: supports 'cross_correlation' or 'convolution'. Default:
-        'cross_correlation'
-    :param dtype: support for ``np.dtype``, Default: np.int8
-    :type compute_mode: string or
-        :class:`Convolution.ComputeMode`.
-    :param compute_mode: when set to "default", no special requirements will be
-        placed on the precision of intermediate results. When set to "float32",
-        "float32" would be used for accumulator and intermediate result,
-        but only effective when input and output are of float16 dtype.
+    r"""Batch convolution bias with activation operation, only for inference.

+    Args:
+        inp: feature map of the convolution operation.
+        weight: convolution kernel in batched way.
+        bias: bias added to the result of convolution
+        stride: stride of the 2D convolution operation. Default: 1
+        padding: size of the paddings added to the input on both sides
+            of its spatial dimensions. Only zero-padding is supported. Default: 0
+        dilation: dilation of the 2D convolution operation. Default: 1
+        groups: number of groups into which the input and output channels are divided,
+            so as to perform a "grouped convolution". When ``groups`` is not 1,
+            ``in_channels`` and ``out_channels`` must be divisible by ``groups``,
+            and the shape of weight should be `(groups, out_channel // groups,
+            in_channels // groups, height, width)`.
+        conv_mode: supports 'cross_correlation' or 'convolution'. Default:
+            'cross_correlation'
+        dtype: support for ``np.dtype``, Default: np.int8
+        compute_mode: when set to "default", no special requirements will be
+            placed on the precision of intermediate results. When set to "float32",
+            "float32" would be used for accumulator and intermediate result,
+            but only effective when input and output are of float16 dtype.
    """
    ph, pw = _pair(padding)
    sh, sw = _pair_nonzero(stride)

--- a/imperative/python/megengine/functional/tensor.py
+++ b/imperative/python/megengine/functional/tensor.py
--- a/imperative/python/megengine/functional/utils.py
+++ b/imperative/python/megengine/functional/utils.py
@@ -19,37 +19,36 @@ __all__ = ["topk_accuracy"]
 def _assert_equal(
    expect: Tensor, actual: Tensor, *, maxerr: float = 0.0001, verbose: bool = False
 ):
-    r"""
-    Asserts two tensors equal and returns expected value (first input).
+    r"""Asserts two tensors equal and returns expected value (first input).
    It is a variant of python assert which is symbolically traceable (similar to ``numpy.testing.assert_equal``).
    If we want to verify the correctness of model, just ``assert`` its states and outputs.
    While sometimes we need to verify the correctness at different backends for *dumped* model
    (or in :class:`~jit.trace` context), and no python code could be executed in that case.
    Thus we have to use :func:`~functional.utils._assert_equal` instead.

-    :param expect: expected tensor value
-    :param actual: tensor to check value
-    :param maxerr: max allowed error; error is defined as the minimal of absolute and relative error
-    :param verbose: whether to print maxerr to stdout during opr exec
-    :return: expected tensor
+    Args:
+        expect: expected tensor value
+        actual: tensor to check value
+        maxerr: max allowed error; error is defined as the minimal of absolute and relative error
+        verbose: whether to print maxerr to stdout during opr exec

    Examples:

-    .. testcode::
+        .. testcode::

-        import numpy as np
-        from megengine import tensor
-        import megengine.functional as F
+            import numpy as np
+            from megengine import tensor
+            import megengine.functional as F

-        x = tensor([1, 2, 3], np.float32)
-        y = tensor([1, 2, 3], np.float32)
-        print(F.utils._assert_equal(x, y, maxerr=0).numpy())
+            x = tensor([1, 2, 3], np.float32)
+            y = tensor([1, 2, 3], np.float32)
+            print(F.utils._assert_equal(x, y, maxerr=0).numpy())

-    Outputs:
+        Outputs:

-    .. testoutput::
+        .. testoutput::

-        [1. 2. 3.]
+            [1. 2. 3.]
    """
    err = (
        abs(expect - actual)

--- a/imperative/python/megengine/functional/vision.py
+++ b/imperative/python/megengine/functional/vision.py
--- a/imperative/python/megengine/hub/exceptions.py
+++ b/imperative/python/megengine/hub/exceptions.py
@@ -7,24 +7,24 @@
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 class FetcherError(Exception):
-    """Base class for fetch related error."""
+    r"""Base class for fetch related error."""


 class InvalidRepo(FetcherError):
-    """The repo provided was somehow invalid."""
+    r"""The repo provided was somehow invalid."""


 class InvalidGitHost(FetcherError):
-    """The git host provided was somehow invalid."""
+    r"""The git host provided was somehow invalid."""


 class GitPullError(FetcherError):
-    """A git pull error occurred."""
+    r"""A git pull error occurred."""


 class GitCheckoutError(FetcherError):
-    """A git checkout error occurred."""
+    r"""A git checkout error occurred."""


 class InvalidProtocol(FetcherError):
-    """The protocol provided was somehow invalid."""
+    r"""The protocol provided was somehow invalid."""
--- a/imperative/python/megengine/hub/fetcher.py
+++ b/imperative/python/megengine/hub/fetcher.py
@@ -102,24 +102,18 @@ class GitSSHFetcher(RepoFetcherBase):
        commit: str = None,
        silent: bool = True,
    ) -> str:
-        """
-        Fetches git repo by SSH protocol
-
-        :param git_host:
-            host address of git repo.
-            Example: github.com
-        :param repo_info:
-            a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
-            tag/branch. The default branch is ``master`` if not specified.
-            Example: ``"brain_sdk/MegBrain[:hub]"``
-        :param use_cache:
-            whether to use locally fetched code or completely re-fetch.
-        :param commit:
-            commit id on github or gitlab.
-        :param silent:
-            whether to accept the stdout and stderr of the subprocess with PIPE, instead of
-            displaying on the screen.
-        :return:
+        """Fetches git repo by SSH protocol
+
+        Args:
+            git_host: host address of git repo. Eg: github.com
+            repo_info: a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
+                tag/branch. The default branch is ``master`` if not specified. Eg: ``"brain_sdk/MegBrain[:hub]"``
+            use_cache: whether to use locally fetched code or completely re-fetch.
+            commit: commit id on github or gitlab.
+            silent: whether to accept the stdout and stderr of the subprocess with PIPE, instead of
+                displaying on the screen.
+
+        Returns:
            directory where the repo code is stored.
        """
        if not cls._check_git_host(git_host):
@@ -217,24 +211,19 @@ class GitHTTPSFetcher(RepoFetcherBase):
        commit: str = None,
        silent: bool = True,
    ) -> str:
-        """
-        Fetches git repo by HTTPS protocol.
-
-        :param git_host:
-            host address of git repo.
-            Example: github.com
-        :param repo_info:
-            a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
-            tag/branch. The default branch is ``master`` if not specified.
-            Example: ``"brain_sdk/MegBrain[:hub]"``
-        :param use_cache:
-            whether to use locally cached code or completely re-fetch.
-        :param commit:
-            commit id on github or gitlab.
-        :param silent:
-            whether to accept the stdout and stderr of the subprocess with PIPE, instead of
-            displaying on the screen.
-        :return:
+        """Fetches git repo by HTTPS protocol.
+
+        Args:
+            git_host: host address of git repo. Eg: github.com
+            repo_info: a string with format ``"repo_owner/repo_name[:tag_name/:branch_name]"`` with an optional
+                tag/branch. The default branch is ``master`` if not specified. Eg: ``"brain_sdk/MegBrain[:hub]"``
+            use_cache: whether to use locally cached code or completely re-fetch.
+            commit: commit id on github or gitlab.
+            silent: whether to accept the stdout and stderr of the subprocess with PIPE, instead of
+                displaying on the screen.
+ 
+
+        Returns:
            directory where the repo code is stored.
        """
        if not cls._check_git_host(git_host):

--- a/imperative/python/megengine/hub/hub.py
+++ b/imperative/python/megengine/hub/hub.py
--- a/imperative/python/megengine/hub/tools.py
+++ b/imperative/python/megengine/hub/tools.py
--- a/imperative/python/megengine/jit/graph_opt_config.py
+++ b/imperative/python/megengine/jit/graph_opt_config.py
@@ -9,12 +9,12 @@


 class GraphOptimizationConfig:
-    r"""
-    Configuration for graph optimization: False for OFF, True for ON. The default value
+    r"""Configuration for graph optimization: False for OFF, True for ON. The default value
    None means that opt_level will decide whther this optimization will be applied or not.

-    :param jit_fuse_dimshuffle: whether to fuse dimshuffle in JIT optimization
-    :param jit_fuse_reduce: whether to fuse reduce in JIT optimization
+    Args:
+        jit_fuse_dimshuffle: whether to fuse dimshuffle in JIT optimization
+        jit_fuse_reduce: whether to fuse reduce in JIT optimization
    """

    def __init__(self):

--- a/imperative/python/megengine/jit/sublinear_memory_config.py
+++ b/imperative/python/megengine/jit/sublinear_memory_config.py
--- a/imperative/python/megengine/jit/tracing.py
+++ b/imperative/python/megengine/jit/tracing.py
--- a/imperative/python/megengine/logger.py
+++ b/imperative/python/megengine/logger.py
--- a/imperative/python/megengine/module/activation.py
+++ b/imperative/python/megengine/module/activation.py
--- a/imperative/python/megengine/module/adaptive_pooling.py
+++ b/imperative/python/megengine/module/adaptive_pooling.py
--- a/imperative/python/megengine/module/batch_matmul_activation.py
+++ b/imperative/python/megengine/module/batch_matmul_activation.py
@@ -14,9 +14,7 @@ from .module import Module


 class BatchMatMulActivation(Module):
-    r"""
-    Batched :func:`~.matmul` with activation(only :func:`~.relu` supported), no transpose anywhere.
-    """
+    r"""Batched :func:`~.matmul` with activation(only :func:`~.relu` supported), no transpose anywhere."""

    def __init__(
        self,

--- a/imperative/python/megengine/module/batchnorm.py
+++ b/imperative/python/megengine/module/batchnorm.py
--- a/imperative/python/megengine/module/concat.py
+++ b/imperative/python/megengine/module/concat.py
--- a/imperative/python/megengine/module/conv.py
+++ b/imperative/python/megengine/module/conv.py
--- a/imperative/python/megengine/module/conv_bn.py
+++ b/imperative/python/megengine/module/conv_bn.py
--- a/imperative/python/megengine/module/dropout.py
+++ b/imperative/python/megengine/module/dropout.py
--- a/imperative/python/megengine/module/elemwise.py
+++ b/imperative/python/megengine/module/elemwise.py
--- a/imperative/python/megengine/module/embedding.py
+++ b/imperative/python/megengine/module/embedding.py
--- a/imperative/python/megengine/module/external.py
+++ b/imperative/python/megengine/module/external.py
--- a/imperative/python/megengine/module/init.py
+++ b/imperative/python/megengine/module/init.py
--- a/imperative/python/megengine/module/linear.py
+++ b/imperative/python/megengine/module/linear.py
--- a/imperative/python/megengine/module/module.py
+++ b/imperative/python/megengine/module/module.py
--- a/imperative/python/megengine/module/normalization.py
+++ b/imperative/python/megengine/module/normalization.py
--- a/imperative/python/megengine/module/pooling.py
+++ b/imperative/python/megengine/module/pooling.py
--- a/imperative/python/megengine/module/qat/batch_matmul_activation.py
+++ b/imperative/python/megengine/module/qat/batch_matmul_activation.py
--- a/imperative/python/megengine/module/qat/concat.py
+++ b/imperative/python/megengine/module/qat/concat.py
--- a/imperative/python/megengine/module/qat/conv.py
+++ b/imperative/python/megengine/module/qat/conv.py
--- a/imperative/python/megengine/module/qat/conv_bn.py
+++ b/imperative/python/megengine/module/qat/conv_bn.py
--- a/imperative/python/megengine/module/qat/elemwise.py
+++ b/imperative/python/megengine/module/qat/elemwise.py
--- a/imperative/python/megengine/module/qat/linear.py
+++ b/imperative/python/megengine/module/qat/linear.py
--- a/imperative/python/megengine/module/qat/module.py
+++ b/imperative/python/megengine/module/qat/module.py
--- a/imperative/python/megengine/module/qat/quant_dequant.py
+++ b/imperative/python/megengine/module/qat/quant_dequant.py
--- a/imperative/python/megengine/module/quant_dequant.py
+++ b/imperative/python/megengine/module/quant_dequant.py
--- a/imperative/python/megengine/module/quantized/concat.py
+++ b/imperative/python/megengine/module/quantized/concat.py
--- a/imperative/python/megengine/module/quantized/conv.py
+++ b/imperative/python/megengine/module/quantized/conv.py
--- a/imperative/python/megengine/module/quantized/conv_bn.py
+++ b/imperative/python/megengine/module/quantized/conv_bn.py
--- a/imperative/python/megengine/module/quantized/module.py
+++ b/imperative/python/megengine/module/quantized/module.py
--- a/imperative/python/megengine/module/quantized/quant_dequant.py
+++ b/imperative/python/megengine/module/quantized/quant_dequant.py
--- a/imperative/python/megengine/module/sequential.py
+++ b/imperative/python/megengine/module/sequential.py
--- a/imperative/python/megengine/module/sliding_window.py
+++ b/imperative/python/megengine/module/sliding_window.py
--- a/imperative/python/megengine/optimizer/adadelta.py
+++ b/imperative/python/megengine/optimizer/adadelta.py
--- a/imperative/python/megengine/optimizer/adagrad.py
+++ b/imperative/python/megengine/optimizer/adagrad.py
--- a/imperative/python/megengine/optimizer/adam.py
+++ b/imperative/python/megengine/optimizer/adam.py
--- a/imperative/python/megengine/optimizer/adamw.py
+++ b/imperative/python/megengine/optimizer/adamw.py
--- a/imperative/python/megengine/optimizer/clip_grad.py
+++ b/imperative/python/megengine/optimizer/clip_grad.py
--- a/imperative/python/megengine/optimizer/lr_scheduler.py
+++ b/imperative/python/megengine/optimizer/lr_scheduler.py
--- a/imperative/python/megengine/optimizer/multi_step_lr.py
+++ b/imperative/python/megengine/optimizer/multi_step_lr.py
--- a/imperative/python/megengine/optimizer/optimizer.py
+++ b/imperative/python/megengine/optimizer/optimizer.py
--- a/imperative/python/megengine/optimizer/sgd.py
+++ b/imperative/python/megengine/optimizer/sgd.py
--- a/imperative/python/megengine/quantization/fake_quant.py
+++ b/imperative/python/megengine/quantization/fake_quant.py
--- a/imperative/python/megengine/quantization/observer.py
+++ b/imperative/python/megengine/quantization/observer.py
--- a/imperative/python/megengine/quantization/qconfig.py
+++ b/imperative/python/megengine/quantization/qconfig.py
--- a/imperative/python/megengine/quantization/quantize.py
+++ b/imperative/python/megengine/quantization/quantize.py
--- a/imperative/python/megengine/quantization/utils.py
+++ b/imperative/python/megengine/quantization/utils.py
--- a/imperative/python/megengine/random/rng.py
+++ b/imperative/python/megengine/random/rng.py
--- a/imperative/python/megengine/serialization.py
+++ b/imperative/python/megengine/serialization.py
--- a/imperative/python/megengine/tensor.py
+++ b/imperative/python/megengine/tensor.py
--- a/imperative/python/megengine/tools/compare_binary_iodump.py
+++ b/imperative/python/megengine/tools/compare_binary_iodump.py
--- a/imperative/python/megengine/tools/network_visualize.py
+++ b/imperative/python/megengine/tools/network_visualize.py
--- a/imperative/python/megengine/tools/profile_analyze.py
+++ b/imperative/python/megengine/tools/profile_analyze.py
--- a/imperative/python/megengine/traced_module/expr.py
+++ b/imperative/python/megengine/traced_module/expr.py
--- a/imperative/python/megengine/traced_module/fake_quant.py
+++ b/imperative/python/megengine/traced_module/fake_quant.py
--- a/imperative/python/megengine/traced_module/node.py
+++ b/imperative/python/megengine/traced_module/node.py
--- a/imperative/python/megengine/traced_module/traced_module.py
+++ b/imperative/python/megengine/traced_module/traced_module.py
--- a/imperative/python/megengine/traced_module/utils.py
+++ b/imperative/python/megengine/traced_module/utils.py
--- a/imperative/python/megengine/utils/comp_graph_tools.py
+++ b/imperative/python/megengine/utils/comp_graph_tools.py
--- a/imperative/python/megengine/utils/deprecation.py
+++ b/imperative/python/megengine/utils/deprecation.py
--- a/imperative/python/megengine/utils/http_download.py
+++ b/imperative/python/megengine/utils/http_download.py
--- a/imperative/python/megengine/utils/max_recursion_limit.py
+++ b/imperative/python/megengine/utils/max_recursion_limit.py
--- a/imperative/python/megengine/utils/module_stats.py
+++ b/imperative/python/megengine/utils/module_stats.py
--- a/imperative/python/megengine/utils/module_utils.py
+++ b/imperative/python/megengine/utils/module_utils.py
--- a/imperative/python/megengine/utils/naming.py
+++ b/imperative/python/megengine/utils/naming.py
--- a/imperative/python/megengine/utils/network.py
+++ b/imperative/python/megengine/utils/network.py
--- a/imperative/python/megengine/utils/network_node.py
+++ b/imperative/python/megengine/utils/network_node.py
--- a/imperative/python/megengine/utils/profile_analyzer.py
+++ b/imperative/python/megengine/utils/profile_analyzer.py
--- a/imperative/python/megengine/utils/profiler.py
+++ b/imperative/python/megengine/utils/profiler.py
--- a/imperative/python/megengine/utils/tensor_sanity_check.py
+++ b/imperative/python/megengine/utils/tensor_sanity_check.py
--- a/imperative/python/megengine/utils/tuple_function.py
+++ b/imperative/python/megengine/utils/tuple_function.py