diff --git a/imperative/python/megengine/autodiff/grad_manager.py b/imperative/python/megengine/autodiff/grad_manager.py
index 001c9f9de4eae5e033a64a2993eeb464b63e594b..f63deede49ab155fedd98098ec9b3403cf52ee42 100644
--- a/imperative/python/megengine/autodiff/grad_manager.py
+++ b/imperative/python/megengine/autodiff/grad_manager.py
@@ -20,42 +20,42 @@ class GradManager:
     the forward operations start and when all resources should be released. A typical usage of
     GradManager is as follows:
 
-        .. code-block::
+    .. code-block::
 
-            gm = GradManager()
-            gm.attach(model.parameters())
-            with gm:
-                # forward operations
-                ...
-                # backward gradients
-                gm.backward(loss)
+        gm = GradManager()
+        gm.attach(model.parameters())
+        with gm:
+            # forward operations
+            ...
+            # backward gradients
+            gm.backward(loss)
 
-    You can also use `record()` and `release()` method instead of `with` context:
+    You can also use ``record()`` and ``release()`` method instead of ``with`` context:
 
-        .. code-block::
+    .. code-block::
 
-            gm = GradManager()
-            gm.attach(model.parameters())
+        gm = GradManager()
+        gm.attach(model.parameters())
 
-            gm.record()
+        gm.record()
 
-            # forward operations
-            ...
-            # backward gradients
-            gm.backward(loss)
+        # forward operations
+        ...
+        # backward gradients
+        gm.backward(loss)
 
-            gm.release()
+        gm.release()
 
     Typically, in data parallel, we would like to average the gradients across
     processes. Users will finally get the averaged gradients if an "AllReduce"
     callback is registered as follows:
 
-        .. code-block::
+    .. code-block::
 
-            import megengine.distributed as dist
+        import megengine.distributed as dist
 
-            gm = GradManager()
-            gm.attach(model.parameters(), callback=dist.make_allreduce_cb("MEAN"))
+        gm = GradManager()
+        gm.attach(model.parameters(), callback=dist.make_allreduce_cb("MEAN"))
 
     """
 
diff --git a/imperative/python/megengine/data/dataloader.py b/imperative/python/megengine/data/dataloader.py
index a92dff7ac876681147ab32e5894839fa25bfc849..2a818a29781e1f5c579d09e563eccbf22d72fac9 100644
--- a/imperative/python/megengine/data/dataloader.py
+++ b/imperative/python/megengine/data/dataloader.py
@@ -50,7 +50,6 @@ class DataLoader:
         :param dataset: dataset from which to load the minibatch.
         :type sampler: Sampler
         :param sampler: defines the strategy to sample data from the dataset.
-            If specified, :attr:`shuffle` must be ``False``.
         :type transform: Transform
         :param transform: defined the transforming strategy for a sampled batch.
             Default: None
diff --git a/imperative/python/megengine/functional/__init__.py b/imperative/python/megengine/functional/__init__.py
index 37455891a26dd00cb60e6ec3a4482034fe0bbb95..2d3240fab3295e4d0ba33b826d7c162031d205ff 100644
--- a/imperative/python/megengine/functional/__init__.py
+++ b/imperative/python/megengine/functional/__init__.py
@@ -17,4 +17,4 @@ from . import distributed  # isort:skip
 
 # delete namespace
 # pylint: disable=undefined-variable
-# del elemwise, graph, loss, math, nn, tensor  # type: ignore[name-defined]
+del elemwise, graph, loss, math, nn, quantized, tensor, utils  # type: ignore[name-defined]
diff --git a/imperative/python/megengine/functional/loss.py b/imperative/python/megengine/functional/loss.py
index 67a296678a4acf4a72aaf239f99fc10b02d9ae24..0ef622d57d032d315c180e4a141ef0d1a3e99866 100644
--- a/imperative/python/megengine/functional/loss.py
+++ b/imperative/python/megengine/functional/loss.py
@@ -127,9 +127,10 @@ def cross_entropy(
     with_logits: bool = True,
     label_smooth: float = 0,
 ) -> Tensor:
-    r"""Compute the multi-class cross entropy loss (using logits by default).
+    r"""Computes the multi-class cross entropy loss (using logits by default).
 
-    By default, prediction is assumed to be logits, whose softmax gives probabilities.
+    By default(``with_logitis`` is True), ``pred`` is assumed to be logits,
+    class probabilities are given by softmax.
 
     It has better numerical stability compared with sequential calls to :func:`~.softmax` and :func:`~.cross_entropy`.
 
@@ -194,9 +195,10 @@ def cross_entropy(
 def binary_cross_entropy(
     pred: Tensor, label: Tensor, with_logits: bool = True
 ) -> Tensor:
-    r"""Compute the binary cross entropy loss (using logits by default).
+    r"""Computes the binary cross entropy loss (using logits by default).
 
-    By default, prediction is assumed to be logits, whose sigmoid gives probabilities.
+    By default(``with_logitis`` is True), ``pred`` is assumed to be logits,
+    class probabilities are given by sigmoid.
 
     :param pred: `(N, *)`, where `*` means any number of additional dimensions.
     :param label: `(N, *)`, same shape as the input.
diff --git a/imperative/python/megengine/functional/nn.py b/imperative/python/megengine/functional/nn.py
index 7d120b8ea71d5eef88804f1e2a30f3651094afb7..7a5f8829df99ed966e687304efc63b716d73186d 100644
--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
@@ -335,8 +335,8 @@ def adaptive_max_pool2d(
 
     Refer to :class:`~.MaxAdaptivePool2d` for more information.
 
-    :param inp: The input tensor.
-    :param oshp: (OH, OW) size of the output shape.
+    :param inp: input tensor.
+    :param oshp: `(OH, OW)` size of the output shape.
     :return: output tensor.
     """
     assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type"
@@ -356,8 +356,8 @@ def adaptive_avg_pool2d(
 
     Refer to :class:`~.AvgAdaptivePool2d` for more information.
 
-    :param inp: The input tensor.
-    :param oshp: (OH, OW) size of the output shape.
+    :param inp: input tensor.
+    :param oshp: `(OH, OW)` size of the output shape.
     :return: output tensor.
     """
     assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type"
diff --git a/imperative/python/megengine/module/adaptive_pooling.py b/imperative/python/megengine/module/adaptive_pooling.py
index 99e7c57d272fdfb231dca5fc3a5f45100b57d83a..c0cbf3b2087a40eea02283ec6412e3693d87e381 100644
--- a/imperative/python/megengine/module/adaptive_pooling.py
+++ b/imperative/python/megengine/module/adaptive_pooling.py
@@ -40,10 +40,10 @@ class AdaptiveMaxPool2d(_AdaptivePoolNd):
                 \text{stride[1]} \times w + n)
         \end{aligned}
 
-    Kernel_size and stride can be inferred from input shape and out shape:
-    padding: (0, 0)
-    stride: (floor(IH / OH), floor(IW / OW))
-    kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w)
+    ``kernel_size`` and ``stride`` can be inferred from input shape and out shape:
+    * padding: (0, 0)
+    * stride: (floor(IH / OH), floor(IW / OW))
+    * kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w)
 
     Examples:
 
@@ -83,10 +83,10 @@ class AdaptiveAvgPool2d(_AdaptivePoolNd):
         out(N_i, C_j, h, w)  = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
                                input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
 
-    Kernel_size and stride can be inferred from input shape and out shape:
-    padding: (0, 0)
-    stride: (floor(IH / OH), floor(IW / OW))
-    kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w)
+    ``kernel_size`` and ``stride`` can be inferred from input shape and out shape:
+    * padding: (0, 0)
+    * stride: (floor(IH / OH), floor(IW / OW))
+    * kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w)
 
     Examples:
 
diff --git a/imperative/python/megengine/module/module.py b/imperative/python/megengine/module/module.py
index 856c0f01d81cca90360c056aaa06017c0b0b7638..3295d77c7f85c3278e8469ab67b92d78a4407bac 100644
--- a/imperative/python/megengine/module/module.py
+++ b/imperative/python/megengine/module/module.py
@@ -351,7 +351,7 @@ class Module(metaclass=ABCMeta):
     def replace_param(
         self, params: dict, start_pos: int, seen: Optional[Set[int]] = None
     ):
-        """Replaces module's parameters with `params`, used by :class:`~.ParamPack` to
+        """Replaces module's parameters with ``params``, used by :class:`~.ParamPack` to
         speedup multimachine training.
         """
         offset = 0
@@ -411,7 +411,7 @@ class Module(metaclass=ABCMeta):
         If ``strict`` is ``True``, the keys of :func:`state_dict` must exactly match the keys
         returned by :func:`state_dict`.
 
-        Users can also pass a closure: `Function[key: str, var: Tensor] -> Optional[np.ndarray]`
+        Users can also pass a closure: ``Function[key: str, var: Tensor] -> Optional[np.ndarray]``
         as a `state_dict`, in order to handle complex situations. For example, load everything
         except for the final linear classifier:
 
@@ -423,7 +423,7 @@ class Module(metaclass=ABCMeta):
                 for k, v in state_dict.items()
             }, strict=False)
 
-        Here returning `None` means skipping parameter `k`.
+        Here returning ``None`` means skipping parameter ``k``.
 
         To prevent shape mismatch (e.g. load PyTorch weights), we can reshape before loading:
 
@@ -485,9 +485,8 @@ class Module(metaclass=ABCMeta):
                 )
 
     def _load_state_dict_with_closure(self, closure):
-        """Advance state_dict load through callable `closure` whose signature is
-
-            `closure(key: str, var: Tensor) -> Union[np.ndarry, None]`
+        """Advance state_dict load through callable ``closure`` whose signature is
+        ``closure(key: str, var: Tensor) -> Union[np.ndarry, None]``
         """
         assert callable(closure), "closure must be a function"