docs(mge): fix some docstring format problem

GitOrigin-RevId: cbc5ab04b368246f1ae6d9e797703c92f2e524c2

docs(mge): fix some docstring format problem
GitOrigin-RevId: cbc5ab04b368246f1ae6d9e797703c92f2e524c2
67013463 · Megvii Engine Team · 5cc043f0 · 67013463 · 67013463 · 67013463
7 changed file
--- a/imperative/python/megengine/autodiff/grad_manager.py
+++ b/imperative/python/megengine/autodiff/grad_manager.py
@@ -20,42 +20,42 @@ class GradManager:
    the forward operations start and when all resources should be released. A typical usage of
    GradManager is as follows:
-        .. code-block::
+    .. code-block::
-            gm = GradManager()
+        gm = GradManager()
-            gm.attach(model.parameters())
+        gm.attach(model.parameters())
-            with gm:
+        with gm:
-                # forward operations
+            # forward operations
-                ...
+            ...
-                # backward gradients
+            # backward gradients
-                gm.backward(loss)
+            gm.backward(loss)
-    You can also use `record()` and `release()` method instead of `with` context:
+    You can also use ``record()`` and ``release()`` method instead of ``with`` context:
-        .. code-block::
+    .. code-block::
-            gm = GradManager()
+        gm = GradManager()
-            gm.attach(model.parameters())
+        gm.attach(model.parameters())
-            gm.record()
+        gm.record()
-            # forward operations
+        # forward operations
-            ...
+        ...
-            # backward gradients
+        # backward gradients
-            gm.backward(loss)
+        gm.backward(loss)
-            gm.release()
+        gm.release()
    Typically, in data parallel, we would like to average the gradients across
    processes. Users will finally get the averaged gradients if an "AllReduce"
    callback is registered as follows:
-        .. code-block::
+    .. code-block::
-            import megengine.distributed as dist
+        import megengine.distributed as dist
-            gm = GradManager()
+        gm = GradManager()
-            gm.attach(model.parameters(), callback=dist.make_allreduce_cb("MEAN"))
+        gm.attach(model.parameters(), callback=dist.make_allreduce_cb("MEAN"))
    """

--- a/imperative/python/megengine/data/dataloader.py
+++ b/imperative/python/megengine/data/dataloader.py
@@ -50,7 +50,6 @@ class DataLoader:
        :param dataset: dataset from which to load the minibatch.
        :type sampler: Sampler
        :param sampler: defines the strategy to sample data from the dataset.
-            If specified, :attr:`shuffle` must be ``False``.
        :type transform: Transform
        :param transform: defined the transforming strategy for a sampled batch.
            Default: None

--- a/imperative/python/megengine/functional/__init__.py
+++ b/imperative/python/megengine/functional/__init__.py
@@ -17,4 +17,4 @@ from . import distributed  # isort:skip
 # delete namespace
 # pylint: disable=undefined-variable
-# del elemwise, graph, loss, math, nn, tensor  # type: ignore[name-defined]
+del elemwise, graph, loss, math, nn, quantized, tensor, utils  # type: ignore[name-defined]
--- a/imperative/python/megengine/functional/loss.py
+++ b/imperative/python/megengine/functional/loss.py
@@ -127,9 +127,10 @@ def cross_entropy(
    with_logits: bool = True,
    label_smooth: float = 0,
 ) -> Tensor:
-    r"""Compute the multi-class cross entropy loss (using logits by default).
+    r"""Computes the multi-class cross entropy loss (using logits by default).
-    By default, prediction is assumed to be logits, whose softmax gives probabilities.
+    By default(``with_logitis`` is True), ``pred`` is assumed to be logits,
+    class probabilities are given by softmax.
    It has better numerical stability compared with sequential calls to :func:`~.softmax` and :func:`~.cross_entropy`.
@@ -194,9 +195,10 @@ def cross_entropy(
 def binary_cross_entropy(
    pred: Tensor, label: Tensor, with_logits: bool = True
 ) -> Tensor:
-    r"""Compute the binary cross entropy loss (using logits by default).
+    r"""Computes the binary cross entropy loss (using logits by default).
-    By default, prediction is assumed to be logits, whose sigmoid gives probabilities.
+    By default(``with_logitis`` is True), ``pred`` is assumed to be logits,
+    class probabilities are given by sigmoid.
    :param pred: `(N, *)`, where `*` means any number of additional dimensions.
    :param label: `(N, *)`, same shape as the input.

--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
@@ -335,8 +335,8 @@ def adaptive_max_pool2d(
    Refer to :class:`~.MaxAdaptivePool2d` for more information.
-    :param inp: The input tensor.
+    :param inp: input tensor.
-    :param oshp: (OH, OW) size of the output shape.
+    :param oshp: `(OH, OW)` size of the output shape.
    :return: output tensor.
    """
    assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type"
@@ -356,8 +356,8 @@ def adaptive_avg_pool2d(
    Refer to :class:`~.AvgAdaptivePool2d` for more information.
-    :param inp: The input tensor.
+    :param inp: input tensor.
-    :param oshp: (OH, OW) size of the output shape.
+    :param oshp: `(OH, OW)` size of the output shape.
    :return: output tensor.
    """
    assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type"

--- a/imperative/python/megengine/module/adaptive_pooling.py
+++ b/imperative/python/megengine/module/adaptive_pooling.py
@@ -40,10 +40,10 @@ class AdaptiveMaxPool2d(_AdaptivePoolNd):
                \text{stride[1]} \times w + n)
        \end{aligned}
-    Kernel_size and stride can be inferred from input shape and out shape:
+    ``kernel_size`` and ``stride`` can be inferred from input shape and out shape:
-    padding: (0, 0)
+    * padding: (0, 0)
-    stride: (floor(IH / OH), floor(IW / OW))
+    * stride: (floor(IH / OH), floor(IW / OW))
-    kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w)
+    * kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w)
    Examples:
@@ -83,10 +83,10 @@ class AdaptiveAvgPool2d(_AdaptivePoolNd):
        out(N_i, C_j, h, w)  = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
-    Kernel_size and stride can be inferred from input shape and out shape:
+    ``kernel_size`` and ``stride`` can be inferred from input shape and out shape:
-    padding: (0, 0)
+    * padding: (0, 0)
-    stride: (floor(IH / OH), floor(IW / OW))
+    * stride: (floor(IH / OH), floor(IW / OW))
-    kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w)
+    * kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w)
    Examples:

--- a/imperative/python/megengine/module/module.py
+++ b/imperative/python/megengine/module/module.py
@@ -351,7 +351,7 @@ class Module(metaclass=ABCMeta):
    def replace_param(
        self, params: dict, start_pos: int, seen: Optional[Set[int]] = None
    ):
-        """Replaces module's parameters with `params`, used by :class:`~.ParamPack` to
+        """Replaces module's parameters with ``params``, used by :class:`~.ParamPack` to
        speedup multimachine training.
        """
        offset = 0
@@ -411,7 +411,7 @@ class Module(metaclass=ABCMeta):
        If ``strict`` is ``True``, the keys of :func:`state_dict` must exactly match the keys
        returned by :func:`state_dict`.
-        Users can also pass a closure: `Function[key: str, var: Tensor] -> Optional[np.ndarray]`
+        Users can also pass a closure: ``Function[key: str, var: Tensor] -> Optional[np.ndarray]``
        as a `state_dict`, in order to handle complex situations. For example, load everything
        except for the final linear classifier:
@@ -423,7 +423,7 @@ class Module(metaclass=ABCMeta):
                for k, v in state_dict.items()
            }, strict=False)
-        Here returning `None` means skipping parameter `k`.
+        Here returning ``None`` means skipping parameter ``k``.
        To prevent shape mismatch (e.g. load PyTorch weights), we can reshape before loading:
@@ -485,9 +485,8 @@ class Module(metaclass=ABCMeta):
                )
    def _load_state_dict_with_closure(self, closure):
-        """Advance state_dict load through callable `closure` whose signature is
+        """Advance state_dict load through callable ``closure`` whose signature is
+        ``closure(key: str, var: Tensor) -> Union[np.ndarry, None]``
-            `closure(key: str, var: Tensor) -> Union[np.ndarry, None]`
        """
        assert callable(closure), "closure must be a function"