diff --git a/imperative/python/megengine/autodiff/grad_manager.py b/imperative/python/megengine/autodiff/grad_manager.py index 001c9f9de4eae5e033a64a2993eeb464b63e594b..f63deede49ab155fedd98098ec9b3403cf52ee42 100644 --- a/imperative/python/megengine/autodiff/grad_manager.py +++ b/imperative/python/megengine/autodiff/grad_manager.py @@ -20,42 +20,42 @@ class GradManager: the forward operations start and when all resources should be released. A typical usage of GradManager is as follows: - .. code-block:: + .. code-block:: - gm = GradManager() - gm.attach(model.parameters()) - with gm: - # forward operations - ... - # backward gradients - gm.backward(loss) + gm = GradManager() + gm.attach(model.parameters()) + with gm: + # forward operations + ... + # backward gradients + gm.backward(loss) - You can also use `record()` and `release()` method instead of `with` context: + You can also use ``record()`` and ``release()`` method instead of ``with`` context: - .. code-block:: + .. code-block:: - gm = GradManager() - gm.attach(model.parameters()) + gm = GradManager() + gm.attach(model.parameters()) - gm.record() + gm.record() - # forward operations - ... - # backward gradients - gm.backward(loss) + # forward operations + ... + # backward gradients + gm.backward(loss) - gm.release() + gm.release() Typically, in data parallel, we would like to average the gradients across processes. Users will finally get the averaged gradients if an "AllReduce" callback is registered as follows: - .. code-block:: + .. code-block:: - import megengine.distributed as dist + import megengine.distributed as dist - gm = GradManager() - gm.attach(model.parameters(), callback=dist.make_allreduce_cb("MEAN")) + gm = GradManager() + gm.attach(model.parameters(), callback=dist.make_allreduce_cb("MEAN")) """ diff --git a/imperative/python/megengine/data/dataloader.py b/imperative/python/megengine/data/dataloader.py index a92dff7ac876681147ab32e5894839fa25bfc849..2a818a29781e1f5c579d09e563eccbf22d72fac9 100644 --- a/imperative/python/megengine/data/dataloader.py +++ b/imperative/python/megengine/data/dataloader.py @@ -50,7 +50,6 @@ class DataLoader: :param dataset: dataset from which to load the minibatch. :type sampler: Sampler :param sampler: defines the strategy to sample data from the dataset. - If specified, :attr:`shuffle` must be ``False``. :type transform: Transform :param transform: defined the transforming strategy for a sampled batch. Default: None diff --git a/imperative/python/megengine/functional/__init__.py b/imperative/python/megengine/functional/__init__.py index 37455891a26dd00cb60e6ec3a4482034fe0bbb95..2d3240fab3295e4d0ba33b826d7c162031d205ff 100644 --- a/imperative/python/megengine/functional/__init__.py +++ b/imperative/python/megengine/functional/__init__.py @@ -17,4 +17,4 @@ from . import distributed # isort:skip # delete namespace # pylint: disable=undefined-variable -# del elemwise, graph, loss, math, nn, tensor # type: ignore[name-defined] +del elemwise, graph, loss, math, nn, quantized, tensor, utils # type: ignore[name-defined] diff --git a/imperative/python/megengine/functional/loss.py b/imperative/python/megengine/functional/loss.py index 67a296678a4acf4a72aaf239f99fc10b02d9ae24..0ef622d57d032d315c180e4a141ef0d1a3e99866 100644 --- a/imperative/python/megengine/functional/loss.py +++ b/imperative/python/megengine/functional/loss.py @@ -127,9 +127,10 @@ def cross_entropy( with_logits: bool = True, label_smooth: float = 0, ) -> Tensor: - r"""Compute the multi-class cross entropy loss (using logits by default). + r"""Computes the multi-class cross entropy loss (using logits by default). - By default, prediction is assumed to be logits, whose softmax gives probabilities. + By default(``with_logitis`` is True), ``pred`` is assumed to be logits, + class probabilities are given by softmax. It has better numerical stability compared with sequential calls to :func:`~.softmax` and :func:`~.cross_entropy`. @@ -194,9 +195,10 @@ def cross_entropy( def binary_cross_entropy( pred: Tensor, label: Tensor, with_logits: bool = True ) -> Tensor: - r"""Compute the binary cross entropy loss (using logits by default). + r"""Computes the binary cross entropy loss (using logits by default). - By default, prediction is assumed to be logits, whose sigmoid gives probabilities. + By default(``with_logitis`` is True), ``pred`` is assumed to be logits, + class probabilities are given by sigmoid. :param pred: `(N, *)`, where `*` means any number of additional dimensions. :param label: `(N, *)`, same shape as the input. diff --git a/imperative/python/megengine/functional/nn.py b/imperative/python/megengine/functional/nn.py index 7d120b8ea71d5eef88804f1e2a30f3651094afb7..7a5f8829df99ed966e687304efc63b716d73186d 100644 --- a/imperative/python/megengine/functional/nn.py +++ b/imperative/python/megengine/functional/nn.py @@ -335,8 +335,8 @@ def adaptive_max_pool2d( Refer to :class:`~.MaxAdaptivePool2d` for more information. - :param inp: The input tensor. - :param oshp: (OH, OW) size of the output shape. + :param inp: input tensor. + :param oshp: `(OH, OW)` size of the output shape. :return: output tensor. """ assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type" @@ -356,8 +356,8 @@ def adaptive_avg_pool2d( Refer to :class:`~.AvgAdaptivePool2d` for more information. - :param inp: The input tensor. - :param oshp: (OH, OW) size of the output shape. + :param inp: input tensor. + :param oshp: `(OH, OW)` size of the output shape. :return: output tensor. """ assert isinstance(inp, (Tensor, megbrain_graph.VarNode)), "inp must be Tensor type" diff --git a/imperative/python/megengine/module/adaptive_pooling.py b/imperative/python/megengine/module/adaptive_pooling.py index 99e7c57d272fdfb231dca5fc3a5f45100b57d83a..c0cbf3b2087a40eea02283ec6412e3693d87e381 100644 --- a/imperative/python/megengine/module/adaptive_pooling.py +++ b/imperative/python/megengine/module/adaptive_pooling.py @@ -40,10 +40,10 @@ class AdaptiveMaxPool2d(_AdaptivePoolNd): \text{stride[1]} \times w + n) \end{aligned} - Kernel_size and stride can be inferred from input shape and out shape: - padding: (0, 0) - stride: (floor(IH / OH), floor(IW / OW)) - kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w) + ``kernel_size`` and ``stride`` can be inferred from input shape and out shape: + * padding: (0, 0) + * stride: (floor(IH / OH), floor(IW / OW)) + * kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w) Examples: @@ -83,10 +83,10 @@ class AdaptiveAvgPool2d(_AdaptivePoolNd): out(N_i, C_j, h, w) = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n) - Kernel_size and stride can be inferred from input shape and out shape: - padding: (0, 0) - stride: (floor(IH / OH), floor(IW / OW)) - kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w) + ``kernel_size`` and ``stride`` can be inferred from input shape and out shape: + * padding: (0, 0) + * stride: (floor(IH / OH), floor(IW / OW)) + * kernel_size: (IH - (OH - 1) * stride_h, IW - (OW - 1) * stride_w) Examples: diff --git a/imperative/python/megengine/module/module.py b/imperative/python/megengine/module/module.py index 856c0f01d81cca90360c056aaa06017c0b0b7638..3295d77c7f85c3278e8469ab67b92d78a4407bac 100644 --- a/imperative/python/megengine/module/module.py +++ b/imperative/python/megengine/module/module.py @@ -351,7 +351,7 @@ class Module(metaclass=ABCMeta): def replace_param( self, params: dict, start_pos: int, seen: Optional[Set[int]] = None ): - """Replaces module's parameters with `params`, used by :class:`~.ParamPack` to + """Replaces module's parameters with ``params``, used by :class:`~.ParamPack` to speedup multimachine training. """ offset = 0 @@ -411,7 +411,7 @@ class Module(metaclass=ABCMeta): If ``strict`` is ``True``, the keys of :func:`state_dict` must exactly match the keys returned by :func:`state_dict`. - Users can also pass a closure: `Function[key: str, var: Tensor] -> Optional[np.ndarray]` + Users can also pass a closure: ``Function[key: str, var: Tensor] -> Optional[np.ndarray]`` as a `state_dict`, in order to handle complex situations. For example, load everything except for the final linear classifier: @@ -423,7 +423,7 @@ class Module(metaclass=ABCMeta): for k, v in state_dict.items() }, strict=False) - Here returning `None` means skipping parameter `k`. + Here returning ``None`` means skipping parameter ``k``. To prevent shape mismatch (e.g. load PyTorch weights), we can reshape before loading: @@ -485,9 +485,8 @@ class Module(metaclass=ABCMeta): ) def _load_state_dict_with_closure(self, closure): - """Advance state_dict load through callable `closure` whose signature is - - `closure(key: str, var: Tensor) -> Union[np.ndarry, None]` + """Advance state_dict load through callable ``closure`` whose signature is + ``closure(key: str, var: Tensor) -> Union[np.ndarry, None]`` """ assert callable(closure), "closure must be a function"