diff --git a/python_module/megengine/module/activation.py b/python_module/megengine/module/activation.py index a6bba573d7be8840a1b08dda4dbb89cd139dbf20..b80c10a798369bc9186adf1f80a24140f10a82b6 100644 --- a/python_module/megengine/module/activation.py +++ b/python_module/megengine/module/activation.py @@ -191,7 +191,7 @@ class LeakyReLU(Module): Applies the element-wise function: .. math:: - \text{LeakyReLU}(x) = \max(0,x) + 0.01 * \min(0,x) + \text{LeakyReLU}(x) = \max(0,x) + negative\_slope \times \min(0,x) or @@ -199,7 +199,7 @@ class LeakyReLU(Module): \text{LeakyReLU}(x) = \begin{cases} x, & \text{ if } x \geq 0 \\ - 0.01x, & \text{ otherwise } + negative\_slope \times x, & \text{ otherwise } \end{cases} Examples: @@ -211,7 +211,7 @@ class LeakyReLU(Module): import megengine.module as M data = mge.tensor(np.array([-8, -12, 6, 10]).astype(np.float32)) - leakyrelu = M.LeakyReLU() + leakyrelu = M.LeakyReLU(0.01) output = leakyrelu(data) print(output.numpy()) diff --git a/python_module/megengine/module/conv.py b/python_module/megengine/module/conv.py index 9029c377cd98fc2404406ad5cb9be07cd0d9d1fc..fbeb50db11cbe9b9b642b6fb55fde6176fb26dbd 100644 --- a/python_module/megengine/module/conv.py +++ b/python_module/megengine/module/conv.py @@ -204,7 +204,7 @@ class ConvTranspose2d(_ConvNd): with respect to its input. Convolution usually reduces the size of input, while transposed convolution works - the other way, transforming a smaller input to a larger output while preserving the + the opposite way, transforming a smaller input to a larger output while preserving the connectivity pattern. :param in_channels: number of input channels. diff --git a/python_module/megengine/module/dropout.py b/python_module/megengine/module/dropout.py index 5deb5ea8e68551141ac9dbf92ffacc9a6dbb647b..146eba24544bd713e3c2210a78e1466317012ba6 100644 --- a/python_module/megengine/module/dropout.py +++ b/python_module/megengine/module/dropout.py @@ -11,9 +11,9 @@ from .module import Module class Dropout(Module): - r"""Randomly set input elements to zeros. Commonly used in large networks to prevent overfitting. + r"""Randomly set input elements to zeros with the probability :math:`drop\_prob` during training. Commonly used in large networks to prevent overfitting. Note that we perform dropout only during training, we also rescale(multiply) the output tensor - by :math:`\frac{1}{1 - p}`. During inference :class:`~.Dropout` is equal to :class:`~.Identity`. + by :math:`\frac{1}{1 - drop\_prob}`. During inference :class:`~.Dropout` is equal to :class:`~.Identity`. :param drop_prob: The probability to drop (set to zero) each single element """ diff --git a/python_module/megengine/module/identity.py b/python_module/megengine/module/identity.py index 7d62ae24030bcb2b74dfef264e8949106988e0e4..51b31e505370020a14744e39054979da5c197027 100644 --- a/python_module/megengine/module/identity.py +++ b/python_module/megengine/module/identity.py @@ -11,5 +11,7 @@ from .module import Module class Identity(Module): + r"""A placeholder identity operator that will ignore any argument.""" + def forward(self, x): return identity(x) diff --git a/python_module/megengine/module/init.py b/python_module/megengine/module/init.py index 01c0bcb82f10392abb9c284298b7fe27144f6f51..8c39443ed77df233251b7ca44b7931e140b57759 100644 --- a/python_module/megengine/module/init.py +++ b/python_module/megengine/module/init.py @@ -176,8 +176,8 @@ def xavier_uniform_(tensor: Tensor, gain: float = 1.0) -> None: a = \text{gain} \times \sqrt{\frac{6}{\text{fan_in} + \text{fan_out}}} Also known as Glorot initialization. Detailed information can be retrieved from - `Understanding the difficulty of training deep feedforward neural networks` - - Glorot, X. & Bengio, Y. (2010). + `"Understanding the difficulty of training deep feedforward neural networks" `_. + :param tensor: An n-dimentional tensor to be initialized :param gain: Scaling factor for :math:`a`. @@ -196,8 +196,7 @@ def xavier_normal_(tensor: Tensor, gain: float = 1.0) -> None: \text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan_in} + \text{fan_out}}} Also known as Glorot initialization. Detailed information can be retrieved from - `Understanding the difficulty of training deep feedforward neural networks` - - Glorot, X. & Bengio, Y. (2010). + `"Understanding the difficulty of training deep feedforward neural networks" `_. :param tensor: An n-dimentional tensor to be initialized :param gain: Scaling factor for :math:`std`. @@ -217,8 +216,9 @@ def msra_uniform_( \text{bound} = \sqrt{\frac{6}{(1 + a^2) \times \text{fan_in}}} Detailed information can be retrieved from - `Delving deep into rectifiers: Surpassing human-level performance on ImageNet - classification` + `"Delving deep into rectifiers: Surpassing human-level performance on ImageNet + classification" `_. + :param tensor: An n-dimentional tensor to be initialized :param a: Optional parameter for calculating gain for leaky_relu. See @@ -246,8 +246,8 @@ def msra_normal_( \text{std} = \sqrt{\frac{2}{(1 + a^2) \times \text{fan_in}}} Detailed information can be retrieved from - `Delving deep into rectifiers: Surpassing human-level performance on ImageNet - classification` + `"Delving deep into rectifiers: Surpassing human-level performance on ImageNet + classification" `_. :param tensor: An n-dimentional tensor to be initialized :param a: Optional parameter for calculating gain for leaky_relu. See