paddle/nn fix formula bugs (#34643)

* fix paddle.optimizer test=document_fix * fix paddle.optimizer test=document_fix * fix bugs in paddle.nn.functional document test=document_fix * fix bugs in paddle.nn.functional document test=document_fix * fix bugs in paddle.nn.functional document test=document_fix * fix bugs in paddle.nn.functional document test=document_fix * fix nn formula bugs test=document_fix * fix nn formula bugs test=document_fix * fix nn formula bugs test=document_fix

paddle/nn fix formula bugs (#34643)
* fix paddle.optimizer test=document_fix * fix paddle.optimizer test=document_fix * fix bugs in paddle.nn.functional document test=document_fix * fix bugs in paddle.nn.functional document test=document_fix * fix bugs in paddle.nn.functional document test=document_fix * fix bugs in paddle.nn.functional document test=document_fix * fix nn formula bugs test=document_fix * fix nn formula bugs test=document_fix * fix nn formula bugs test=document_fix
0f19ac7c · sunzhongkai588 · GitHub · 6e442e6a · 0f19ac7c · 0f19ac7c
7 changed file
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -286,18 +286,18 @@ class ClipGradByNorm(ClipGradBase):
    .. math::
        Out =
-        \\left \{
+        \left\{
-        \\begin{aligned}
+            \begin{array}{ccl}
-        & X & & if (norm(X) \\leq clip\_norm) \\\\
+                X & & if (norm(X) \leq clip\_norm) \\
-        & \\frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\\\
+                \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\
-        \\end{aligned}
+        \end{array}
-        \\right.
+        \right.
    where :math:`norm(X)` represents the L2 norm of :math:`X`.
    .. math::
-        norm(X) = ( \\sum_{i=1}^{n}|x\_i|^2)^{ \\frac{1}{2}}
+        norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}
    Note:
        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0. 
@@ -389,7 +389,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
    .. math::
-        t\_list[i] = t\_list[i] * \\frac{clip\_norm}{\max(global\_norm, clip\_norm)}
+        t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}
    where:

--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -1151,9 +1151,6 @@ class InstanceNorm(layers.Layer):
 class BatchNorm(layers.Layer):
    r"""
-    :alias_main: paddle.nn.BatchNorm
-	:alias: paddle.nn.BatchNorm,paddle.nn.layer.BatchNorm,paddle.nn.layer.norm.BatchNorm
-	:old_api: paddle.fluid.dygraph.BatchNorm
    This interface is used to construct a callable object of the ``BatchNorm`` class.
    For more details, refer to code examples.
@@ -1164,16 +1161,16 @@ class BatchNorm(layers.Layer):
    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
    for more details.
-    When use_global_stats = False, the :math:`\\mu_{\\beta}` 
+    When use_global_stats = False, the :math:`\mu_{\beta}` 
-    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
    Calculated as follows:
    ..  math::
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &
-        \ mini-batch\ mean \\\\
+        //\ mini-batch\ mean \\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \mu_{\beta})^2 \qquad &
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        //\ mini-batch\ variance \\
    - :math:`x` : mini-batch data
    - :math:`m` : the size of the mini-batch data
@@ -1191,13 +1188,14 @@ class BatchNorm(layers.Layer):
    ..  math::
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
-    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
-    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\gamma` : trainable proportional parameter
-    - :math:`\\beta` : trainable deviation parameter
+    - :math:`\beta` : trainable deviation parameter
    Parameters:
        num_channels(int): Indicate the number of channels of the input ``Tensor``.
@@ -3011,9 +3009,9 @@ class SpectralNorm(layers.Layer):
    .. math::
-        \mathbf{v} := \\frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2}
+        \mathbf{v} := \frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2}
-        \mathbf{u} := \\frac{\mathbf{W}^{T} \mathbf{v}}{\|\mathbf{W}^{T} \mathbf{v}\|_2}
+        \mathbf{u} := \frac{\mathbf{W}^{T} \mathbf{v}}{\|\mathbf{W}^{T} \mathbf{v}\|_2}
    Step 3:
    Calculate :math:`\sigma(\mathbf{W})` and normalize weight values.
@@ -3022,7 +3020,7 @@ class SpectralNorm(layers.Layer):
        \sigma(\mathbf{W}) = \mathbf{u}^{T} \mathbf{W} \mathbf{v}
-        \mathbf{W} = \\frac{\mathbf{W}}{\sigma(\mathbf{W})}
+        \mathbf{W} = \frac{\mathbf{W}}{\sigma(\mathbf{W})}
    Refer to `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .

--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -33,7 +33,7 @@ class KaimingNormal(MSRAInitializer):
    .. math::
-        \sqrt{\\frac{2.0}{fan\_in}}
+        \sqrt{\frac{2.0}{fan\_in}}
    Args:
        fan_in (float32|None): fan_in for Kaiming normal Initializer. If None, it is\
@@ -75,7 +75,7 @@ class KaimingUniform(MSRAInitializer):
    .. math::
-        x = \sqrt{\\frac{6.0}{fan\_in}}
+        x = \sqrt{\frac{6.0}{fan\_in}}
    Args:
        fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\

--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -28,7 +28,7 @@ class XavierNormal(XavierInitializer):
    .. math::
-        \sqrt{\\frac{2.0}{fan\_in + fan\_out}}
+        \sqrt{\frac{2.0}{fan\_in + fan\_out}}
    Args:
@@ -83,7 +83,7 @@ class XavierUniform(XavierInitializer):
    .. math::
-        x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
+        x = \sqrt{\frac{6.0}{fan\_in + fan\_out}}
    Args:
        fan_in (float, optional): fan_in for Xavier initialization, it is

--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -31,7 +31,7 @@ class ELU(Layer):
    .. math::
-        ELU(x) = max(0, x) + min(0, \\alpha * (e^{x}-1))
+        ELU(x) = max(0, x) + min(0, \alpha * (e^{x}-1))
    Parameters:
        alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0.
@@ -75,13 +75,13 @@ class GELU(Layer):
    .. math::
-        GELU(x) = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3})))
+        GELU(x) = 0.5 * x * (1 + tanh(\sqrt{\frac{2}{\pi}} * (x + 0.044715x^{3})))
    else
    .. math::
-        GELU(x) = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}}))
+        GELU(x) = 0.5 * x * (1 + erf(\frac{x}{\sqrt{2}}))
    Parameters:
        approximate (bool, optional): Wether to enable approximation. Default is False.
@@ -127,13 +127,13 @@ class Hardshrink(Layer):
    .. math::
        hardshrink(x)=
-            \\left\\{
+            \left\{
-            \\begin{aligned}
+                \begin{array}{rcl}
-            &x, & & if \\ x > threshold \\\\
+                    x, & & if \ x > threshold \\
-            &x, & & if \\ x < -threshold \\\\
+                    x, & & if \ x < -threshold \\
-            &0, & & if \\ others
+                    0, & & if \ others
-            \\end{aligned}
+            \end{array}
-            \\right.
+            \right.
    Parameters:
        threshold (float, optional): The value of threshold for hardthrink. Default is 0.5
@@ -179,13 +179,14 @@ class Hardswish(Layer):
    .. math::
        Hardswish(x)=
-            \\left\\{
+            \left\{
-            \\begin{aligned}
+                \begin{array}{cll}
-            &0, & & \\text{if } x \\leq -3 \\\\
+                0 &, & \text{if } x \leq -3 \\
-            &x, & & \\text{if } x \\geq 3 \\\\
+                x &, & \text{if } x \geq 3 \\
-            &\\frac{x(x+3)}{6}, & & \\text{otherwise}
+                \frac{x(x+3)}{6} &, & \text{otherwise}
-            \\end{aligned}
+                \end{array}
-            \\right.
+            \right.
    Parameters:
        name (str, optional): Name for the operation (optional, default is None).
@@ -223,7 +224,7 @@ class Tanh(Layer):
    Tanh Activation.
    .. math::
-        Tanh(x) = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
+        Tanh(x) = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
    Parameters:
        name (str, optional): Name for the operation (optional, default is None).
@@ -265,11 +266,15 @@ class Hardtanh(Layer):
    .. math::
-        Hardtanh(x)= \\begin{cases}
+        Hardtanh(x)=
-                        max, \\text{if } x > max \\\\
+            \left\{
-                        min, \\text{if } x < min \\\\
+                \begin{array}{cll}
-                        x,  \\text{otherwise}
+                    max,& & \text{if } x > max \\
-                      \\end{cases}
+                    min,& & \text{if } x < min \\
+                    x,& & \text{otherwise}
+                \end{array}
+            \right.
    Parameters:
        min (float, optional): The value of min for Hardtanh. Default is -1.
@@ -461,10 +466,12 @@ class SELU(Layer):
    .. math::
        SELU(x)= scale *
-                 \\begin{cases}
+            \left\{
-                   x, \\text{if } x > 0 \\\\
+                \begin{array}{lcl}
-                   alpha * e^{x} - alpha, \\text{if } x <= 0
+                x,& &\text{if } \ x > 0 \\
-                 \\end{cases}
+                alpha * e^{x} - alpha,& &\text{if } \ x <= 0
+                \end{array}
+            \right.
    Parameters:
        scale (float, optional): The value of scale(must be greater than 1.0) for SELU. Default is 1.0507009873554804934193349852946
@@ -512,12 +519,13 @@ class LeakyReLU(Layer):
    .. math::
        LeakyReLU(x)=
-            \\left\\{
+            \left\{
-            \\begin{aligned}
+                \begin{array}{rcl}
-            &x, & & if \\ x >= 0 \\\\
+                    x, & & if \ x >= 0 \\
-            &negative\_slope * x, & & otherwise \\\\
+                    negative\_slope * x, & & otherwise \\
-            \\end{aligned}
+                \end{array}
-            \\right. \\\\
+            \right.
    Parameters:
        negative_slope (float, optional): Slope of the activation function at
@@ -604,13 +612,14 @@ class Hardsigmoid(Layer):
    .. math::
        Hardsigmoid(x)=
-            \\left\\{
+            \left\{
-            \\begin{aligned}
+                \begin{array}{rcl}
-            &0, & & \\text{if } x \\leq -3 \\\\
+            0, & & \text{if } \ x \leq -3 \\
-            &1, & & \\text{if } x \\geq 3 \\\\
+            1, & & \text{if } \ x \geq 3 \\
-            &x/6 + 1/2, & & \\text{otherwise}
+            x/6 + 1/2, & & \text{otherwise}
-            \\end{aligned}
+                \end{array}
-            \\right.
+            \right.
    Parameters:
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
@@ -650,8 +659,8 @@ class Softplus(Layer):
    .. math::
-        Softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\
+        Softplus(x) = \frac{1}{beta} * \log(1 + e^{beta * x}) \\
-        \\text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.}
+        \text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.}
    Parameters:
        beta (float, optional): The value of beta for Softplus. Default is 1
@@ -695,11 +704,15 @@ class Softshrink(Layer):
    .. math::
-        Softshrink(x)= \\begin{cases}
+        Softshrink(x)=
-                        x - threshold, \\text{if } x > threshold \\\\
+            \left\{
-                        x + threshold, \\text{if } x < -threshold \\\\
+                \begin{array}{rcl}
-                        0,  \\text{otherwise}
+                x - threshold,& & \text{if } x > threshold \\
-                      \\end{cases}
+                x + threshold,& & \text{if } x < -threshold \\
+                0,& &  \text{otherwise}
+            \end{array}
+            \right.
    Parameters:
        threshold (float, optional): The value of threshold(must be no less than zero) for softplus. Default is 0.5
@@ -740,7 +753,7 @@ class Softsign(Layer):
    .. math::
-        Softsign(x) = \\frac{x}{1 + |x|}
+        Softsign(x) = \frac{x}{1 + |x|}
    Parameters:
        name (str, optional): Name for the operation (optional, default is None).
@@ -779,7 +792,7 @@ class Swish(Layer):
    .. math::
-        Swish(x) = \\frac{x}{1 + e^{-x}}
+        Swish(x) = \frac{x}{1 + e^{-x}}
    Parameters:
        name (str, optional): Name for the operation (optional, default is None).
@@ -857,10 +870,14 @@ class ThresholdedReLU(Layer):
    .. math::
-        ThresholdedReLU(x) = \\begin{cases}
+        ThresholdedReLU(x) =
-                               x, \\text{if } x > threshold \\\\
+            \left\{
-                               0, \\text{otherwise}
+                \begin{array}{rl}
-                              \\end{cases}
+                x,& \text{if } \ x > threshold \\
+                0,& \text{otherwise}
+                \end{array}
+            \right.
    Parameters:
        threshold (float, optional): The value of threshold for ThresholdedReLU. Default is 1.0
@@ -939,7 +956,7 @@ class LogSigmoid(Layer):
    .. math::
-        LogSigmoid(x) = log \\frac{1}{1 + e^{-x}}
+        LogSigmoid(x) = log \frac{1}{1 + e^{-x}}
    Parameters:
        x (Tensor): The input Tensor with data type float32, or float64.
@@ -1001,7 +1018,7 @@ class Softmax(Layer):
    .. math::
-        Softmax[i, j] = \\frac{\\exp(x[i, j])}{\\sum_j(exp(x[i, j])}
+        Softmax[i, j] = \frac{\exp(x[i, j])}{\sum_j(exp(x[i, j])}
    Example:
@@ -1105,10 +1122,10 @@ class LogSoftmax(Layer):
    .. math::
-        \\begin{aligned} 
+        \begin{array} {rcl}
-        Out[i, j] &= log(softmax(x)) \\\\
+            Out[i, j] &= &log(softmax(x)) \\
-        &= log(\\frac{\\exp(X[i, j])}{\\sum_j(\\exp(X[i, j])})
+            &= &log(\frac{\exp(X[i, j])}{\sum_j(\exp(X[i, j])})
-        \\end{aligned}
+        \end{array}
    Parameters:
        axis (int, optional): The axis along which to perform log_softmax
@@ -1167,12 +1184,14 @@ class Maxout(Layer):
    .. math::
-        &out_{si+j} = \max_{k} x_{gsi + sk + j} \\\\
+        \begin{array}{l}
-        &g = groups \\\\
+            &out_{si+j} = \max_{k} x_{gsi + sk + j} \\
-        &s = \\frac{input.size}{num\\_channels} \\\\
+            &g = groups \\
-        &0 \\le i < \\frac{num\\_channels}{groups} \\\\
+            &s = \frac{input.size}{num\_channels} \\
-        &0 \\le j < s \\\\
+            &0 \le i < \frac{num\_channels}{groups} \\
-        &0 \\le k < groups
+            &0 \le j < s \\
+            &0 \le k < groups
+        \end{array}
    Parameters:
        groups (int, optional): The groups number of maxout. `groups` specifies the

--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -40,18 +40,18 @@ class BCEWithLogitsLoss(Layer):
    First this operator calculate loss function as follows:
    .. math::
-           Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit))
+           Out = -Labels * \log(\sigma(Logit)) - (1 - Labels) * \log(1 - \sigma(Logit))
-    We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\e^{-Logit}}`. By substituting this we get:
+    We know that :math:`\sigma(Logit) = \frac{1}{1 + e^{-Logit}}`. By substituting this we get:
    .. math::
-           Out = Logit - Logit * Labels + \\log(1 + \\e^{-Logit})
+           Out = Logit - Logit * Labels + \log(1 + e^{-Logit})
-    For stability and to prevent overflow of :math:`\\e^{-Logit}` when Logit < 0,
+    For stability and to prevent overflow of :math:`e^{-Logit}` when Logit < 0,
    we reformulate the loss as follows:
    .. math::
-           Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + \\e^{-\|Logit\|})
+           Out = \max(Logit, 0) - Logit * Labels + \log(1 + e^{-\|Logit\|})
    Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
    weight tensor on the loss `Out`. The ``weight`` tensor will attach different
@@ -779,8 +779,6 @@ class BCELoss(Layer):
 class NLLLoss(Layer):
    r"""
-	:alias_main: paddle.nn.NLLLoss
-	:alias: paddle.nn.NLLLoss,paddle.nn.layer.NLLLoss,paddle.nn.layer.loss.NLLLoss
    This class accepts input and target label and returns negative log likelihood
    cross error. It is useful to train a classification problem with C classes.
@@ -800,20 +798,25 @@ class NLLLoss(Layer):
    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
    .. math::
-        \ell(x, y) = L = \{l_1,\dots,l_N\}^\\top, \quad
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
        l_n = - w_{y_n} x_{n,y_n}, \quad
-        w_{c} = \\text{weight}[c] \cdot \mathbb{1}\{c \\not= \\text{ignore\\_index}\},
+        w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\},
    where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
    (default ``'mean'``), then
    .. math::
-        \ell(x, y) = \\begin{cases}
-            \\sum_{n=1}^N \\frac{1}{\\sum_{n=1}^N w_{y_n}} l_n, &
+        \ell(x, y) =
-            \\text{if reduction} = \\text{'mean';}\\\\
+        \left\{
-            \\sum_{n=1}^N l_n,  &
+            \begin{array}{lcl}
-            \\text{if reduction} = \\text{'sum'.}
+            \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, &
-        \\end{cases}
+            \text{if  reduction} = \text{'mean';}\\
+            \sum_{n=1}^N l_n,  &
+            \text{if  reduction} = \text{'sum'.}
+            \end{array}
+        \right.
    Parameters:
        weight (Tensor, optional): Weight tensor, a manual rescaling weight given
@@ -1136,16 +1139,16 @@ class SmoothL1Loss(Layer):
    .. math::
-         loss(x,y) = \\frac{1}{n}\\sum_{i}z_i
+         loss(x,y) = \frac{1}{n}\sum_{i}z_i
    where z_i is given by:
    .. math::
-         \\mathop{z_i} = \\left\\{\\begin{array}{rcl}
+        \mathop{z_i} = \left\{\begin{array}{rcl}
-        0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\\\
+        0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\
        delta * |x_i - y_i| - 0.5 * delta^2 & & {otherwise}
-        \\end{array} \\right.
+        \end{array} \right.
    Parameters:
        reduction (str, optional): Indicate how to average the loss by batch_size,

--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -115,13 +115,13 @@ class InstanceNorm1D(_InstanceNormBase):
    ..  math::
-        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\
-        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \ mean\ of\ one\  feature\ map\ in\ mini-batch \\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \
-        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \mu_{\beta})^2 \qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
    Note:
        `H` means height of feature map, `W` means width of feature map.
@@ -187,13 +187,13 @@ class InstanceNorm2D(_InstanceNormBase):
    ..  math::
-        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\
-        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \ mean\ of\ one\  feature\ map\ in\ mini-batch \\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \
-        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \mu_{\beta})^2 \qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
    Note:
        `H` means height of feature map, `W` means width of feature map.
@@ -257,13 +257,13 @@ class InstanceNorm3D(_InstanceNormBase):
    ..  math::
-        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
+        \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\
-        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
+        \ mean\ of\ one\  feature\ map\ in\ mini-batch \\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \
-        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
+        \mu_{\beta})^2 \qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
    Note:
        `H` means height of feature map, `W` means width of feature map.
@@ -450,15 +450,15 @@ class LayerNorm(Layer):
    ..  math::
-        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} x_i
+        \mu & = \frac{1}{H}\sum_{i=1}^{H} x_i
-        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}{(x_i - \\mu)^2} + \\epsilon}
+        \sigma & = \sqrt{\frac{1}{H}\sum_{i=1}^{H}{(x_i - \mu)^2} + \epsilon}
-        y & = f(\\frac{g}{\\sigma}(x - \\mu) + b)
+        y & = f(\frac{g}{\sigma}(x - \mu) + b)
    - :math:`x`: the vector representation of the summed inputs to the neurons in that layer.
    - :math:`H`: the number of hidden units in a layers
-    - :math:`\\epsilon`: the small value added to the variance to prevent division by zero.
+    - :math:`\epsilon`: the small value added to the variance to prevent division by zero.
    - :math:`g`: the trainable scale parameter.
    - :math:`b`: the trainable bias parameter.
@@ -666,37 +666,36 @@ class BatchNorm1D(_BatchNormBase):
    r"""
    Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
-    When use_global_stats = False, the :math:`\\mu_{\\beta}`
+    When use_global_stats = False, the :math:`\mu_{\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
    Calculated as follows:
    ..  math::
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\
-        \ mini-batch\ mean \\\\
+        \ mini-batch\ mean \\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\
-    When use_global_stats = True, the :math:`\\mu_{\\beta}`
+    When use_global_stats = True, the :math:`\mu_{\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    and :math:`\sigma_{\beta}^{2}` are not the statistics of one mini-batch.
    They are global or running statistics (moving_mean and moving_variance). It usually got from the
    pre-trained model. Calculated as follows:
    .. math::
-        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
-        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\
    The normalization function formula is as follows:
    ..  math::
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
-    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
-    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\gamma` : trainable proportional parameter
-    - :math:`\\beta` : trainable deviation parameter
+    - :math:`\beta` : trainable deviation parameter
    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -770,37 +769,36 @@ class BatchNorm2D(_BatchNormBase):
    r"""
    Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
-    When use_global_stats = False, the :math:`\\mu_{\\beta}`
+    When use_global_stats = False, the :math:`\mu_{\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
    Calculated as follows:
    ..  math::
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//
-        \ mini-batch\ mean \\\\
+        \ mini-batch\ mean \\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - 
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\
-    When use_global_stats = True, the :math:`\\mu_{\\beta}`
+    When use_global_stats = True, the :math:`\mu_{\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    and :math:`\sigma_{\beta}^{2}` are not the statistics of one mini-batch.
    They are global or running statistics (moving_mean and moving_variance). It usually got from the
    pre-trained model. Calculated as follows:
    .. math::
-        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
-        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\
    The normalization function formula is as follows:
    ..  math::
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
-    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
-    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\gamma` : trainable proportional parameter
-    - :math:`\\beta` : trainable deviation parameter
+    - :math:`\beta` : trainable deviation parameter
    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -859,16 +857,16 @@ class BatchNorm3D(_BatchNormBase):
    r"""
    Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
-    When use_global_stats = False, the :math:`\\mu_{\\beta}`
+    When use_global_stats = False, the :math:`\mu_{\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
    Calculated as follows:
    ..  math::
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\
-        \ mini-batch\ mean \\\\
+        \ mini-batch\ mean \\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\
    When use_global_stats = True, the :math:`\\mu_{\\beta}`
    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
@@ -876,20 +874,19 @@ class BatchNorm3D(_BatchNormBase):
    pre-trained model. Calculated as follows:
    .. math::
-        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
-        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\
    The normalization function formula is as follows:
    ..  math::
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
-    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
-    - :math:`\\gamma` : trainable proportional parameter
+    - :math:`\gamma` : trainable proportional parameter
-    - :math:`\\beta` : trainable deviation parameter
+    - :math:`\beta` : trainable deviation parameter
    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -976,33 +973,33 @@ class SyncBatchNorm(_BatchNormBase):
    ..  math::
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\
-        \ mini-batch\ mean \\\\
+        \ mini-batch\ mean \\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\
    - :math:`x` : whole mini-batch data in all gpus
    - :math:`m` : the size of the whole mini-batch data
    When model in evaluation mode, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance, 
+    and :math:`\sigma_{\beta}^{2}` are global statistics (moving_mean and moving_variance, 
    which usually got from the pre-trained model). Global statistics calculated as follows:
    .. math::
-        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
-        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\
    The formula of normalization is as follows:
    ..  math::
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
-        \\sigma_{\\beta}^{2} + \\eps}} \\qquad &//\ normalize \\\\
+        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
-    - :math:`\\eps` : add a smaller value to the variance to prevent division by zero
+    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
-    - :math:`\\gamma` : trainable scale parameter vector
+    - :math:`\gamma` : trainable scale parameter vector
-    - :math:`\\beta` : trainable shift parameter vector 
+    - :math:`\beta` : trainable shift parameter vector 
    Note:
        If you want to use container to pack your model and has ``SyncBatchNorm`` in the