diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index cebb5e77ac636f1d7aa9509989080b416afe2a8f..37b67ea993f779bb1aace08384e9f7804b05d091 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -7097,9 +7097,9 @@ def dice_loss(input, label, epsilon=0.00001, name=None):
 
     .. math::
 
-        dice\_loss &= 1 - \\frac{2 * intersection\_area}{total\_area} \\\\
-                  &= \\frac{(total\_area - intersection\_area) - intersection\_area}{total\_area} \\\\
-                  &= \\frac{(union\_area - intersection\_area)}{total\_area}
+        dice\_loss &= 1 - \frac{2 * intersection\_area}{total\_area} \\
+                  &= \frac{(total\_area - intersection\_area) - intersection\_area}{total\_area} \\
+                  &= \frac{(union\_area - intersection\_area)}{total\_area}
 
 
     Parameters:
@@ -13065,8 +13065,8 @@ def log_loss(input, label, epsilon=1e-4, name=None):
 
     .. math::
 
-        Out = -label * \\log{(input + \\epsilon)}
-              - (1 - label) * \\log{(1 - input + \\epsilon)}
+        Out = -label * \log{(input + \epsilon)}
+              - (1 - label) * \log{(1 - input + \epsilon)}
 
     Args:
         input (Tensor|list):  A 2-D tensor with shape [N x 1], where N is the
@@ -14500,17 +14500,17 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
 
     .. math::
 
-        dkernel[0] &= dilations[0] \\times (kernel\_sizes[0] - 1) + 1
+        dkernel[0] &= dilations[0] \times (kernel\_sizes[0] - 1) + 1
 
-        dkernel[1] &= dilations[1] \\times (kernel\_sizes[1] - 1) + 1
+        dkernel[1] &= dilations[1] \times (kernel\_sizes[1] - 1) + 1
 
-        hout &= \\frac{H + paddings[0] + paddings[2] - dkernel[0]}{strides[0]} + 1
+        hout &= \frac{H + paddings[0] + paddings[2] - dkernel[0]}{strides[0]} + 1
 
-        wout &= \\frac{W + paddings[1] + paddings[3] - dkernel[1]}{strides[1]} + 1
+        wout &= \frac{W + paddings[1] + paddings[3] - dkernel[1]}{strides[1]} + 1
 
-        Cout &= C \\times kernel\_sizes[0] \\times kernel\_sizes[1]
+        Cout &= C \times kernel\_sizes[0] \times kernel\_sizes[1]
 
-        Lout &= hout \\times wout
+        Lout &= hout \times wout
 
 
     Parameters:
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 64d6910e1f859cf578f2b84e19f86e4355d7af3a..7228c903d6ffa71c7c5830113bb8bbad11847745 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -37,7 +37,7 @@ def elu(x, alpha=1.0, name=None):
 
     .. math::
 
-        elu(x) = max(0, x) + min(0, \\alpha * (e^{x}-1))
+        elu(x) = max(0, x) + min(0, \alpha * (e^{x}-1))
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -91,13 +91,13 @@ def gelu(x, approximate=False, name=None):
 
     .. math::
 
-        gelu(x) = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3})))
+        gelu(x) = 0.5 * x * (1 + tanh(\sqrt{\frac{2}{\pi}} * (x + 0.044715x^{3})))
 
     else
 
     .. math::
 
-        gelu(x) = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}}))
+        gelu(x) = 0.5 * x * (1 + erf(\frac{x}{\sqrt{2}}))
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -144,13 +144,13 @@ def hardshrink(x, threshold=0.5, name=None):
     .. math::
 
         hardshrink(x)=
-            \\left\\{
-            \\begin{aligned}
-            &x, & & if \\ x > threshold \\\\
-            &x, & & if \\ x < -threshold \\\\
-            &0, & & if \\ others
-            \\end{aligned}
-            \\right.
+            \left\{
+                \begin{array}{rcl}
+                x,&  &if \ {x > threshold}  \\
+                x,&  &if \ {x < -threshold}   \\
+                0,&  &if \ {others} &
+                \end{array}
+            \right.
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -192,11 +192,14 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
 
     .. math::
 
-        hardtanh(x)= \\begin{cases}
-                        max, \\text{if } x > max \\\\
-                        min, \\text{if } x < min \\\\
-                        x,  \\text{otherwise}
-                      \\end{cases}
+        hardtanh(x)=
+            \left\{
+                \begin{array}{cll}
+                    max,& & \text{if } x > max \\
+                    min,& & \text{if } x < min \\
+                    x,& & \text{otherwise}
+                \end{array}
+            \right.
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -246,13 +249,13 @@ def hardsigmoid(x, slope=0.1666667, offset=0.5, name=None):
     .. math::
 
         hardsigmoid(x)=
-            \\left\\{
-            \\begin{aligned}
-            &0, & & \\text{if } x \\leq -3 \\\\
-            &1, & & \\text{if } x \\geq 3 \\\\
-            &slope * x + offset, & & \\text{otherwise}
-            \\end{aligned}
-            \\right.
+            \left\{
+                \begin{array}{lcl}
+                0, & &\text{if } \ x \leq -3 \\
+                1, & &\text{if } \ x \geq 3 \\
+                slope * x + offset, & &\text{otherwise}
+                \end{array}
+            \right.
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -302,13 +305,13 @@ def hardswish(x, name=None):
     .. math::
 
         hardswish(x)=
-            \\left\\{
-            \\begin{aligned}
-            &0, & & \\text{if } x \\leq -3 \\\\
-            &x, & & \\text{if } x \\geq 3 \\\\
-            &\\frac{x(x+3)}{6}, & & \\text{otherwise}
-            \\end{aligned}
-            \\right.
+            \left\{
+                \begin{array}{cll}
+                0 &, & \text{if } x \leq -3 \\
+                x &, & \text{if } x \geq 3 \\
+                \frac{x(x+3)}{6} &, & \text{otherwise}
+                \end{array}
+            \right.
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -345,13 +348,13 @@ def leaky_relu(x, negative_slope=0.01, name=None):
     leaky_relu activation
 
     .. math::
-        leaky\\_relu(x)=
-            \\left\\{
-            \\begin{aligned}
-            &x, & & if \\ x >= 0 \\\\
-            &negative\_slope * x, & & otherwise \\\\
-            \\end{aligned}
-            \\right. \\\\
+        leaky\_relu(x)=
+        \left\{
+            \begin{array}{rcl}
+                x, & & if \ x >= 0 \\
+                negative\_slope * x, & & otherwise \\
+            \end{array}
+        \right.
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -513,7 +516,7 @@ def log_sigmoid(x, name=None):
 
     .. math::
 
-        log\\_sigmoid(x) = log \\frac{1}{1 + e^{-x}}
+        log\_sigmoid(x) = log \frac{1}{1 + e^{-x}}
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -554,12 +557,15 @@ def maxout(x, groups, axis=1, name=None):
 
     .. math::
 
-        &out_{si+j} = \\max_{k} x_{gsi + sk + j} \\\\
-        &g = groups \\\\
-        &s = \\frac{input.size}{num\\_channels} \\\\
-        &0 \\le i < \\frac{num\\_channels}{groups} \\\\
-        &0 \\le j < s \\\\
-        &0 \\le k < groups
+        \begin{array}{l}
+        &out_{si+j} = \max_{k} x_{gsi + sk + j} \\
+        &g = groups \\
+        &s = \frac{input.size}{num\_channels} \\
+        &0 \le i < \frac{num\_channels}{groups} \\
+        &0 \le j < s \\
+        &0 \le k < groups
+        \end{array}
+
 
     Parameters:
         x (Tensor): The input is 4-D Tensor with shape [N, C, H, W] or [N, H, W, C], the data type
@@ -670,10 +676,12 @@ def selu(x,
     .. math::
 
         selu(x)= scale *
-                 \\begin{cases}
-                   x, \\text{if } x > 0 \\\\
-                   alpha * e^{x} - alpha, \\text{if } x <= 0
-                 \\end{cases}
+            \left\{
+                \begin{array}{lcl}
+                x,& &\text{if } \ x > 0 \\
+                alpha * e^{x} - alpha,& &\text{if } \ x <= 0
+                \end{array}
+            \right.
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -719,9 +727,11 @@ def selu(x,
 
 
 def silu(x, name=None):
-    """
-    silu activation.
-    .. math:
+    r"""
+    silu activation
+
+    .. math::
+
         silu(x) = \frac{x}{1 + e^{-x}}
     
     Parameters:
@@ -734,11 +744,12 @@ def silu(x, name=None):
     
     Examples:
         .. code-block:: python
-        import paddle
-        import paddle.nn.functional as F
-        
-        x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
-        out = F.silu(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
+
+            import paddle
+            import paddle.nn.functional as F
+            
+            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+            out = F.silu(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
     """
 
     if in_dygraph_mode():
@@ -778,7 +789,7 @@ def softmax(x, axis=-1, dtype=None, name=None):
 
     .. math::
 
-        softmax[i, j] = \\frac{\\exp(x[i, j])}{\\sum_j(exp(x[i, j])}
+        softmax[i, j] = \frac{\exp(x[i, j])}{\sum_j(exp(x[i, j])}
 
     Example:
 
@@ -923,8 +934,8 @@ def softplus(x, beta=1, threshold=20, name=None):
 
     .. math::
 
-        softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\
-        \\text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.}
+        softplus(x) = \frac{1}{beta} * \log(1 + e^{beta * x}) \\
+        \text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.}
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -968,11 +979,14 @@ def softshrink(x, threshold=0.5, name=None):
 
     .. math::
 
-        softshrink(x)= \\begin{cases}
-                        x - threshold, \\text{if } x > threshold \\\\
-                        x + threshold, \\text{if } x < -threshold \\\\
-                        0,  \\text{otherwise}
-                      \\end{cases}
+        softshrink(x)= 
+            \left\{
+                \begin{array}{rcl}
+                x - threshold,& & \text{if } x > threshold \\
+                x + threshold,& & \text{if } x < -threshold \\
+                0,& &  \text{otherwise}
+            \end{array}
+            \right.
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -1019,7 +1033,7 @@ def softsign(x, name=None):
 
     .. math::
 
-        softsign(x) = \\frac{x}{1 + |x|}
+        softsign(x) = \frac{x}{1 + |x|}
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -1056,7 +1070,7 @@ def swish(x, name=None):
 
     .. math::
 
-        swish(x) = \\frac{x}{1 + e^{-x}}
+        swish(x) = \frac{x}{1 + e^{-x}}
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -1134,10 +1148,14 @@ def thresholded_relu(x, threshold=1.0, name=None):
 
     .. math::
 
-        thresholded\\_relu(x) = \\begin{cases}
-                                 x, \\text{if } x > threshold \\\\
-                                 0, \\text{otherwise}
-                                \\end{cases}
+        thresholded\_relu(x) = 
+            \left\{
+                \begin{array}{rl}
+                x,& \text{if } \ x > threshold \\
+                0,& \text{otherwise}
+                \end{array}
+            \right.
+
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -1181,10 +1199,10 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
 
     .. math::
 
-        \\begin{aligned} 
-        log\\_softmax[i, j] &= log(softmax(x)) \\\\
-        &= log(\\frac{\\exp(X[i, j])}{\\sum_j(\\exp(X[i, j])})
-        \\end{aligned}
+        \begin{aligned} 
+        log\_softmax[i, j] &= log(softmax(x)) \\
+        &= log(\frac{\exp(X[i, j])}{\sum_j(\exp(X[i, j])})
+        \end{aligned}
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index cb7a50ade7ac8f3b52934b35c79480e3d5d12cc6..ef2bfb3b8e0d3aeabf296cc8379e49d92c6dd6e3 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -180,18 +180,18 @@ def binary_cross_entropy_with_logits(logit,
     First this operator calculate loss function as follows:
 
     .. math::
-           Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit))
+           Out = -Labels * \log(\sigma(Logit)) - (1 - Labels) * \log(1 - \sigma(Logit))
 
-    We know that :math:`\\sigma(Logit) = \\frac{1}{1 + e^{-Logit}}`. By substituting this we get:
+    We know that :math:`\sigma(Logit) = \frac{1}{1 + e^{-Logit}}`. By substituting this we get:
 
     .. math::
-           Out = Logit - Logit * Labels + \\log(1 + e^{-Logit})
+           Out = Logit - Logit * Labels + \log(1 + e^{-Logit})
 
     For stability and to prevent overflow of :math:`e^{-Logit}` when Logit < 0,
     we reformulate the loss as follows:
 
     .. math::
-           Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + e^{-\|Logit\|})
+           Out = \max(Logit, 0) - Logit * Labels + \log(1 + e^{-\|Logit\|})
 
     Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
     weight tensor on the loss `Out`. The ``weight`` tensor will attach different
@@ -450,17 +450,17 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
 
     .. math::
 
-         loss(x,y) = \\frac{1}{n}\\sum_{i}z_i
+         loss(x,y) = \frac{1}{n}\sum_{i}z_i
 
 
     where z_i is given by:
 
     .. math::
 
-         \\mathop{z_i} = \\left\\{\\begin{array}{rcl}
-        0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\\\
+        \mathop{z_i} = \left\{\begin{array}{rcl}
+        0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\
         delta * |x_i - y_i| - 0.5 * delta^2 & & {otherwise}
-        \\end{array} \\right.
+        \end{array} \right.
 
     Parameters:
         input (Tensor): Input tensor, the data type is float32 or float64. Shape is
@@ -631,17 +631,17 @@ def l1_loss(input, label, reduction='mean', name=None):
     If `reduction` set to ``'none'``, the loss is:
 
     .. math::
-        Out = \\lvert input - label \\rvert
+        Out = \lvert input - label \rvert
 
     If `reduction` set to ``'mean'``, the loss is:
 
     .. math::
-        Out = MEAN(\\lvert input - label \\rvert)
+        Out = MEAN(\lvert input - label \rvert)
 
     If `reduction` set to ``'sum'``, the loss is:
 
     .. math::
-        Out = SUM(\\lvert input - label\\rvert)
+        Out = SUM(\lvert input - label \rvert)
 
 
     Parameters:
@@ -1563,15 +1563,15 @@ def sigmoid_focal_loss(logit,
     This operator measures focal loss function as follows: 
 
     .. math::
-           Out = -Labels * alpha * {(1 - \\sigma(Logit))}^{gamma}\\log(\\sigma(Logit)) - (1 - Labels) * (1 - alpha) * {\\sigma(Logit)}^{gamma}\\log(1 - \\sigma(Logit))
+           Out = -Labels * alpha * {(1 - \sigma(Logit))}^{gamma}\log(\sigma(Logit)) - (1 - Labels) * (1 - alpha) * {\sigma(Logit)}^{gamma}\log(1 - \sigma(Logit))
 
-    We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\exp(-Logit)}`. 
+    We know that :math:`\sigma(Logit) = \frac{1}{1 + \exp(-Logit)}`. 
 
     Then, if :attr:`normalizer` is not None, this operator divides the
     normalizer tensor on the loss `Out`:
 
     .. math::
-           Out = \\frac{Out}{normalizer}
+           Out = \frac{Out}{normalizer}
 
     Finally, this operator applies reduce operation on the loss.
     If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`.
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 286f8ef167b457d3e24a8f068e08529ba2f82667..db73e56f879a770011d9a095f922d9db9ac8171c 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -34,12 +34,12 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
 
     .. math::
 
-        y = \\frac{x}{ \\max\\left( \\lvert \\lvert x \\rvert \\rvert_p, epsilon\\right) }
+        y = \frac{x}{ \max\left( \lvert \lvert x \rvert \rvert_p, epsilon\right) }
     
     .. math::
-        \\lvert \\lvert x \\rvert \\rvert_p = \\left( \\sum_i {\\lvert x_i \\rvert^p}  \\right)^{1/p}
+        \lvert \lvert x \rvert \rvert_p = \left( \sum_i {\lvert x_i \rvert^p}  \right)^{1/p}
 
-    where, :math:`\\sum_i{\\lvert x_i \\rvert^p}` is calculated along the ``axis`` dimension.
+    where, :math:`\sum_i{\lvert x_i \rvert^p}` is calculated along the ``axis`` dimension.
 
 
     Parameters:
@@ -432,7 +432,7 @@ def local_response_norm(x,
 
         .. math::
 
-            Output(i, x, y) = Input(i, x, y) / \\left(k + \\alpha \\sum\\limits^{\\min(C-1, i + size/2)}_{j = \\max(0, i - size/2)}(Input(j, x, y))^2\\right)^{\\beta}
+            Output(i, x, y) = Input(i, x, y) / \left(k + \alpha \sum\limits^{\min(C-1, i + size/2)}_{j = \max(0, i - size/2)}(Input(j, x, y))^2\right)^{\beta}
 
         In the above equation: