diff --git a/python/paddle/distributed/auto_parallel/cost_model.py b/python/paddle/distributed/auto_parallel/cost_model.py
index 1155c2817a21cd147ee1012fbaf11376a5183717..b72c044428f6cdde92756ff24d76939b759aebb4 100644
--- a/python/paddle/distributed/auto_parallel/cost_model.py
+++ b/python/paddle/distributed/auto_parallel/cost_model.py
@@ -426,7 +426,7 @@ class CostModel(object):
         return merged_node_id, merged_node
 
     def merge_linear(self):
-        '''
+        r'''
         This method does the following: 
         If X depends on Y only, they must be run sequentially.
             [ e.g. A ->- C ->- D   D and E depends on C only.] 
@@ -442,7 +442,7 @@ class CostModel(object):
         return cnt
 
     def merge_branch(self):
-        '''
+        r'''
         This method does the following:
         If a node has more than one successor, there is *branch*.
             [ e.g. A ->- B ->- D                                       ] 
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index 3569d372fa6dc7ef89b6d1f8e9e0f675ab89dde9..d600cda8454cc696579df7fa7f6e6f4d6ae12600 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -46,7 +46,7 @@ def fused_feedforward(x,
                       training=True,
                       mode='upscale_in_train',
                       name=None):
-    """
+    r"""
     This is a fusion operator to compute feed forward layer in transformer model architecture.
     This operator only supports running on GPU. The function of the operator is consistent with
     the following pseudo code:
@@ -230,7 +230,7 @@ def fused_multi_head_attention(x,
                                training=True,
                                mode='upscale_in_train',
                                name=None):
-    """
+    r"""
     Attention mapps queries and a set of key-value pairs to outputs, and
     Multi-Head Attention performs multiple parallel attention to jointly attending
     to information from different representation subspaces. This API only
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 94c516f476ede38e60de0bb6d01aed0a61850572..e59ef5ebfb0ab26c16c78933733bc11c0c4148d0 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1113,7 +1113,7 @@ def margin_cross_entropy(logits,
                          group=None,
                          return_softmax=False,
                          reduction='mean'):
-    """
+    r"""
     .. math::
 
         L=-\\frac{1}{N}\sum^N_{i=1}\log\\frac{e^{s(cos(m_{1}\\theta_{y_i}+m_{2})-m_{3})}}{e^{s(cos(m_{1}\\theta_{y_i}+m_{2})-m_{3})}+\sum^n_{j=1,j\\neq y_i} e^{scos\\theta_{y_i}}}
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index a528a72ec5cacaa9d22a46c9b20d05150cb2bc83..34a0159fbb0dc4b16dadcd075d450102648d956b 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -670,7 +670,7 @@ def max_unpool1d(x,
                  data_format="NCL",
                  output_size=None,
                  name=None):
-    """
+    r"""
     This API implements max unpooling 1d opereation.
     `max_unpool1d` accepts the output of `max_pool1d` as input, 
     including the indices of the maximum value and calculate the partial inverse. 
@@ -779,7 +779,7 @@ def max_unpool2d(x,
                  data_format="NCHW",
                  output_size=None,
                  name=None):
-    """
+    r"""
     This API implements max unpooling 2d opereation.
     See more details in :ref:`api_nn_pooling_MaxUnPool2D` .
 
@@ -894,7 +894,7 @@ def max_unpool3d(x,
                  data_format="NCDHW",
                  output_size=None,
                  name=None):
-    """
+    r"""
     This API implements max unpooling 3d opereation.
     `max_unpool3d` accepts the output of `max_pool3d` as input, 
     including the indices of the maximum value and calculate the partial inverse. 
diff --git a/python/paddle/nn/initializer/dirac.py b/python/paddle/nn/initializer/dirac.py
index 514afb15a8edb30ba5a1511328ecf6796f8f15d5..da3266ab3369480cabd954166f55f69c65febb9c 100644
--- a/python/paddle/nn/initializer/dirac.py
+++ b/python/paddle/nn/initializer/dirac.py
@@ -23,7 +23,7 @@ __all__ = []
 
 
 class Dirac(Initializer):
-    """Initialize the 3D/4D/5D Tensor with Dirac delta function.
+    r"""Initialize the 3D/4D/5D Tensor with Dirac delta function.
     
     It can reserve the feature of convolution layer input, which means that
     as many channels are reserved as possible.
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 9ae9d5bec437e9d4ec74d5696fae7043f0186d97..19fbcd5b6f85691e57530a442d9f72ce7935692d 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -1554,7 +1554,7 @@ class Unfold(Layer):
 
 
 class Fold(Layer):
-    """
+    r"""
 
     This Op is used to combines an array of sliding local blocks into a large containing
     tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each 
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 96942f5c8500a0a859e2c73b6557b0604258a127..68808c6354afbdd6fad44a0f1cc273679c783afd 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -1131,7 +1131,7 @@ class AdaptiveMaxPool3D(Layer):
 
 
 class MaxUnPool1D(Layer):
-    """
+    r"""
     This API implements max unpooling 1d opereation.
 
     `max_unpool1d` accepts the output of `max_pool1d` as input, 
@@ -1213,7 +1213,7 @@ class MaxUnPool1D(Layer):
 
 
 class MaxUnPool2D(Layer):
-    """
+    r"""
     This API implements max unpooling 2d opereation.
 
     'max_unpool2d' accepts the output of 'max_unpool2d' as input
@@ -1299,7 +1299,7 @@ class MaxUnPool2D(Layer):
 
 
 class MaxUnPool3D(Layer):
-    """
+    r"""
     This API implements max unpooling 3d opereation.
 
     `max_unpool3d` accepts the output of `max_pool3d` as input, 
diff --git a/python/paddle/signal.py b/python/paddle/signal.py
index fc80c7cbc80f36c3afbc4229aaee63a3679b4e2b..cd8ba2b58a8c939acc43a93b0ea6ca5a617b35d1 100644
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -243,7 +243,7 @@ def stft(x,
          normalized=False,
          onesided=True,
          name=None):
-    """
+    r"""
     Short-time Fourier transform (STFT).
 
     The STFT computes the discrete Fourier transforms (DFT) of short overlapping
@@ -398,7 +398,7 @@ def istft(x,
           length=None,
           return_complex=False,
           name=None):
-    """
+    r"""
     Inverse short-time Fourier transform (ISTFT).
 
     Reconstruct time-domain signal from the giving complex input and window tensor when
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index c4e7e96191acff607f5f31651b88867f69100098..660803f9f7475997b19be4635b7e89aa055e9c83 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -81,7 +81,7 @@ def bernoulli(x, name=None):
 
 
 def poisson(x, name=None):
-    """
+    r"""
     This OP returns a tensor filled with random number from a Poisson Distribution.
 
     .. math::
@@ -984,7 +984,7 @@ def rand(shape, dtype=None, name=None):
 
 
 def exponential_(x, lam=1.0, name=None):
-    """
+    r"""
     This inplace OP fill input Tensor ``x`` with random number from a Exponential Distribution.
 
     ``lam`` is :math:`\lambda` parameter of Exponential Distribution. 
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 68cd3ae72a6aa0a7d830a7fc7e6c590d7bc6a216..03060e92bdb69b1ec6022d887d01c514cb11b45d 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -949,8 +949,8 @@ def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     if isinstance(output_size, int):
         output_size = (output_size, output_size)
     pooled_height, pooled_width = output_size
-    assert (len(x.shape) == 4,
-            "Input features with shape should be (N, C, H, W)")
+    assert len(x.shape) == 4, \
+            "Input features with shape should be (N, C, H, W)"
     output_channels = int(x.shape[1] / (pooled_height * pooled_width))
     if in_dygraph_mode():
         return _C_ops.psroi_pool(x, boxes, boxes_num, "output_channels",