diff --git a/paddle/scripts/conda_build.py b/paddle/scripts/conda_build.py
index 0a0736f35a58dbcd2f47bd4c4c9849f7d146fbf4..395a071ed130845e87acbb3c286d02406b0719f2 100644
--- a/paddle/scripts/conda_build.py
+++ b/paddle/scripts/conda_build.py
@@ -1,4 +1,19 @@
 #!/bin/python
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #
 import platform
 from sys import argv
@@ -120,7 +135,7 @@ python setup.py install
         self.py_str = ["py27", "py35", "py36", "py37"]
         self.pip_end = ".whl --no-deps"
         self.pip_prefix_linux = "pip install /package/paddlepaddle"
-        self.pip_prefix_windows = "pip install C:\package\paddlepaddle"
+        self.pip_prefix_windows = r"pip install C:\package\paddlepaddle"
         self.pip_gpu = "_gpu-"
         self.pip_cpu = "-"
         self.mac_pip = [
@@ -216,7 +231,7 @@ package:
     - matplotlib"""
     if not (cuda_str == None):
         meta_str = meta_str + cuda_str
-    
+
     blt_str = var.blt_const + blt_var
     if (python_str == var.python27):
         blt_str = blt_str + """
@@ -224,7 +239,7 @@ package:
     else:
         meta_str = meta_str + """
     - opencv>=3.4.2"""
-    
+
     meta_str = meta_str + var.test + var.about
     meta_filename = "meta.yaml"
     build_filename = "bld.bat"
diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py
index e5a3b6074c96d224fd2e120c7508246bd7e8a86b..dab3c964cc6b73c7149d53b6899ded2effc2e97a 100644
--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -116,8 +116,8 @@ def train(word_idx):
     :rtype: callable
     """
     return reader_creator(
-        re.compile("aclImdb/train/pos/.*\.txt$"),
-        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
+        re.compile(r"aclImdb/train/pos/.*\.txt$"),
+        re.compile(r"aclImdb/train/neg/.*\.txt$"), word_idx)
 
 
 @deprecated(
@@ -137,8 +137,8 @@ def test(word_idx):
     :rtype: callable
     """
     return reader_creator(
-        re.compile("aclImdb/test/pos/.*\.txt$"),
-        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
+        re.compile(r"aclImdb/test/pos/.*\.txt$"),
+        re.compile(r"aclImdb/test/neg/.*\.txt$"), word_idx)
 
 
 @deprecated(
@@ -153,7 +153,7 @@ def word_dict():
     :rtype: dict
     """
     return build_dict(
-        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+        re.compile(r"aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
 
 
 @deprecated(
diff --git a/python/paddle/dataset/tests/imdb_test.py b/python/paddle/dataset/tests/imdb_test.py
index 415947e3477f2e5b9979588528f7cb6f799acf6a..613c5f8edb289ce5d9110adbedbe44d058eaf75d 100644
--- a/python/paddle/dataset/tests/imdb_test.py
+++ b/python/paddle/dataset/tests/imdb_test.py
@@ -18,13 +18,13 @@ import paddle.dataset.imdb
 import unittest
 import re
 
-TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
-TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
-TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
+TRAIN_POS_PATTERN = re.compile(r"aclImdb/train/pos/.*\.txt$")
+TRAIN_NEG_PATTERN = re.compile(r"aclImdb/train/neg/.*\.txt$")
+TRAIN_PATTERN = re.compile(r"aclImdb/train/.*\.txt$")
 
-TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
-TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
-TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
+TEST_POS_PATTERN = re.compile(r"aclImdb/test/pos/.*\.txt$")
+TEST_NEG_PATTERN = re.compile(r"aclImdb/test/neg/.*\.txt$")
+TEST_PATTERN = re.compile(r"aclImdb/test/.*\.txt$")
 
 
 class TestIMDB(unittest.TestCase):
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 71eca424fe6500d08526c190e71db2194671b74f..46ccb4663e8b7a41f8cee6608521ebda2feca7a3 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -862,7 +862,7 @@ class DistributedStrategy(object):
 
     @property
     def dgc_configs(self):
-        """
+        r"""
         Set Deep Gradient Compression training configurations. In general, dgc has serveral configurable
         settings that can be configured through a dict.
 
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 00bec671d4b86fa4a0494da3bf9bbd0f7c7e38f3..c48ce1a0f333540768367679ab70278060299a05 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
+r"""
 fleetrun is a module that spawns multiple distributed
 process on each training node for gpu training and cpu training.
 Usage:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index 10b0c82c0eef9e32634d3c1e2a4acc6879dd98c5..3135b69d00480d5f5add168ea01ecf0f43ca9d7e 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -158,13 +158,13 @@ class ParameterServerOptimizer(MetaOptimizerBase):
                     ['vm_stat'], stdout=subprocess.PIPE).communicate()[0]
                 # Process vm_stat
                 vmLines = vm.split('\n')
-                sep = re.compile(':[\s]+')
+                sep = re.compile(r':[\s]+')
                 vmStats = {}
                 for row in range(1, len(vmLines) - 2):
                     rowText = vmLines[row].strip()
                     rowElements = sep.split(rowText)
                     vmStats[(rowElements[0]
-                             )] = int(rowElements[1].strip('\.')) * 4096
+                             )] = int(rowElements[1].strip(r'\.')) * 4096
                 return vmStats["Pages free"]
             elif platform.system() == "Linux":
                 mems = {}
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index 9b969cf3002379058b9cff0d604d2db750573028..060e742ad6cc85f5faf32065b70f59257d7d9e58 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
+r"""
 paddle.distributed.launch is a module that spawns multiple distributed 
 process on each training node for gpu training.
 Usage:
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index e9a15feb5170f073882dcb0d07ec62cc8d8e5f7a..ad134b4591e8ddd638675e9bb88f2958b4b4648d 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -166,7 +166,7 @@ class Distribution(object):
 
 
 class Uniform(Distribution):
-    """Uniform distribution with `low` and `high` parameters.
+    r"""Uniform distribution with `low` and `high` parameters.
 
     Mathematical Details
 
@@ -374,7 +374,7 @@ class Uniform(Distribution):
         return elementwise_div((lb * ub), (self.high - self.low), name=name)
 
     def entropy(self):
-        """Shannon entropy in nats.
+        r"""Shannon entropy in nats.
 
         The entropy is
 
@@ -391,7 +391,7 @@ class Uniform(Distribution):
 
 
 class Normal(Distribution):
-    """The Normal distribution with location `loc` and `scale` parameters.
+    r"""The Normal distribution with location `loc` and `scale` parameters.
 
     Mathematical details
 
@@ -534,7 +534,7 @@ class Normal(Distribution):
                 return output
 
     def entropy(self):
-        """Shannon entropy in nats.
+        r"""Shannon entropy in nats.
 
         The entropy is
 
@@ -599,7 +599,7 @@ class Normal(Distribution):
             name=name)
 
     def kl_divergence(self, other):
-        """The KL-divergence between two normal distributions.
+        r"""The KL-divergence between two normal distributions.
 
         The probability density function (pdf) is
 
@@ -644,7 +644,7 @@ class Normal(Distribution):
 
 
 class Categorical(Distribution):
-    """
+    r"""
     Categorical distribution is a discrete probability distribution that 
     describes the possible results of a random variable that can take on 
     one of K possible categories, with the probability of each category 
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index f20716c3a1503a615ac8b26c3f54a123f8609486..8fd01509331e207af1aaabde1e40404f1a8c6f74 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -40,7 +40,7 @@ class BaseErrorClipAttr(object):
 
 
 class ErrorClipByValue(BaseErrorClipAttr):
-    """
+    r"""
     Clips tensor values to the range [min, max].
 
     Given a tensor ``t`` (see Examples below), this operation clips its value \
@@ -241,7 +241,7 @@ class ClipGradByValue(ClipGradBase):
 
 
 class ClipGradByNorm(ClipGradBase):
-    """
+    r"""
     Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
     
     - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
@@ -343,7 +343,7 @@ class ClipGradByNorm(ClipGradBase):
 
 
 class ClipGradByGlobalNorm(ClipGradBase):
-    """
+    r"""
     Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in 
     :math:`t\_list` , and limit it to ``clip_norm`` .
     
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index d0543bb90dd14b484ce2fb36206885d2de79621e..f3f8c815b004c45d512af100c0d2f49bbe7d34f8 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -137,7 +137,7 @@ def var_conv_2d(input,
                 act=None,
                 dtype='float32',
                 name=None):
-    """
+    r"""
     The var_conv_2d layer calculates the output base on the :attr:`input` with variable length,
     row, col, input channel, filter size and strides. Both :attr:`input`, :attr:`row`,
     and :attr:`col` are 1-level LodTensor. The convolution operation is same as conv2d layer with
@@ -477,7 +477,7 @@ def fused_embedding_seq_pool(input,
                              combiner='sum',
                              param_attr=None,
                              dtype='float32'):
-    """
+    r"""
     **Embedding Sequence pool**
 
     This layer is the fusion of lookup table and sequence_pool.
@@ -1442,7 +1442,7 @@ def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None):
 
 
 def _pull_box_extended_sparse(input, size, extend_size=64, dtype='float32'):
-    """
+    r"""
     **Pull Box Extended Sparse Layer**
     This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
     BoxPS lookup table. The result of this lookup is the embedding of each ID in the
@@ -1640,7 +1640,7 @@ def fused_bn_add_act(x,
                      moving_variance_name=None,
                      act=None,
                      name=None):
-    """
+    r"""
     This Op performs batch norm on input x, and adds the result to input y. Then
     it performs activation on the sum. The data format of inputs must be NHWC
     `[batch, in_height, in_width, in_channels]`.
diff --git a/python/paddle/fluid/contrib/layers/rnn_impl.py b/python/paddle/fluid/contrib/layers/rnn_impl.py
index 4e304a393f88e38e871a6bf508a353fcaadb6067..a2dd0835b6064c4b1f2f7747268d15bf97d0697a 100644
--- a/python/paddle/fluid/contrib/layers/rnn_impl.py
+++ b/python/paddle/fluid/contrib/layers/rnn_impl.py
@@ -175,7 +175,7 @@ def basic_gru(input,
               activation=None,
               dtype='float32',
               name='basic_gru'):
-    """
+    r"""
     GRU implementation using basic operator, supports multiple layers and bidirectional gru.
 
     .. math::
@@ -418,7 +418,7 @@ def basic_lstm(input,
                forget_bias=1.0,
                dtype='float32',
                name='basic_lstm'):
-    """
+    r"""
     LSTM implementation using basic operators, supports multiple layers and bidirectional LSTM.
 
     .. math::
@@ -697,7 +697,7 @@ def basic_lstm(input,
 
 
 class BasicLSTMUnit(Layer):
-    """
+    r"""
     ****
     BasicLSTMUnit class, Using basic operator to build LSTM
     The algorithm can be described as the code below.
diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py
index b5d85616cf03c7ed56f1e6f03a359e32aacee36d..24e39d7ac61dbad1ece08927535f899adf8b618e 100644
--- a/python/paddle/fluid/contrib/memory_usage_calc.py
+++ b/python/paddle/fluid/contrib/memory_usage_calc.py
@@ -44,7 +44,7 @@ DEBUG = False
 
 
 def memory_usage(program, batch_size):
-    """
+    r"""
     Get the estimate memory usage of program with input batch size.
 
     Args:
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index cae241772326759b04bd1c0b5b38663d134f1e14..7364655107bd991c4cd98e4c01eb1eece5df850b 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -64,7 +64,7 @@ class ImperativeQuantAware(object):
                  act_preprocess_layer=None,
                  weight_quantize_layer=None,
                  act_quantize_layer=None):
-        """
+        r"""
         The constructor for ImperativeQuantAware.
 
         Args:
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
index 79138febd0ce87d7d006700c9494c30f53691742..5acc4c30bc086f6b0bbababde98e713fe85cca1f 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -30,7 +30,7 @@ __all__ = [
 
 
 class FakeQuantMovingAverage(layers.Layer):
-    """
+    r"""
     FakeQuantMovingAverage layer does the moving_average_abs_max quant and then dequant.
     Its computational formula is described as below:
 
@@ -128,7 +128,7 @@ class FakeQuantMovingAverage(layers.Layer):
 
 
 class FakeQuantAbsMax(layers.Layer):
-    """
+    r"""
     FakeQuantAbsMax layer does the abs_max quant and then dequant.
     Its computational formula is described as below:
 
@@ -545,7 +545,7 @@ class QuantizedLinear(layers.Layer):
 
 class MovingAverageAbsMaxScale(layers.Layer):
     def __init__(self, name=None, moving_rate=0.9, dtype='float32'):
-        """
+        r"""
         MovingAverageMaxScale layer is used to calculating the output quantization scale of Layer.
         Its computational formula is described as below:
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py
index a25abd9ff09fbab1534f6f4327983af5db52f023..d31dc35d143dec009936edeaa02fcb7975cbfc71 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py
@@ -37,7 +37,7 @@ class QuantInt8MkldnnPass(object):
     """
 
     def __init__(self, _scope=None, _place=None):
-        """
+        r"""
         Args:
             scope(fluid.Scope): scope is used to initialize the new parameters.
             place(fluid.CPUPlace): place is used to initialize the new parameters.
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 68bf9ecd80be4c0d074508ccf1e8821c230dd709..219025269fe97b9ebaf332886f67f9ad8cf3c6ad 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -239,7 +239,7 @@ class QuantizationTransformPass(object):
                  act_preprocess_func=None,
                  optimizer_func=None,
                  executor=None):
-        """
+        r"""
         Constructor.
 
         Args:
diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py
index 2de4f82bd14559a99581c5716523b2a78c2d7998..9572552f0f2be4025973b5720caafae5e6363a51 100644
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py
@@ -33,7 +33,7 @@ _logger = get_logger(
 
 
 class HDFSClient(object):
-    """
+    r"""
     A tool of HDFS 
 
     Args:
@@ -376,7 +376,7 @@ class HDFSClient(object):
             _logger.info("HDFS list path: {} successfully".format(hdfs_path))
 
             ret_lines = []
-            regex = re.compile('\s+')
+            regex = re.compile(r'\s+')
             out_lines = output.strip().split("\n")
             for line in out_lines:
                 re_line = regex.split(line)
@@ -418,7 +418,7 @@ class HDFSClient(object):
             _logger.info("HDFS list all files: {} successfully".format(
                 hdfs_path))
             lines = []
-            regex = re.compile('\s+')
+            regex = re.compile(r'\s+')
             out_lines = output.strip().split("\n")
             for line in out_lines:
                 re_line = regex.split(line)
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index ad116c2597064d0dc3a6c9f4e9cc0362d2d52351..224a021cd6aa5ff1d45cbc4555d2acac4311f0c9 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -224,7 +224,7 @@ def less_than_ver(a, b):
     import operator
 
     def to_list(s):
-        s = re.sub('(\.0+)+$', '', s)
+        s = re.sub(r'(\.0+)+$', '', s)
         return [int(x) for x in s.split('.')]
 
     return operator.lt(to_list(a), to_list(b))
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index ee30484ae9a0fb1d56973280f22069bb40906806..ea89b09d2bf3dcece91b4edf515d70c210f8e154 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -101,10 +101,11 @@ class _DatasetKind(object):
     ITER = 1
 
     @staticmethod
-    def create_fetcher(kind, dataset, auto_collate_batch, collate_fn, drop_last):
+    def create_fetcher(kind, dataset, auto_collate_batch, collate_fn,
+                       drop_last):
         if kind == _DatasetKind.MAP:
-            return _MapDatasetFetcher(dataset, auto_collate_batch,
-                                      collate_fn, drop_last)
+            return _MapDatasetFetcher(dataset, auto_collate_batch, collate_fn,
+                                      drop_last)
         elif kind == _DatasetKind.ITER:
             return _IterableDatasetFetcher(dataset, auto_collate_batch,
                                            collate_fn, drop_last)
@@ -240,7 +241,8 @@ class _DataLoaderIterBase(object):
             if self._dataset_kind == _DatasetKind.MAP:
                 self._sampler_iter = iter(list(range(len(self._dataset))))
             else:
-                self._sampler_iter = iter(_InfiniteIterableSampler(self._dataset, 1))
+                self._sampler_iter = iter(
+                    _InfiniteIterableSampler(self._dataset, 1))
             self._collate_fn = loader.collate_fn
 
         # LoDTensorBlockingQueue instance for create_py_reader and a thread
@@ -380,8 +382,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
 
 # NOTE(chenweihang): _worker_loop must be top level method to be pickled
 def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
-                 auto_collate_batch, collate_fn, init_fn, worker_id, num_workers,
-                 use_shared_memory):
+                 auto_collate_batch, collate_fn, init_fn, worker_id,
+                 num_workers, use_shared_memory):
     try:
         # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
         # some shared memory objects may have been applied for but have not yet
@@ -400,8 +402,8 @@ def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
         try:
             if init_fn is not None:
                 init_fn(worker_id)
-            fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset,
-                                    auto_collate_batch, collate_fn, True)
+            fetcher = _DatasetKind.create_fetcher(
+                dataset_kind, dataset, auto_collate_batch, collate_fn, True)
         except:
             init_exception = Exception("init_fn failed in worker {}: " \
                                     "{}".format(worker_id, sys.exc_info()))
diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py
index 61e508ea72e8b529ce98d14c8ec3beff65148275..89e9a6a907632b902ff7f9eb4bb906b8fb86ed8e 100644
--- a/python/paddle/fluid/distributed/downpour.py
+++ b/python/paddle/fluid/distributed/downpour.py
@@ -22,7 +22,7 @@ from google.protobuf import text_format
 
 
 class DownpourSGD(object):
-    """
+    r"""
     Distributed optimizer of downpour stochastic gradient descent
     Standard implementation of Google's Downpour SGD
     in Large Scale Distributed Deep Networks
diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py
index 41e0d64e0b788b0e354f7635c3d3e52d6bba7e23..a15f94f4d17fca8f53340a31b9dab1951b72ac1c 100644
--- a/python/paddle/fluid/distributed/node.py
+++ b/python/paddle/fluid/distributed/node.py
@@ -52,7 +52,7 @@ class DownpourServer(Server):
 
     def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
                          slot_value_var):
-        """
+        r"""
         Args:
             table_id(int): id of sparse params table
             learning_rate(float): the learning rate used to update parameters. \
@@ -84,7 +84,7 @@ class DownpourServer(Server):
         table.accessor.downpour_accessor_param.delete_threshold = 0.8
 
     def add_dense_table(self, table_id, learning_rate, param_var, grad_var):
-        """
+        r"""
         Args:
             table_id(int): id of sparse params table
             learning_rate(float): the learning rate used to update parameters. \
@@ -135,7 +135,7 @@ class DownpourWorker(Worker):
 
     def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
                          slot_value_vars):
-        """
+        r"""
         Args:
             table_id(int): id of sparse params table
             learning_rate(float): the learning rate used to update parameters. \
@@ -153,7 +153,7 @@ class DownpourWorker(Worker):
             [var.name + "@GRAD" for var in slot_value_vars])
 
     def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars):
-        """
+        r"""
         Args:
             table_id(int): id of sparse params table
             learning_rate(float): the learning rate used to update parameters. \
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index a26b903493a699beea679364fe355f7a4c390660..397f873f961abae5690951b809de61c27e18e036 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -593,7 +593,7 @@ def grad(outputs,
 
 @framework.dygraph_only
 def to_variable(value, name=None, zero_copy=None, dtype=None):
-    """
+    r"""
     :api_attr: imperative
 
     The API will create a ``Variable`` or ``ComplexVariable`` object from 
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index cd6af6fd5b575e8188088bde9e8944ab94c7e0f8..a6c1993dbbf03f8fdc3806a1f66ea55008d9628d 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -183,7 +183,7 @@ class PiecewiseDecay(LearningRateDecay):
 
 
 class NaturalExpDecay(LearningRateDecay):
-    """
+    r"""
     :api_attr: imperative
 
     Applies natural exponential decay to the initial learning rate.
@@ -266,7 +266,7 @@ class NaturalExpDecay(LearningRateDecay):
 
 
 class ExponentialDecay(LearningRateDecay):
-    """
+    r"""
     :api_attr: imperative
 
     Applies exponential decay to the learning rate.
@@ -348,7 +348,7 @@ class ExponentialDecay(LearningRateDecay):
 
 
 class InverseTimeDecay(LearningRateDecay):
-    """
+    r"""
     :api_attr: imperative
 
     Applies inverse time decay to the initial learning rate.
@@ -426,7 +426,7 @@ class InverseTimeDecay(LearningRateDecay):
 
 
 class PolynomialDecay(LearningRateDecay):
-    """
+    r"""
     :api_attr: imperative
 
     Applies polynomial decay to the initial learning rate.
@@ -520,7 +520,7 @@ class PolynomialDecay(LearningRateDecay):
 
 
 class CosineDecay(LearningRateDecay):
-    """
+    r"""
     :api_attr: imperative
 
     Applies cosine decay to the learning rate.
@@ -578,7 +578,7 @@ class CosineDecay(LearningRateDecay):
 
 
 class NoamDecay(LearningRateDecay):
-    """
+    r"""
     :api_attr: imperative
 
     Applies Noam decay to the initial learning rate. 
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 3c75b30402897a3b7c4e585ec94f846360562fa8..0f92c32f252cde08248d9d2b9592918ae4da1a3f 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -42,7 +42,7 @@ __all__ = [
 
 
 class Conv2D(layers.Layer):
-    """
+    r"""
     This interface is used to construct a callable object of the ``Conv2D`` class.
     For more details, refer to code examples.
     The convolution2D layer calculates the output based on the input, filter
@@ -282,7 +282,7 @@ class Conv2D(layers.Layer):
 
 
 class Conv3D(layers.Layer):
-    """
+    r"""
     **Convlution3D Layer**
 
     The convolution3D layer calculates the output based on the input, filter
@@ -484,7 +484,7 @@ class Conv3D(layers.Layer):
 
 
 class Conv3DTranspose(layers.Layer):
-    """
+    r"""
     **Convlution3D transpose layer**
 
     The convolution3D transpose layer calculates the output based on the input,
@@ -701,7 +701,7 @@ class Conv3DTranspose(layers.Layer):
 
 
 class Pool2D(layers.Layer):
-    """
+    r"""
 
     This interface is used to construct a callable object of the ``Pool2D`` class.
     For more details, refer to code examples.
@@ -1009,7 +1009,7 @@ class Linear(layers.Layer):
 
 
 class InstanceNorm(layers.Layer):
-    """
+    r"""
     This interface is used to construct a callable object of the ``InstanceNorm`` class.
     For more details, refer to code examples.
 
@@ -1143,7 +1143,7 @@ class InstanceNorm(layers.Layer):
 
 
 class BatchNorm(layers.Layer):
-    """
+    r"""
     :alias_main: paddle.nn.BatchNorm
 	:alias: paddle.nn.BatchNorm,paddle.nn.layer.BatchNorm,paddle.nn.layer.norm.BatchNorm
 	:old_api: paddle.fluid.dygraph.BatchNorm
@@ -1492,7 +1492,7 @@ class Dropout(layers.Layer):
 
 
 class Embedding(layers.Layer):
-    """
+    r"""
     :alias_main: paddle.nn.Embedding
 	:alias: paddle.nn.Embedding,paddle.nn.layer.Embedding,paddle.nn.layer.common.Embedding
 	:old_api: paddle.fluid.dygraph.Embedding
@@ -1652,7 +1652,7 @@ class Embedding(layers.Layer):
 
 
 class LayerNorm(layers.Layer):
-    """
+    r"""
     :alias_main: paddle.nn.LayerNorm
 	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
 	:old_api: paddle.fluid.dygraph.LayerNorm
@@ -2242,7 +2242,7 @@ class NCE(layers.Layer):
 
 
 class PRelu(layers.Layer):
-    """
+    r"""
     This interface is used to construct a callable object of the ``PRelu`` class.
     For more details, refer to code examples.
     It implements three activation methods of the ``PRelu`` activation function.
@@ -2350,7 +2350,7 @@ class PRelu(layers.Layer):
 
 
 class BilinearTensorProduct(layers.Layer):
-    """
+    r"""
 
     **Add Bilinear Tensor Product Layer**
 
@@ -2467,7 +2467,7 @@ class BilinearTensorProduct(layers.Layer):
 
 
 class Conv2DTranspose(layers.Layer):
-    """
+    r"""
     This interface is used to construct a callable object of the ``Conv2DTranspose`` class.
     For more details, refer to code examples.
     The convolution2D transpose layer calculates the output based on the input,
@@ -2979,7 +2979,7 @@ class GroupNorm(layers.Layer):
 
 
 class SpectralNorm(layers.Layer):
-    """
+    r"""
     :alias_main: paddle.nn.SpectralNorm
 	:alias: paddle.nn.SpectralNorm,paddle.nn.layer.SpectralNorm,paddle.nn.layer.norm.SpectralNorm
 	:old_api: paddle.fluid.dygraph.SpectralNorm
diff --git a/python/paddle/fluid/dygraph/rnn.py b/python/paddle/fluid/dygraph/rnn.py
index 9df4188fb7eb872d21ac9e6a1f851074a682ca54..05a76a8d125864c3d046e610e33314d77791b1d0 100644
--- a/python/paddle/fluid/dygraph/rnn.py
+++ b/python/paddle/fluid/dygraph/rnn.py
@@ -20,7 +20,7 @@ __all__ = ['LSTMCell', 'GRUCell']
 
 
 class LSTMCell(Layer):
-    """
+    r"""
     LSTMCell implementation using basic operators.
     There are two LSTMCell version, the default one is compatible with CUDNN LSTM implementation.
     The algorithm can be described as the equations below.
@@ -236,7 +236,7 @@ class LSTMCell(Layer):
 
 
 class GRUCell(Layer):
-    """
+    r"""
     GRU implementation using basic operators.
     There are two GRUCell version, the default one is compatible with CUDNN GRU implementation.
     The algorithm can be described as the equations below.
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 49c5f9f5b8e46e29eba9766ad1f15091324f309f..28891871777d7ef204142a142a3d104ef4a9f9e4 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2255,7 +2255,7 @@ class Operator(object):
         return self.desc.type()
 
     def input(self, name):
-        """
+        r"""
         Get the input arguments according to the input parameter name.
 
         Args:
@@ -2306,7 +2306,7 @@ class Operator(object):
         return self.desc.output_arg_names()
 
     def output(self, name):
-        """
+        r"""
         Get output arguments by the output parameter name.
 
         Args:
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
index 4b600150e0427488c4954d6b00971c034bbf8c32..0853d05ef3bbe64bdbe5d5a40f91435612ba3d1f 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
@@ -527,7 +527,7 @@ class DownpourWorker(Worker):
 
     def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars,
                         dense_start_table_id, sparse_table_names):
-        """
+        r"""
         Args:
             table_id(int): id of sparse params table
             learning_rate(float): the learning rate used to update parameters. \
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
index c126f06de9d8a793d96b66e70f8f6817e0986821..dd968a70e8a4fca203939854e718684b64ce74ac 100644
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
@@ -153,7 +153,7 @@ class FleetUtil(object):
                          stat_pos="_generated_var_2",
                          stat_neg="_generated_var_3",
                          print_prefix=""):
-        """
+        r"""
         Print global auc of all distributed workers.
 
         Args:
@@ -1073,7 +1073,7 @@ class FleetUtil(object):
                                 hadoop_fs_name,
                                 hadoop_fs_ugi,
                                 hadoop_home="$HADOOP_HOME"):
-        """
+        r"""
         get last saved base xbox info from xbox_base_done.txt
 
         Args:
@@ -1118,7 +1118,7 @@ class FleetUtil(object):
                            hadoop_fs_name,
                            hadoop_fs_ugi,
                            hadoop_home="$HADOOP_HOME"):
-        """
+        r"""
         get last saved xbox info from xbox_patch_done.txt
 
         Args:
@@ -1164,7 +1164,7 @@ class FleetUtil(object):
                             hadoop_fs_name,
                             hadoop_fs_ugi,
                             hadoop_home="$HADOOP_HOME"):
-        """
+        r"""
         get last saved model info from donefile.txt
 
         Args:
@@ -1279,7 +1279,7 @@ class FleetUtil(object):
                            q_name="q",
                            pos_ins_num_name="pos",
                            total_ins_num_name="total"):
-        """
+        r"""
         get global metrics, including auc, bucket_error, mae, rmse,
         actual_ctr, predicted_ctr, copc, mean_predict_qvalue, total_ins_num.
 
@@ -1469,7 +1469,7 @@ class FleetUtil(object):
                              pos_ins_num_name="pos",
                              total_ins_num_name="total",
                              print_prefix=""):
-        """
+        r"""
         print global metrics, including auc, bucket_error, mae, rmse,
         actual_ctr, predicted_ctr, copc, mean_predict_qvalue, total_ins_num.
 
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 30932d0c8b59057a71e5928b06bea4bf95faa75f..86fab9811275fb520646f79e5aa6a70c9fd9e102 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -459,7 +459,7 @@ class TruncatedNormalInitializer(Initializer):
 
 
 class XavierInitializer(Initializer):
-    """
+    r"""
     This class implements the Xavier weight initializer from the paper
     `Understanding the difficulty of training deep feedforward neural
     networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
@@ -595,7 +595,7 @@ class XavierInitializer(Initializer):
 
 
 class MSRAInitializer(Initializer):
-    """Implements the MSRA initializer a.k.a. Kaiming Initializer
+    r"""Implements the MSRA initializer a.k.a. Kaiming Initializer
 
     This class implements the weight initialization from the paper
     `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
diff --git a/python/paddle/fluid/input.py b/python/paddle/fluid/input.py
index 0e3ee46fa46d174240c4be34f793aa00820064e7..e56d1876e3f01087471953840f88d09e56568af2 100644
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
@@ -137,7 +137,7 @@ def embedding(input,
               padding_idx=None,
               param_attr=None,
               dtype='float32'):
-    """
+    r"""
     :api_attr: Static Graph
 
     The operator is used to lookup embeddings vector of ids provided by :attr:`input` . 
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 6e38c855562809fa38cddbf6e58eb4eee6b899f3..5ee46a68fb76e1911b0a7f73ba63f2d0d2ee1358 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -59,7 +59,7 @@ class LayerHelperBase(object):
         return cls.__dtype
 
     def to_variable(self, value, name=None):
-        """
+        r"""
         The API will create a ``Variable`` object from numpy\.ndarray or Variable object.
 
         Parameters:
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 65ca5a211e3c89510f75229a8c41e906ce5c226f..b5f66a1308e0fd770d8ab3202560044c90bc0eb8 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -3012,7 +3012,7 @@ class DynamicRNN(object):
         self.mem_link = []
 
     def step_input(self, x, level=0):
-        """
+        r"""
         This function is used to set sequence x as DynamicRNN's input.
         The maximum sequence length in x determines the number of time steps
         the RNN unit will be executed. DynamicRNN can take multiple inputs.
@@ -3144,7 +3144,7 @@ class DynamicRNN(object):
         return array_read(array=input_array, i=self.step_idx)
 
     def static_input(self, x):
-        """
+        r"""
         This function is used to set x as DynamicRNN's static input. It is optional.
 
         - Case 1, set static input with LoD
@@ -3348,7 +3348,7 @@ class DynamicRNN(object):
                value=0.0,
                need_reorder=False,
                dtype='float32'):
-        """
+        r"""
         Create a memory Variable for DynamicRNN to deliver data cross time steps.
         It can be initialized by an existing Tensor or a constant Tensor of given
         dtype and shape.
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index f7e79f79f8bfd23a129e048d75c928e791f9412b..ce29b64ce432a3c221378e5134b778338c098ad7 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -77,7 +77,7 @@ def retinanet_target_assign(bbox_pred,
                             num_classes=1,
                             positive_overlap=0.5,
                             negative_overlap=0.4):
-    """
+    r"""
     **Target Assign Layer for the detector RetinaNet.**
 
     This OP finds out positive and negative samples from all anchors
@@ -471,7 +471,7 @@ def rpn_target_assign(bbox_pred,
 
 
 def sigmoid_focal_loss(x, label, fg_num, gamma=2.0, alpha=0.25):
-    """
+    r"""
 	:alias_main: paddle.nn.functional.sigmoid_focal_loss
 	:alias: paddle.nn.functional.sigmoid_focal_loss,paddle.nn.functional.loss.sigmoid_focal_loss
 	:old_api: paddle.fluid.layers.sigmoid_focal_loss
@@ -821,7 +821,7 @@ def box_coder(prior_box,
               box_normalized=True,
               name=None,
               axis=0):
-    """
+    r"""
 
     **Box Coder Layer**
 
@@ -1523,7 +1523,7 @@ def ssd_loss(location,
              mining_type='max_negative',
              normalize=True,
              sample_size=None):
-    """
+    r"""
 	:alias_main: paddle.nn.functional.ssd_loss
 	:alias: paddle.nn.functional.ssd_loss,paddle.nn.functional.loss.ssd_loss
 	:old_api: paddle.fluid.layers.ssd_loss
@@ -1930,7 +1930,7 @@ def density_prior_box(input,
                       offset=0.5,
                       flatten_to_2d=False,
                       name=None):
-    """
+    r"""
 
     This op generates density prior boxes for SSD(Single Shot MultiBox Detector) 
     algorithm. Each position of the input produce N prior boxes, N is 
@@ -2741,7 +2741,7 @@ def generate_proposal_labels(rpn_rois,
 
 def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
                          labels_int32, num_classes, resolution):
-    """
+    r"""
 
     **Generate Mask Labels for Mask-RCNN**
 
@@ -3671,7 +3671,7 @@ def distribute_fpn_proposals(fpn_rois,
                              refer_scale,
                              rois_num=None,
                              name=None):
-    """
+    r"""
 	
     **This op only takes LoDTensor as input.** In Feature Pyramid Networks 
     (FPN) models, it is needed to distribute all proposals into different FPN 
diff --git a/python/paddle/fluid/layers/distributions.py b/python/paddle/fluid/layers/distributions.py
index 81bea3898bed03e679bb06ebc15d1fa529d33ad3..4e4c8dfd2a010cb1c0c9887c6039ac238676cd75 100644
--- a/python/paddle/fluid/layers/distributions.py
+++ b/python/paddle/fluid/layers/distributions.py
@@ -113,7 +113,7 @@ class Distribution(object):
 
 
 class Uniform(Distribution):
-    """Uniform distribution with `low` and `high` parameters.
+    r"""Uniform distribution with `low` and `high` parameters.
 
     Mathematical Details
 
@@ -258,7 +258,7 @@ class Uniform(Distribution):
 
 
 class Normal(Distribution):
-    """The Normal distribution with location `loc` and `scale` parameters.
+    r"""The Normal distribution with location `loc` and `scale` parameters.
 
     Mathematical details
 
@@ -423,7 +423,7 @@ class Normal(Distribution):
 
 
 class Categorical(Distribution):
-    """
+    r"""
     Categorical distribution is a discrete probability distribution that 
     describes the possible results of a random variable that can take on 
     one of K possible categories, with the probability of each category 
@@ -529,7 +529,7 @@ class Categorical(Distribution):
 
 
 class MultivariateNormalDiag(Distribution):
-    """
+    r"""
     A multivariate normal (also called Gaussian) distribution parameterized by a mean vector
     and a covariance matrix.
 
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 2710ab12cd3dad4957e48936e54eb742d84dca78..26f08a2356d6cfd72124a11a5405a7381c221fe7 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -440,7 +440,7 @@ Applies piecewise decay to the initial learning rate.
 
 
 def cosine_decay(learning_rate, step_each_epoch, epochs):
-    """
+    r"""
 
     Applies cosine decay to the learning rate.
 
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 99801514f472664912101234af82d251a46d9bc4..45f3de2d99a6b664b2651f04e009020eae11e981 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -57,7 +57,7 @@ def center_loss(input,
                 alpha,
                 param_attr,
                 update_center=True):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Center loss Cost layer**
@@ -151,7 +151,7 @@ def center_loss(input,
 
 
 def bpr_loss(input, label, name=None):
-    """
+    r"""
 
     **Bayesian Personalized Ranking Loss Operator**
 
@@ -203,7 +203,7 @@ def bpr_loss(input, label, name=None):
 
 
 def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
-    """
+    r"""
     :alias_main: paddle.nn.functional.cross_entropy
 	:alias: paddle.nn.functional.cross_entropy,paddle.nn.functional.loss.cross_entropy
 	:old_api: paddle.fluid.layers.cross_entropy
@@ -300,7 +300,7 @@ def cross_entropy2(input, label, ignore_index=kIgnoreIndex):
 
 
 def square_error_cost(input, label):
-    """
+    r"""
 
     This op accepts input predictions and target label and returns the
     squared error cost.
@@ -1185,7 +1185,7 @@ def softmax_with_cross_entropy(logits,
                                numeric_stable_mode=True,
                                return_softmax=False,
                                axis=-1):
-    """
+    r"""
     :alias_main: paddle.nn.functional.softmax_with_cross_entropy
 	:alias: paddle.nn.functional.softmax_with_cross_entropy,paddle.nn.functional.loss.softmax_with_cross_entropy
 	:old_api: paddle.fluid.layers.softmax_with_cross_entropy
@@ -1312,7 +1312,7 @@ def softmax_with_cross_entropy(logits,
 
 
 def rank_loss(label, left, right, name=None):
-    """
+    r"""
 
     This operator implements the sort loss layer in the RankNet model. RankNet is a pairwise ranking model 
     with a training sample consisting of a pair of documents (A and B), The label (P) 
@@ -1375,7 +1375,7 @@ def rank_loss(label, left, right, name=None):
 
 
 def margin_rank_loss(label, left, right, margin=0.1, name=None):
-    """
+    r"""
     Margin Ranking Loss Layer for ranking problem,
     which compares left score and right score passed in.
     The ranking loss can be defined as following equation:
@@ -1551,7 +1551,7 @@ def teacher_student_sigmoid_loss(input,
 
 
 def huber_loss(input, label, delta):
-    """
+    r"""
     This operator computes the Huber loss between input and label.
     Huber loss is commonly used in regression tasks. Compared to square_error_cost, Huber loss is more robust and less sensitivity to outliers.
 
@@ -1681,7 +1681,7 @@ from .control_flow import equal
 
 
 def npair_loss(anchor, positive, labels, l2_reg=0.002):
-    '''
+    r'''
 
   Read `Improved Deep Metric Learning with Multi class N pair Loss Objective\
        <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/\
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index 3ec88a2d5d57a2c78b3927480afb824d8ba6735a..35d14ef7657d4272430c1b7767d4d43612636748 100644
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -114,7 +114,7 @@ def auc(input,
         num_thresholds=2**12 - 1,
         topk=1,
         slide_steps=1):
-    """
+    r"""
     **Area Under the Curve (AUC) Layer**
 
     This implementation computes the AUC according to forward output and label.
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index fa9a1c75b389a9a46f84fbd153561a3cf3f2f814..1bee56348234a308ffffa38b4ed6128f071576cf 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -215,7 +215,7 @@ def fc(input,
        bias_attr=None,
        act=None,
        name=None):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Fully Connected Layer**
@@ -377,7 +377,7 @@ def embedding(input,
               padding_idx=None,
               param_attr=None,
               dtype='float32'):
-    """
+    r"""
     :api_attr: Static Graph
 
     **WARING:** This OP will be deprecated in a future release. This OP requires the
@@ -530,7 +530,7 @@ def _pull_sparse(input,
                  padding_id=0,
                  dtype='float32',
                  scale_sparse_grad=True):
-    """
+    r"""
     **Pull Fleet Sparse Layer**
 
     This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
@@ -601,7 +601,7 @@ def _pull_sparse_v2(input,
                     padding_id=0,
                     dtype='float32',
                     scale_sparse_grad=True):
-    """
+    r"""
     **Pull Fleet Sparse Layer**
 
     This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
@@ -664,7 +664,7 @@ def _pull_sparse_v2(input,
 
 
 def _pull_box_sparse(input, size, dtype='float32'):
-    """
+    r"""
     **Pull Box Sparse Layer**
 
     This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
@@ -1050,7 +1050,7 @@ def chunk_eval(input,
                num_chunk_types,
                excluded_chunk_types=None,
                seq_length=None):
-    """
+    r"""
     This operator computes the precision, recall and F1-score for chunk detection.
     It is often used in sequence tagging tasks, such as Named Entity Recognition(NER).
 
@@ -1199,7 +1199,7 @@ def chunk_eval(input,
 
 @deprecated(since="2.0.0", update_to="paddle.nn.functional.softmax")
 def softmax(input, use_cudnn=False, name=None, axis=-1):
-    """
+    r"""
     This operator implements the softmax layer. The calculation process is as follows:
 
     1. The dimension :attr:`axis` of the ``input`` will be permuted to the last.
@@ -1339,7 +1339,7 @@ def conv2d(input,
            act=None,
            name=None,
            data_format="NCHW"):
-    """
+    r"""
     :api_attr: Static Graph
 
     The convolution2D layer calculates the output based on the input, filter
@@ -1618,7 +1618,7 @@ def conv3d(input,
            act=None,
            name=None,
            data_format="NCDHW"):
-    """
+    r"""
     :api_attr: Static Graph
 
     The convolution3D layer calculates the output based on the input, filter
@@ -2325,7 +2325,7 @@ def adaptive_pool2d(input,
                     pool_type="max",
                     require_index=False,
                     name=None):
-    """
+    r"""
 
     This operation calculates the output based on the input, pool_size,
     pool_type parameters. Input(X) and output(Out) are in NCHW format, where N is batch
@@ -2471,7 +2471,7 @@ def adaptive_pool3d(input,
                     pool_type="max",
                     require_index=False,
                     name=None):
-    """
+    r"""
 
     This operation calculates the output based on the input, pool_size,
     pool_type parameters. Input(X) and output(Out) are in NCDHW format, where N is batch
@@ -2638,7 +2638,7 @@ def batch_norm(input,
                moving_variance_name=None,
                do_model_average_for_mean_and_var=True,
                use_global_stats=False):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Batch Normalization Layer**
@@ -2902,7 +2902,7 @@ def inplace_abn(input,
                 do_model_average_for_mean_and_var=True,
                 use_global_stats=False,
                 act_alpha=1.0):
-    """
+    r"""
     **In-place Activation Batch Normalization Layer**
 
     This layer calculates batch normalization and activation with in-place memory.
@@ -3096,7 +3096,7 @@ def instance_norm(input,
                   param_attr=None,
                   bias_attr=None,
                   name=None):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Instance Normalization Layer**
@@ -3231,7 +3231,7 @@ def data_norm(input,
               sync_stats=False,
               summary_decay_rate=0.9999999,
               enable_scale_and_shift=False):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Data Normalization Layer**
@@ -3416,7 +3416,7 @@ def layer_norm(input,
                bias_attr=None,
                act=None,
                name=None):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Layer Normalization Layer**
@@ -3646,7 +3646,7 @@ def group_norm(input,
 
 @templatedoc()
 def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Spectral Normalization Layer**
@@ -3765,7 +3765,7 @@ def conv2d_transpose(input,
                      act=None,
                      name=None,
                      data_format='NCHW'):
-    """
+    r"""
     :api_attr: Static Graph
 
     The convolution2D transpose layer calculates the output based on the input,
@@ -4057,7 +4057,7 @@ def conv3d_transpose(input,
                      act=None,
                      name=None,
                      data_format='NCDHW'):
-    """
+    r"""
     :api_attr: Static Graph
 
     The convolution3D transpose layer calculates the output based on the input,
@@ -4961,7 +4961,7 @@ def split(input, num_or_sections, dim=-1, name=None):
 
 
 def l2_normalize(x, axis, epsilon=1e-12, name=None):
-    """
+    r"""
 
     This op normalizes `x` along dimension `axis` using an L2
     norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes
@@ -5286,7 +5286,7 @@ def ctc_greedy_decoder(input,
                        input_length=None,
                        padding_value=0,
                        name=None):
-    """
+    r"""
     This op is used to decode sequences by greedy policy by the following steps:
 
     1. Get the indexes of maximum value for each row in input. a.k.a.
@@ -5538,7 +5538,7 @@ def im2sequence(input,
                 input_image_size=None,
                 out_stride=1,
                 name=None):
-    """
+    r"""
     :api_attr: Static Graph
 
     Extracts image patches from the input tensor to form a tensor of shape
@@ -6046,7 +6046,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
 
 
 def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
-    """
+    r"""
     :alias_main: paddle.reshape
 	:alias: paddle.reshape,paddle.tensor.reshape,paddle.tensor.manipulation.reshape
 
@@ -6535,7 +6535,7 @@ def lod_append(x, level):
 
 def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None,
         data_format='NCHW'):
-    """
+    r"""
     :alias_main: paddle.nn.functional.lrn
 	:alias: paddle.nn.functional.lrn,paddle.nn.functional.norm.lrn
 	:old_api: paddle.fluid.layers.lrn
@@ -6625,7 +6625,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None,
 
 
 def pad(x, paddings, pad_value=0., name=None):
-    """
+    r"""
     :alias_main: paddle.nn.functional.pad
 	:alias: paddle.nn.functional.pad,paddle.nn.functional.common.pad
 	:old_api: paddle.fluid.layers.pad
@@ -6695,7 +6695,7 @@ def pad(x, paddings, pad_value=0., name=None):
 
 
 def pad_constant_like(x, y, pad_value=0., name=None):
-    """
+    r"""
     Pad :attr:`y` with :attr:`pad_value`, the number of values padded to
     the edges of each axis is specified by the difference of the shape
     of :attr:`x` and :attr:`y` . ((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n))
@@ -6794,7 +6794,7 @@ def label_smooth(label,
                  epsilon=0.1,
                  dtype="float32",
                  name=None):
-    """
+    r"""
     :alias_main: paddle.nn.functional.label_smooth
 	:alias: paddle.nn.functional.label_smooth,paddle.nn.functional.common.label_smooth
 	:old_api: paddle.fluid.layers.label_smooth
@@ -7067,7 +7067,7 @@ def roi_align(input,
 
 
 def dice_loss(input, label, epsilon=0.00001, name=None):
-    """
+    r"""
 
     Dice loss for comparing the similarity between the input predictions and the label.
     This implementation is for binary classification, where the input is sigmoid
@@ -8500,7 +8500,7 @@ def scatter(input, index, updates, name=None, overwrite=True):
 
 
 def scatter_nd_add(ref, index, updates, name=None):
-    """
+    r"""
     **Scatter_nd_add Layer**
 
     Output is obtained by applying sparse addition to a single value
@@ -8686,7 +8686,7 @@ def random_crop(x, shape, seed=None):
 
 
 def log(x, name=None):
-    """
+    r"""
     Calculates the natural log of the given input tensor, element-wise.
 
     .. math::
@@ -8768,7 +8768,7 @@ def relu(x, name=None):
 
 @deprecated(since="2.0.0", update_to="paddle.nn.functional.selu")
 def selu(x, scale=None, alpha=None, name=None):
-    """
+    r"""
 
     Selu Operator.
 
@@ -8836,7 +8836,7 @@ def selu(x, scale=None, alpha=None, name=None):
 
 
 def mean_iou(input, label, num_classes):
-    """
+    r"""
     Mean Intersection-Over-Union is a common evaluation metric for
     semantic image segmentation, which first computes the IOU for each
     semantic class and then computes the average over classes.
@@ -9640,7 +9640,7 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
 
 @templatedoc()
 def swish(x, beta=1.0, name=None):
-    """
+    r"""
     :alias_main: paddle.nn.functional.swish
 	:alias: paddle.nn.functional.swish,paddle.nn.functional.activation.swish
 	:old_api: paddle.fluid.layers.swish
@@ -9725,7 +9725,7 @@ def swish(x, beta=1.0, name=None):
 
 @deprecated(since="2.0.0", update_to="paddle.static.nn.prelu")
 def prelu(x, mode, param_attr=None, name=None):
-    """
+    r"""
     prelu activation.
 
     .. math::
@@ -9883,7 +9883,7 @@ def leaky_relu(x, alpha=0.02, name=None):
 
 
 def soft_relu(x, threshold=40.0, name=None):
-    """
+    r"""
 
     SoftRelu Activation Operator.
 
@@ -9932,7 +9932,7 @@ def soft_relu(x, threshold=40.0, name=None):
 
 
 def flatten(x, axis=1, name=None):
-    """
+    r"""
     **Flatten op**
 
     Flatten the input tensor into a 2D matrix.
@@ -12153,7 +12153,7 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
 
 
 def logical_and(x, y, out=None, name=None):
-    """
+    r"""
 
     ``logical_and`` operator computes element-wise logical AND on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
     Each element of ``out`` is calculated by
@@ -12230,7 +12230,7 @@ def logical_or(x, y, out=None, name=None):
 
 
 def logical_xor(x, y, out=None, name=None):
-    """
+    r"""
 
     ``logical_xor`` operator computes element-wise logical XOR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
     Each element of ``out`` is calculated by
@@ -12565,7 +12565,7 @@ def maxout(x, groups, name=None, axis=1):
 
 
 def space_to_depth(x, blocksize, name=None):
-    """
+    r"""
 
     Gives a blocksize to space_to_depth the input LoDtensor with Layout: [batch, channel, height, width]
 
@@ -12753,7 +12753,7 @@ def affine_channel(x,
 
 
 def similarity_focus(input, axis, indexes, name=None):
-    """
+    r"""
     SimilarityFocus Operator
 
     Generate a similarity focus mask with the same shape of input using the following method:
@@ -13034,7 +13034,7 @@ def grid_sampler(x, grid, name=None):
 
 
 def log_loss(input, label, epsilon=1e-4, name=None):
-    """
+    r"""
 
     **Negative Log Loss Layer**
 
@@ -13086,7 +13086,7 @@ def log_loss(input, label, epsilon=1e-4, name=None):
 
 
 def add_position_encoding(input, alpha, beta, name=None):
-    """
+    r"""
 
     This operator performs weighted sum of input feature at each position
     (position in the sequence) and the corresponding position encoding.
@@ -13160,7 +13160,7 @@ def bilinear_tensor_product(x,
                             name=None,
                             param_attr=None,
                             bias_attr=None):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Bilinear Tensor Product Layer**
@@ -13987,7 +13987,7 @@ def fsp_matrix(x, y):
 
 
 def continuous_value_model(input, cvm, use_cvm=True):
-    """
+    r"""
 
     **continuous_value_model layers**
 
@@ -14092,7 +14092,7 @@ def where(condition):
 
 @deprecated(since="2.0.0", update_to="paddle.sign")
 def sign(x):
-    """
+    r"""
     This OP returns sign of every element in `x`: 1 for positive, -1 for negative and 0 for zero.
 
     Args:
@@ -14125,7 +14125,7 @@ def sign(x):
 
 
 def unique(x, dtype='int32'):
-    """
+    r"""
     Return a unique tensor for `x` and an index tensor pointing to this unique tensor.
 
     Args:
@@ -14164,7 +14164,7 @@ def unique(x, dtype='int32'):
 
 
 def unique_with_counts(x, dtype='int32'):
-    """
+    r"""
     This OP return a unique tensor for `x` , and count tensor that the count of unique result in raw input, \
     and an index tensor pointing to this unique tensor.
 
@@ -14236,7 +14236,7 @@ def deformable_conv(input,
                     bias_attr=None,
                     modulated=True,
                     name=None):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Deformable Convolution op**
@@ -14453,7 +14453,7 @@ def deformable_conv(input,
 
 
 def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
-    """
+    r"""
 
     This op returns a col buffer of sliding local blocks of input x, also known
     as im2col for batched 2D image tensors. For each block under the convolution filter,
@@ -14590,7 +14590,7 @@ def deformable_roi_pooling(input,
                            trans_std=0.1,
                            position_sensitive=False,
                            name=None):
-    """
+    r"""
 
     Deformable ROI Pooling Layer
 
@@ -14821,7 +14821,7 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
 
 @templatedoc()
 def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
-    """
+    r"""
     This operator implements the hard_swish activation function.
     Hard_swish is proposed in MobileNetV3, and performs better in computational stability and efficiency compared to swish function.
     For more details please refer to: https://arxiv.org/pdf/1905.02244.pdf
@@ -14890,7 +14890,7 @@ def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
 
 @templatedoc()
 def mish(x, threshold=20, name=None):
-    """
+    r"""
     This operator implements the mish activation function.
     Refer to `Mish: A Self Regularized Non-Monotonic Neural
     Activation Function <https://arxiv.org/abs/1908.08681>`_
@@ -14964,7 +14964,7 @@ def mish(x, threshold=20, name=None):
 
 
 def gather_tree(ids, parents):
-    """
+    r"""
     To be used after beam search. After beam search, we get selected ids at
     each time step and the corresponding parents in the search tree. Both ids
     and parents have the layout :attr:`[max_time, batch_size, beam_size]`. Then
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index de0fbb16f6241209dfd755a71aab2c101252d17a..72dc4a91608e18cf59169ce3cbfe6cd2a74554cf 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -413,7 +413,7 @@ def softshrink(x, alpha=None):
     return _softshrink_(**kwargs)
 
 
-softshrink.__doc__ = """
+softshrink.__doc__ = r"""
 	:alias_main: paddle.nn.functional.softshrink
 	:alias: paddle.nn.functional.softshrink,paddle.nn.functional.activation.softshrink
 	:old_api: paddle.fluid.layers.softshrink
@@ -530,7 +530,7 @@ def thresholded_relu(x, threshold=None):
     return _thresholded_relu_(**kwargs)
 
 
-thresholded_relu.__doc__ = """
+thresholded_relu.__doc__ = r"""
 	:alias_main: paddle.nn.functional.thresholded_relu
 	:alias: paddle.nn.functional.thresholded_relu,paddle.nn.functional.activation.thresholded_relu
 	:old_api: paddle.fluid.layers.thresholded_relu
@@ -617,7 +617,7 @@ def gelu(x, approximate=False):
     return _gelu_(**kwargs)
 
 
-gelu.__doc__ = """
+gelu.__doc__ = r"""
 :strong:`GeLU Activation Operator`
 For more details, see [Gaussian Error Linear Units](https://arxiv.org/abs/1606.08415).
 
@@ -701,7 +701,7 @@ def erf(x, name=None):
     return _erf_(**kwargs)
 
 
-erf.__doc__ = """
+erf.__doc__ = r"""
 :strong:`Erf Operator`
 For more details, see [Error function](https://en.wikipedia.org/wiki/Error_function).
 
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 05272a7cefb0827517c22b38b8b85ccdb3036fcc..2f11603d484fa571f736c0e186fd5a3fd02106b7 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -67,7 +67,7 @@ class RNNCell(object):
     """
 
     def call(self, inputs, states, **kwargs):
-        """
+        r"""
         Every cell must implement this method to do the calculations mapping the
         inputs and states to the output and new states.
 
@@ -97,7 +97,7 @@ class RNNCell(object):
                            dtype='float32',
                            init_value=0,
                            batch_dim_idx=0):
-        """
+        r"""
         Generate initialized states according to provided shape, data type and
         value.
 
@@ -225,7 +225,7 @@ class RNNCell(object):
 
 
 class GRUCell(RNNCell):
-    """
+    r"""
 	:api_attr: Static Graph
 
     Gated Recurrent Unit cell. It is a wrapper for 
@@ -287,7 +287,7 @@ class GRUCell(RNNCell):
             activation, dtype)
 
     def call(self, inputs, states):
-        """
+        r"""
         Perform calculations of GRU.
 
         Parameters:
@@ -323,7 +323,7 @@ class GRUCell(RNNCell):
 
 
 class LSTMCell(RNNCell):
-    """
+    r"""
 	:api_attr: Static Graph
 
     Long-Short Term Memory cell. It is a wrapper for 
@@ -390,7 +390,7 @@ class LSTMCell(RNNCell):
             activation, forget_bias, dtype)
 
     def call(self, inputs, states):
-        """
+        r"""
         Perform calculations of LSTM.
 
         Parameters:
@@ -782,7 +782,7 @@ class Decoder(object):
     """
 
     def initialize(self, inits):
-        """
+        r"""
         Called once before the decoding iterations.
 
         Parameters:
@@ -797,7 +797,7 @@ class Decoder(object):
         raise NotImplementedError
 
     def step(self, time, inputs, states, **kwargs):
-        """
+        r"""
         Called per step of decoding. 
 
         Parameters:
@@ -818,7 +818,7 @@ class Decoder(object):
         raise NotImplementedError
 
     def finalize(self, outputs, final_states, sequence_lengths):
-        """
+        r"""
         Called once after the decoding iterations if implemented.
 
         Parameters:
@@ -931,7 +931,7 @@ class BeamSearchDecoder(Decoder):
 
     @staticmethod
     def tile_beam_merge_with_batch(x, beam_size):
-        """
+        r"""
         Tile the batch dimension of a tensor. Specifically, this function takes
         a tensor t shaped `[batch_size, s0, s1, ...]` composed of minibatch 
         entries `t[0], ..., t[batch_size - 1]` and tiles it to have a shape
@@ -966,7 +966,7 @@ class BeamSearchDecoder(Decoder):
         return x
 
     def _split_batch_beams(self, x):
-        """
+        r"""
         Reshape a tensor with shape `[batch_size * beam_size, ...]` to a new
         tensor with shape `[batch_size, beam_size, ...]`. 
 
@@ -983,7 +983,7 @@ class BeamSearchDecoder(Decoder):
         return nn.reshape(x, shape=[-1, self.beam_size] + list(x.shape[1:]))
 
     def _merge_batch_beams(self, x):
-        """
+        r"""
         Reshape a tensor with shape `[batch_size, beam_size, ...]` to a new
         tensor with shape `[batch_size * beam_size, ...]`. 
 
@@ -1000,7 +1000,7 @@ class BeamSearchDecoder(Decoder):
         return nn.reshape(x, shape=[-1] + list(x.shape[2:]))
 
     def _expand_to_beam_size(self, x):
-        """
+        r"""
         This function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed
         of minibatch entries `t[0], ..., t[batch_size - 1]` and tiles it to have a
         shape `[batch_size, beam_size, s0, s1, ...]` composed of minibatch entries
@@ -1023,7 +1023,7 @@ class BeamSearchDecoder(Decoder):
         return x
 
     def _mask_probs(self, probs, finished):
-        """
+        r"""
         Mask log probabilities. It forces finished beams to allocate all probability
         mass to eos and unfinished beams to remain unchanged.
 
@@ -1052,7 +1052,7 @@ class BeamSearchDecoder(Decoder):
         return probs
 
     def _gather(self, x, indices, batch_size):
-        """
+        r"""
         Gather from the tensor `x` using `indices`.
 
         Parameters:
@@ -1104,7 +1104,7 @@ class BeamSearchDecoder(Decoder):
         pass
 
     def initialize(self, initial_cell_states):
-        """
+        r"""
         Initialize the BeamSearchDecoder.
 
         Parameters:
@@ -1162,7 +1162,7 @@ class BeamSearchDecoder(Decoder):
                                               init_lengths), init_finished
 
     def _beam_search_step(self, time, logits, next_cell_states, beam_state):
-        """
+        r"""
         Calculate scores and select candidate token ids.
 
         Parameters:
@@ -1235,7 +1235,7 @@ class BeamSearchDecoder(Decoder):
         return beam_search_output, beam_search_state
 
     def step(self, time, inputs, states, **kwargs):
-        """
+        r"""
         Perform a beam search decoding step, which uses `cell` to get probabilities,
         and follows a beam search step to calculate scores and select candidate
         token ids.
@@ -1287,7 +1287,7 @@ class BeamSearchDecoder(Decoder):
         return (beam_search_output, beam_search_state, next_inputs, finished)
 
     def finalize(self, outputs, final_states, sequence_lengths):
-        """
+        r"""
         Use `gather_tree` to backtrace along the beam search tree and construct
         the full predicted sequences.
 
@@ -1572,7 +1572,7 @@ def dynamic_decode(decoder,
                    is_test=False,
                    return_length=False,
                    **kwargs):
-    """
+    r"""
     Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned
     Tensor indicating finished status contains all True values or the number of
     decoding step reaches to :attr:`max_step_num`.
@@ -1664,7 +1664,7 @@ class DecodeHelper(object):
     """
 
     def initialize(self):
-        """
+        r"""
         DecodeHelper initialization to produce inputs for the first decoding step
         and give the initial status telling whether each sequence in the batch
         is finished. It is the partial of the initialization of `BasicDecoder`.
@@ -1698,7 +1698,7 @@ class DecodeHelper(object):
         pass
 
     def next_inputs(self, time, outputs, states, sample_ids):
-        """
+        r"""
         Produce the inputs and states for next time step and give status telling
         whether each minibatch entry is finished. It is called after `sample` in
         `BasicDecoder.step`. It is the partial of `BasicDecoder.step`.
@@ -1787,7 +1787,7 @@ class TrainingHelper(DecodeHelper):
             self.inputs)
 
     def initialize(self):
-        """
+        r"""
         TrainingHelper initialization produces inputs for the first decoding
         step by slicing at the first time step of full sequence inputs, and it
         gives initial status telling whether each sequence in the batch is
@@ -1809,7 +1809,7 @@ class TrainingHelper(DecodeHelper):
         return init_inputs, init_finished
 
     def sample(self, time, outputs, states):
-        """
+        r"""
         Perform sampling by using `argmax` according to the `outputs`. Mostly
         the sampled ids would not be used since the inputs for next decoding
         step would be got by slicing.
@@ -1832,7 +1832,7 @@ class TrainingHelper(DecodeHelper):
         return sample_ids
 
     def next_inputs(self, time, outputs, states, sample_ids):
-        """
+        r"""
         Generate inputs for the next decoding step by slicing at corresponding
         step of the full sequence inputs. Simultaneously, produce the states
         for next time step by directly using the input `states` and emit status
@@ -1909,7 +1909,7 @@ class GreedyEmbeddingHelper(DecodeHelper):
     """
 
     def __init__(self, embedding_fn, start_tokens, end_token):
-        """
+        r"""
         Constructor of GreedyEmbeddingHelper.
 
         Parameters:
@@ -1934,7 +1934,7 @@ class GreedyEmbeddingHelper(DecodeHelper):
             shape=[1], dtype="int64", value=end_token)
 
     def initialize(self):
-        """
+        r"""
         GreedyEmbeddingHelper initialization produces inputs for the first decoding
         step by using `start_tokens` of the constructor, and gives initial
         status telling whether each sequence in the batch is finished. 
@@ -1957,7 +1957,7 @@ class GreedyEmbeddingHelper(DecodeHelper):
         return init_inputs, init_finished
 
     def sample(self, time, outputs, states):
-        """
+        r"""
         Perform sampling by using `argmax` according to the `outputs`.
 
         Parameters:
@@ -1978,7 +1978,7 @@ class GreedyEmbeddingHelper(DecodeHelper):
         return sample_ids
 
     def next_inputs(self, time, outputs, states, sample_ids):
-        """
+        r"""
         Generate inputs for the next decoding step by applying `embedding_fn`
         to `sample_ids`. Simultaneously, produce the states for next time step
         by directly using the input `states` and emit status telling whether
@@ -2046,7 +2046,7 @@ class SampleEmbeddingHelper(GreedyEmbeddingHelper):
                  end_token,
                  softmax_temperature=None,
                  seed=None):
-        """
+        r"""
         Constructor of SampleEmbeddingHelper.
 
         Parameters:
@@ -2080,7 +2080,7 @@ class SampleEmbeddingHelper(GreedyEmbeddingHelper):
         self.seed = seed
 
     def sample(self, time, outputs, states):
-        """
+        r"""
         Perform sampling from a categorical distribution, and the distribution
         is computed by `softmax(outputs/softmax_temperature)`.
 
@@ -2165,7 +2165,7 @@ class BasicDecoder(Decoder):
         self.output_fn = output_fn
 
     def initialize(self, initial_cell_states):
-        """
+        r"""
         BasicDecoder initialization includes helper initialization and cell
         initialization, and cell initialization uses `initial_cell_states` as
         the result directly.
@@ -2195,7 +2195,7 @@ class BasicDecoder(Decoder):
         pass
 
     def step(self, time, inputs, states, **kwargs):
-        """
+        r"""
         Perform one decoding step as following steps:
 
         1. Perform `cell_outputs, cell_states = cell.call(inputs, states)`
@@ -2258,7 +2258,7 @@ def dynamic_lstm(input,
                  candidate_activation='tanh',
                  dtype='float32',
                  name=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
     **Note**:
@@ -2430,7 +2430,7 @@ def lstm(input,
          name=None,
          default_initializer=None,
          seed=-1):
-    """
+    r"""
 	:api_attr: Static Graph
 
     **Note**:
@@ -2612,7 +2612,7 @@ def dynamic_lstmp(input,
                   c_0=None,
                   cell_clip=None,
                   proj_clip=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
     **Note**:
@@ -2823,7 +2823,7 @@ def dynamic_gru(input,
                 candidate_activation='tanh',
                 h_0=None,
                 origin_mode=False):
-    """
+    r"""
 	:api_attr: Static Graph
 
     **Note: The input type of this must be LoDTensor. If the input type to be
@@ -2985,7 +2985,7 @@ def gru_unit(input,
              activation='tanh',
              gate_activation='sigmoid',
              origin_mode=False):
-    """
+    r"""
 	:api_attr: Static Graph
 
     Gated Recurrent Unit (GRU) RNN cell. This operator performs GRU calculations for
@@ -3143,7 +3143,7 @@ def beam_search(pre_ids,
                 is_accumulated=True,
                 name=None,
                 return_parent_idx=False):
-    """
+    r"""
 
     Beam search is a classical algorithm for selecting candidate words in a
     machine translation task.
@@ -3293,7 +3293,7 @@ def beam_search(pre_ids,
 
 
 def beam_search_decode(ids, scores, beam_size, end_id, name=None):
-    """
+    r"""
 
     This operator is used after beam search has completed. It constructs the
     full predicted sequences for each sample by walking back along the search
@@ -3378,7 +3378,7 @@ def lstm_unit(x_t,
               param_attr=None,
               bias_attr=None,
               name=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
     Long-Short Term Memory (LSTM) RNN cell. This operator performs LSTM calculations for
diff --git a/python/paddle/fluid/layers/sequence_lod.py b/python/paddle/fluid/layers/sequence_lod.py
index 80faffd477b6257ff7c769f7f177c56513518f0a..df1113660f7d8d2755d56a377b6770da99afc62a 100644
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
@@ -51,7 +51,7 @@ def sequence_conv(input,
                   param_attr=None,
                   act=None,
                   name=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
     **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use conv2d Op.(fluid.layers.** :ref:`api_fluid_layers_conv2d` ).
@@ -175,7 +175,7 @@ def sequence_conv(input,
 
 
 def sequence_softmax(input, use_cudnn=False, name=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
     **Note**:
@@ -259,7 +259,7 @@ def sequence_softmax(input, use_cudnn=False, name=None):
 
 
 def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
-    """
+    r"""
 	:api_attr: Static Graph
 
     **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use pool2d Op.(fluid.layers.** :ref:`api_fluid_layers_pool2d` ).
@@ -636,7 +636,7 @@ def sequence_slice(input, offset, length, name=None):
 
 
 def sequence_expand(x, y, ref_level=-1, name=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
         Sequence Expand Layer. This layer will expand the input variable ``x`` \
@@ -772,7 +772,7 @@ def sequence_expand(x, y, ref_level=-1, name=None):
 
 
 def sequence_expand_as(x, y, name=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
         Sequence Expand As Layer. This OP will expand the input variable ``x`` \
@@ -892,7 +892,7 @@ def sequence_expand_as(x, y, name=None):
 
 
 def sequence_pad(x, pad_value, maxlen=None, name=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
     This layer padding the sequences in a same batch to a common length (according \
@@ -1233,7 +1233,7 @@ def sequence_scatter(input, index, updates, name=None):
 
 
 def sequence_enumerate(input, win_size, pad_value=0, name=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
     Generate a new sequence for the input index sequence with \
@@ -1301,7 +1301,7 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
 
 
 def sequence_mask(x, maxlen=None, dtype='int64', name=None):
-    """
+    r"""
     **SequenceMask Layer**
 
     This layer outputs a mask according to the input :code:`x` and
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index fe3970ce1c10ce8a17f0c59e935ebbd0bdeda45e..6e794874afbc97efff9e56efe3c16329f58927ec 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -343,7 +343,7 @@ def concat(input, axis=0, name=None):
 
 
 def tensor_array_to_tensor(input, axis=1, name=None, use_stack=False):
-    """
+    r"""
     This function concatenates or stacks all tensors in the input LoDTensorArray
     along the axis mentioned and returns that as the output.
 
@@ -452,7 +452,7 @@ def tensor_array_to_tensor(input, axis=1, name=None, use_stack=False):
 
 
 def sums(input, out=None):
-    """
+    r"""
     This function computes the sum of multiple input Tensors elementwisely.
 
     - Case 1, sum of 3 Tensors
@@ -1391,7 +1391,7 @@ def range(start, end, step, dtype, name=None):
 
 
 def linspace(start, stop, num, dtype=None, name=None):
-    """
+    r"""
     This OP return fixed number of evenly spaced values within a given interval.
 
     Args:
@@ -1527,7 +1527,7 @@ def zeros_like(x, out=None):
 
 @deprecated(since="2.0.0", update_to="paddle.diag")
 def diag(diagonal):
-    """
+    r"""
 	:alias_main: paddle.diag
 	:alias: paddle.diag,paddle.tensor.diag,paddle.tensor.creation.diag
 	:old_api: paddle.fluid.layers.diag
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 0c3f6e1673287dd8bf2275eaedfb2e39a32fa133..a3b61f2e9112286c0c939e38857685c90569e9a8 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -475,7 +475,7 @@ class Accuracy(MetricBase):
         self.weight = .0
 
     def update(self, value, weight):
-        """
+        r"""
         This function takes the minibatch states (value, weight) as input,
         to accumulate and update the corresponding status of the Accuracy object. The update method is as follows:
 
@@ -561,7 +561,7 @@ class ChunkEvaluator(MetricBase):
         self.num_correct_chunks = 0
 
     def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
-        """
+        r"""
         This function takes (num_infer_chunks, num_label_chunks, num_correct_chunks) as input,
         to accumulate and update the corresponding status of the ChunkEvaluator object. The update method is as follows:
         
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 8df8f6b689146f160b766d1e3b06baf56f174696..c47cce76f89849199767bc0c9f43b3a7ff7b2a49 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -42,7 +42,7 @@ def simple_img_conv_pool(input,
                          bias_attr=None,
                          act=None,
                          use_cudnn=True):
-    """
+    r"""
 	:api_attr: Static Graph
 
     The simple_img_conv_pool api is composed of :ref:`api_fluid_layers_conv2d` and :ref:`api_fluid_layers_pool2d` .
@@ -333,7 +333,7 @@ def sequence_conv_pool(input,
 
 
 def glu(input, dim=-1):
-    """
+    r"""
 	:api_attr: Static Graph
 
     The Gated Linear Units(GLU) composed by :ref:`api_fluid_layers_split` , 
@@ -384,7 +384,7 @@ def scaled_dot_product_attention(queries,
                                  values,
                                  num_heads=1,
                                  dropout_rate=0.):
-    """
+    r"""
 	:api_attr: Static Graph
 
     This interface Multi-Head Attention using scaled dot product.
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 7f9ade8fcbd24b29c1c36e9ce77b98267c271951..2d95bfa8c541116e81e223d7c91fcf37dd97f286 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -954,7 +954,7 @@ class Optimizer(object):
 
 
 class SGDOptimizer(Optimizer):
-    """
+    r"""
     Optimizer of the stochastic gradient descent algorithm.
 
     .. math::
@@ -1048,7 +1048,7 @@ class SGDOptimizer(Optimizer):
 
 
 class MomentumOptimizer(Optimizer):
-    """
+    r"""
 
     Simple Momentum optimizer with velocity state
 
@@ -1183,7 +1183,7 @@ class MomentumOptimizer(Optimizer):
 
 
 class DGCMomentumOptimizer(Optimizer):
-    """
+    r"""
 	:api_attr: Static Graph
 
     DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887
@@ -1603,7 +1603,7 @@ class DGCMomentumOptimizer(Optimizer):
 
 
 class LarsMomentumOptimizer(Optimizer):
-    """
+    r"""
     Momentum optimizer with LARS support
 
     The update equations are as follows:
@@ -1735,7 +1735,7 @@ class LarsMomentumOptimizer(Optimizer):
 
 
 class AdagradOptimizer(Optimizer):
-    """
+    r"""
     The Adaptive Gradient optimizer (Adagrad for short) can adaptively assign
     different learning rates to individual parameters.
 
@@ -1851,7 +1851,7 @@ class AdagradOptimizer(Optimizer):
 
 
 class AdamOptimizer(Optimizer):
-    """
+    r"""
     The Adam optimizer uses an optimization described at the end
     of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
     it can dynamically adjusts the learning rate of each parameter using
@@ -2117,7 +2117,7 @@ class AdamOptimizer(Optimizer):
 
 
 class AdamaxOptimizer(Optimizer):
-    """
+    r"""
     The Adamax optimizer is implemented based on the Adamax Optimization 
     in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
     The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
@@ -2289,7 +2289,7 @@ class AdamaxOptimizer(Optimizer):
 
 
 class DpsgdOptimizer(Optimizer):
-    """
+    r"""
     We implement the Dpsgd optimizer according to CCS16 paper -
     Deep Learning with Differential Privacy.
 
@@ -2384,7 +2384,7 @@ class DpsgdOptimizer(Optimizer):
 
 
 class DecayedAdagradOptimizer(Optimizer):
-    """
+    r"""
     The Decayed Adagrad optimizer can be seen as an Adagrad algorithm that introduces
     the decay rate to solve the problem of a sharp drop in the learning rate
     during model training when using the AdagradOptimizer.
@@ -2494,7 +2494,7 @@ class DecayedAdagradOptimizer(Optimizer):
 
 
 class AdadeltaOptimizer(Optimizer):
-    """
+    r"""
     **Notes: This API does not support sparse parameter optimization.**
 
     Adadelta Optimizer. Please refer to this for details:
@@ -2613,7 +2613,7 @@ class AdadeltaOptimizer(Optimizer):
 
 
 class RMSPropOptimizer(Optimizer):
-    """
+    r"""
     Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
     rate method. The original slides proposed RMSProp: Slide 29 of
     http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
@@ -2801,7 +2801,7 @@ class RMSPropOptimizer(Optimizer):
 
 
 class FtrlOptimizer(Optimizer):
-    """
+    r"""
     FTRL (Follow The Regularized Leader) Optimizer.
 
     The paper that proposed Follow The Regularized Leader (FTRL):
@@ -2960,7 +2960,7 @@ class FtrlOptimizer(Optimizer):
 
 
 class LambOptimizer(AdamOptimizer):
-    """
+    r"""
     LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.
 
     LAMB Optimizer is designed to scale up the batch size of training without losing 
@@ -3132,7 +3132,7 @@ Lamb = LambOptimizer
 
 
 class ModelAverage(Optimizer):
-    """
+    r"""
 	:api_attr: Static Graph
 
     The ModelAverage optimizer accumulates specific continuous historical parameters
@@ -3441,7 +3441,7 @@ class ModelAverage(Optimizer):
 
 
 class ExponentialMovingAverage(object):
-    """
+    r"""
 	:api_attr: Static Graph
 
     Compute the moving average of parameters with exponential decay.
@@ -4795,7 +4795,7 @@ class RecomputeOptimizer(Optimizer):
 
 
 class LookaheadOptimizer(object):
-    """
+    r"""
 	:api_attr: Static Graph
 
     This implements the Lookahead optimizer of the
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 65f7bd64708125e0c31a9ac3c6a8d1ea3de9e8f6..7d123e7122eeb4a76cb7936511a7bc29575669cf 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -210,7 +210,7 @@ class ParamAttr(object):
 
 
 class WeightNormParamAttr(ParamAttr):
-    """
+    r"""
 	:api_attr: Static Graph
 
     Note:
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 09850b3cac90d4de576df0d82180964227921243..1cb76b1f3905938dd8de55498fc8bfd83d95251f 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -1325,7 +1325,7 @@ class GeneratorLoader(DataLoaderBase):
 
 
 class PyReader(DataLoaderBase):
-    """
+    r"""
     Create a reader object for data feeding in Python. 
     Data would be prefetched using Python thread and be pushed
     into a queue asynchronously. Data in the queue would be extracted 
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 9fe24ec2c9d87d1c82f8a3fbd771c714ad376aad..5e0e5f724a889aaf491b2c1bbb67e36cfaf9cd38 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -63,7 +63,7 @@ def _create_regularization_of_grad(param, grad, regularization=None):
 
 
 def append_regularization_ops(parameters_and_grads, regularization=None):
-    """Create and add backward regularization Operators
+    r"""Create and add backward regularization Operators
 
     Creates and adds backward regularization operators in the BlockDesc.
     This will add gradients of the regularizer function to the gradients
@@ -132,7 +132,7 @@ class WeightDecayRegularizer(object):
 
 
 class L2DecayRegularizer(WeightDecayRegularizer):
-    """ 
+    r""" 
     Implement the L2 Weight Decay Regularization, which helps to prevent the model over-fitting.
 
     It can be set in :ref:`api_fluid_ParamAttr` or ``optimizer`` (such as :ref:`api_fluid_optimizer_SGDOptimizer` ). 
@@ -239,7 +239,7 @@ class L2DecayRegularizer(WeightDecayRegularizer):
 
 
 class L1DecayRegularizer(WeightDecayRegularizer):
-    """
+    r"""
     Implement the L1 Weight Decay Regularization, which encourages the weights to be sparse.
     
     It can be set in :ref:`api_fluid_ParamAttr` or ``optimizer`` (such as :ref:`api_fluid_optimizer_SGDOptimizer` ). 
diff --git a/python/paddle/fluid/tests/unittests/dist_text_classification.py b/python/paddle/fluid/tests/unittests/dist_text_classification.py
index 095a474fd3ac056c678f9051ed80ef363ae968c9..21180d7f49f56929df798e01b8d69aff371f2310 100644
--- a/python/paddle/fluid/tests/unittests/dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py
@@ -204,8 +204,8 @@ def train(word_idx):
     :rtype: callable
     """
     return reader_creator(
-        re.compile("train/pos/.*\.txt$"),
-        re.compile("train/neg/.*\.txt$"), word_idx)
+        re.compile(r"train/pos/.*\.txt$"),
+        re.compile(r"train/neg/.*\.txt$"), word_idx)
 
 
 def test(word_idx):
@@ -221,8 +221,8 @@ def test(word_idx):
     :rtype: callable
     """
     return reader_creator(
-        re.compile("test/pos/.*\.txt$"),
-        re.compile("test/neg/.*\.txt$"), word_idx)
+        re.compile(r"test/pos/.*\.txt$"),
+        re.compile(r"test/neg/.*\.txt$"), word_idx)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
index bb7e0ca2a0ca7314ff890f9d1204a60842eec3dd..4f35befda8e2cdd7e238dc22f2cea78b68fc70e6 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -230,7 +230,7 @@ class SoftsignLayer(object):
 
 
 class FC(Layer):
-    """
+    r"""
     This interface is used to construct a callable object of the ``FC`` class.
     For more details, refer to code examples.
     It creates a fully connected layer in the network. It can take
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
index ec57057164f61c412493904ded99c018b06de8e6..e0b7e9033dd5e62110dde39e3f8d399b0f3b1662 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
@@ -227,7 +227,7 @@ class SoftsignLayer(object):
 
 
 class FC(paddle.nn.Layer):
-    """
+    r"""
     This interface is used to construct a callable object of the ``FC`` class.
     For more details, refer to code examples.
     It creates a fully connected layer in the network. It can take
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
index 4ae44365f25dfdb4d87b23f4d1605614eaf2f4df..ef4cbf0b742e15291781688a9bfa2d19fd2bae73 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
@@ -235,7 +235,7 @@ class EagerDeletionRecurrentOpTest1(unittest.TestCase):
 
 
 class EagerDeletionRecurrentOpTest2(EagerDeletionRecurrentOpTest1):
-    '''
+    r'''
     Test RNNOp
     equation:
         h_t = \sigma (W x_t + U h_{t-1})
diff --git a/python/paddle/fluid/tests/unittests/test_full_like_op.py b/python/paddle/fluid/tests/unittests/test_full_like_op.py
index 30bc097428c3b452a635999ab4a99c20d32cf20b..3f3b1ee6703643d4d8ae9c683397512462e5d867 100644
--- a/python/paddle/fluid/tests/unittests/test_full_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_like_op.py
@@ -31,7 +31,8 @@ class TestFullOp(unittest.TestCase):
         train_program = Program()
         with program_guard(train_program, startup_program):
             fill_value = 2.0
-            input = paddle.fluid.data(name='input', dtype='float32', shape=[2, 3])
+            input = paddle.fluid.data(
+                name='input', dtype='float32', shape=[2, 3])
             output = paddle.full_like(input, fill_value)
             output_dtype = paddle.full_like(input, fill_value, dtype='float32')
 
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py
index 29e0a8d6f02db323fc6befa9fca588247741ba24..2b632b2437ad64841a48ee96b8de4f54c6889e21 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -25,7 +25,7 @@ from paddle.fluid import compiler, Program, program_guard
 
 class TestLRNOp(OpTest):
     def get_input(self):
-        ''' TODO(gongweibao): why it's grad diff is so large?
+        r''' TODO(gongweibao): why it's grad diff is so large?
         x = np.ndarray(
             shape=(self.N, self.C, self.H, self.W), dtype=float, order='C')
         for m in range(0, self.N):
diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
index b738d4b8efe058c7b1c70f996dae74c15b196636..a8adee742c6127008251654b53c5286a86002c81 100644
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -232,7 +232,7 @@ class RecurrentOpTest1(unittest.TestCase):
 
 
 class RecurrentOpTest2(RecurrentOpTest1):
-    '''
+    r'''
     Test RNNOp
     equation:
         h_t = \sigma (W x_t + U h_{t-1})
@@ -469,7 +469,7 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):
 
 
 class RecurrentOpSubBlockTest(RecurrentOpTest1):
-    '''
+    r'''
     Test RNNOp with subblock variable
     equation:
         y_ = emb * w1
@@ -608,7 +608,7 @@ class RecurrentOpSubBlockTest(RecurrentOpTest1):
 
 
 class RecurrentOpStopGradientTest(RecurrentOpTest1):
-    """
+    r"""
     Test RNNOp with stop_gradient = True
     equation:
         h_t = \sigma (W x_t + U h_{t-1})
diff --git a/python/paddle/fluid/tests/unittests/test_require_version.py b/python/paddle/fluid/tests/unittests/test_require_version.py
index 80d595c1ef1eb7b4346ff257593169a609c0f033..d1cb0aa4d8164c53e6fd0d32305d4ac993211428 100644
--- a/python/paddle/fluid/tests/unittests/test_require_version.py
+++ b/python/paddle/fluid/tests/unittests/test_require_version.py
@@ -79,7 +79,7 @@ class TestErrors(unittest.TestCase):
 
         self.assertRaises(TypeError, test_input_type_1)
 
-        # The value of params must be in format '\d+(\.\d+){0,3}', like '1.5.2.0', '1.6' ...
+        # The value of params must be in format r'\d+(\.\d+){0,3}', like '1.5.2.0', '1.6' ...
         def test_input_value_1():
             fluid.require_version('string')
 
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 510b99c03008d542cb8324de5a4888a9d18407dc..f1808efe86e433cbfcfedf04bd5e8cf6df6c46a6 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -35,7 +35,7 @@ def _is_numpy_(var):
 
 @six.add_metaclass(abc.ABCMeta)
 class Metric(object):
-    """
+    r"""
     Base class for metric, encapsulates metric logic and APIs
     Usage:
         
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index e7adc7106a4f09c7aef710b93b0976a0ae13fa45..915668de19d3cb5f67dc835211eb3fddd2dff6bf 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -58,7 +58,7 @@ import paddle
 
 
 def elu(x, alpha=1.0, name=None):
-    """
+    r"""
     elu activation.
 
     .. math::
@@ -101,7 +101,7 @@ def elu(x, alpha=1.0, name=None):
 
 
 def gelu(x, approximate=False, name=None):
-    """
+    r"""
     gelu activation.
 
     if approximate is True
@@ -155,7 +155,7 @@ def gelu(x, approximate=False, name=None):
 
 
 def hardshrink(x, threshold=0.5, name=None):
-    """
+    r"""
     hard shrinkage activation
 
     .. math::
@@ -204,7 +204,7 @@ def hardshrink(x, threshold=0.5, name=None):
 
 
 def hardtanh(x, min=-1.0, max=1.0, name=None):
-    """
+    r"""
     hardtanh activation
 
     .. math::
@@ -254,7 +254,7 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
 
 
 def hardsigmoid(x, name=None):
-    """
+    r"""
     hardsigmoid activation.
 
     A 3-part piecewise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
@@ -308,7 +308,7 @@ def hardsigmoid(x, name=None):
 
 
 def hardswish(x, name=None):
-    """
+    r"""
     hardswish activation
 
     hardswish is proposed in MobileNetV3, and performs better in computational stability
@@ -357,7 +357,7 @@ def hardswish(x, name=None):
 
 
 def leaky_relu(x, negative_slope=0.01, name=None):
-    """
+    r"""
     leaky_relu activation
 
     .. math::
@@ -515,7 +515,7 @@ def relu(x, name=None):
 
 
 def log_sigmoid(x, name=None):
-    """
+    r"""
     log_sigmoid activation.
 
     .. math::
@@ -552,7 +552,7 @@ def log_sigmoid(x, name=None):
 
 
 def maxout(x, groups, axis=1, name=None):
-    """
+    r"""
     maxout activation.
 
     Assumed the input shape is (N, Ci, H, W).
@@ -671,7 +671,7 @@ def selu(x,
          scale=1.0507009873554804934193349852946,
          alpha=1.6732632423543772848170429916717,
          name=None):
-    """
+    r"""
     selu activation
 
     .. math::
@@ -726,7 +726,7 @@ def selu(x,
 
 
 def softmax(x, axis=-1, dtype=None, name=None):
-    """
+    r"""
     This operator implements the softmax layer. The calculation process is as follows:
 
     1. The dimension :attr:`axis` of ``x`` will be permuted to the last.
@@ -880,7 +880,7 @@ def softmax(x, axis=-1, dtype=None, name=None):
 
 
 def softplus(x, beta=1, threshold=20, name=None):
-    """
+    r"""
     softplus activation
 
     .. math::
@@ -925,7 +925,7 @@ def softplus(x, beta=1, threshold=20, name=None):
 
 
 def softshrink(x, threshold=0.5, name=None):
-    """
+    r"""
     softshrink activation
 
     .. math::
@@ -976,7 +976,7 @@ def softshrink(x, threshold=0.5, name=None):
 
 
 def softsign(x, name=None):
-    """
+    r"""
     softsign activation
 
     .. math::
@@ -1013,7 +1013,7 @@ def softsign(x, name=None):
 
 
 def swish(x, name=None):
-    """
+    r"""
     swish activation.
 
     .. math::
@@ -1091,7 +1091,7 @@ def tanhshrink(x, name=None):
 
 
 def thresholded_relu(x, threshold=1.0, name=None):
-    """
+    r"""
     thresholded relu activation.
 
     .. math::
@@ -1137,7 +1137,7 @@ def thresholded_relu(x, threshold=1.0, name=None):
 
 
 def log_softmax(x, axis=-1, dtype=None, name=None):
-    """
+    r"""
     This operator implements the log_softmax layer. The calculation process is
     as follows:
 
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 910a302599fef295b2d49ddc63fb8b73f46fd720..a4c92883e060738c37cf21a48692f132d20cb554 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1413,7 +1413,7 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
 
 
 def linear(x, weight, bias=None, name=None):
-    """
+    r"""
 
     Fully-connected linear transformation operator. For each input :math:`X` ,
     the equation is:
@@ -1500,7 +1500,7 @@ def linear(x, weight, bias=None, name=None):
 
 
 def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
-    """
+    r"""
     Label smoothing is a mechanism to regularize the classifier layer and is called
     label-smoothing regularization (LSR).
 
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index c4410346ca17d1bed626cf176403dcb43b9668c5..75be8f54cd7de167dd5b83594dd3041b4943d976 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -166,7 +166,7 @@ def conv1d(x,
            groups=1,
            data_format='NCL',
            name=None):
-    """
+    r"""
     The convolution1D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input and
     Output are in NCL format, where N is batch size, C is the number of
@@ -392,7 +392,7 @@ def conv2d(x,
            groups=1,
            data_format="NCHW",
            name=None):
-    """
+    r"""
 
     The convolution2D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input and
@@ -568,7 +568,7 @@ def conv1d_transpose(x,
                      output_size=None,
                      data_format="NCL",
                      name=None):
-    """
+    r"""
     The 1-D convolution transpose layer calculates the output based on the input,
     filter, and dilation, stride, padding. Input(Input) and output(Output)
     are in 'NCL' format or 'NLC' where N is batch size, C is the number of channels,
@@ -828,7 +828,7 @@ def conv2d_transpose(x,
                      output_size=None,
                      data_format='NCHW',
                      name=None):
-    """
+    r"""
 
     The convolution2D transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
@@ -1068,7 +1068,7 @@ def conv3d(x,
            groups=1,
            data_format="NCDHW",
            name=None):
-    """
+    r"""
 
     The convolution3D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
@@ -1233,7 +1233,7 @@ def conv3d_transpose(x,
                      output_size=None,
                      data_format='NCDHW',
                      name=None):
-    """
+    r"""
     The convolution3d transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
     are in NCDHW or NDHWC format. Where N is batch size, C is the number of channels,
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 4ec0f8407fa91ba2a3f11a01c1b2ff5125f4ddf8..5e80f307eeeef8a2a44a505086b88bb3d6a4804b 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -14,10 +14,7 @@
 
 # TODO: define the extention functions
 
-__all__ = [
-    'diag_embed',
-    'row_conv'
-]
+__all__ = ['diag_embed', 'row_conv']
 
 import numpy as np
 from ...fluid.data_feeder import check_dtype
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index 40b9441c2dc00124c56603dcd75c94b503a320db..5cabc4b67558b159c45477a025c8a7b5814bea5b 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -111,7 +111,7 @@ def one_hot(x, num_classes, name=None):
 
 
 def embedding(x, weight, padding_idx=None, sparse=False, name=None):
-    """
+    r"""
     The operator is used to lookup embeddings vector of ids provided by :attr:`x` .
 
     The shape of output Tensor is generated by appending the last dimension of the input Tensor shape
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 1b19c4c163707559917dad79b92b4fe9a02ae7b2..fb923e056714824283f02c491cca4711b99d9b37 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -184,7 +184,7 @@ def binary_cross_entropy_with_logits(logit,
                                      reduction='mean',
                                      pos_weight=None,
                                      name=None):
-    """
+    r"""
     This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
     Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
     layer and some reduce operations.
@@ -461,7 +461,7 @@ def hsigmoid_loss(input,
 
 
 def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
-    """
+    r"""
     This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
     term if the absolute element-wise error falls below 1 and an L1 term otherwise.
     In some cases it can prevent exploding gradients and it is more robust and less
@@ -544,7 +544,7 @@ def margin_ranking_loss(input,
                         margin=0.0,
                         reduction='mean',
                         name=None):
-    """
+    r"""
 
     This op the calcluate the the margin rank loss between the input, other and label, use the math function as follows.
 
@@ -646,7 +646,7 @@ def margin_ranking_loss(input,
 
 
 def l1_loss(input, label, reduction='mean', name=None):
-    """
+    r"""
     This operator computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
 
     If `reduction` set to ``'none'``, the loss is:
@@ -840,7 +840,7 @@ def nll_loss(input,
 
 
 def kl_div(input, label, reduction='mean', name=None):
-    """
+    r"""
     This operator calculates the Kullback-Leibler divergence loss
     between Input(X) and Input(Target). Notes that Input(X) is the
     log-probability and Input(Target) is the probability.
@@ -947,7 +947,7 @@ def kl_div(input, label, reduction='mean', name=None):
 
 
 def mse_loss(input, label, reduction='mean', name=None):
-    """
+    r"""
     This op accepts input predications and label and returns the mean square error.
 
     If :attr:`reduction` is set to ``'none'``, loss is calculated as:
@@ -1121,7 +1121,7 @@ def cross_entropy(input,
                   weight=None,
                   ignore_index=-100,
                   reduction='mean'):
-    """
+    r"""
     This operator implements the cross entropy loss function. This OP combines ``LogSoftmax``,
     and ``NLLLoss`` together.
 
@@ -1252,7 +1252,7 @@ def sigmoid_focal_loss(logit,
                        gamma=2.0,
                        reduction='sum',
                        name=None):
-    """
+    r"""
     `Focal Loss <https://arxiv.org/abs/1708.02002>`_ is proposed to address the
     foreground-background class imbalance for classification tasks. It down-weights
     easily-classified examples and thus focuses training on hard examples. For example,
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 0a1547bebbb312c27fcd3e2a882779cf6fd361d9..250039b96460ab3914552fa9d76c29ada52049ef 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -35,7 +35,7 @@ __all__ = [
 
 
 def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
-    """
+    r"""
     This op normalizes ``x`` along dimension ``axis`` using :math:`L_p` norm. This layer computes
 
     .. math::
@@ -412,7 +412,7 @@ def local_response_norm(x,
                         k=1.,
                         data_format="NCHW",
                         name=None):
-    """
+    r"""
         Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
         For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
 
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 5e1cb377bd72b97e7d8cf85544594eacca20fd90..a76bc9e86d226e08217bfafe3f39a71b6b5ed5ef 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -54,11 +54,7 @@ import numpy as np
 # from ...fluid.layers import roi_perspective_transform  #DEFINE_ALIAS
 # from ...fluid.layers import shuffle_channel  #DEFINE_ALIAS
 
-__all__ = [
-    'affine_grid',
-    'grid_sample',
-    'pixel_shuffle'
-]
+__all__ = ['affine_grid', 'grid_sample', 'pixel_shuffle']
 
 
 def affine_grid(theta, out_shape, align_corners=True, name=None):
diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py
index f0c6880e89d8eb6e1a0ecf166a0a926f8a1d87c3..7e2b6f787f85316c9ad4c3bedf91eef3b19cd50d 100644
--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -19,7 +19,7 @@ __all__ = ['KaimingUniform', 'KaimingNormal']
 
 
 class KaimingNormal(MSRAInitializer):
-    """Implements the Kaiming Normal initializer
+    r"""Implements the Kaiming Normal initializer
 
     This class implements the weight initialization from the paper
     `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
@@ -62,7 +62,7 @@ class KaimingNormal(MSRAInitializer):
 
 
 class KaimingUniform(MSRAInitializer):
-    """Implements the Kaiming Uniform initializer
+    r"""Implements the Kaiming Uniform initializer
 
     This class implements the weight initialization from the paper
     `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index 5a4e7fec057e71b2ac9285f4c388d6b08c176444..821a6984753105162e878c879cd5b960d2aa80e1 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -18,7 +18,7 @@ __all__ = ['XavierNormal', 'XavierUniform']
 
 
 class XavierNormal(XavierInitializer):
-    """
+    r"""
     This class implements the Xavier weight initializer from the paper
     `Understanding the difficulty of training deep feedforward neural
     networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
@@ -71,7 +71,7 @@ class XavierNormal(XavierInitializer):
 
 
 class XavierUniform(XavierInitializer):
-    """
+    r"""
     This class implements the Xavier weight initializer from the paper
     `Understanding the difficulty of training deep feedforward neural
     networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 520762107db07ec19c2bfb8f8274355e90cbbc68..b002b534625ffb7efbd4828f11772d634b91ab5d 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -50,7 +50,7 @@ from .. import functional as F
 
 
 class ELU(layers.Layer):
-    """
+    r"""
     ELU Activation.
 
     .. math::
@@ -88,7 +88,7 @@ class ELU(layers.Layer):
 
 
 class GELU(layers.Layer):
-    """
+    r"""
     GELU Activation.
 
     If approximate is True
@@ -137,7 +137,7 @@ class GELU(layers.Layer):
 
 
 class Hardshrink(layers.Layer):
-    """
+    r"""
     Hardshrink Activation
 
     .. math::
@@ -181,7 +181,7 @@ class Hardshrink(layers.Layer):
 
 
 class Hardswish(layers.Layer):
-    """
+    r"""
     Hardswish activation
 
     Hardswish is proposed in MobileNetV3, and performs better in computational stability
@@ -227,7 +227,7 @@ class Hardswish(layers.Layer):
 
 
 class Tanh(layers.Layer):
-    """
+    r"""
     Tanh Activation.
 
     .. math::
@@ -264,7 +264,7 @@ class Tanh(layers.Layer):
 
 
 class Hardtanh(layers.Layer):
-    """
+    r"""
     Hardtanh Activation
 
     .. math::
@@ -442,7 +442,7 @@ class ReLU6(layers.Layer):
 
 
 class SELU(layers.Layer):
-    """
+    r"""
     SELU Activation
 
     .. math::
@@ -488,7 +488,7 @@ class SELU(layers.Layer):
 
 
 class LeakyReLU(layers.Layer):
-    """
+    r"""
     Leaky ReLU Activation.
 
     .. math::
@@ -574,7 +574,7 @@ class Sigmoid(layers.Layer):
 
 
 class Hardsigmoid(layers.Layer):
-    """
+    r"""
     This interface is used to construct a callable object of the ``Hardsigmoid`` class.
     This layer calcluate the `hardsigmoid` of input x.
 
@@ -621,7 +621,7 @@ class Hardsigmoid(layers.Layer):
 
 
 class Softplus(layers.Layer):
-    """
+    r"""
     Softplus Activation
 
     .. math::
@@ -661,7 +661,7 @@ class Softplus(layers.Layer):
 
 
 class Softshrink(layers.Layer):
-    """
+    r"""
     Softshrink Activation
 
     .. math::
@@ -702,7 +702,7 @@ class Softshrink(layers.Layer):
 
 
 class Softsign(layers.Layer):
-    """
+    r"""
     Softsign Activation
 
     .. math::
@@ -737,7 +737,7 @@ class Softsign(layers.Layer):
 
 
 class Swish(layers.Layer):
-    """
+    r"""
     Swish Activation.
 
     .. math::
@@ -807,7 +807,7 @@ class Tanhshrink(layers.Layer):
 
 
 class ThresholdedReLU(layers.Layer):
-    """
+    r"""
     Thresholded ReLU Activation
 
     .. math::
@@ -847,7 +847,7 @@ class ThresholdedReLU(layers.Layer):
 
 
 class LogSigmoid(layers.Layer):
-    """
+    r"""
     LogSigmoid Activation.
 
     .. math::
@@ -882,7 +882,7 @@ class LogSigmoid(layers.Layer):
 
 
 class Softmax(layers.Layer):
-    """
+    r"""
     Softmax Activation.
 
     This operator implements the softmax layer. The calculation process is as follows:
@@ -1005,7 +1005,7 @@ class Softmax(layers.Layer):
 
 
 class LogSoftmax(layers.Layer):
-    """
+    r"""
     This operator implements the log_softmax layer. The calculation process is as follows:
 
     .. math::
@@ -1059,7 +1059,7 @@ class LogSoftmax(layers.Layer):
 
 
 class Maxout(layers.Layer):
-    """
+    r"""
     Maxout Activation.
 
     Assumed the input shape is (N, Ci, H, W).
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 9a3edef5e4cc4bae3fe7f86c8640a3fe717f2039..8558e0f1793bc83d604f3734405add2243908257 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -40,7 +40,7 @@ __all__ = [
 
 
 class Linear(layers.Layer):
-    """
+    r"""
 
     Fully-connected linear transformation layer. For each input :math:`X` ,
     the equation is:
@@ -381,7 +381,7 @@ class Upsample(layers.Layer):
 
 
 class Bilinear(layers.Layer):
-    """
+    r"""
 
     This layer performs bilinear on two inputs.
 
@@ -988,7 +988,7 @@ class CosineSimilarity(layers.Layer):
 
 
 class Embedding(layers.Layer):
-    """
+    r"""
     **Embedding Layer**
 
     This interface is used to construct a callable object of the ``Embedding`` class.
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 0b0d0e302b841c525d2059daa4ad45cb609159ca..d554bb0fd96bd0bd5b2ab12da36929e102888c81 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -141,7 +141,7 @@ class _ConvNd(layers.Layer):
 
 
 class Conv1D(_ConvNd):
-    """
+    r"""
     This interface is used to construct a callable object of the ``Conv1D`` class.
     For more details, refer to code examples.
     The convolution1D layer calculates the output based on the input, filter
@@ -294,7 +294,7 @@ class Conv1D(_ConvNd):
 
 
 class Conv1DTranspose(_ConvNd):
-    """
+    r"""
     This interface is used to construct a callable object of the ``Conv1DTranspose`` class.
     For more details, refer to code examples.
     The 1-D convolution transpose layer calculates the output based on the input,
@@ -469,7 +469,7 @@ class Conv1DTranspose(_ConvNd):
 
 
 class Conv2D(_ConvNd):
-    """
+    r"""
     This interface is used to construct a callable object of the ``Conv2D`` class.
     For more details, refer to code examples.
     The convolution2D layer calculates the output based on the input, filter
@@ -626,7 +626,7 @@ class Conv2D(_ConvNd):
 
 
 class Conv2DTranspose(_ConvNd):
-    """
+    r"""
     This interface is used to construct a callable object of the ``Conv2DTranspose`` class.
     For more details, refer to code examples.
     The convolution2D transpose layer calculates the output based on the input,
@@ -786,7 +786,7 @@ class Conv2DTranspose(_ConvNd):
 
 
 class Conv3D(_ConvNd):
-    """
+    r"""
     **Convlution3d Layer**
     The convolution3d layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
@@ -943,7 +943,7 @@ class Conv3D(_ConvNd):
 
 
 class Conv3DTranspose(_ConvNd):
-    """
+    r"""
     **Convlution3D transpose layer**
     The convolution3D transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
index 28b29a583d8a3c0f31c9eb1bb6883ac997394bf2..5a3c611b3c447edcfa34f49c0a38c36b9e7d3ec9 100644
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -24,7 +24,7 @@ from ...fluid.layer_helper import LayerHelper
 
 
 class PairwiseDistance(layers.Layer):
-    """
+    r"""
     This operator computes the pairwise distance between two vectors. The
     distance is calculated by p-oreder norm:
 
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 96db0dde54f6ee32eef8d655f05c6fa2b09ad439..faf1345c7bae3f7a3758300f15d6eadd37daca32 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -36,7 +36,7 @@ __all__ = [
 
 
 class BCEWithLogitsLoss(fluid.dygraph.Layer):
-    """
+    r"""
     This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
     Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
     layer and some reduce operations.
@@ -141,7 +141,7 @@ class BCEWithLogitsLoss(fluid.dygraph.Layer):
 
 
 class CrossEntropyLoss(fluid.dygraph.Layer):
-    """
+    r"""
 	:alias_main: paddle.nn.CrossEntropyLoss
 	:alias: paddle.nn.CrossEntropyLoss,paddle.nn.layer.CrossEntropyLoss,paddle.nn.layer.loss.CrossEntropyLoss
 
@@ -375,7 +375,7 @@ class HSigmoidLoss(fluid.dygraph.Layer):
 
 
 class MSELoss(fluid.dygraph.layers.Layer):
-    """
+    r"""
     **Mean Square Error Loss**
     Computes the mean square error (squared L2 norm) of given input and label.
 
@@ -454,7 +454,7 @@ class MSELoss(fluid.dygraph.layers.Layer):
 
 
 class L1Loss(fluid.dygraph.Layer):
-    """
+    r"""
     This interface is used to construct a callable object of the ``L1Loss`` class.
     The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
 
@@ -622,7 +622,7 @@ class BCELoss(fluid.dygraph.Layer):
 
 
 class NLLLoss(fluid.dygraph.Layer):
-    """
+    r"""
 
     This class accepts input and target label and returns negative log likelihood
     cross error. It is useful to train a classification problem with C classes.
@@ -733,7 +733,7 @@ class NLLLoss(fluid.dygraph.Layer):
 
 
 class KLDivLoss(fluid.dygraph.Layer):
-    """
+    r"""
     This interface calculates the Kullback-Leibler divergence loss
     between Input(X) and Input(Target). Notes that Input(X) is the
     log-probability and Input(Target) is the probability.
@@ -806,7 +806,7 @@ class KLDivLoss(fluid.dygraph.Layer):
 
 
 class MarginRankingLoss(fluid.dygraph.Layer):
-    """
+    r"""
 
     This interface is used to construct a callable object of the ``MarginRankingLoss`` class.
     The MarginRankingLoss layer calculates the margin rank loss between the input, other and label
@@ -958,7 +958,7 @@ class CTCLoss(fluid.dygraph.Layer):
 
 
 class SmoothL1Loss(fluid.dygraph.Layer):
-    """
+    r"""
     This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
     term if the absolute element-wise error falls below 1 and an L1 term otherwise.
     In some cases it can prevent exploding gradients and it is more robust and less
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 5e2292d40d2bfbd83d6fe37f1b4ea03c82397c31..7f416749c8afbb67a4e0b17e5a349d1eb355b132 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -109,7 +109,7 @@ class _InstanceNormBase(layers.Layer):
 
 
 class InstanceNorm1D(_InstanceNormBase):
-    """
+    r"""
     Applies Instance Normalization over a 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
     DataLayout: NCL `[batch, in_channels, length]`
@@ -181,7 +181,7 @@ class InstanceNorm1D(_InstanceNormBase):
 
 
 class InstanceNorm2D(_InstanceNormBase):
-    """
+    r"""
     Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
     DataLayout: NCHW `[batch, in_channels, in_height, in_width]`
@@ -252,7 +252,7 @@ class InstanceNorm2D(_InstanceNormBase):
 
 
 class InstanceNorm3D(_InstanceNormBase):
-    """
+    r"""
     Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
     DataLayout: NCHW `[batch, in_channels, D, in_height, in_width]`
@@ -437,7 +437,7 @@ class GroupNorm(layers.Layer):
 
 
 class LayerNorm(layers.Layer):
-    """
+    r"""
     :alias_main: paddle.nn.LayerNorm
 	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
 	:old_api: paddle.fluid.dygraph.LayerNorm
@@ -649,7 +649,7 @@ class _BatchNormBase(layers.Layer):
 
 
 class BatchNorm1D(_BatchNormBase):
-    """
+    r"""
     Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
     When track_running_stats = False, the :math:`\\mu_{\\beta}`
@@ -740,7 +740,7 @@ class BatchNorm1D(_BatchNormBase):
 
 
 class BatchNorm2D(_BatchNormBase):
-    """
+    r"""
     Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
     When track_running_stats = False, the :math:`\\mu_{\\beta}`
@@ -829,7 +829,7 @@ class BatchNorm2D(_BatchNormBase):
 
 
 class BatchNorm3D(_BatchNormBase):
-    """
+    r"""
     Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
     When track_running_stats = False, the :math:`\\mu_{\\beta}`
@@ -919,7 +919,7 @@ class BatchNorm3D(_BatchNormBase):
 
 
 class SyncBatchNorm(_BatchNormBase):
-    """
+    r"""
     This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
     It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
     be used as a normalizer function for other operations, such as conv2d and fully connected 
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 7be229bdce09a577a348c45c8dd0af87c3e36da8..dc065918f3d77eddccc417e4746f2293d072f7bd 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -120,7 +120,7 @@ class AvgPool1D(layers.Layer):
 
 
 class AvgPool2D(layers.Layer):
-    """
+    r"""
     This operation applies 2D average pooling over input features based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
     in NCHW format, where N is batch size, C is the number of channels,
@@ -401,7 +401,7 @@ class MaxPool1D(layers.Layer):
 
 
 class MaxPool2D(layers.Layer):
-    """
+    r"""
     This operation applies 2D max pooling over input feature based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
     in NCHW format, where N is batch size, C is the number of channels,
@@ -595,7 +595,7 @@ class MaxPool3D(layers.Layer):
 
 
 class AdaptiveAvgPool1D(layers.Layer):
-    """
+    r"""
 
     This operation applies a 1D adaptive average pooling over an input signal composed
     of several input planes, based on the input, output_size, return_mask parameters.
@@ -663,7 +663,7 @@ class AdaptiveAvgPool1D(layers.Layer):
 
 
 class AdaptiveAvgPool2D(layers.Layer):
-    """
+    r"""
 
     This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size.
@@ -745,7 +745,7 @@ class AdaptiveAvgPool2D(layers.Layer):
 
 
 class AdaptiveAvgPool3D(layers.Layer):
-    """
+    r"""
 
     This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size.
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index ea4f6970bc686b202ad36bde772e432fd9800c20..0da00735b43a1cbef334bf1848a80bb37f939500 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -157,7 +157,7 @@ class MultiHeadAttention(Layer):
             embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
 
     def _prepare_qkv(self, query, key, value, cache=None):
-        """
+        r"""
         Prapares linear projected queries, keys and values for usage of subsequnt
         multiple parallel attention. If `cache` is not None, using cached results
         to reduce redundant calculations.
@@ -212,7 +212,7 @@ class MultiHeadAttention(Layer):
         return (q, k, v) if cache is None else (q, k, v, cache)
 
     def compute_kv(self, key, value):
-        """
+        r"""
         Applies linear projection on input keys and values, then splits heads
         (reshape and transpose) to get keys and values from different representation
         subspaces. The results are used as key-values pairs for subsequent multiple
@@ -312,7 +312,7 @@ class MultiHeadAttention(Layer):
             return self.Cache(key, value)
 
     def forward(self, query, key, value, attn_mask=None, cache=None):
-        """
+        r"""
         Applies multi-head attention to map queries and a set of key-value pairs
         to outputs.
 
@@ -499,7 +499,7 @@ class TransformerEncoderLayer(Layer):
         self.activation = getattr(F, activation)
 
     def forward(self, src, src_mask=None):
-        """
+        r"""
         Applies a Transformer encoder layer on the input.
 
         Parameters:
@@ -575,7 +575,7 @@ class TransformerEncoder(Layer):
         self.norm = norm
 
     def forward(self, src, src_mask=None):
-        """
+        r"""
         Applies a stack of N Transformer encoder layers on inputs. If `norm` is
         provided, also applies layer normalization on the output of last encoder
         layer.
@@ -725,7 +725,7 @@ class TransformerDecoderLayer(Layer):
         self.activation = getattr(F, activation)
 
     def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
-        """
+        r"""
         Applies a Transformer decoder layer on the input.
 
         Parameters:
@@ -801,7 +801,7 @@ class TransformerDecoderLayer(Layer):
                                                 static_cache))
 
     def gen_cache(self, memory):
-        """
+        r"""
         Generates cache for `forward` usage. The generated cache is a tuple
         composed of an instance of `MultiHeadAttention.Cache` and an instance
         of `MultiHeadAttention.StaticCache`.
@@ -873,7 +873,7 @@ class TransformerDecoder(Layer):
         self.norm = norm
 
     def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
-        """
+        r"""
         Applies a stack of N Transformer decoder layers on inputs. If `norm` is
         provided, also applies layer normalization on the output of last decoder
         layer.
@@ -937,7 +937,7 @@ class TransformerDecoder(Layer):
         return output if cache is None else (output, new_caches)
 
     def gen_cache(self, memory, do_zip=False):
-        """
+        r"""
         Generates cache for `forward` usage. The generated cache is a list, and
         each element in it is a tuple( :code:`(incremental_cache, static_cache)` )
         produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache`
@@ -1139,7 +1139,7 @@ class Transformer(Layer):
         self.nhead = nhead
 
     def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
-        """
+        r"""
         Applies a Transformer model on the inputs.
 
         Parameters:
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index 7a21e7661d4e78d0004996ee67c80ddc35006bc3..b14fb3e21200d5c11ca925b83090eb5ead61c4c4 100644
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -153,7 +153,7 @@ class WeightNorm(object):
 
 
 def weight_norm(layer, name='weight', dim=0):
-    """
+    r"""
     This weight_norm layer applies weight normalization to a parameter according to the 
     following formula:
 
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index bba2c11ea07490804573189bac8b315dfc80fd37..91591d23f00c4b159ab5bb18b550478c5b40eb37 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -21,7 +21,7 @@ __all__ = ["Adadelta"]
 
 
 class Adadelta(Optimizer):
-    """
+    r"""
     **Notes: This API does not support sparse parameter optimization.**
 
     Adadelta Optimizer. Please refer to this for details:
diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py
index ed55ebd0bf2a334b8d25cff58fa18977b9547ad5..72a3f8ce99606f500fb4985688a026e54924948d 100644
--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
@@ -21,7 +21,7 @@ __all__ = ["Adagrad"]
 
 
 class Adagrad(Optimizer):
-    """
+    r"""
     The Adaptive Gradient optimizer (Adagrad for short) use an optimization described 
     in paper: `Adaptive Subgradient Methods for Online Learning and
     Stochastic Optimization <http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf>`_.
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 79caa1583121dff8cdf1781d4ab109f1c9d16878..375102312194e0ffb34c023832e403ecc7a61643 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -24,7 +24,7 @@ __all__ = ["Adam"]
 
 
 class Adam(Optimizer):
-    """
+    r"""
     The Adam optimizer uses an optimization described at the end
     of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
     it can dynamically adjusts the learning rate of each parameter using
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index e5d1962d12625c8fe4a74dff331cd948b23e6498..5d164fa76235147c2ec2f0dd72e98cb978129fad 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -21,7 +21,7 @@ __all__ = ["Adamax"]
 
 
 class Adamax(Optimizer):
-    """
+    r"""
     The Adamax optimizer is implemented based on the Adamax Optimization 
     in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
     The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 0ffff675903573e4bfdd2e7c210e460f4ee903d0..b597109d31457059d0daf6ad93e4579dfcb325c1 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -23,7 +23,7 @@ __all__ = ['AdamW']
 
 
 class AdamW(Adam):
-    """
+    r"""
     The AdamW optimizer is implemented based on the AdamW Optimization
     in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
     it can resolves the problem of L2 regularization failure in the Adam optimizer.
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 2d5dc5d998e638eebf573f6d2c3c584052739629..5085911ce927a319c191b91cd6b48af64a50a05d 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -192,7 +192,7 @@ class LRScheduler(object):
 
 
 class NoamDecay(LRScheduler):
-    """
+    r"""
 
     Applies Noam Decay to the initial learning rate. 
 
@@ -376,7 +376,7 @@ class PiecewiseDecay(LRScheduler):
 
 
 class NaturalExpDecay(LRScheduler):
-    """
+    r"""
 
     Applies natural exponential decay to the initial learning rate.
     
@@ -455,7 +455,7 @@ class NaturalExpDecay(LRScheduler):
 
 
 class InverseTimeDecay(LRScheduler):
-    """
+    r"""
 
     Applies inverse time decay to the initial learning rate.
 
@@ -536,7 +536,7 @@ class InverseTimeDecay(LRScheduler):
 
 
 class PolynomialDecay(LRScheduler):
-    """
+    r"""
 
     Applies polynomial decay to the initial learning rate.
 
@@ -656,7 +656,7 @@ class PolynomialDecay(LRScheduler):
 
 
 class LinearWarmup(LRScheduler):
-    """
+    r"""
 
     Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
     For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
@@ -794,7 +794,7 @@ class LinearWarmup(LRScheduler):
 
 
 class ExponentialDecay(LRScheduler):
-    """
+    r"""
 
     Update learning rate by `gamma` each epoch.
 
@@ -1383,7 +1383,7 @@ class ReduceOnPlateau(LRScheduler):
 
 
 class CosineAnnealingDecay(LRScheduler):
-    """
+    r"""
 
     Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to 
     the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in 
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index 87fa86c17615ef8cc455e95517608a246d677e74..2cfd8deaef7db6c6ef1313b3151a686e52fa7140 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -21,7 +21,7 @@ __all__ = ["Momentum"]
 
 
 class Momentum(Optimizer):
-    """
+    r"""
 
     Simple Momentum optimizer with velocity state
 
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index d0326b4155a1674959e4903c495c3ec99af5af20..030d419de48e023e0229e41c574384323a6878be 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -47,7 +47,7 @@ __all__ = ['Optimizer']
 
 
 class Optimizer(object):
-    """Optimizer Base class.
+    r"""Optimizer Base class.
 
     Define the common interface of an optimizer.
     User should not use this class directly,
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index a664b01595632f7b8dd79edbebd249d672fdff4a..12825bb781381253186bad500c6009b8b8d1db43 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -21,7 +21,7 @@ __all__ = ["RMSProp"]
 
 
 class RMSProp(Optimizer):
-    """
+    r"""
     Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
     rate method. The original slides proposed RMSProp: Slide 29 of
     http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index 133c3dfb24fed82e4d666321585932d7e58a6f29..44e5695a2cfa8e7778adc40133d66898736012b5 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -21,7 +21,7 @@ __all__ = ["SGD"]
 
 
 class SGD(Optimizer):
-    """
+    r"""
     Optimizer of the stochastic gradient descent algorithm.
 
     .. math::
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
index 881cfd813141653fed8e7d9107cdebe54c9df791..1a4d45469235dac16eaa414b8fc6350bbbb040fa 100644
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
+r"""
 At training and testing time, PaddlePaddle programs need to read data. To ease
 the users' work to write data reading code, we define that
 
diff --git a/python/paddle/regularizer.py b/python/paddle/regularizer.py
index a1ab329169af2cf4534a46b4e0f5a1cfa3883f29..586ae0f988c2e5d68f4b171944b37db96861c558 100644
--- a/python/paddle/regularizer.py
+++ b/python/paddle/regularizer.py
@@ -18,7 +18,7 @@ import paddle.fluid as fluid
 
 
 class L1Decay(fluid.regularizer.L1Decay):
-    """
+    r"""
     Implement the L1 Weight Decay Regularization, which encourages the weights to be sparse.
     
     It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
@@ -80,7 +80,7 @@ class L1Decay(fluid.regularizer.L1Decay):
 
 
 class L2Decay(fluid.regularizer.L2Decay):
-    """
+    r"""
     Implement the L2 Weight Decay Regularization, which helps to prevent the model over-fitting.
     
     It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index a25a8fb191bb2db45746ef3c027fc2993fe78ef4..84a5ed9950a0a64e36205edac01d7713f4d7f06f 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -14,7 +14,6 @@
 
 from __future__ import print_function
 
-
 import errno
 import inspect
 import logging
@@ -31,7 +30,6 @@ from paddle.fluid.io import prepend_feed_ops, append_fetch_ops, save_persistable
 from paddle.fluid.io import load_persistables, _endpoints_replacement
 from paddle.fluid.log_helper import get_logger
 
-
 __all__ = [
     'save_inference_model',
     'load_inference_model',
@@ -44,10 +42,13 @@ _logger = get_logger(
 def _check_args(caller, args, supported_args=[], deprecated_args=[]):
     for arg in args:
         if arg in deprecated_args:
-            raise ValueError("argument '{}' in function '{}' is deprecated, only {} are supported.".format(arg, caller, supported_args))
+            raise ValueError(
+                "argument '{}' in function '{}' is deprecated, only {} are supported.".
+                format(arg, caller, supported_args))
         elif arg not in supported_args:
             raise ValueError(
-                "function '{}' doesn't support argument '{}',\n only {} are supported.".format(caller, arg, supported_args))
+                "function '{}' doesn't support argument '{}',\n only {} are supported.".
+                format(caller, arg, supported_args))
 
 
 @static_only
@@ -129,14 +130,18 @@ def save_inference_model(path_prefix, feed_vars, fetch_vars, executor):
     # verify feed_vars
     if not isinstance(feed_vars, list):
         feed_vars = [feed_vars]
-    if not feed_vars or not all([isinstance(var, Variable) for var in feed_vars]):
-        raise ValueError("'feed_vars' should be a Variable or a list of Variable.")
+    if not feed_vars or not all(
+        [isinstance(var, Variable) for var in feed_vars]):
+        raise ValueError(
+            "'feed_vars' should be a Variable or a list of Variable.")
 
     # verify fetch_vars
     if not isinstance(fetch_vars, list):
         fetch_vars = [fetch_vars]
-    if not fetch_vars or not all([isinstance(var, Variable) for var in fetch_vars]):
-        raise ValueError("'fetch_vars' should be a Variable or a list of Variable.")
+    if not fetch_vars or not all(
+        [isinstance(var, Variable) for var in fetch_vars]):
+        raise ValueError(
+            "'fetch_vars' should be a Variable or a list of Variable.")
 
     main_program = _get_valid_program()
     # remind users to set auc_states to 0 if auc op were found.
@@ -145,7 +150,9 @@ def save_inference_model(path_prefix, feed_vars, fetch_vars, executor):
         device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
         op._set_attr(device_attr_name, "")
         if op.type == 'auc':
-            warnings.warn("Be sure that you have set auc states to 0 before saving inference model.")
+            warnings.warn(
+                "Be sure that you have set auc states to 0 before saving inference model."
+            )
             break
 
     # fix the bug that the activation op's output as target will be pruned.
@@ -154,10 +161,11 @@ def save_inference_model(path_prefix, feed_vars, fetch_vars, executor):
     with program_guard(main_program):
         uniq_fetch_vars = []
         for i, var in enumerate(fetch_vars):
-            var = layers.scale(var, 1., name="save_infer_model/scale_{}".format(i))
+            var = layers.scale(
+                var, 1., name="save_infer_model/scale_{}".format(i))
             uniq_fetch_vars.append(var)
         fetch_vars = uniq_fetch_vars
-    
+
     # save model
     origin_program = main_program.clone()
     main_program = main_program.clone()
@@ -257,7 +265,7 @@ def load_inference_model(path_prefix, executor, **configs):
     """
     # check configs
     supported_args = ('model_filename', 'params_filename')
-    deprecated_args = ('pserver_endpoints',)
+    deprecated_args = ('pserver_endpoints', )
     caller = inspect.currentframe().f_code.co_name
     _check_args(caller, configs, supported_args, deprecated_args)
 
@@ -268,8 +276,7 @@ def load_inference_model(path_prefix, executor, **configs):
         params_filename = configs.get('params_filename', None)
         if params_filename is None:
             raise ValueError(
-                "params_filename cannot be None when path_prefix is None."
-            )
+                "params_filename cannot be None when path_prefix is None.")
         load_dirname = path_prefix
         program_desc_str = model_filename
         params_filename = params_filename
@@ -297,18 +304,21 @@ def load_inference_model(path_prefix, executor, **configs):
             if model_filename is None:
                 model_path = os.path.join(path_prefix, "__model__")
             else:
-                model_path = os.path.join(path_prefix, model_filename + ".pdmodel")
+                model_path = os.path.join(path_prefix,
+                                          model_filename + ".pdmodel")
                 if not os.path.exists(model_path):
                     model_path = os.path.join(path_prefix, model_filename)
             # set params_path
             if params_filename is None:
                 params_path = os.path.join(path_prefix, "")
             else:
-                params_path = os.path.join(path_prefix, params_filename + ".pdiparams")
+                params_path = os.path.join(path_prefix,
+                                           params_filename + ".pdiparams")
                 if not os.path.exists(params_path):
                     params_path = os.path.join(path_prefix, params_filename)
             _logger.warning("The old way to load inference model is deprecated."
-                    " model path: {}, params path: {}".format(model_path, params_path))
+                            " model path: {}, params path: {}".format(
+                                model_path, params_path))
         with open(model_path, "rb") as f:
             program_desc_str = f.read()
         load_dirname = os.path.dirname(params_path)
@@ -328,4 +338,3 @@ def load_inference_model(path_prefix, executor, **configs):
     ]
 
     return [program, feed_target_names, fetch_targets]
-
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index 44f0a73fa42cd74f7f0753d2c501ea13ca25209e..0806d2c29148f77abb2fb9c7ab591154be9efd14 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -26,7 +26,7 @@ def fc(x,
        bias_attr=None,
        activation=None,
        name=None):
-    """
+    r"""
 
     Fully-Connected layer can take a tensor or a list of tensor as its inputs.
     It creates a 2-D weight tensor for each input tensor, which represents its
@@ -180,7 +180,7 @@ def deform_conv2d(x,
                   weight_attr=None,
                   bias_attr=None,
                   name=None):
-    """
+    r"""
 
     Compute 2-D deformable convolution on 4-D input.
     Given input image x, output feature map y, the deformable convolution operation can be expressed as follow:
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index b46e1c79461a2b23519cfa462c7e4435b2e45158..32e86c96b4e2af57d4d53cc7299a99373e888d24 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -54,7 +54,7 @@ __all__ = [
 
 @dygraph_only
 def to_tensor(data, dtype=None, place=None, stop_gradient=True):
-    """
+    r"""
     Constructs a ``paddle.Tensor`` or ``paddle.ComplexTensor`` from ``data`` , 
     which can be scalar, tuple, list, numpy\.ndarray, paddle\.Tensor, paddle\.ComplexTensor.
 
@@ -609,7 +609,7 @@ def _tril_triu_op(helper):
 
 
 def tril(x, diagonal=0, name=None):
-    """
+    r"""
 	:alias_main: paddle.tril
 	:alias: paddle.tril,paddle.tensor.tril,paddle.tensor.creation.tril
 
@@ -680,7 +680,7 @@ def tril(x, diagonal=0, name=None):
 
 
 def triu(x, diagonal=0, name=None):
-    """
+    r"""
 	:alias_main: paddle.triu
 	:alias: paddle.triu,paddle.tensor.triu,paddle.tensor.creation.triu
 
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 25fb93431796fed63a3788dbf6d7f5bb5ed72881..b1c0f0b446a3c727d29dbe551c38166137804951 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -453,7 +453,7 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
 
 
 def dist(x, y, p=2):
-    """
+    r"""
 
     This OP returns the p-norm of (x - y). It is not a norm in a strict sense, only as a measure
     of distance. The shapes of x and y must be broadcastable. The definition is as follows, for
@@ -740,7 +740,7 @@ def cross(x, y, axis=None, name=None):
 
 
 def cholesky(x, upper=False, name=None):
-    """
+    r"""
     Computes the Cholesky decomposition of one symmetric positive-definite
     matrix or batches of symmetric positive-definite matrice. 
     
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 0bda55a1faedffc851535e00c1e55875ad9d549b..7ea8a9286c34e3365d2a03e39fa1825c2fab0780 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -169,7 +169,7 @@ def flip(x, axis, name=None):
 
 
 def flatten(x, start_axis=0, stop_axis=-1, name=None):
-    """
+    r"""
     **Flatten op**
 
     Flattens a contiguous range of axes in a tensor according to start_axis and stop_axis.
@@ -565,7 +565,7 @@ def unique(x,
            axis=None,
            dtype="int64",
            name=None):
-    """
+    r"""
     Returns the unique elements of `x` in ascending order.
 
     Args:
@@ -946,7 +946,7 @@ def scatter(x, index, updates, overwrite=True, name=None):
 
 
 def scatter_nd_add(x, index, updates, name=None):
-    """
+    r"""
     **Scatter_nd_add Layer**
 
     Output is obtained by applying sparse addition to a single value
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index ccc49c769c270dae644c98ce6eff472fc664118f..e7b72fe95bca6505e734b3e628493d819461bb57 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -379,7 +379,7 @@ def floor_divide(x, y, name=None):
 
 
 def remainder(x, y, name=None):
-    """
+    r"""
     Mod two tensors element-wise. The equation is:
 
     .. math::
@@ -981,7 +981,7 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
 
 
 def logsumexp(x, axis=None, keepdim=False, name=None):
-    """
+    r"""
     This OP calculates the log of the sum of exponentials of ``x`` along ``axis`` .
 
     .. math::
@@ -1281,7 +1281,7 @@ def min(x, axis=None, keepdim=False, name=None):
 
 
 def log1p(x, name=None):
-    """
+    r"""
     Calculates the natural log of the given input tensor, element-wise.
     .. math::
         Out = \\ln(x+1)
@@ -1315,7 +1315,7 @@ def log1p(x, name=None):
     return out
 
 def log2(x, name=None):
-    """
+    r"""
     Calculates the log to the base 2 of the given input tensor, element-wise.
 
     .. math::
@@ -1365,7 +1365,7 @@ def log2(x, name=None):
 
 
 def log10(x, name=None):
-    """
+    r"""
     Calculates the log to the base 10 of the given input tensor, element-wise.
 
     .. math::
@@ -1947,7 +1947,7 @@ def sign(x, name=None):
 
 
 def tanh(x, name=None):
-    """
+    r"""
     Tanh Activation Operator.
 
     .. math::
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index f5e0dc4c05bfb62137e1429192dbcee9c201b081..c4a3bf4b1b63b9860d86760fd6569491d2f35442 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -494,7 +494,7 @@ def sort(x, axis=-1, descending=False, name=None):
 
 
 def where(condition, x, y, name=None):
-    """
+    r"""
     Return a tensor of elements selected from either $x$ or $y$, depending on $condition$.
 
     .. math::
diff --git a/python/paddle/text/datasets/imdb.py b/python/paddle/text/datasets/imdb.py
index f1bf247efcaf7591fe8062976d6329898ee15258..f02b59819069571f5ba7935441f541ab0fa7d4c0 100644
--- a/python/paddle/text/datasets/imdb.py
+++ b/python/paddle/text/datasets/imdb.py
@@ -93,7 +93,7 @@ class Imdb(Dataset):
 
     def _build_work_dict(self, cutoff):
         word_freq = collections.defaultdict(int)
-        pattern = re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$")
+        pattern = re.compile(r"aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$")
         for doc in self._tokenize(pattern):
             for word in doc:
                 word_freq[word] += 1
@@ -123,8 +123,8 @@ class Imdb(Dataset):
         return data
 
     def _load_anno(self):
-        pos_pattern = re.compile("aclImdb/{}/pos/.*\.txt$".format(self.mode))
-        neg_pattern = re.compile("aclImdb/{}/neg/.*\.txt$".format(self.mode))
+        pos_pattern = re.compile(r"aclImdb/{}/pos/.*\.txt$".format(self.mode))
+        neg_pattern = re.compile(r"aclImdb/{}/neg/.*\.txt$".format(self.mode))
 
         UNK = self.word_idx['<unk>']
 
diff --git a/r/example/mobilenet.py b/r/example/mobilenet.py
index adb1c330a704f5349316d02d4c02d08e9d7222db..99e755ab69f8d6ae52d635fe4d9f5f6da5b3175b 100755
--- a/r/example/mobilenet.py
+++ b/r/example/mobilenet.py
@@ -1,4 +1,19 @@
 #!/usr/bin/env python3.7
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # pylint: skip-file
 
 import functools
diff --git a/tools/check_ctest_hung.py b/tools/check_ctest_hung.py
index c44690a93ac3c1f1833ee62b4e13d1ae8220fb55..556c8ef60439e531f573bd80fb44bad9c875ed4d 100644
--- a/tools/check_ctest_hung.py
+++ b/tools/check_ctest_hung.py
@@ -42,11 +42,11 @@ Diff:  set(['test_parallel_executor_crf'])
         for l in fn.readlines():
             if l.find("Test ") != -1 and \
                 l.find("Passed") != -1:
-                m = re.search("Test\s+#[0-9]*\:\s([a-z0-9_]+)", escape(l))
+                m = re.search(r"Test\s+#[0-9]*\:\s([a-z0-9_]+)", escape(l))
                 passed.add(m.group(1))
             if l.find("Start ") != -1:
                 start_parts = escape(l).split(" ")
-                m = re.search("Start\s+[0-9]+\:\s([a-z0-9_]+)", escape(l))
+                m = re.search(r"Start\s+[0-9]+\:\s([a-z0-9_]+)", escape(l))
                 started.add(m.group(1))
     print("Diff: ", started - passed)
 
diff --git a/tools/codestyle/docstring_checker.py b/tools/codestyle/docstring_checker.py
index 8d4b24a0cf6b743b72dca58fd885f927560964bf..823d9470230417676d30852ceae0582eded94be8 100644
--- a/tools/codestyle/docstring_checker.py
+++ b/tools/codestyle/docstring_checker.py
@@ -101,7 +101,7 @@ class Docstring(object):
     def _arg_with_type(self):
 
         for t in self.d['Args']:
-            m = re.search('([A-Za-z0-9_-]+)\s{0,4}(\(.+\))\s{0,4}:', t)
+            m = re.search(r'([A-Za-z0-9_-]+)\s{0,4}(\(.+\))\s{0,4}:', t)
             if m:
                 self.args[m.group(1)] = m.group(2)
 
diff --git a/tools/coverage/coverage_diff.py b/tools/coverage/coverage_diff.py
index 38f671fe4089d1f94caafaf26640e4df75870f55..6a400d293b27d30cc4dc36779aa1db580c2f6581 100644
--- a/tools/coverage/coverage_diff.py
+++ b/tools/coverage/coverage_diff.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 usage: coverage_diff.py info_file diff_file > > coverage-diff.info
 """
diff --git a/tools/coverage/coverage_diff_list.py b/tools/coverage/coverage_diff_list.py
index 8975185edadfbd567a428bbd90523923f5ab675d..6283430120995a2b87709dd285b918b8214864b1 100644
--- a/tools/coverage/coverage_diff_list.py
+++ b/tools/coverage/coverage_diff_list.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 usage: coverage_diff_list.py list_file max_rate > coverage-diff-list-90.out
 """
diff --git a/tools/coverage/coverage_lines.py b/tools/coverage/coverage_lines.py
index cdec5b8b1bb1873f8b9ef761e9d8575c89fee234..553cd691e4520006c56ce675d24301277d03984e 100644
--- a/tools/coverage/coverage_lines.py
+++ b/tools/coverage/coverage_lines.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 usage: coverage_lines.py info_file expected
 """
diff --git a/tools/coverage/cuda_clean.py b/tools/coverage/cuda_clean.py
index c71ff375fd59efa23b5d03e0fb53b0257c3a6c8c..8c03edd07854941799a3ac4b15eca357b0461477 100644
--- a/tools/coverage/cuda_clean.py
+++ b/tools/coverage/cuda_clean.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """ usage: cuda_clean.py pull_id. """
 
 import os
diff --git a/tools/coverage/gcda_clean.py b/tools/coverage/gcda_clean.py
index f5726db005efaa31ff684078f8fd60f1fa0df064..39fa3509cb86eb51d16abbcabcf5b12cb99df1bb 100644
--- a/tools/coverage/gcda_clean.py
+++ b/tools/coverage/gcda_clean.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """ usage: gcda_clean.py pull_id. """
 
 import os
diff --git a/tools/coverage/pull_request.py b/tools/coverage/pull_request.py
index 105460032f7db538eaf7a193776bf8085e2837a1..f3e88286ca965d68d00dd48b4ad590587210a9bf 100644
--- a/tools/coverage/pull_request.py
+++ b/tools/coverage/pull_request.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 usage: pull_request.py files pull_id
        pull_request.py diff  pull_id
diff --git a/tools/coverage/python_coverage.py b/tools/coverage/python_coverage.py
index 8ad9d85c1bf6b5ed542fb8469173c4f1815050a4..f2e52b5e23b3a8b7e4f463b53caa0c6f5a7d1238 100644
--- a/tools/coverage/python_coverage.py
+++ b/tools/coverage/python_coverage.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 usage: python_coverage.py > python-coverage.info
 """
diff --git a/tools/get_quick_disable_lt.py b/tools/get_quick_disable_lt.py
index 1e3d71789227285b8ad713b20a03a6ccb394f5ea..18ebdb0031747fb957fb77e078f7c4cd81142328 100644
--- a/tools/get_quick_disable_lt.py
+++ b/tools/get_quick_disable_lt.py
@@ -20,7 +20,7 @@ import requests
 def download_file():
     """Get disabled unit tests"""
     ssl._create_default_https_context = ssl._create_unverified_context
-    sysstr=sys.platform
+    sysstr = sys.platform
     if sysstr == 'win32':
         url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_win')
     else:
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index d23c18a44e936628f8a63fe9ebca86c1f61a5cd0..ce0490d487fbe7798cba06e7ff0c11b457a18979 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -236,20 +236,24 @@ def single_defcom_extract(start_from, srcls, is_class_begin=False):
         if srcls[x].startswith('def ') or srcls[x].startswith('class '):
             break
         else:
-            if (comstart == -1 and srcls[x].replace(" ", '').replace(
-                    "\t", '').replace("\n", '').startswith("\"\"\"")):
-                comstart = x
-                comstyle = 2
-                continue
+            if comstart == -1:
+                s = srcls[x].replace(" ", '').replace("\t",
+                                                      '').replace("\n", '')
+                if s.startswith("\"\"\"") or s.startswith("r\"\"\""):
+                    comstart = x
+                    comstyle = 2
+                    continue
             if (comstyle == 2 and comstart != -1 and
                     srcls[x].replace(" ", '').replace("\t", '').replace(
                         "\n", '').startswith("\"\"\"")):
                 break
-            if (comstart == -1 and srcls[x].replace(" ", '').replace(
-                    "\t", '').replace("\n", '').startswith("\'\'\'")):
-                comstart = x
-                comstyle = 1
-                continue
+            if comstart == -1:
+                s = srcls[x].replace(" ", '').replace("\t",
+                                                      '').replace("\n", '')
+                if s.startswith("\'\'\'") or s.startswith("r\'\'\'"):
+                    comstart = x
+                    comstyle = 1
+                    continue
             if (comstyle == 1 and comstart != -1 and
                     srcls[x].replace(" ", '').replace("\t", '').replace(
                         "\n", '').startswith("\'\'\'")):
diff --git a/tools/summary_env.py b/tools/summary_env.py
index 39d6acaf536c533a218d3d53b596c469ab19922d..38bae87651d4b24fc7377c65f371995c893fda42 100644
--- a/tools/summary_env.py
+++ b/tools/summary_env.py
@@ -92,7 +92,7 @@ def get_cudnn_info():
         cudnn_dll_path = run_shell_command('where cudnn*')
         if cudnn_dll_path:
             cudnn_header_path = cudnn_dll_path.split('bin')[
-                0] + 'include\cudnn.h'
+                0] + r'include\cudnn.h'
             cmd = 'type "{0}" | findstr "{1}" | findstr /v "CUDNN_VERSION"'
         else:
             envs['cudnn_version'] = None