From 3815d7aa4012d00fdf38292fb1e14dde5d26945b Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Tue, 24 Nov 2020 14:53:51 +0800
Subject: [PATCH] Upgrade  string literals to raw string (#28989)

* upgrade comment string to raw string

* fix string in

* fix string with ' '

* revert update on comments

* upgrade only necessary

* fix sample code checker

* fix comments with '''
---
 paddle/scripts/conda_build.py                 |  21 +++-
 python/paddle/dataset/imdb.py                 |  10 +-
 python/paddle/dataset/tests/imdb_test.py      |  12 +-
 .../fleet/base/distributed_strategy.py        |   2 +-
 python/paddle/distributed/fleet/launch.py     |   2 +-
 .../parameter_server_optimizer.py             |   4 +-
 python/paddle/distributed/launch.py           |   2 +-
 python/paddle/distribution.py                 |  12 +-
 python/paddle/fluid/clip.py                   |   6 +-
 python/paddle/fluid/contrib/layers/nn.py      |   8 +-
 .../paddle/fluid/contrib/layers/rnn_impl.py   |   6 +-
 .../paddle/fluid/contrib/memory_usage_calc.py |   2 +-
 .../slim/quantization/imperative/qat.py       |   2 +-
 .../slim/quantization/imperative/quant_nn.py  |   6 +-
 .../quantization/quant_int8_mkldnn_pass.py    |   2 +-
 .../slim/quantization/quantization_pass.py    |   2 +-
 .../paddle/fluid/contrib/utils/hdfs_utils.py  |   6 +-
 python/paddle/fluid/core.py                   |   2 +-
 .../fluid/dataloader/dataloader_iter.py       |  18 +--
 python/paddle/fluid/distributed/downpour.py   |   2 +-
 python/paddle/fluid/distributed/node.py       |   8 +-
 python/paddle/fluid/dygraph/base.py           |   2 +-
 .../fluid/dygraph/learning_rate_scheduler.py  |  12 +-
 python/paddle/fluid/dygraph/nn.py             |  24 ++--
 python/paddle/fluid/dygraph/rnn.py            |   4 +-
 python/paddle/fluid/framework.py              |   4 +-
 .../fleet/parameter_server/pslib/node.py      |   2 +-
 .../fluid/incubate/fleet/utils/fleet_util.py  |  12 +-
 python/paddle/fluid/initializer.py            |   4 +-
 python/paddle/fluid/input.py                  |   2 +-
 python/paddle/fluid/layer_helper_base.py      |   2 +-
 python/paddle/fluid/layers/control_flow.py    |   6 +-
 python/paddle/fluid/layers/detection.py       |  14 +--
 python/paddle/fluid/layers/distributions.py   |   8 +-
 .../fluid/layers/learning_rate_scheduler.py   |   2 +-
 python/paddle/fluid/layers/loss.py            |  18 +--
 python/paddle/fluid/layers/metric_op.py       |   2 +-
 python/paddle/fluid/layers/nn.py              | 106 +++++++++---------
 python/paddle/fluid/layers/ops.py             |   8 +-
 python/paddle/fluid/layers/rnn.py             |  82 +++++++-------
 python/paddle/fluid/layers/sequence_lod.py    |  16 +--
 python/paddle/fluid/layers/tensor.py          |   8 +-
 python/paddle/fluid/metrics.py                |   4 +-
 python/paddle/fluid/nets.py                   |   6 +-
 python/paddle/fluid/optimizer.py              |  32 +++---
 python/paddle/fluid/param_attr.py             |   2 +-
 python/paddle/fluid/reader.py                 |   2 +-
 python/paddle/fluid/regularizer.py            |   6 +-
 .../unittests/dist_text_classification.py     |   8 +-
 .../dygraph_to_static/simnet_dygraph_model.py |   2 +-
 .../simnet_dygraph_model_v2.py                |   2 +-
 .../test_eager_deletion_recurrent_op.py       |   2 +-
 .../tests/unittests/test_full_like_op.py      |   3 +-
 .../fluid/tests/unittests/test_lrn_op.py      |   2 +-
 .../tests/unittests/test_recurrent_op.py      |   6 +-
 .../tests/unittests/test_require_version.py   |   2 +-
 python/paddle/metric/metrics.py               |   2 +-
 python/paddle/nn/functional/activation.py     |  34 +++---
 python/paddle/nn/functional/common.py         |   4 +-
 python/paddle/nn/functional/conv.py           |  12 +-
 python/paddle/nn/functional/extension.py      |   5 +-
 python/paddle/nn/functional/input.py          |   2 +-
 python/paddle/nn/functional/loss.py           |  16 +--
 python/paddle/nn/functional/norm.py           |   4 +-
 python/paddle/nn/functional/vision.py         |   6 +-
 python/paddle/nn/initializer/kaiming.py       |   4 +-
 python/paddle/nn/initializer/xavier.py        |   4 +-
 python/paddle/nn/layer/activation.py          |  36 +++---
 python/paddle/nn/layer/common.py              |   6 +-
 python/paddle/nn/layer/conv.py                |  12 +-
 python/paddle/nn/layer/distance.py            |   2 +-
 python/paddle/nn/layer/loss.py                |  16 +--
 python/paddle/nn/layer/norm.py                |  16 +--
 python/paddle/nn/layer/pooling.py             |  10 +-
 python/paddle/nn/layer/transformer.py         |  20 ++--
 python/paddle/nn/utils/weight_norm_hook.py    |   2 +-
 python/paddle/optimizer/adadelta.py           |   2 +-
 python/paddle/optimizer/adagrad.py            |   2 +-
 python/paddle/optimizer/adam.py               |   2 +-
 python/paddle/optimizer/adamax.py             |   2 +-
 python/paddle/optimizer/adamw.py              |   2 +-
 python/paddle/optimizer/lr.py                 |  14 +--
 python/paddle/optimizer/momentum.py           |   2 +-
 python/paddle/optimizer/optimizer.py          |   2 +-
 python/paddle/optimizer/rmsprop.py            |   2 +-
 python/paddle/optimizer/sgd.py                |   2 +-
 python/paddle/reader/__init__.py              |   2 +-
 python/paddle/regularizer.py                  |   4 +-
 python/paddle/static/io.py                    |  45 +++++---
 python/paddle/static/nn/common.py             |   4 +-
 python/paddle/tensor/creation.py              |   6 +-
 python/paddle/tensor/linalg.py                |   4 +-
 python/paddle/tensor/manipulation.py          |   6 +-
 python/paddle/tensor/math.py                  |  12 +-
 python/paddle/tensor/search.py                |   2 +-
 python/paddle/text/datasets/imdb.py           |   6 +-
 r/example/mobilenet.py                        |  15 +++
 tools/check_ctest_hung.py                     |   4 +-
 tools/codestyle/docstring_checker.py          |   2 +-
 tools/coverage/coverage_diff.py               |  14 +++
 tools/coverage/coverage_diff_list.py          |  14 +++
 tools/coverage/coverage_lines.py              |  14 +++
 tools/coverage/cuda_clean.py                  |  14 +++
 tools/coverage/gcda_clean.py                  |  14 +++
 tools/coverage/pull_request.py                |  14 +++
 tools/coverage/python_coverage.py             |  14 +++
 tools/get_quick_disable_lt.py                 |   2 +-
 tools/sampcd_processor.py                     |  24 ++--
 tools/summary_env.py                          |   2 +-
 109 files changed, 586 insertions(+), 449 deletions(-)

diff --git a/paddle/scripts/conda_build.py b/paddle/scripts/conda_build.py
index 0a0736f35a5..395a071ed13 100644
--- a/paddle/scripts/conda_build.py
+++ b/paddle/scripts/conda_build.py
@@ -1,4 +1,19 @@
 #!/bin/python
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #
 import platform
 from sys import argv
@@ -120,7 +135,7 @@ python setup.py install
         self.py_str = ["py27", "py35", "py36", "py37"]
         self.pip_end = ".whl --no-deps"
         self.pip_prefix_linux = "pip install /package/paddlepaddle"
-        self.pip_prefix_windows = "pip install C:\package\paddlepaddle"
+        self.pip_prefix_windows = r"pip install C:\package\paddlepaddle"
         self.pip_gpu = "_gpu-"
         self.pip_cpu = "-"
         self.mac_pip = [
@@ -216,7 +231,7 @@ package:
     - matplotlib"""
     if not (cuda_str == None):
         meta_str = meta_str + cuda_str
-    
+
     blt_str = var.blt_const + blt_var
     if (python_str == var.python27):
         blt_str = blt_str + """
@@ -224,7 +239,7 @@ package:
     else:
         meta_str = meta_str + """
     - opencv>=3.4.2"""
-    
+
     meta_str = meta_str + var.test + var.about
     meta_filename = "meta.yaml"
     build_filename = "bld.bat"
diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py
index e5a3b6074c9..dab3c964cc6 100644
--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -116,8 +116,8 @@ def train(word_idx):
     :rtype: callable
     """
     return reader_creator(
-        re.compile("aclImdb/train/pos/.*\.txt$"),
-        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
+        re.compile(r"aclImdb/train/pos/.*\.txt$"),
+        re.compile(r"aclImdb/train/neg/.*\.txt$"), word_idx)
 
 
 @deprecated(
@@ -137,8 +137,8 @@ def test(word_idx):
     :rtype: callable
     """
     return reader_creator(
-        re.compile("aclImdb/test/pos/.*\.txt$"),
-        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
+        re.compile(r"aclImdb/test/pos/.*\.txt$"),
+        re.compile(r"aclImdb/test/neg/.*\.txt$"), word_idx)
 
 
 @deprecated(
@@ -153,7 +153,7 @@ def word_dict():
     :rtype: dict
     """
     return build_dict(
-        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+        re.compile(r"aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
 
 
 @deprecated(
diff --git a/python/paddle/dataset/tests/imdb_test.py b/python/paddle/dataset/tests/imdb_test.py
index 415947e3477..613c5f8edb2 100644
--- a/python/paddle/dataset/tests/imdb_test.py
+++ b/python/paddle/dataset/tests/imdb_test.py
@@ -18,13 +18,13 @@ import paddle.dataset.imdb
 import unittest
 import re
 
-TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
-TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
-TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
+TRAIN_POS_PATTERN = re.compile(r"aclImdb/train/pos/.*\.txt$")
+TRAIN_NEG_PATTERN = re.compile(r"aclImdb/train/neg/.*\.txt$")
+TRAIN_PATTERN = re.compile(r"aclImdb/train/.*\.txt$")
 
-TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
-TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
-TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
+TEST_POS_PATTERN = re.compile(r"aclImdb/test/pos/.*\.txt$")
+TEST_NEG_PATTERN = re.compile(r"aclImdb/test/neg/.*\.txt$")
+TEST_PATTERN = re.compile(r"aclImdb/test/.*\.txt$")
 
 
 class TestIMDB(unittest.TestCase):
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 71eca424fe6..46ccb4663e8 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -862,7 +862,7 @@ class DistributedStrategy(object):
 
     @property
     def dgc_configs(self):
-        """
+        r"""
         Set Deep Gradient Compression training configurations. In general, dgc has serveral configurable
         settings that can be configured through a dict.
 
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 00bec671d4b..c48ce1a0f33 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
+r"""
 fleetrun is a module that spawns multiple distributed
 process on each training node for gpu training and cpu training.
 Usage:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index 10b0c82c0ee..3135b69d004 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -158,13 +158,13 @@ class ParameterServerOptimizer(MetaOptimizerBase):
                     ['vm_stat'], stdout=subprocess.PIPE).communicate()[0]
                 # Process vm_stat
                 vmLines = vm.split('\n')
-                sep = re.compile(':[\s]+')
+                sep = re.compile(r':[\s]+')
                 vmStats = {}
                 for row in range(1, len(vmLines) - 2):
                     rowText = vmLines[row].strip()
                     rowElements = sep.split(rowText)
                     vmStats[(rowElements[0]
-                             )] = int(rowElements[1].strip('\.')) * 4096
+                             )] = int(rowElements[1].strip(r'\.')) * 4096
                 return vmStats["Pages free"]
             elif platform.system() == "Linux":
                 mems = {}
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index 9b969cf3002..060e742ad6c 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
+r"""
 paddle.distributed.launch is a module that spawns multiple distributed 
 process on each training node for gpu training.
 Usage:
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index e9a15feb517..ad134b4591e 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -166,7 +166,7 @@ class Distribution(object):
 
 
 class Uniform(Distribution):
-    """Uniform distribution with `low` and `high` parameters.
+    r"""Uniform distribution with `low` and `high` parameters.
 
     Mathematical Details
 
@@ -374,7 +374,7 @@ class Uniform(Distribution):
         return elementwise_div((lb * ub), (self.high - self.low), name=name)
 
     def entropy(self):
-        """Shannon entropy in nats.
+        r"""Shannon entropy in nats.
 
         The entropy is
 
@@ -391,7 +391,7 @@ class Uniform(Distribution):
 
 
 class Normal(Distribution):
-    """The Normal distribution with location `loc` and `scale` parameters.
+    r"""The Normal distribution with location `loc` and `scale` parameters.
 
     Mathematical details
 
@@ -534,7 +534,7 @@ class Normal(Distribution):
                 return output
 
     def entropy(self):
-        """Shannon entropy in nats.
+        r"""Shannon entropy in nats.
 
         The entropy is
 
@@ -599,7 +599,7 @@ class Normal(Distribution):
             name=name)
 
     def kl_divergence(self, other):
-        """The KL-divergence between two normal distributions.
+        r"""The KL-divergence between two normal distributions.
 
         The probability density function (pdf) is
 
@@ -644,7 +644,7 @@ class Normal(Distribution):
 
 
 class Categorical(Distribution):
-    """
+    r"""
     Categorical distribution is a discrete probability distribution that 
     describes the possible results of a random variable that can take on 
     one of K possible categories, with the probability of each category 
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index f20716c3a15..8fd01509331 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -40,7 +40,7 @@ class BaseErrorClipAttr(object):
 
 
 class ErrorClipByValue(BaseErrorClipAttr):
-    """
+    r"""
     Clips tensor values to the range [min, max].
 
     Given a tensor ``t`` (see Examples below), this operation clips its value \
@@ -241,7 +241,7 @@ class ClipGradByValue(ClipGradBase):
 
 
 class ClipGradByNorm(ClipGradBase):
-    """
+    r"""
     Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
     
     - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
@@ -343,7 +343,7 @@ class ClipGradByNorm(ClipGradBase):
 
 
 class ClipGradByGlobalNorm(ClipGradBase):
-    """
+    r"""
     Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in 
     :math:`t\_list` , and limit it to ``clip_norm`` .
     
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index d0543bb90dd..f3f8c815b00 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -137,7 +137,7 @@ def var_conv_2d(input,
                 act=None,
                 dtype='float32',
                 name=None):
-    """
+    r"""
     The var_conv_2d layer calculates the output base on the :attr:`input` with variable length,
     row, col, input channel, filter size and strides. Both :attr:`input`, :attr:`row`,
     and :attr:`col` are 1-level LodTensor. The convolution operation is same as conv2d layer with
@@ -477,7 +477,7 @@ def fused_embedding_seq_pool(input,
                              combiner='sum',
                              param_attr=None,
                              dtype='float32'):
-    """
+    r"""
     **Embedding Sequence pool**
 
     This layer is the fusion of lookup table and sequence_pool.
@@ -1442,7 +1442,7 @@ def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None):
 
 
 def _pull_box_extended_sparse(input, size, extend_size=64, dtype='float32'):
-    """
+    r"""
     **Pull Box Extended Sparse Layer**
     This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
     BoxPS lookup table. The result of this lookup is the embedding of each ID in the
@@ -1640,7 +1640,7 @@ def fused_bn_add_act(x,
                      moving_variance_name=None,
                      act=None,
                      name=None):
-    """
+    r"""
     This Op performs batch norm on input x, and adds the result to input y. Then
     it performs activation on the sum. The data format of inputs must be NHWC
     `[batch, in_height, in_width, in_channels]`.
diff --git a/python/paddle/fluid/contrib/layers/rnn_impl.py b/python/paddle/fluid/contrib/layers/rnn_impl.py
index 4e304a393f8..a2dd0835b60 100644
--- a/python/paddle/fluid/contrib/layers/rnn_impl.py
+++ b/python/paddle/fluid/contrib/layers/rnn_impl.py
@@ -175,7 +175,7 @@ def basic_gru(input,
               activation=None,
               dtype='float32',
               name='basic_gru'):
-    """
+    r"""
     GRU implementation using basic operator, supports multiple layers and bidirectional gru.
 
     .. math::
@@ -418,7 +418,7 @@ def basic_lstm(input,
                forget_bias=1.0,
                dtype='float32',
                name='basic_lstm'):
-    """
+    r"""
     LSTM implementation using basic operators, supports multiple layers and bidirectional LSTM.
 
     .. math::
@@ -697,7 +697,7 @@ def basic_lstm(input,
 
 
 class BasicLSTMUnit(Layer):
-    """
+    r"""
     ****
     BasicLSTMUnit class, Using basic operator to build LSTM
     The algorithm can be described as the code below.
diff --git a/python/paddle/fluid/contrib/memory_usage_calc.py b/python/paddle/fluid/contrib/memory_usage_calc.py
index b5d85616cf0..24e39d7ac61 100644
--- a/python/paddle/fluid/contrib/memory_usage_calc.py
+++ b/python/paddle/fluid/contrib/memory_usage_calc.py
@@ -44,7 +44,7 @@ DEBUG = False
 
 
 def memory_usage(program, batch_size):
-    """
+    r"""
     Get the estimate memory usage of program with input batch size.
 
     Args:
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index cae24177232..7364655107b 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -64,7 +64,7 @@ class ImperativeQuantAware(object):
                  act_preprocess_layer=None,
                  weight_quantize_layer=None,
                  act_quantize_layer=None):
-        """
+        r"""
         The constructor for ImperativeQuantAware.
 
         Args:
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
index 79138febd0c..5acc4c30bc0 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -30,7 +30,7 @@ __all__ = [
 
 
 class FakeQuantMovingAverage(layers.Layer):
-    """
+    r"""
     FakeQuantMovingAverage layer does the moving_average_abs_max quant and then dequant.
     Its computational formula is described as below:
 
@@ -128,7 +128,7 @@ class FakeQuantMovingAverage(layers.Layer):
 
 
 class FakeQuantAbsMax(layers.Layer):
-    """
+    r"""
     FakeQuantAbsMax layer does the abs_max quant and then dequant.
     Its computational formula is described as below:
 
@@ -545,7 +545,7 @@ class QuantizedLinear(layers.Layer):
 
 class MovingAverageAbsMaxScale(layers.Layer):
     def __init__(self, name=None, moving_rate=0.9, dtype='float32'):
-        """
+        r"""
         MovingAverageMaxScale layer is used to calculating the output quantization scale of Layer.
         Its computational formula is described as below:
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py
index a25abd9ff09..d31dc35d143 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py
@@ -37,7 +37,7 @@ class QuantInt8MkldnnPass(object):
     """
 
     def __init__(self, _scope=None, _place=None):
-        """
+        r"""
         Args:
             scope(fluid.Scope): scope is used to initialize the new parameters.
             place(fluid.CPUPlace): place is used to initialize the new parameters.
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 68bf9ecd80b..219025269fe 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -239,7 +239,7 @@ class QuantizationTransformPass(object):
                  act_preprocess_func=None,
                  optimizer_func=None,
                  executor=None):
-        """
+        r"""
         Constructor.
 
         Args:
diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py
index 2de4f82bd14..9572552f0f2 100644
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py
@@ -33,7 +33,7 @@ _logger = get_logger(
 
 
 class HDFSClient(object):
-    """
+    r"""
     A tool of HDFS 
 
     Args:
@@ -376,7 +376,7 @@ class HDFSClient(object):
             _logger.info("HDFS list path: {} successfully".format(hdfs_path))
 
             ret_lines = []
-            regex = re.compile('\s+')
+            regex = re.compile(r'\s+')
             out_lines = output.strip().split("\n")
             for line in out_lines:
                 re_line = regex.split(line)
@@ -418,7 +418,7 @@ class HDFSClient(object):
             _logger.info("HDFS list all files: {} successfully".format(
                 hdfs_path))
             lines = []
-            regex = re.compile('\s+')
+            regex = re.compile(r'\s+')
             out_lines = output.strip().split("\n")
             for line in out_lines:
                 re_line = regex.split(line)
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index ad116c25970..224a021cd6a 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -224,7 +224,7 @@ def less_than_ver(a, b):
     import operator
 
     def to_list(s):
-        s = re.sub('(\.0+)+$', '', s)
+        s = re.sub(r'(\.0+)+$', '', s)
         return [int(x) for x in s.split('.')]
 
     return operator.lt(to_list(a), to_list(b))
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index ee30484ae9a..ea89b09d2bf 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -101,10 +101,11 @@ class _DatasetKind(object):
     ITER = 1
 
     @staticmethod
-    def create_fetcher(kind, dataset, auto_collate_batch, collate_fn, drop_last):
+    def create_fetcher(kind, dataset, auto_collate_batch, collate_fn,
+                       drop_last):
         if kind == _DatasetKind.MAP:
-            return _MapDatasetFetcher(dataset, auto_collate_batch,
-                                      collate_fn, drop_last)
+            return _MapDatasetFetcher(dataset, auto_collate_batch, collate_fn,
+                                      drop_last)
         elif kind == _DatasetKind.ITER:
             return _IterableDatasetFetcher(dataset, auto_collate_batch,
                                            collate_fn, drop_last)
@@ -240,7 +241,8 @@ class _DataLoaderIterBase(object):
             if self._dataset_kind == _DatasetKind.MAP:
                 self._sampler_iter = iter(list(range(len(self._dataset))))
             else:
-                self._sampler_iter = iter(_InfiniteIterableSampler(self._dataset, 1))
+                self._sampler_iter = iter(
+                    _InfiniteIterableSampler(self._dataset, 1))
             self._collate_fn = loader.collate_fn
 
         # LoDTensorBlockingQueue instance for create_py_reader and a thread
@@ -380,8 +382,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
 
 # NOTE(chenweihang): _worker_loop must be top level method to be pickled
 def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
-                 auto_collate_batch, collate_fn, init_fn, worker_id, num_workers,
-                 use_shared_memory):
+                 auto_collate_batch, collate_fn, init_fn, worker_id,
+                 num_workers, use_shared_memory):
     try:
         # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
         # some shared memory objects may have been applied for but have not yet
@@ -400,8 +402,8 @@ def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
         try:
             if init_fn is not None:
                 init_fn(worker_id)
-            fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset,
-                                    auto_collate_batch, collate_fn, True)
+            fetcher = _DatasetKind.create_fetcher(
+                dataset_kind, dataset, auto_collate_batch, collate_fn, True)
         except:
             init_exception = Exception("init_fn failed in worker {}: " \
                                     "{}".format(worker_id, sys.exc_info()))
diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py
index 61e508ea72e..89e9a6a9076 100644
--- a/python/paddle/fluid/distributed/downpour.py
+++ b/python/paddle/fluid/distributed/downpour.py
@@ -22,7 +22,7 @@ from google.protobuf import text_format
 
 
 class DownpourSGD(object):
-    """
+    r"""
     Distributed optimizer of downpour stochastic gradient descent
     Standard implementation of Google's Downpour SGD
     in Large Scale Distributed Deep Networks
diff --git a/python/paddle/fluid/distributed/node.py b/python/paddle/fluid/distributed/node.py
index 41e0d64e0b7..a15f94f4d17 100644
--- a/python/paddle/fluid/distributed/node.py
+++ b/python/paddle/fluid/distributed/node.py
@@ -52,7 +52,7 @@ class DownpourServer(Server):
 
     def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
                          slot_value_var):
-        """
+        r"""
         Args:
             table_id(int): id of sparse params table
             learning_rate(float): the learning rate used to update parameters. \
@@ -84,7 +84,7 @@ class DownpourServer(Server):
         table.accessor.downpour_accessor_param.delete_threshold = 0.8
 
     def add_dense_table(self, table_id, learning_rate, param_var, grad_var):
-        """
+        r"""
         Args:
             table_id(int): id of sparse params table
             learning_rate(float): the learning rate used to update parameters. \
@@ -135,7 +135,7 @@ class DownpourWorker(Worker):
 
     def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
                          slot_value_vars):
-        """
+        r"""
         Args:
             table_id(int): id of sparse params table
             learning_rate(float): the learning rate used to update parameters. \
@@ -153,7 +153,7 @@ class DownpourWorker(Worker):
             [var.name + "@GRAD" for var in slot_value_vars])
 
     def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars):
-        """
+        r"""
         Args:
             table_id(int): id of sparse params table
             learning_rate(float): the learning rate used to update parameters. \
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index a26b903493a..397f873f961 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -593,7 +593,7 @@ def grad(outputs,
 
 @framework.dygraph_only
 def to_variable(value, name=None, zero_copy=None, dtype=None):
-    """
+    r"""
     :api_attr: imperative
 
     The API will create a ``Variable`` or ``ComplexVariable`` object from 
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index cd6af6fd5b5..a6c1993dbbf 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -183,7 +183,7 @@ class PiecewiseDecay(LearningRateDecay):
 
 
 class NaturalExpDecay(LearningRateDecay):
-    """
+    r"""
     :api_attr: imperative
 
     Applies natural exponential decay to the initial learning rate.
@@ -266,7 +266,7 @@ class NaturalExpDecay(LearningRateDecay):
 
 
 class ExponentialDecay(LearningRateDecay):
-    """
+    r"""
     :api_attr: imperative
 
     Applies exponential decay to the learning rate.
@@ -348,7 +348,7 @@ class ExponentialDecay(LearningRateDecay):
 
 
 class InverseTimeDecay(LearningRateDecay):
-    """
+    r"""
     :api_attr: imperative
 
     Applies inverse time decay to the initial learning rate.
@@ -426,7 +426,7 @@ class InverseTimeDecay(LearningRateDecay):
 
 
 class PolynomialDecay(LearningRateDecay):
-    """
+    r"""
     :api_attr: imperative
 
     Applies polynomial decay to the initial learning rate.
@@ -520,7 +520,7 @@ class PolynomialDecay(LearningRateDecay):
 
 
 class CosineDecay(LearningRateDecay):
-    """
+    r"""
     :api_attr: imperative
 
     Applies cosine decay to the learning rate.
@@ -578,7 +578,7 @@ class CosineDecay(LearningRateDecay):
 
 
 class NoamDecay(LearningRateDecay):
-    """
+    r"""
     :api_attr: imperative
 
     Applies Noam decay to the initial learning rate. 
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 3c75b304028..0f92c32f252 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -42,7 +42,7 @@ __all__ = [
 
 
 class Conv2D(layers.Layer):
-    """
+    r"""
     This interface is used to construct a callable object of the ``Conv2D`` class.
     For more details, refer to code examples.
     The convolution2D layer calculates the output based on the input, filter
@@ -282,7 +282,7 @@ class Conv2D(layers.Layer):
 
 
 class Conv3D(layers.Layer):
-    """
+    r"""
     **Convlution3D Layer**
 
     The convolution3D layer calculates the output based on the input, filter
@@ -484,7 +484,7 @@ class Conv3D(layers.Layer):
 
 
 class Conv3DTranspose(layers.Layer):
-    """
+    r"""
     **Convlution3D transpose layer**
 
     The convolution3D transpose layer calculates the output based on the input,
@@ -701,7 +701,7 @@ class Conv3DTranspose(layers.Layer):
 
 
 class Pool2D(layers.Layer):
-    """
+    r"""
 
     This interface is used to construct a callable object of the ``Pool2D`` class.
     For more details, refer to code examples.
@@ -1009,7 +1009,7 @@ class Linear(layers.Layer):
 
 
 class InstanceNorm(layers.Layer):
-    """
+    r"""
     This interface is used to construct a callable object of the ``InstanceNorm`` class.
     For more details, refer to code examples.
 
@@ -1143,7 +1143,7 @@ class InstanceNorm(layers.Layer):
 
 
 class BatchNorm(layers.Layer):
-    """
+    r"""
     :alias_main: paddle.nn.BatchNorm
 	:alias: paddle.nn.BatchNorm,paddle.nn.layer.BatchNorm,paddle.nn.layer.norm.BatchNorm
 	:old_api: paddle.fluid.dygraph.BatchNorm
@@ -1492,7 +1492,7 @@ class Dropout(layers.Layer):
 
 
 class Embedding(layers.Layer):
-    """
+    r"""
     :alias_main: paddle.nn.Embedding
 	:alias: paddle.nn.Embedding,paddle.nn.layer.Embedding,paddle.nn.layer.common.Embedding
 	:old_api: paddle.fluid.dygraph.Embedding
@@ -1652,7 +1652,7 @@ class Embedding(layers.Layer):
 
 
 class LayerNorm(layers.Layer):
-    """
+    r"""
     :alias_main: paddle.nn.LayerNorm
 	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
 	:old_api: paddle.fluid.dygraph.LayerNorm
@@ -2242,7 +2242,7 @@ class NCE(layers.Layer):
 
 
 class PRelu(layers.Layer):
-    """
+    r"""
     This interface is used to construct a callable object of the ``PRelu`` class.
     For more details, refer to code examples.
     It implements three activation methods of the ``PRelu`` activation function.
@@ -2350,7 +2350,7 @@ class PRelu(layers.Layer):
 
 
 class BilinearTensorProduct(layers.Layer):
-    """
+    r"""
 
     **Add Bilinear Tensor Product Layer**
 
@@ -2467,7 +2467,7 @@ class BilinearTensorProduct(layers.Layer):
 
 
 class Conv2DTranspose(layers.Layer):
-    """
+    r"""
     This interface is used to construct a callable object of the ``Conv2DTranspose`` class.
     For more details, refer to code examples.
     The convolution2D transpose layer calculates the output based on the input,
@@ -2979,7 +2979,7 @@ class GroupNorm(layers.Layer):
 
 
 class SpectralNorm(layers.Layer):
-    """
+    r"""
     :alias_main: paddle.nn.SpectralNorm
 	:alias: paddle.nn.SpectralNorm,paddle.nn.layer.SpectralNorm,paddle.nn.layer.norm.SpectralNorm
 	:old_api: paddle.fluid.dygraph.SpectralNorm
diff --git a/python/paddle/fluid/dygraph/rnn.py b/python/paddle/fluid/dygraph/rnn.py
index 9df4188fb7e..05a76a8d125 100644
--- a/python/paddle/fluid/dygraph/rnn.py
+++ b/python/paddle/fluid/dygraph/rnn.py
@@ -20,7 +20,7 @@ __all__ = ['LSTMCell', 'GRUCell']
 
 
 class LSTMCell(Layer):
-    """
+    r"""
     LSTMCell implementation using basic operators.
     There are two LSTMCell version, the default one is compatible with CUDNN LSTM implementation.
     The algorithm can be described as the equations below.
@@ -236,7 +236,7 @@ class LSTMCell(Layer):
 
 
 class GRUCell(Layer):
-    """
+    r"""
     GRU implementation using basic operators.
     There are two GRUCell version, the default one is compatible with CUDNN GRU implementation.
     The algorithm can be described as the equations below.
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 49c5f9f5b8e..28891871777 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2255,7 +2255,7 @@ class Operator(object):
         return self.desc.type()
 
     def input(self, name):
-        """
+        r"""
         Get the input arguments according to the input parameter name.
 
         Args:
@@ -2306,7 +2306,7 @@ class Operator(object):
         return self.desc.output_arg_names()
 
     def output(self, name):
-        """
+        r"""
         Get output arguments by the output parameter name.
 
         Args:
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
index 4b600150e04..0853d05ef3b 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
@@ -527,7 +527,7 @@ class DownpourWorker(Worker):
 
     def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars,
                         dense_start_table_id, sparse_table_names):
-        """
+        r"""
         Args:
             table_id(int): id of sparse params table
             learning_rate(float): the learning rate used to update parameters. \
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
index c126f06de9d..dd968a70e8a 100644
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
@@ -153,7 +153,7 @@ class FleetUtil(object):
                          stat_pos="_generated_var_2",
                          stat_neg="_generated_var_3",
                          print_prefix=""):
-        """
+        r"""
         Print global auc of all distributed workers.
 
         Args:
@@ -1073,7 +1073,7 @@ class FleetUtil(object):
                                 hadoop_fs_name,
                                 hadoop_fs_ugi,
                                 hadoop_home="$HADOOP_HOME"):
-        """
+        r"""
         get last saved base xbox info from xbox_base_done.txt
 
         Args:
@@ -1118,7 +1118,7 @@ class FleetUtil(object):
                            hadoop_fs_name,
                            hadoop_fs_ugi,
                            hadoop_home="$HADOOP_HOME"):
-        """
+        r"""
         get last saved xbox info from xbox_patch_done.txt
 
         Args:
@@ -1164,7 +1164,7 @@ class FleetUtil(object):
                             hadoop_fs_name,
                             hadoop_fs_ugi,
                             hadoop_home="$HADOOP_HOME"):
-        """
+        r"""
         get last saved model info from donefile.txt
 
         Args:
@@ -1279,7 +1279,7 @@ class FleetUtil(object):
                            q_name="q",
                            pos_ins_num_name="pos",
                            total_ins_num_name="total"):
-        """
+        r"""
         get global metrics, including auc, bucket_error, mae, rmse,
         actual_ctr, predicted_ctr, copc, mean_predict_qvalue, total_ins_num.
 
@@ -1469,7 +1469,7 @@ class FleetUtil(object):
                              pos_ins_num_name="pos",
                              total_ins_num_name="total",
                              print_prefix=""):
-        """
+        r"""
         print global metrics, including auc, bucket_error, mae, rmse,
         actual_ctr, predicted_ctr, copc, mean_predict_qvalue, total_ins_num.
 
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 30932d0c8b5..86fab981127 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -459,7 +459,7 @@ class TruncatedNormalInitializer(Initializer):
 
 
 class XavierInitializer(Initializer):
-    """
+    r"""
     This class implements the Xavier weight initializer from the paper
     `Understanding the difficulty of training deep feedforward neural
     networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
@@ -595,7 +595,7 @@ class XavierInitializer(Initializer):
 
 
 class MSRAInitializer(Initializer):
-    """Implements the MSRA initializer a.k.a. Kaiming Initializer
+    r"""Implements the MSRA initializer a.k.a. Kaiming Initializer
 
     This class implements the weight initialization from the paper
     `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
diff --git a/python/paddle/fluid/input.py b/python/paddle/fluid/input.py
index 0e3ee46fa46..e56d1876e3f 100644
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
@@ -137,7 +137,7 @@ def embedding(input,
               padding_idx=None,
               param_attr=None,
               dtype='float32'):
-    """
+    r"""
     :api_attr: Static Graph
 
     The operator is used to lookup embeddings vector of ids provided by :attr:`input` . 
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 6e38c855562..5ee46a68fb7 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -59,7 +59,7 @@ class LayerHelperBase(object):
         return cls.__dtype
 
     def to_variable(self, value, name=None):
-        """
+        r"""
         The API will create a ``Variable`` object from numpy\.ndarray or Variable object.
 
         Parameters:
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 65ca5a211e3..b5f66a1308e 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -3012,7 +3012,7 @@ class DynamicRNN(object):
         self.mem_link = []
 
     def step_input(self, x, level=0):
-        """
+        r"""
         This function is used to set sequence x as DynamicRNN's input.
         The maximum sequence length in x determines the number of time steps
         the RNN unit will be executed. DynamicRNN can take multiple inputs.
@@ -3144,7 +3144,7 @@ class DynamicRNN(object):
         return array_read(array=input_array, i=self.step_idx)
 
     def static_input(self, x):
-        """
+        r"""
         This function is used to set x as DynamicRNN's static input. It is optional.
 
         - Case 1, set static input with LoD
@@ -3348,7 +3348,7 @@ class DynamicRNN(object):
                value=0.0,
                need_reorder=False,
                dtype='float32'):
-        """
+        r"""
         Create a memory Variable for DynamicRNN to deliver data cross time steps.
         It can be initialized by an existing Tensor or a constant Tensor of given
         dtype and shape.
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index f7e79f79f8b..ce29b64ce43 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -77,7 +77,7 @@ def retinanet_target_assign(bbox_pred,
                             num_classes=1,
                             positive_overlap=0.5,
                             negative_overlap=0.4):
-    """
+    r"""
     **Target Assign Layer for the detector RetinaNet.**
 
     This OP finds out positive and negative samples from all anchors
@@ -471,7 +471,7 @@ def rpn_target_assign(bbox_pred,
 
 
 def sigmoid_focal_loss(x, label, fg_num, gamma=2.0, alpha=0.25):
-    """
+    r"""
 	:alias_main: paddle.nn.functional.sigmoid_focal_loss
 	:alias: paddle.nn.functional.sigmoid_focal_loss,paddle.nn.functional.loss.sigmoid_focal_loss
 	:old_api: paddle.fluid.layers.sigmoid_focal_loss
@@ -821,7 +821,7 @@ def box_coder(prior_box,
               box_normalized=True,
               name=None,
               axis=0):
-    """
+    r"""
 
     **Box Coder Layer**
 
@@ -1523,7 +1523,7 @@ def ssd_loss(location,
              mining_type='max_negative',
              normalize=True,
              sample_size=None):
-    """
+    r"""
 	:alias_main: paddle.nn.functional.ssd_loss
 	:alias: paddle.nn.functional.ssd_loss,paddle.nn.functional.loss.ssd_loss
 	:old_api: paddle.fluid.layers.ssd_loss
@@ -1930,7 +1930,7 @@ def density_prior_box(input,
                       offset=0.5,
                       flatten_to_2d=False,
                       name=None):
-    """
+    r"""
 
     This op generates density prior boxes for SSD(Single Shot MultiBox Detector) 
     algorithm. Each position of the input produce N prior boxes, N is 
@@ -2741,7 +2741,7 @@ def generate_proposal_labels(rpn_rois,
 
 def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
                          labels_int32, num_classes, resolution):
-    """
+    r"""
 
     **Generate Mask Labels for Mask-RCNN**
 
@@ -3671,7 +3671,7 @@ def distribute_fpn_proposals(fpn_rois,
                              refer_scale,
                              rois_num=None,
                              name=None):
-    """
+    r"""
 	
     **This op only takes LoDTensor as input.** In Feature Pyramid Networks 
     (FPN) models, it is needed to distribute all proposals into different FPN 
diff --git a/python/paddle/fluid/layers/distributions.py b/python/paddle/fluid/layers/distributions.py
index 81bea3898be..4e4c8dfd2a0 100644
--- a/python/paddle/fluid/layers/distributions.py
+++ b/python/paddle/fluid/layers/distributions.py
@@ -113,7 +113,7 @@ class Distribution(object):
 
 
 class Uniform(Distribution):
-    """Uniform distribution with `low` and `high` parameters.
+    r"""Uniform distribution with `low` and `high` parameters.
 
     Mathematical Details
 
@@ -258,7 +258,7 @@ class Uniform(Distribution):
 
 
 class Normal(Distribution):
-    """The Normal distribution with location `loc` and `scale` parameters.
+    r"""The Normal distribution with location `loc` and `scale` parameters.
 
     Mathematical details
 
@@ -423,7 +423,7 @@ class Normal(Distribution):
 
 
 class Categorical(Distribution):
-    """
+    r"""
     Categorical distribution is a discrete probability distribution that 
     describes the possible results of a random variable that can take on 
     one of K possible categories, with the probability of each category 
@@ -529,7 +529,7 @@ class Categorical(Distribution):
 
 
 class MultivariateNormalDiag(Distribution):
-    """
+    r"""
     A multivariate normal (also called Gaussian) distribution parameterized by a mean vector
     and a covariance matrix.
 
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 2710ab12cd3..26f08a2356d 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -440,7 +440,7 @@ Applies piecewise decay to the initial learning rate.
 
 
 def cosine_decay(learning_rate, step_each_epoch, epochs):
-    """
+    r"""
 
     Applies cosine decay to the learning rate.
 
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 99801514f47..45f3de2d99a 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -57,7 +57,7 @@ def center_loss(input,
                 alpha,
                 param_attr,
                 update_center=True):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Center loss Cost layer**
@@ -151,7 +151,7 @@ def center_loss(input,
 
 
 def bpr_loss(input, label, name=None):
-    """
+    r"""
 
     **Bayesian Personalized Ranking Loss Operator**
 
@@ -203,7 +203,7 @@ def bpr_loss(input, label, name=None):
 
 
 def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
-    """
+    r"""
     :alias_main: paddle.nn.functional.cross_entropy
 	:alias: paddle.nn.functional.cross_entropy,paddle.nn.functional.loss.cross_entropy
 	:old_api: paddle.fluid.layers.cross_entropy
@@ -300,7 +300,7 @@ def cross_entropy2(input, label, ignore_index=kIgnoreIndex):
 
 
 def square_error_cost(input, label):
-    """
+    r"""
 
     This op accepts input predictions and target label and returns the
     squared error cost.
@@ -1185,7 +1185,7 @@ def softmax_with_cross_entropy(logits,
                                numeric_stable_mode=True,
                                return_softmax=False,
                                axis=-1):
-    """
+    r"""
     :alias_main: paddle.nn.functional.softmax_with_cross_entropy
 	:alias: paddle.nn.functional.softmax_with_cross_entropy,paddle.nn.functional.loss.softmax_with_cross_entropy
 	:old_api: paddle.fluid.layers.softmax_with_cross_entropy
@@ -1312,7 +1312,7 @@ def softmax_with_cross_entropy(logits,
 
 
 def rank_loss(label, left, right, name=None):
-    """
+    r"""
 
     This operator implements the sort loss layer in the RankNet model. RankNet is a pairwise ranking model 
     with a training sample consisting of a pair of documents (A and B), The label (P) 
@@ -1375,7 +1375,7 @@ def rank_loss(label, left, right, name=None):
 
 
 def margin_rank_loss(label, left, right, margin=0.1, name=None):
-    """
+    r"""
     Margin Ranking Loss Layer for ranking problem,
     which compares left score and right score passed in.
     The ranking loss can be defined as following equation:
@@ -1551,7 +1551,7 @@ def teacher_student_sigmoid_loss(input,
 
 
 def huber_loss(input, label, delta):
-    """
+    r"""
     This operator computes the Huber loss between input and label.
     Huber loss is commonly used in regression tasks. Compared to square_error_cost, Huber loss is more robust and less sensitivity to outliers.
 
@@ -1681,7 +1681,7 @@ from .control_flow import equal
 
 
 def npair_loss(anchor, positive, labels, l2_reg=0.002):
-    '''
+    r'''
 
   Read `Improved Deep Metric Learning with Multi class N pair Loss Objective\
        <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/\
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index 3ec88a2d5d5..35d14ef7657 100644
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -114,7 +114,7 @@ def auc(input,
         num_thresholds=2**12 - 1,
         topk=1,
         slide_steps=1):
-    """
+    r"""
     **Area Under the Curve (AUC) Layer**
 
     This implementation computes the AUC according to forward output and label.
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index fa9a1c75b38..1bee5634823 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -215,7 +215,7 @@ def fc(input,
        bias_attr=None,
        act=None,
        name=None):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Fully Connected Layer**
@@ -377,7 +377,7 @@ def embedding(input,
               padding_idx=None,
               param_attr=None,
               dtype='float32'):
-    """
+    r"""
     :api_attr: Static Graph
 
     **WARING:** This OP will be deprecated in a future release. This OP requires the
@@ -530,7 +530,7 @@ def _pull_sparse(input,
                  padding_id=0,
                  dtype='float32',
                  scale_sparse_grad=True):
-    """
+    r"""
     **Pull Fleet Sparse Layer**
 
     This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
@@ -601,7 +601,7 @@ def _pull_sparse_v2(input,
                     padding_id=0,
                     dtype='float32',
                     scale_sparse_grad=True):
-    """
+    r"""
     **Pull Fleet Sparse Layer**
 
     This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
@@ -664,7 +664,7 @@ def _pull_sparse_v2(input,
 
 
 def _pull_box_sparse(input, size, dtype='float32'):
-    """
+    r"""
     **Pull Box Sparse Layer**
 
     This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
@@ -1050,7 +1050,7 @@ def chunk_eval(input,
                num_chunk_types,
                excluded_chunk_types=None,
                seq_length=None):
-    """
+    r"""
     This operator computes the precision, recall and F1-score for chunk detection.
     It is often used in sequence tagging tasks, such as Named Entity Recognition(NER).
 
@@ -1199,7 +1199,7 @@ def chunk_eval(input,
 
 @deprecated(since="2.0.0", update_to="paddle.nn.functional.softmax")
 def softmax(input, use_cudnn=False, name=None, axis=-1):
-    """
+    r"""
     This operator implements the softmax layer. The calculation process is as follows:
 
     1. The dimension :attr:`axis` of the ``input`` will be permuted to the last.
@@ -1339,7 +1339,7 @@ def conv2d(input,
            act=None,
            name=None,
            data_format="NCHW"):
-    """
+    r"""
     :api_attr: Static Graph
 
     The convolution2D layer calculates the output based on the input, filter
@@ -1618,7 +1618,7 @@ def conv3d(input,
            act=None,
            name=None,
            data_format="NCDHW"):
-    """
+    r"""
     :api_attr: Static Graph
 
     The convolution3D layer calculates the output based on the input, filter
@@ -2325,7 +2325,7 @@ def adaptive_pool2d(input,
                     pool_type="max",
                     require_index=False,
                     name=None):
-    """
+    r"""
 
     This operation calculates the output based on the input, pool_size,
     pool_type parameters. Input(X) and output(Out) are in NCHW format, where N is batch
@@ -2471,7 +2471,7 @@ def adaptive_pool3d(input,
                     pool_type="max",
                     require_index=False,
                     name=None):
-    """
+    r"""
 
     This operation calculates the output based on the input, pool_size,
     pool_type parameters. Input(X) and output(Out) are in NCDHW format, where N is batch
@@ -2638,7 +2638,7 @@ def batch_norm(input,
                moving_variance_name=None,
                do_model_average_for_mean_and_var=True,
                use_global_stats=False):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Batch Normalization Layer**
@@ -2902,7 +2902,7 @@ def inplace_abn(input,
                 do_model_average_for_mean_and_var=True,
                 use_global_stats=False,
                 act_alpha=1.0):
-    """
+    r"""
     **In-place Activation Batch Normalization Layer**
 
     This layer calculates batch normalization and activation with in-place memory.
@@ -3096,7 +3096,7 @@ def instance_norm(input,
                   param_attr=None,
                   bias_attr=None,
                   name=None):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Instance Normalization Layer**
@@ -3231,7 +3231,7 @@ def data_norm(input,
               sync_stats=False,
               summary_decay_rate=0.9999999,
               enable_scale_and_shift=False):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Data Normalization Layer**
@@ -3416,7 +3416,7 @@ def layer_norm(input,
                bias_attr=None,
                act=None,
                name=None):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Layer Normalization Layer**
@@ -3646,7 +3646,7 @@ def group_norm(input,
 
 @templatedoc()
 def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Spectral Normalization Layer**
@@ -3765,7 +3765,7 @@ def conv2d_transpose(input,
                      act=None,
                      name=None,
                      data_format='NCHW'):
-    """
+    r"""
     :api_attr: Static Graph
 
     The convolution2D transpose layer calculates the output based on the input,
@@ -4057,7 +4057,7 @@ def conv3d_transpose(input,
                      act=None,
                      name=None,
                      data_format='NCDHW'):
-    """
+    r"""
     :api_attr: Static Graph
 
     The convolution3D transpose layer calculates the output based on the input,
@@ -4961,7 +4961,7 @@ def split(input, num_or_sections, dim=-1, name=None):
 
 
 def l2_normalize(x, axis, epsilon=1e-12, name=None):
-    """
+    r"""
 
     This op normalizes `x` along dimension `axis` using an L2
     norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes
@@ -5286,7 +5286,7 @@ def ctc_greedy_decoder(input,
                        input_length=None,
                        padding_value=0,
                        name=None):
-    """
+    r"""
     This op is used to decode sequences by greedy policy by the following steps:
 
     1. Get the indexes of maximum value for each row in input. a.k.a.
@@ -5538,7 +5538,7 @@ def im2sequence(input,
                 input_image_size=None,
                 out_stride=1,
                 name=None):
-    """
+    r"""
     :api_attr: Static Graph
 
     Extracts image patches from the input tensor to form a tensor of shape
@@ -6046,7 +6046,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
 
 
 def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
-    """
+    r"""
     :alias_main: paddle.reshape
 	:alias: paddle.reshape,paddle.tensor.reshape,paddle.tensor.manipulation.reshape
 
@@ -6535,7 +6535,7 @@ def lod_append(x, level):
 
 def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None,
         data_format='NCHW'):
-    """
+    r"""
     :alias_main: paddle.nn.functional.lrn
 	:alias: paddle.nn.functional.lrn,paddle.nn.functional.norm.lrn
 	:old_api: paddle.fluid.layers.lrn
@@ -6625,7 +6625,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None,
 
 
 def pad(x, paddings, pad_value=0., name=None):
-    """
+    r"""
     :alias_main: paddle.nn.functional.pad
 	:alias: paddle.nn.functional.pad,paddle.nn.functional.common.pad
 	:old_api: paddle.fluid.layers.pad
@@ -6695,7 +6695,7 @@ def pad(x, paddings, pad_value=0., name=None):
 
 
 def pad_constant_like(x, y, pad_value=0., name=None):
-    """
+    r"""
     Pad :attr:`y` with :attr:`pad_value`, the number of values padded to
     the edges of each axis is specified by the difference of the shape
     of :attr:`x` and :attr:`y` . ((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n))
@@ -6794,7 +6794,7 @@ def label_smooth(label,
                  epsilon=0.1,
                  dtype="float32",
                  name=None):
-    """
+    r"""
     :alias_main: paddle.nn.functional.label_smooth
 	:alias: paddle.nn.functional.label_smooth,paddle.nn.functional.common.label_smooth
 	:old_api: paddle.fluid.layers.label_smooth
@@ -7067,7 +7067,7 @@ def roi_align(input,
 
 
 def dice_loss(input, label, epsilon=0.00001, name=None):
-    """
+    r"""
 
     Dice loss for comparing the similarity between the input predictions and the label.
     This implementation is for binary classification, where the input is sigmoid
@@ -8500,7 +8500,7 @@ def scatter(input, index, updates, name=None, overwrite=True):
 
 
 def scatter_nd_add(ref, index, updates, name=None):
-    """
+    r"""
     **Scatter_nd_add Layer**
 
     Output is obtained by applying sparse addition to a single value
@@ -8686,7 +8686,7 @@ def random_crop(x, shape, seed=None):
 
 
 def log(x, name=None):
-    """
+    r"""
     Calculates the natural log of the given input tensor, element-wise.
 
     .. math::
@@ -8768,7 +8768,7 @@ def relu(x, name=None):
 
 @deprecated(since="2.0.0", update_to="paddle.nn.functional.selu")
 def selu(x, scale=None, alpha=None, name=None):
-    """
+    r"""
 
     Selu Operator.
 
@@ -8836,7 +8836,7 @@ def selu(x, scale=None, alpha=None, name=None):
 
 
 def mean_iou(input, label, num_classes):
-    """
+    r"""
     Mean Intersection-Over-Union is a common evaluation metric for
     semantic image segmentation, which first computes the IOU for each
     semantic class and then computes the average over classes.
@@ -9640,7 +9640,7 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
 
 @templatedoc()
 def swish(x, beta=1.0, name=None):
-    """
+    r"""
     :alias_main: paddle.nn.functional.swish
 	:alias: paddle.nn.functional.swish,paddle.nn.functional.activation.swish
 	:old_api: paddle.fluid.layers.swish
@@ -9725,7 +9725,7 @@ def swish(x, beta=1.0, name=None):
 
 @deprecated(since="2.0.0", update_to="paddle.static.nn.prelu")
 def prelu(x, mode, param_attr=None, name=None):
-    """
+    r"""
     prelu activation.
 
     .. math::
@@ -9883,7 +9883,7 @@ def leaky_relu(x, alpha=0.02, name=None):
 
 
 def soft_relu(x, threshold=40.0, name=None):
-    """
+    r"""
 
     SoftRelu Activation Operator.
 
@@ -9932,7 +9932,7 @@ def soft_relu(x, threshold=40.0, name=None):
 
 
 def flatten(x, axis=1, name=None):
-    """
+    r"""
     **Flatten op**
 
     Flatten the input tensor into a 2D matrix.
@@ -12153,7 +12153,7 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
 
 
 def logical_and(x, y, out=None, name=None):
-    """
+    r"""
 
     ``logical_and`` operator computes element-wise logical AND on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
     Each element of ``out`` is calculated by
@@ -12230,7 +12230,7 @@ def logical_or(x, y, out=None, name=None):
 
 
 def logical_xor(x, y, out=None, name=None):
-    """
+    r"""
 
     ``logical_xor`` operator computes element-wise logical XOR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
     Each element of ``out`` is calculated by
@@ -12565,7 +12565,7 @@ def maxout(x, groups, name=None, axis=1):
 
 
 def space_to_depth(x, blocksize, name=None):
-    """
+    r"""
 
     Gives a blocksize to space_to_depth the input LoDtensor with Layout: [batch, channel, height, width]
 
@@ -12753,7 +12753,7 @@ def affine_channel(x,
 
 
 def similarity_focus(input, axis, indexes, name=None):
-    """
+    r"""
     SimilarityFocus Operator
 
     Generate a similarity focus mask with the same shape of input using the following method:
@@ -13034,7 +13034,7 @@ def grid_sampler(x, grid, name=None):
 
 
 def log_loss(input, label, epsilon=1e-4, name=None):
-    """
+    r"""
 
     **Negative Log Loss Layer**
 
@@ -13086,7 +13086,7 @@ def log_loss(input, label, epsilon=1e-4, name=None):
 
 
 def add_position_encoding(input, alpha, beta, name=None):
-    """
+    r"""
 
     This operator performs weighted sum of input feature at each position
     (position in the sequence) and the corresponding position encoding.
@@ -13160,7 +13160,7 @@ def bilinear_tensor_product(x,
                             name=None,
                             param_attr=None,
                             bias_attr=None):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Bilinear Tensor Product Layer**
@@ -13987,7 +13987,7 @@ def fsp_matrix(x, y):
 
 
 def continuous_value_model(input, cvm, use_cvm=True):
-    """
+    r"""
 
     **continuous_value_model layers**
 
@@ -14092,7 +14092,7 @@ def where(condition):
 
 @deprecated(since="2.0.0", update_to="paddle.sign")
 def sign(x):
-    """
+    r"""
     This OP returns sign of every element in `x`: 1 for positive, -1 for negative and 0 for zero.
 
     Args:
@@ -14125,7 +14125,7 @@ def sign(x):
 
 
 def unique(x, dtype='int32'):
-    """
+    r"""
     Return a unique tensor for `x` and an index tensor pointing to this unique tensor.
 
     Args:
@@ -14164,7 +14164,7 @@ def unique(x, dtype='int32'):
 
 
 def unique_with_counts(x, dtype='int32'):
-    """
+    r"""
     This OP return a unique tensor for `x` , and count tensor that the count of unique result in raw input, \
     and an index tensor pointing to this unique tensor.
 
@@ -14236,7 +14236,7 @@ def deformable_conv(input,
                     bias_attr=None,
                     modulated=True,
                     name=None):
-    """
+    r"""
     :api_attr: Static Graph
 
     **Deformable Convolution op**
@@ -14453,7 +14453,7 @@ def deformable_conv(input,
 
 
 def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
-    """
+    r"""
 
     This op returns a col buffer of sliding local blocks of input x, also known
     as im2col for batched 2D image tensors. For each block under the convolution filter,
@@ -14590,7 +14590,7 @@ def deformable_roi_pooling(input,
                            trans_std=0.1,
                            position_sensitive=False,
                            name=None):
-    """
+    r"""
 
     Deformable ROI Pooling Layer
 
@@ -14821,7 +14821,7 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
 
 @templatedoc()
 def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
-    """
+    r"""
     This operator implements the hard_swish activation function.
     Hard_swish is proposed in MobileNetV3, and performs better in computational stability and efficiency compared to swish function.
     For more details please refer to: https://arxiv.org/pdf/1905.02244.pdf
@@ -14890,7 +14890,7 @@ def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
 
 @templatedoc()
 def mish(x, threshold=20, name=None):
-    """
+    r"""
     This operator implements the mish activation function.
     Refer to `Mish: A Self Regularized Non-Monotonic Neural
     Activation Function <https://arxiv.org/abs/1908.08681>`_
@@ -14964,7 +14964,7 @@ def mish(x, threshold=20, name=None):
 
 
 def gather_tree(ids, parents):
-    """
+    r"""
     To be used after beam search. After beam search, we get selected ids at
     each time step and the corresponding parents in the search tree. Both ids
     and parents have the layout :attr:`[max_time, batch_size, beam_size]`. Then
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index de0fbb16f62..72dc4a91608 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -413,7 +413,7 @@ def softshrink(x, alpha=None):
     return _softshrink_(**kwargs)
 
 
-softshrink.__doc__ = """
+softshrink.__doc__ = r"""
 	:alias_main: paddle.nn.functional.softshrink
 	:alias: paddle.nn.functional.softshrink,paddle.nn.functional.activation.softshrink
 	:old_api: paddle.fluid.layers.softshrink
@@ -530,7 +530,7 @@ def thresholded_relu(x, threshold=None):
     return _thresholded_relu_(**kwargs)
 
 
-thresholded_relu.__doc__ = """
+thresholded_relu.__doc__ = r"""
 	:alias_main: paddle.nn.functional.thresholded_relu
 	:alias: paddle.nn.functional.thresholded_relu,paddle.nn.functional.activation.thresholded_relu
 	:old_api: paddle.fluid.layers.thresholded_relu
@@ -617,7 +617,7 @@ def gelu(x, approximate=False):
     return _gelu_(**kwargs)
 
 
-gelu.__doc__ = """
+gelu.__doc__ = r"""
 :strong:`GeLU Activation Operator`
 For more details, see [Gaussian Error Linear Units](https://arxiv.org/abs/1606.08415).
 
@@ -701,7 +701,7 @@ def erf(x, name=None):
     return _erf_(**kwargs)
 
 
-erf.__doc__ = """
+erf.__doc__ = r"""
 :strong:`Erf Operator`
 For more details, see [Error function](https://en.wikipedia.org/wiki/Error_function).
 
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 05272a7cefb..2f11603d484 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -67,7 +67,7 @@ class RNNCell(object):
     """
 
     def call(self, inputs, states, **kwargs):
-        """
+        r"""
         Every cell must implement this method to do the calculations mapping the
         inputs and states to the output and new states.
 
@@ -97,7 +97,7 @@ class RNNCell(object):
                            dtype='float32',
                            init_value=0,
                            batch_dim_idx=0):
-        """
+        r"""
         Generate initialized states according to provided shape, data type and
         value.
 
@@ -225,7 +225,7 @@ class RNNCell(object):
 
 
 class GRUCell(RNNCell):
-    """
+    r"""
 	:api_attr: Static Graph
 
     Gated Recurrent Unit cell. It is a wrapper for 
@@ -287,7 +287,7 @@ class GRUCell(RNNCell):
             activation, dtype)
 
     def call(self, inputs, states):
-        """
+        r"""
         Perform calculations of GRU.
 
         Parameters:
@@ -323,7 +323,7 @@ class GRUCell(RNNCell):
 
 
 class LSTMCell(RNNCell):
-    """
+    r"""
 	:api_attr: Static Graph
 
     Long-Short Term Memory cell. It is a wrapper for 
@@ -390,7 +390,7 @@ class LSTMCell(RNNCell):
             activation, forget_bias, dtype)
 
     def call(self, inputs, states):
-        """
+        r"""
         Perform calculations of LSTM.
 
         Parameters:
@@ -782,7 +782,7 @@ class Decoder(object):
     """
 
     def initialize(self, inits):
-        """
+        r"""
         Called once before the decoding iterations.
 
         Parameters:
@@ -797,7 +797,7 @@ class Decoder(object):
         raise NotImplementedError
 
     def step(self, time, inputs, states, **kwargs):
-        """
+        r"""
         Called per step of decoding. 
 
         Parameters:
@@ -818,7 +818,7 @@ class Decoder(object):
         raise NotImplementedError
 
     def finalize(self, outputs, final_states, sequence_lengths):
-        """
+        r"""
         Called once after the decoding iterations if implemented.
 
         Parameters:
@@ -931,7 +931,7 @@ class BeamSearchDecoder(Decoder):
 
     @staticmethod
     def tile_beam_merge_with_batch(x, beam_size):
-        """
+        r"""
         Tile the batch dimension of a tensor. Specifically, this function takes
         a tensor t shaped `[batch_size, s0, s1, ...]` composed of minibatch 
         entries `t[0], ..., t[batch_size - 1]` and tiles it to have a shape
@@ -966,7 +966,7 @@ class BeamSearchDecoder(Decoder):
         return x
 
     def _split_batch_beams(self, x):
-        """
+        r"""
         Reshape a tensor with shape `[batch_size * beam_size, ...]` to a new
         tensor with shape `[batch_size, beam_size, ...]`. 
 
@@ -983,7 +983,7 @@ class BeamSearchDecoder(Decoder):
         return nn.reshape(x, shape=[-1, self.beam_size] + list(x.shape[1:]))
 
     def _merge_batch_beams(self, x):
-        """
+        r"""
         Reshape a tensor with shape `[batch_size, beam_size, ...]` to a new
         tensor with shape `[batch_size * beam_size, ...]`. 
 
@@ -1000,7 +1000,7 @@ class BeamSearchDecoder(Decoder):
         return nn.reshape(x, shape=[-1] + list(x.shape[2:]))
 
     def _expand_to_beam_size(self, x):
-        """
+        r"""
         This function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed
         of minibatch entries `t[0], ..., t[batch_size - 1]` and tiles it to have a
         shape `[batch_size, beam_size, s0, s1, ...]` composed of minibatch entries
@@ -1023,7 +1023,7 @@ class BeamSearchDecoder(Decoder):
         return x
 
     def _mask_probs(self, probs, finished):
-        """
+        r"""
         Mask log probabilities. It forces finished beams to allocate all probability
         mass to eos and unfinished beams to remain unchanged.
 
@@ -1052,7 +1052,7 @@ class BeamSearchDecoder(Decoder):
         return probs
 
     def _gather(self, x, indices, batch_size):
-        """
+        r"""
         Gather from the tensor `x` using `indices`.
 
         Parameters:
@@ -1104,7 +1104,7 @@ class BeamSearchDecoder(Decoder):
         pass
 
     def initialize(self, initial_cell_states):
-        """
+        r"""
         Initialize the BeamSearchDecoder.
 
         Parameters:
@@ -1162,7 +1162,7 @@ class BeamSearchDecoder(Decoder):
                                               init_lengths), init_finished
 
     def _beam_search_step(self, time, logits, next_cell_states, beam_state):
-        """
+        r"""
         Calculate scores and select candidate token ids.
 
         Parameters:
@@ -1235,7 +1235,7 @@ class BeamSearchDecoder(Decoder):
         return beam_search_output, beam_search_state
 
     def step(self, time, inputs, states, **kwargs):
-        """
+        r"""
         Perform a beam search decoding step, which uses `cell` to get probabilities,
         and follows a beam search step to calculate scores and select candidate
         token ids.
@@ -1287,7 +1287,7 @@ class BeamSearchDecoder(Decoder):
         return (beam_search_output, beam_search_state, next_inputs, finished)
 
     def finalize(self, outputs, final_states, sequence_lengths):
-        """
+        r"""
         Use `gather_tree` to backtrace along the beam search tree and construct
         the full predicted sequences.
 
@@ -1572,7 +1572,7 @@ def dynamic_decode(decoder,
                    is_test=False,
                    return_length=False,
                    **kwargs):
-    """
+    r"""
     Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned
     Tensor indicating finished status contains all True values or the number of
     decoding step reaches to :attr:`max_step_num`.
@@ -1664,7 +1664,7 @@ class DecodeHelper(object):
     """
 
     def initialize(self):
-        """
+        r"""
         DecodeHelper initialization to produce inputs for the first decoding step
         and give the initial status telling whether each sequence in the batch
         is finished. It is the partial of the initialization of `BasicDecoder`.
@@ -1698,7 +1698,7 @@ class DecodeHelper(object):
         pass
 
     def next_inputs(self, time, outputs, states, sample_ids):
-        """
+        r"""
         Produce the inputs and states for next time step and give status telling
         whether each minibatch entry is finished. It is called after `sample` in
         `BasicDecoder.step`. It is the partial of `BasicDecoder.step`.
@@ -1787,7 +1787,7 @@ class TrainingHelper(DecodeHelper):
             self.inputs)
 
     def initialize(self):
-        """
+        r"""
         TrainingHelper initialization produces inputs for the first decoding
         step by slicing at the first time step of full sequence inputs, and it
         gives initial status telling whether each sequence in the batch is
@@ -1809,7 +1809,7 @@ class TrainingHelper(DecodeHelper):
         return init_inputs, init_finished
 
     def sample(self, time, outputs, states):
-        """
+        r"""
         Perform sampling by using `argmax` according to the `outputs`. Mostly
         the sampled ids would not be used since the inputs for next decoding
         step would be got by slicing.
@@ -1832,7 +1832,7 @@ class TrainingHelper(DecodeHelper):
         return sample_ids
 
     def next_inputs(self, time, outputs, states, sample_ids):
-        """
+        r"""
         Generate inputs for the next decoding step by slicing at corresponding
         step of the full sequence inputs. Simultaneously, produce the states
         for next time step by directly using the input `states` and emit status
@@ -1909,7 +1909,7 @@ class GreedyEmbeddingHelper(DecodeHelper):
     """
 
     def __init__(self, embedding_fn, start_tokens, end_token):
-        """
+        r"""
         Constructor of GreedyEmbeddingHelper.
 
         Parameters:
@@ -1934,7 +1934,7 @@ class GreedyEmbeddingHelper(DecodeHelper):
             shape=[1], dtype="int64", value=end_token)
 
     def initialize(self):
-        """
+        r"""
         GreedyEmbeddingHelper initialization produces inputs for the first decoding
         step by using `start_tokens` of the constructor, and gives initial
         status telling whether each sequence in the batch is finished. 
@@ -1957,7 +1957,7 @@ class GreedyEmbeddingHelper(DecodeHelper):
         return init_inputs, init_finished
 
     def sample(self, time, outputs, states):
-        """
+        r"""
         Perform sampling by using `argmax` according to the `outputs`.
 
         Parameters:
@@ -1978,7 +1978,7 @@ class GreedyEmbeddingHelper(DecodeHelper):
         return sample_ids
 
     def next_inputs(self, time, outputs, states, sample_ids):
-        """
+        r"""
         Generate inputs for the next decoding step by applying `embedding_fn`
         to `sample_ids`. Simultaneously, produce the states for next time step
         by directly using the input `states` and emit status telling whether
@@ -2046,7 +2046,7 @@ class SampleEmbeddingHelper(GreedyEmbeddingHelper):
                  end_token,
                  softmax_temperature=None,
                  seed=None):
-        """
+        r"""
         Constructor of SampleEmbeddingHelper.
 
         Parameters:
@@ -2080,7 +2080,7 @@ class SampleEmbeddingHelper(GreedyEmbeddingHelper):
         self.seed = seed
 
     def sample(self, time, outputs, states):
-        """
+        r"""
         Perform sampling from a categorical distribution, and the distribution
         is computed by `softmax(outputs/softmax_temperature)`.
 
@@ -2165,7 +2165,7 @@ class BasicDecoder(Decoder):
         self.output_fn = output_fn
 
     def initialize(self, initial_cell_states):
-        """
+        r"""
         BasicDecoder initialization includes helper initialization and cell
         initialization, and cell initialization uses `initial_cell_states` as
         the result directly.
@@ -2195,7 +2195,7 @@ class BasicDecoder(Decoder):
         pass
 
     def step(self, time, inputs, states, **kwargs):
-        """
+        r"""
         Perform one decoding step as following steps:
 
         1. Perform `cell_outputs, cell_states = cell.call(inputs, states)`
@@ -2258,7 +2258,7 @@ def dynamic_lstm(input,
                  candidate_activation='tanh',
                  dtype='float32',
                  name=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
     **Note**:
@@ -2430,7 +2430,7 @@ def lstm(input,
          name=None,
          default_initializer=None,
          seed=-1):
-    """
+    r"""
 	:api_attr: Static Graph
 
     **Note**:
@@ -2612,7 +2612,7 @@ def dynamic_lstmp(input,
                   c_0=None,
                   cell_clip=None,
                   proj_clip=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
     **Note**:
@@ -2823,7 +2823,7 @@ def dynamic_gru(input,
                 candidate_activation='tanh',
                 h_0=None,
                 origin_mode=False):
-    """
+    r"""
 	:api_attr: Static Graph
 
     **Note: The input type of this must be LoDTensor. If the input type to be
@@ -2985,7 +2985,7 @@ def gru_unit(input,
              activation='tanh',
              gate_activation='sigmoid',
              origin_mode=False):
-    """
+    r"""
 	:api_attr: Static Graph
 
     Gated Recurrent Unit (GRU) RNN cell. This operator performs GRU calculations for
@@ -3143,7 +3143,7 @@ def beam_search(pre_ids,
                 is_accumulated=True,
                 name=None,
                 return_parent_idx=False):
-    """
+    r"""
 
     Beam search is a classical algorithm for selecting candidate words in a
     machine translation task.
@@ -3293,7 +3293,7 @@ def beam_search(pre_ids,
 
 
 def beam_search_decode(ids, scores, beam_size, end_id, name=None):
-    """
+    r"""
 
     This operator is used after beam search has completed. It constructs the
     full predicted sequences for each sample by walking back along the search
@@ -3378,7 +3378,7 @@ def lstm_unit(x_t,
               param_attr=None,
               bias_attr=None,
               name=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
     Long-Short Term Memory (LSTM) RNN cell. This operator performs LSTM calculations for
diff --git a/python/paddle/fluid/layers/sequence_lod.py b/python/paddle/fluid/layers/sequence_lod.py
index 80faffd477b..df1113660f7 100644
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
@@ -51,7 +51,7 @@ def sequence_conv(input,
                   param_attr=None,
                   act=None,
                   name=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
     **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use conv2d Op.(fluid.layers.** :ref:`api_fluid_layers_conv2d` ).
@@ -175,7 +175,7 @@ def sequence_conv(input,
 
 
 def sequence_softmax(input, use_cudnn=False, name=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
     **Note**:
@@ -259,7 +259,7 @@ def sequence_softmax(input, use_cudnn=False, name=None):
 
 
 def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
-    """
+    r"""
 	:api_attr: Static Graph
 
     **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use pool2d Op.(fluid.layers.** :ref:`api_fluid_layers_pool2d` ).
@@ -636,7 +636,7 @@ def sequence_slice(input, offset, length, name=None):
 
 
 def sequence_expand(x, y, ref_level=-1, name=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
         Sequence Expand Layer. This layer will expand the input variable ``x`` \
@@ -772,7 +772,7 @@ def sequence_expand(x, y, ref_level=-1, name=None):
 
 
 def sequence_expand_as(x, y, name=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
         Sequence Expand As Layer. This OP will expand the input variable ``x`` \
@@ -892,7 +892,7 @@ def sequence_expand_as(x, y, name=None):
 
 
 def sequence_pad(x, pad_value, maxlen=None, name=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
     This layer padding the sequences in a same batch to a common length (according \
@@ -1233,7 +1233,7 @@ def sequence_scatter(input, index, updates, name=None):
 
 
 def sequence_enumerate(input, win_size, pad_value=0, name=None):
-    """
+    r"""
 	:api_attr: Static Graph
 
     Generate a new sequence for the input index sequence with \
@@ -1301,7 +1301,7 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
 
 
 def sequence_mask(x, maxlen=None, dtype='int64', name=None):
-    """
+    r"""
     **SequenceMask Layer**
 
     This layer outputs a mask according to the input :code:`x` and
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index fe3970ce1c1..6e794874afb 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -343,7 +343,7 @@ def concat(input, axis=0, name=None):
 
 
 def tensor_array_to_tensor(input, axis=1, name=None, use_stack=False):
-    """
+    r"""
     This function concatenates or stacks all tensors in the input LoDTensorArray
     along the axis mentioned and returns that as the output.
 
@@ -452,7 +452,7 @@ def tensor_array_to_tensor(input, axis=1, name=None, use_stack=False):
 
 
 def sums(input, out=None):
-    """
+    r"""
     This function computes the sum of multiple input Tensors elementwisely.
 
     - Case 1, sum of 3 Tensors
@@ -1391,7 +1391,7 @@ def range(start, end, step, dtype, name=None):
 
 
 def linspace(start, stop, num, dtype=None, name=None):
-    """
+    r"""
     This OP return fixed number of evenly spaced values within a given interval.
 
     Args:
@@ -1527,7 +1527,7 @@ def zeros_like(x, out=None):
 
 @deprecated(since="2.0.0", update_to="paddle.diag")
 def diag(diagonal):
-    """
+    r"""
 	:alias_main: paddle.diag
 	:alias: paddle.diag,paddle.tensor.diag,paddle.tensor.creation.diag
 	:old_api: paddle.fluid.layers.diag
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 0c3f6e16732..a3b61f2e911 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -475,7 +475,7 @@ class Accuracy(MetricBase):
         self.weight = .0
 
     def update(self, value, weight):
-        """
+        r"""
         This function takes the minibatch states (value, weight) as input,
         to accumulate and update the corresponding status of the Accuracy object. The update method is as follows:
 
@@ -561,7 +561,7 @@ class ChunkEvaluator(MetricBase):
         self.num_correct_chunks = 0
 
     def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
-        """
+        r"""
         This function takes (num_infer_chunks, num_label_chunks, num_correct_chunks) as input,
         to accumulate and update the corresponding status of the ChunkEvaluator object. The update method is as follows:
         
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 8df8f6b6891..c47cce76f89 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -42,7 +42,7 @@ def simple_img_conv_pool(input,
                          bias_attr=None,
                          act=None,
                          use_cudnn=True):
-    """
+    r"""
 	:api_attr: Static Graph
 
     The simple_img_conv_pool api is composed of :ref:`api_fluid_layers_conv2d` and :ref:`api_fluid_layers_pool2d` .
@@ -333,7 +333,7 @@ def sequence_conv_pool(input,
 
 
 def glu(input, dim=-1):
-    """
+    r"""
 	:api_attr: Static Graph
 
     The Gated Linear Units(GLU) composed by :ref:`api_fluid_layers_split` , 
@@ -384,7 +384,7 @@ def scaled_dot_product_attention(queries,
                                  values,
                                  num_heads=1,
                                  dropout_rate=0.):
-    """
+    r"""
 	:api_attr: Static Graph
 
     This interface Multi-Head Attention using scaled dot product.
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 7f9ade8fcbd..2d95bfa8c54 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -954,7 +954,7 @@ class Optimizer(object):
 
 
 class SGDOptimizer(Optimizer):
-    """
+    r"""
     Optimizer of the stochastic gradient descent algorithm.
 
     .. math::
@@ -1048,7 +1048,7 @@ class SGDOptimizer(Optimizer):
 
 
 class MomentumOptimizer(Optimizer):
-    """
+    r"""
 
     Simple Momentum optimizer with velocity state
 
@@ -1183,7 +1183,7 @@ class MomentumOptimizer(Optimizer):
 
 
 class DGCMomentumOptimizer(Optimizer):
-    """
+    r"""
 	:api_attr: Static Graph
 
     DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887
@@ -1603,7 +1603,7 @@ class DGCMomentumOptimizer(Optimizer):
 
 
 class LarsMomentumOptimizer(Optimizer):
-    """
+    r"""
     Momentum optimizer with LARS support
 
     The update equations are as follows:
@@ -1735,7 +1735,7 @@ class LarsMomentumOptimizer(Optimizer):
 
 
 class AdagradOptimizer(Optimizer):
-    """
+    r"""
     The Adaptive Gradient optimizer (Adagrad for short) can adaptively assign
     different learning rates to individual parameters.
 
@@ -1851,7 +1851,7 @@ class AdagradOptimizer(Optimizer):
 
 
 class AdamOptimizer(Optimizer):
-    """
+    r"""
     The Adam optimizer uses an optimization described at the end
     of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
     it can dynamically adjusts the learning rate of each parameter using
@@ -2117,7 +2117,7 @@ class AdamOptimizer(Optimizer):
 
 
 class AdamaxOptimizer(Optimizer):
-    """
+    r"""
     The Adamax optimizer is implemented based on the Adamax Optimization 
     in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
     The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
@@ -2289,7 +2289,7 @@ class AdamaxOptimizer(Optimizer):
 
 
 class DpsgdOptimizer(Optimizer):
-    """
+    r"""
     We implement the Dpsgd optimizer according to CCS16 paper -
     Deep Learning with Differential Privacy.
 
@@ -2384,7 +2384,7 @@ class DpsgdOptimizer(Optimizer):
 
 
 class DecayedAdagradOptimizer(Optimizer):
-    """
+    r"""
     The Decayed Adagrad optimizer can be seen as an Adagrad algorithm that introduces
     the decay rate to solve the problem of a sharp drop in the learning rate
     during model training when using the AdagradOptimizer.
@@ -2494,7 +2494,7 @@ class DecayedAdagradOptimizer(Optimizer):
 
 
 class AdadeltaOptimizer(Optimizer):
-    """
+    r"""
     **Notes: This API does not support sparse parameter optimization.**
 
     Adadelta Optimizer. Please refer to this for details:
@@ -2613,7 +2613,7 @@ class AdadeltaOptimizer(Optimizer):
 
 
 class RMSPropOptimizer(Optimizer):
-    """
+    r"""
     Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
     rate method. The original slides proposed RMSProp: Slide 29 of
     http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
@@ -2801,7 +2801,7 @@ class RMSPropOptimizer(Optimizer):
 
 
 class FtrlOptimizer(Optimizer):
-    """
+    r"""
     FTRL (Follow The Regularized Leader) Optimizer.
 
     The paper that proposed Follow The Regularized Leader (FTRL):
@@ -2960,7 +2960,7 @@ class FtrlOptimizer(Optimizer):
 
 
 class LambOptimizer(AdamOptimizer):
-    """
+    r"""
     LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.
 
     LAMB Optimizer is designed to scale up the batch size of training without losing 
@@ -3132,7 +3132,7 @@ Lamb = LambOptimizer
 
 
 class ModelAverage(Optimizer):
-    """
+    r"""
 	:api_attr: Static Graph
 
     The ModelAverage optimizer accumulates specific continuous historical parameters
@@ -3441,7 +3441,7 @@ class ModelAverage(Optimizer):
 
 
 class ExponentialMovingAverage(object):
-    """
+    r"""
 	:api_attr: Static Graph
 
     Compute the moving average of parameters with exponential decay.
@@ -4795,7 +4795,7 @@ class RecomputeOptimizer(Optimizer):
 
 
 class LookaheadOptimizer(object):
-    """
+    r"""
 	:api_attr: Static Graph
 
     This implements the Lookahead optimizer of the
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 65f7bd64708..7d123e7122e 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -210,7 +210,7 @@ class ParamAttr(object):
 
 
 class WeightNormParamAttr(ParamAttr):
-    """
+    r"""
 	:api_attr: Static Graph
 
     Note:
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 09850b3cac9..1cb76b1f390 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -1325,7 +1325,7 @@ class GeneratorLoader(DataLoaderBase):
 
 
 class PyReader(DataLoaderBase):
-    """
+    r"""
     Create a reader object for data feeding in Python. 
     Data would be prefetched using Python thread and be pushed
     into a queue asynchronously. Data in the queue would be extracted 
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 9fe24ec2c9d..5e0e5f724a8 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -63,7 +63,7 @@ def _create_regularization_of_grad(param, grad, regularization=None):
 
 
 def append_regularization_ops(parameters_and_grads, regularization=None):
-    """Create and add backward regularization Operators
+    r"""Create and add backward regularization Operators
 
     Creates and adds backward regularization operators in the BlockDesc.
     This will add gradients of the regularizer function to the gradients
@@ -132,7 +132,7 @@ class WeightDecayRegularizer(object):
 
 
 class L2DecayRegularizer(WeightDecayRegularizer):
-    """ 
+    r""" 
     Implement the L2 Weight Decay Regularization, which helps to prevent the model over-fitting.
 
     It can be set in :ref:`api_fluid_ParamAttr` or ``optimizer`` (such as :ref:`api_fluid_optimizer_SGDOptimizer` ). 
@@ -239,7 +239,7 @@ class L2DecayRegularizer(WeightDecayRegularizer):
 
 
 class L1DecayRegularizer(WeightDecayRegularizer):
-    """
+    r"""
     Implement the L1 Weight Decay Regularization, which encourages the weights to be sparse.
     
     It can be set in :ref:`api_fluid_ParamAttr` or ``optimizer`` (such as :ref:`api_fluid_optimizer_SGDOptimizer` ). 
diff --git a/python/paddle/fluid/tests/unittests/dist_text_classification.py b/python/paddle/fluid/tests/unittests/dist_text_classification.py
index 095a474fd3a..21180d7f49f 100644
--- a/python/paddle/fluid/tests/unittests/dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py
@@ -204,8 +204,8 @@ def train(word_idx):
     :rtype: callable
     """
     return reader_creator(
-        re.compile("train/pos/.*\.txt$"),
-        re.compile("train/neg/.*\.txt$"), word_idx)
+        re.compile(r"train/pos/.*\.txt$"),
+        re.compile(r"train/neg/.*\.txt$"), word_idx)
 
 
 def test(word_idx):
@@ -221,8 +221,8 @@ def test(word_idx):
     :rtype: callable
     """
     return reader_creator(
-        re.compile("test/pos/.*\.txt$"),
-        re.compile("test/neg/.*\.txt$"), word_idx)
+        re.compile(r"test/pos/.*\.txt$"),
+        re.compile(r"test/neg/.*\.txt$"), word_idx)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
index bb7e0ca2a0c..4f35befda8e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -230,7 +230,7 @@ class SoftsignLayer(object):
 
 
 class FC(Layer):
-    """
+    r"""
     This interface is used to construct a callable object of the ``FC`` class.
     For more details, refer to code examples.
     It creates a fully connected layer in the network. It can take
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
index ec57057164f..e0b7e9033dd 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
@@ -227,7 +227,7 @@ class SoftsignLayer(object):
 
 
 class FC(paddle.nn.Layer):
-    """
+    r"""
     This interface is used to construct a callable object of the ``FC`` class.
     For more details, refer to code examples.
     It creates a fully connected layer in the network. It can take
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
index 4ae44365f25..ef4cbf0b742 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
@@ -235,7 +235,7 @@ class EagerDeletionRecurrentOpTest1(unittest.TestCase):
 
 
 class EagerDeletionRecurrentOpTest2(EagerDeletionRecurrentOpTest1):
-    '''
+    r'''
     Test RNNOp
     equation:
         h_t = \sigma (W x_t + U h_{t-1})
diff --git a/python/paddle/fluid/tests/unittests/test_full_like_op.py b/python/paddle/fluid/tests/unittests/test_full_like_op.py
index 30bc097428c..3f3b1ee6703 100644
--- a/python/paddle/fluid/tests/unittests/test_full_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_like_op.py
@@ -31,7 +31,8 @@ class TestFullOp(unittest.TestCase):
         train_program = Program()
         with program_guard(train_program, startup_program):
             fill_value = 2.0
-            input = paddle.fluid.data(name='input', dtype='float32', shape=[2, 3])
+            input = paddle.fluid.data(
+                name='input', dtype='float32', shape=[2, 3])
             output = paddle.full_like(input, fill_value)
             output_dtype = paddle.full_like(input, fill_value, dtype='float32')
 
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py
index 29e0a8d6f02..2b632b2437a 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -25,7 +25,7 @@ from paddle.fluid import compiler, Program, program_guard
 
 class TestLRNOp(OpTest):
     def get_input(self):
-        ''' TODO(gongweibao): why it's grad diff is so large?
+        r''' TODO(gongweibao): why it's grad diff is so large?
         x = np.ndarray(
             shape=(self.N, self.C, self.H, self.W), dtype=float, order='C')
         for m in range(0, self.N):
diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
index b738d4b8efe..a8adee742c6 100644
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -232,7 +232,7 @@ class RecurrentOpTest1(unittest.TestCase):
 
 
 class RecurrentOpTest2(RecurrentOpTest1):
-    '''
+    r'''
     Test RNNOp
     equation:
         h_t = \sigma (W x_t + U h_{t-1})
@@ -469,7 +469,7 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):
 
 
 class RecurrentOpSubBlockTest(RecurrentOpTest1):
-    '''
+    r'''
     Test RNNOp with subblock variable
     equation:
         y_ = emb * w1
@@ -608,7 +608,7 @@ class RecurrentOpSubBlockTest(RecurrentOpTest1):
 
 
 class RecurrentOpStopGradientTest(RecurrentOpTest1):
-    """
+    r"""
     Test RNNOp with stop_gradient = True
     equation:
         h_t = \sigma (W x_t + U h_{t-1})
diff --git a/python/paddle/fluid/tests/unittests/test_require_version.py b/python/paddle/fluid/tests/unittests/test_require_version.py
index 80d595c1ef1..d1cb0aa4d81 100644
--- a/python/paddle/fluid/tests/unittests/test_require_version.py
+++ b/python/paddle/fluid/tests/unittests/test_require_version.py
@@ -79,7 +79,7 @@ class TestErrors(unittest.TestCase):
 
         self.assertRaises(TypeError, test_input_type_1)
 
-        # The value of params must be in format '\d+(\.\d+){0,3}', like '1.5.2.0', '1.6' ...
+        # The value of params must be in format r'\d+(\.\d+){0,3}', like '1.5.2.0', '1.6' ...
         def test_input_value_1():
             fluid.require_version('string')
 
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 510b99c0300..f1808efe86e 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -35,7 +35,7 @@ def _is_numpy_(var):
 
 @six.add_metaclass(abc.ABCMeta)
 class Metric(object):
-    """
+    r"""
     Base class for metric, encapsulates metric logic and APIs
     Usage:
         
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index e7adc7106a4..915668de19d 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -58,7 +58,7 @@ import paddle
 
 
 def elu(x, alpha=1.0, name=None):
-    """
+    r"""
     elu activation.
 
     .. math::
@@ -101,7 +101,7 @@ def elu(x, alpha=1.0, name=None):
 
 
 def gelu(x, approximate=False, name=None):
-    """
+    r"""
     gelu activation.
 
     if approximate is True
@@ -155,7 +155,7 @@ def gelu(x, approximate=False, name=None):
 
 
 def hardshrink(x, threshold=0.5, name=None):
-    """
+    r"""
     hard shrinkage activation
 
     .. math::
@@ -204,7 +204,7 @@ def hardshrink(x, threshold=0.5, name=None):
 
 
 def hardtanh(x, min=-1.0, max=1.0, name=None):
-    """
+    r"""
     hardtanh activation
 
     .. math::
@@ -254,7 +254,7 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
 
 
 def hardsigmoid(x, name=None):
-    """
+    r"""
     hardsigmoid activation.
 
     A 3-part piecewise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
@@ -308,7 +308,7 @@ def hardsigmoid(x, name=None):
 
 
 def hardswish(x, name=None):
-    """
+    r"""
     hardswish activation
 
     hardswish is proposed in MobileNetV3, and performs better in computational stability
@@ -357,7 +357,7 @@ def hardswish(x, name=None):
 
 
 def leaky_relu(x, negative_slope=0.01, name=None):
-    """
+    r"""
     leaky_relu activation
 
     .. math::
@@ -515,7 +515,7 @@ def relu(x, name=None):
 
 
 def log_sigmoid(x, name=None):
-    """
+    r"""
     log_sigmoid activation.
 
     .. math::
@@ -552,7 +552,7 @@ def log_sigmoid(x, name=None):
 
 
 def maxout(x, groups, axis=1, name=None):
-    """
+    r"""
     maxout activation.
 
     Assumed the input shape is (N, Ci, H, W).
@@ -671,7 +671,7 @@ def selu(x,
          scale=1.0507009873554804934193349852946,
          alpha=1.6732632423543772848170429916717,
          name=None):
-    """
+    r"""
     selu activation
 
     .. math::
@@ -726,7 +726,7 @@ def selu(x,
 
 
 def softmax(x, axis=-1, dtype=None, name=None):
-    """
+    r"""
     This operator implements the softmax layer. The calculation process is as follows:
 
     1. The dimension :attr:`axis` of ``x`` will be permuted to the last.
@@ -880,7 +880,7 @@ def softmax(x, axis=-1, dtype=None, name=None):
 
 
 def softplus(x, beta=1, threshold=20, name=None):
-    """
+    r"""
     softplus activation
 
     .. math::
@@ -925,7 +925,7 @@ def softplus(x, beta=1, threshold=20, name=None):
 
 
 def softshrink(x, threshold=0.5, name=None):
-    """
+    r"""
     softshrink activation
 
     .. math::
@@ -976,7 +976,7 @@ def softshrink(x, threshold=0.5, name=None):
 
 
 def softsign(x, name=None):
-    """
+    r"""
     softsign activation
 
     .. math::
@@ -1013,7 +1013,7 @@ def softsign(x, name=None):
 
 
 def swish(x, name=None):
-    """
+    r"""
     swish activation.
 
     .. math::
@@ -1091,7 +1091,7 @@ def tanhshrink(x, name=None):
 
 
 def thresholded_relu(x, threshold=1.0, name=None):
-    """
+    r"""
     thresholded relu activation.
 
     .. math::
@@ -1137,7 +1137,7 @@ def thresholded_relu(x, threshold=1.0, name=None):
 
 
 def log_softmax(x, axis=-1, dtype=None, name=None):
-    """
+    r"""
     This operator implements the log_softmax layer. The calculation process is
     as follows:
 
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 910a302599f..a4c92883e06 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1413,7 +1413,7 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
 
 
 def linear(x, weight, bias=None, name=None):
-    """
+    r"""
 
     Fully-connected linear transformation operator. For each input :math:`X` ,
     the equation is:
@@ -1500,7 +1500,7 @@ def linear(x, weight, bias=None, name=None):
 
 
 def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
-    """
+    r"""
     Label smoothing is a mechanism to regularize the classifier layer and is called
     label-smoothing regularization (LSR).
 
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index c4410346ca1..75be8f54cd7 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -166,7 +166,7 @@ def conv1d(x,
            groups=1,
            data_format='NCL',
            name=None):
-    """
+    r"""
     The convolution1D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input and
     Output are in NCL format, where N is batch size, C is the number of
@@ -392,7 +392,7 @@ def conv2d(x,
            groups=1,
            data_format="NCHW",
            name=None):
-    """
+    r"""
 
     The convolution2D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input and
@@ -568,7 +568,7 @@ def conv1d_transpose(x,
                      output_size=None,
                      data_format="NCL",
                      name=None):
-    """
+    r"""
     The 1-D convolution transpose layer calculates the output based on the input,
     filter, and dilation, stride, padding. Input(Input) and output(Output)
     are in 'NCL' format or 'NLC' where N is batch size, C is the number of channels,
@@ -828,7 +828,7 @@ def conv2d_transpose(x,
                      output_size=None,
                      data_format='NCHW',
                      name=None):
-    """
+    r"""
 
     The convolution2D transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
@@ -1068,7 +1068,7 @@ def conv3d(x,
            groups=1,
            data_format="NCDHW",
            name=None):
-    """
+    r"""
 
     The convolution3D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
@@ -1233,7 +1233,7 @@ def conv3d_transpose(x,
                      output_size=None,
                      data_format='NCDHW',
                      name=None):
-    """
+    r"""
     The convolution3d transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
     are in NCDHW or NDHWC format. Where N is batch size, C is the number of channels,
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 4ec0f8407fa..5e80f307eee 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -14,10 +14,7 @@
 
 # TODO: define the extention functions
 
-__all__ = [
-    'diag_embed',
-    'row_conv'
-]
+__all__ = ['diag_embed', 'row_conv']
 
 import numpy as np
 from ...fluid.data_feeder import check_dtype
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index 40b9441c2dc..5cabc4b6755 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -111,7 +111,7 @@ def one_hot(x, num_classes, name=None):
 
 
 def embedding(x, weight, padding_idx=None, sparse=False, name=None):
-    """
+    r"""
     The operator is used to lookup embeddings vector of ids provided by :attr:`x` .
 
     The shape of output Tensor is generated by appending the last dimension of the input Tensor shape
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 1b19c4c1637..fb923e05671 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -184,7 +184,7 @@ def binary_cross_entropy_with_logits(logit,
                                      reduction='mean',
                                      pos_weight=None,
                                      name=None):
-    """
+    r"""
     This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
     Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
     layer and some reduce operations.
@@ -461,7 +461,7 @@ def hsigmoid_loss(input,
 
 
 def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
-    """
+    r"""
     This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
     term if the absolute element-wise error falls below 1 and an L1 term otherwise.
     In some cases it can prevent exploding gradients and it is more robust and less
@@ -544,7 +544,7 @@ def margin_ranking_loss(input,
                         margin=0.0,
                         reduction='mean',
                         name=None):
-    """
+    r"""
 
     This op the calcluate the the margin rank loss between the input, other and label, use the math function as follows.
 
@@ -646,7 +646,7 @@ def margin_ranking_loss(input,
 
 
 def l1_loss(input, label, reduction='mean', name=None):
-    """
+    r"""
     This operator computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
 
     If `reduction` set to ``'none'``, the loss is:
@@ -840,7 +840,7 @@ def nll_loss(input,
 
 
 def kl_div(input, label, reduction='mean', name=None):
-    """
+    r"""
     This operator calculates the Kullback-Leibler divergence loss
     between Input(X) and Input(Target). Notes that Input(X) is the
     log-probability and Input(Target) is the probability.
@@ -947,7 +947,7 @@ def kl_div(input, label, reduction='mean', name=None):
 
 
 def mse_loss(input, label, reduction='mean', name=None):
-    """
+    r"""
     This op accepts input predications and label and returns the mean square error.
 
     If :attr:`reduction` is set to ``'none'``, loss is calculated as:
@@ -1121,7 +1121,7 @@ def cross_entropy(input,
                   weight=None,
                   ignore_index=-100,
                   reduction='mean'):
-    """
+    r"""
     This operator implements the cross entropy loss function. This OP combines ``LogSoftmax``,
     and ``NLLLoss`` together.
 
@@ -1252,7 +1252,7 @@ def sigmoid_focal_loss(logit,
                        gamma=2.0,
                        reduction='sum',
                        name=None):
-    """
+    r"""
     `Focal Loss <https://arxiv.org/abs/1708.02002>`_ is proposed to address the
     foreground-background class imbalance for classification tasks. It down-weights
     easily-classified examples and thus focuses training on hard examples. For example,
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 0a1547bebbb..250039b9646 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -35,7 +35,7 @@ __all__ = [
 
 
 def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
-    """
+    r"""
     This op normalizes ``x`` along dimension ``axis`` using :math:`L_p` norm. This layer computes
 
     .. math::
@@ -412,7 +412,7 @@ def local_response_norm(x,
                         k=1.,
                         data_format="NCHW",
                         name=None):
-    """
+    r"""
         Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
         For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
 
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 5e1cb377bd7..a76bc9e86d2 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -54,11 +54,7 @@ import numpy as np
 # from ...fluid.layers import roi_perspective_transform  #DEFINE_ALIAS
 # from ...fluid.layers import shuffle_channel  #DEFINE_ALIAS
 
-__all__ = [
-    'affine_grid',
-    'grid_sample',
-    'pixel_shuffle'
-]
+__all__ = ['affine_grid', 'grid_sample', 'pixel_shuffle']
 
 
 def affine_grid(theta, out_shape, align_corners=True, name=None):
diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py
index f0c6880e89d..7e2b6f787f8 100644
--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -19,7 +19,7 @@ __all__ = ['KaimingUniform', 'KaimingNormal']
 
 
 class KaimingNormal(MSRAInitializer):
-    """Implements the Kaiming Normal initializer
+    r"""Implements the Kaiming Normal initializer
 
     This class implements the weight initialization from the paper
     `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
@@ -62,7 +62,7 @@ class KaimingNormal(MSRAInitializer):
 
 
 class KaimingUniform(MSRAInitializer):
-    """Implements the Kaiming Uniform initializer
+    r"""Implements the Kaiming Uniform initializer
 
     This class implements the weight initialization from the paper
     `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index 5a4e7fec057..821a6984753 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -18,7 +18,7 @@ __all__ = ['XavierNormal', 'XavierUniform']
 
 
 class XavierNormal(XavierInitializer):
-    """
+    r"""
     This class implements the Xavier weight initializer from the paper
     `Understanding the difficulty of training deep feedforward neural
     networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
@@ -71,7 +71,7 @@ class XavierNormal(XavierInitializer):
 
 
 class XavierUniform(XavierInitializer):
-    """
+    r"""
     This class implements the Xavier weight initializer from the paper
     `Understanding the difficulty of training deep feedforward neural
     networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 520762107db..b002b534625 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -50,7 +50,7 @@ from .. import functional as F
 
 
 class ELU(layers.Layer):
-    """
+    r"""
     ELU Activation.
 
     .. math::
@@ -88,7 +88,7 @@ class ELU(layers.Layer):
 
 
 class GELU(layers.Layer):
-    """
+    r"""
     GELU Activation.
 
     If approximate is True
@@ -137,7 +137,7 @@ class GELU(layers.Layer):
 
 
 class Hardshrink(layers.Layer):
-    """
+    r"""
     Hardshrink Activation
 
     .. math::
@@ -181,7 +181,7 @@ class Hardshrink(layers.Layer):
 
 
 class Hardswish(layers.Layer):
-    """
+    r"""
     Hardswish activation
 
     Hardswish is proposed in MobileNetV3, and performs better in computational stability
@@ -227,7 +227,7 @@ class Hardswish(layers.Layer):
 
 
 class Tanh(layers.Layer):
-    """
+    r"""
     Tanh Activation.
 
     .. math::
@@ -264,7 +264,7 @@ class Tanh(layers.Layer):
 
 
 class Hardtanh(layers.Layer):
-    """
+    r"""
     Hardtanh Activation
 
     .. math::
@@ -442,7 +442,7 @@ class ReLU6(layers.Layer):
 
 
 class SELU(layers.Layer):
-    """
+    r"""
     SELU Activation
 
     .. math::
@@ -488,7 +488,7 @@ class SELU(layers.Layer):
 
 
 class LeakyReLU(layers.Layer):
-    """
+    r"""
     Leaky ReLU Activation.
 
     .. math::
@@ -574,7 +574,7 @@ class Sigmoid(layers.Layer):
 
 
 class Hardsigmoid(layers.Layer):
-    """
+    r"""
     This interface is used to construct a callable object of the ``Hardsigmoid`` class.
     This layer calcluate the `hardsigmoid` of input x.
 
@@ -621,7 +621,7 @@ class Hardsigmoid(layers.Layer):
 
 
 class Softplus(layers.Layer):
-    """
+    r"""
     Softplus Activation
 
     .. math::
@@ -661,7 +661,7 @@ class Softplus(layers.Layer):
 
 
 class Softshrink(layers.Layer):
-    """
+    r"""
     Softshrink Activation
 
     .. math::
@@ -702,7 +702,7 @@ class Softshrink(layers.Layer):
 
 
 class Softsign(layers.Layer):
-    """
+    r"""
     Softsign Activation
 
     .. math::
@@ -737,7 +737,7 @@ class Softsign(layers.Layer):
 
 
 class Swish(layers.Layer):
-    """
+    r"""
     Swish Activation.
 
     .. math::
@@ -807,7 +807,7 @@ class Tanhshrink(layers.Layer):
 
 
 class ThresholdedReLU(layers.Layer):
-    """
+    r"""
     Thresholded ReLU Activation
 
     .. math::
@@ -847,7 +847,7 @@ class ThresholdedReLU(layers.Layer):
 
 
 class LogSigmoid(layers.Layer):
-    """
+    r"""
     LogSigmoid Activation.
 
     .. math::
@@ -882,7 +882,7 @@ class LogSigmoid(layers.Layer):
 
 
 class Softmax(layers.Layer):
-    """
+    r"""
     Softmax Activation.
 
     This operator implements the softmax layer. The calculation process is as follows:
@@ -1005,7 +1005,7 @@ class Softmax(layers.Layer):
 
 
 class LogSoftmax(layers.Layer):
-    """
+    r"""
     This operator implements the log_softmax layer. The calculation process is as follows:
 
     .. math::
@@ -1059,7 +1059,7 @@ class LogSoftmax(layers.Layer):
 
 
 class Maxout(layers.Layer):
-    """
+    r"""
     Maxout Activation.
 
     Assumed the input shape is (N, Ci, H, W).
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 9a3edef5e4c..8558e0f1793 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -40,7 +40,7 @@ __all__ = [
 
 
 class Linear(layers.Layer):
-    """
+    r"""
 
     Fully-connected linear transformation layer. For each input :math:`X` ,
     the equation is:
@@ -381,7 +381,7 @@ class Upsample(layers.Layer):
 
 
 class Bilinear(layers.Layer):
-    """
+    r"""
 
     This layer performs bilinear on two inputs.
 
@@ -988,7 +988,7 @@ class CosineSimilarity(layers.Layer):
 
 
 class Embedding(layers.Layer):
-    """
+    r"""
     **Embedding Layer**
 
     This interface is used to construct a callable object of the ``Embedding`` class.
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 0b0d0e302b8..d554bb0fd96 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -141,7 +141,7 @@ class _ConvNd(layers.Layer):
 
 
 class Conv1D(_ConvNd):
-    """
+    r"""
     This interface is used to construct a callable object of the ``Conv1D`` class.
     For more details, refer to code examples.
     The convolution1D layer calculates the output based on the input, filter
@@ -294,7 +294,7 @@ class Conv1D(_ConvNd):
 
 
 class Conv1DTranspose(_ConvNd):
-    """
+    r"""
     This interface is used to construct a callable object of the ``Conv1DTranspose`` class.
     For more details, refer to code examples.
     The 1-D convolution transpose layer calculates the output based on the input,
@@ -469,7 +469,7 @@ class Conv1DTranspose(_ConvNd):
 
 
 class Conv2D(_ConvNd):
-    """
+    r"""
     This interface is used to construct a callable object of the ``Conv2D`` class.
     For more details, refer to code examples.
     The convolution2D layer calculates the output based on the input, filter
@@ -626,7 +626,7 @@ class Conv2D(_ConvNd):
 
 
 class Conv2DTranspose(_ConvNd):
-    """
+    r"""
     This interface is used to construct a callable object of the ``Conv2DTranspose`` class.
     For more details, refer to code examples.
     The convolution2D transpose layer calculates the output based on the input,
@@ -786,7 +786,7 @@ class Conv2DTranspose(_ConvNd):
 
 
 class Conv3D(_ConvNd):
-    """
+    r"""
     **Convlution3d Layer**
     The convolution3d layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
@@ -943,7 +943,7 @@ class Conv3D(_ConvNd):
 
 
 class Conv3DTranspose(_ConvNd):
-    """
+    r"""
     **Convlution3D transpose layer**
     The convolution3D transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
index 28b29a583d8..5a3c611b3c4 100644
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -24,7 +24,7 @@ from ...fluid.layer_helper import LayerHelper
 
 
 class PairwiseDistance(layers.Layer):
-    """
+    r"""
     This operator computes the pairwise distance between two vectors. The
     distance is calculated by p-oreder norm:
 
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 96db0dde54f..faf1345c7ba 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -36,7 +36,7 @@ __all__ = [
 
 
 class BCEWithLogitsLoss(fluid.dygraph.Layer):
-    """
+    r"""
     This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
     Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
     layer and some reduce operations.
@@ -141,7 +141,7 @@ class BCEWithLogitsLoss(fluid.dygraph.Layer):
 
 
 class CrossEntropyLoss(fluid.dygraph.Layer):
-    """
+    r"""
 	:alias_main: paddle.nn.CrossEntropyLoss
 	:alias: paddle.nn.CrossEntropyLoss,paddle.nn.layer.CrossEntropyLoss,paddle.nn.layer.loss.CrossEntropyLoss
 
@@ -375,7 +375,7 @@ class HSigmoidLoss(fluid.dygraph.Layer):
 
 
 class MSELoss(fluid.dygraph.layers.Layer):
-    """
+    r"""
     **Mean Square Error Loss**
     Computes the mean square error (squared L2 norm) of given input and label.
 
@@ -454,7 +454,7 @@ class MSELoss(fluid.dygraph.layers.Layer):
 
 
 class L1Loss(fluid.dygraph.Layer):
-    """
+    r"""
     This interface is used to construct a callable object of the ``L1Loss`` class.
     The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
 
@@ -622,7 +622,7 @@ class BCELoss(fluid.dygraph.Layer):
 
 
 class NLLLoss(fluid.dygraph.Layer):
-    """
+    r"""
 
     This class accepts input and target label and returns negative log likelihood
     cross error. It is useful to train a classification problem with C classes.
@@ -733,7 +733,7 @@ class NLLLoss(fluid.dygraph.Layer):
 
 
 class KLDivLoss(fluid.dygraph.Layer):
-    """
+    r"""
     This interface calculates the Kullback-Leibler divergence loss
     between Input(X) and Input(Target). Notes that Input(X) is the
     log-probability and Input(Target) is the probability.
@@ -806,7 +806,7 @@ class KLDivLoss(fluid.dygraph.Layer):
 
 
 class MarginRankingLoss(fluid.dygraph.Layer):
-    """
+    r"""
 
     This interface is used to construct a callable object of the ``MarginRankingLoss`` class.
     The MarginRankingLoss layer calculates the margin rank loss between the input, other and label
@@ -958,7 +958,7 @@ class CTCLoss(fluid.dygraph.Layer):
 
 
 class SmoothL1Loss(fluid.dygraph.Layer):
-    """
+    r"""
     This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
     term if the absolute element-wise error falls below 1 and an L1 term otherwise.
     In some cases it can prevent exploding gradients and it is more robust and less
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 5e2292d40d2..7f416749c8a 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -109,7 +109,7 @@ class _InstanceNormBase(layers.Layer):
 
 
 class InstanceNorm1D(_InstanceNormBase):
-    """
+    r"""
     Applies Instance Normalization over a 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
     DataLayout: NCL `[batch, in_channels, length]`
@@ -181,7 +181,7 @@ class InstanceNorm1D(_InstanceNormBase):
 
 
 class InstanceNorm2D(_InstanceNormBase):
-    """
+    r"""
     Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
     DataLayout: NCHW `[batch, in_channels, in_height, in_width]`
@@ -252,7 +252,7 @@ class InstanceNorm2D(_InstanceNormBase):
 
 
 class InstanceNorm3D(_InstanceNormBase):
-    """
+    r"""
     Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
     DataLayout: NCHW `[batch, in_channels, D, in_height, in_width]`
@@ -437,7 +437,7 @@ class GroupNorm(layers.Layer):
 
 
 class LayerNorm(layers.Layer):
-    """
+    r"""
     :alias_main: paddle.nn.LayerNorm
 	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
 	:old_api: paddle.fluid.dygraph.LayerNorm
@@ -649,7 +649,7 @@ class _BatchNormBase(layers.Layer):
 
 
 class BatchNorm1D(_BatchNormBase):
-    """
+    r"""
     Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
     When track_running_stats = False, the :math:`\\mu_{\\beta}`
@@ -740,7 +740,7 @@ class BatchNorm1D(_BatchNormBase):
 
 
 class BatchNorm2D(_BatchNormBase):
-    """
+    r"""
     Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
     When track_running_stats = False, the :math:`\\mu_{\\beta}`
@@ -829,7 +829,7 @@ class BatchNorm2D(_BatchNormBase):
 
 
 class BatchNorm3D(_BatchNormBase):
-    """
+    r"""
     Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
     When track_running_stats = False, the :math:`\\mu_{\\beta}`
@@ -919,7 +919,7 @@ class BatchNorm3D(_BatchNormBase):
 
 
 class SyncBatchNorm(_BatchNormBase):
-    """
+    r"""
     This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
     It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
     be used as a normalizer function for other operations, such as conv2d and fully connected 
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 7be229bdce0..dc065918f3d 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -120,7 +120,7 @@ class AvgPool1D(layers.Layer):
 
 
 class AvgPool2D(layers.Layer):
-    """
+    r"""
     This operation applies 2D average pooling over input features based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
     in NCHW format, where N is batch size, C is the number of channels,
@@ -401,7 +401,7 @@ class MaxPool1D(layers.Layer):
 
 
 class MaxPool2D(layers.Layer):
-    """
+    r"""
     This operation applies 2D max pooling over input feature based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
     in NCHW format, where N is batch size, C is the number of channels,
@@ -595,7 +595,7 @@ class MaxPool3D(layers.Layer):
 
 
 class AdaptiveAvgPool1D(layers.Layer):
-    """
+    r"""
 
     This operation applies a 1D adaptive average pooling over an input signal composed
     of several input planes, based on the input, output_size, return_mask parameters.
@@ -663,7 +663,7 @@ class AdaptiveAvgPool1D(layers.Layer):
 
 
 class AdaptiveAvgPool2D(layers.Layer):
-    """
+    r"""
 
     This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size.
@@ -745,7 +745,7 @@ class AdaptiveAvgPool2D(layers.Layer):
 
 
 class AdaptiveAvgPool3D(layers.Layer):
-    """
+    r"""
 
     This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size.
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index ea4f6970bc6..0da00735b43 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -157,7 +157,7 @@ class MultiHeadAttention(Layer):
             embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
 
     def _prepare_qkv(self, query, key, value, cache=None):
-        """
+        r"""
         Prapares linear projected queries, keys and values for usage of subsequnt
         multiple parallel attention. If `cache` is not None, using cached results
         to reduce redundant calculations.
@@ -212,7 +212,7 @@ class MultiHeadAttention(Layer):
         return (q, k, v) if cache is None else (q, k, v, cache)
 
     def compute_kv(self, key, value):
-        """
+        r"""
         Applies linear projection on input keys and values, then splits heads
         (reshape and transpose) to get keys and values from different representation
         subspaces. The results are used as key-values pairs for subsequent multiple
@@ -312,7 +312,7 @@ class MultiHeadAttention(Layer):
             return self.Cache(key, value)
 
     def forward(self, query, key, value, attn_mask=None, cache=None):
-        """
+        r"""
         Applies multi-head attention to map queries and a set of key-value pairs
         to outputs.
 
@@ -499,7 +499,7 @@ class TransformerEncoderLayer(Layer):
         self.activation = getattr(F, activation)
 
     def forward(self, src, src_mask=None):
-        """
+        r"""
         Applies a Transformer encoder layer on the input.
 
         Parameters:
@@ -575,7 +575,7 @@ class TransformerEncoder(Layer):
         self.norm = norm
 
     def forward(self, src, src_mask=None):
-        """
+        r"""
         Applies a stack of N Transformer encoder layers on inputs. If `norm` is
         provided, also applies layer normalization on the output of last encoder
         layer.
@@ -725,7 +725,7 @@ class TransformerDecoderLayer(Layer):
         self.activation = getattr(F, activation)
 
     def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
-        """
+        r"""
         Applies a Transformer decoder layer on the input.
 
         Parameters:
@@ -801,7 +801,7 @@ class TransformerDecoderLayer(Layer):
                                                 static_cache))
 
     def gen_cache(self, memory):
-        """
+        r"""
         Generates cache for `forward` usage. The generated cache is a tuple
         composed of an instance of `MultiHeadAttention.Cache` and an instance
         of `MultiHeadAttention.StaticCache`.
@@ -873,7 +873,7 @@ class TransformerDecoder(Layer):
         self.norm = norm
 
     def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
-        """
+        r"""
         Applies a stack of N Transformer decoder layers on inputs. If `norm` is
         provided, also applies layer normalization on the output of last decoder
         layer.
@@ -937,7 +937,7 @@ class TransformerDecoder(Layer):
         return output if cache is None else (output, new_caches)
 
     def gen_cache(self, memory, do_zip=False):
-        """
+        r"""
         Generates cache for `forward` usage. The generated cache is a list, and
         each element in it is a tuple( :code:`(incremental_cache, static_cache)` )
         produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache`
@@ -1139,7 +1139,7 @@ class Transformer(Layer):
         self.nhead = nhead
 
     def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
-        """
+        r"""
         Applies a Transformer model on the inputs.
 
         Parameters:
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index 7a21e7661d4..b14fb3e2120 100644
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -153,7 +153,7 @@ class WeightNorm(object):
 
 
 def weight_norm(layer, name='weight', dim=0):
-    """
+    r"""
     This weight_norm layer applies weight normalization to a parameter according to the 
     following formula:
 
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index bba2c11ea07..91591d23f00 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -21,7 +21,7 @@ __all__ = ["Adadelta"]
 
 
 class Adadelta(Optimizer):
-    """
+    r"""
     **Notes: This API does not support sparse parameter optimization.**
 
     Adadelta Optimizer. Please refer to this for details:
diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py
index ed55ebd0bf2..72a3f8ce996 100644
--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
@@ -21,7 +21,7 @@ __all__ = ["Adagrad"]
 
 
 class Adagrad(Optimizer):
-    """
+    r"""
     The Adaptive Gradient optimizer (Adagrad for short) use an optimization described 
     in paper: `Adaptive Subgradient Methods for Online Learning and
     Stochastic Optimization <http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf>`_.
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 79caa158312..37510231219 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -24,7 +24,7 @@ __all__ = ["Adam"]
 
 
 class Adam(Optimizer):
-    """
+    r"""
     The Adam optimizer uses an optimization described at the end
     of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
     it can dynamically adjusts the learning rate of each parameter using
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index e5d1962d126..5d164fa7623 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -21,7 +21,7 @@ __all__ = ["Adamax"]
 
 
 class Adamax(Optimizer):
-    """
+    r"""
     The Adamax optimizer is implemented based on the Adamax Optimization 
     in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
     The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 0ffff675903..b597109d314 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -23,7 +23,7 @@ __all__ = ['AdamW']
 
 
 class AdamW(Adam):
-    """
+    r"""
     The AdamW optimizer is implemented based on the AdamW Optimization
     in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
     it can resolves the problem of L2 regularization failure in the Adam optimizer.
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 2d5dc5d998e..5085911ce92 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -192,7 +192,7 @@ class LRScheduler(object):
 
 
 class NoamDecay(LRScheduler):
-    """
+    r"""
 
     Applies Noam Decay to the initial learning rate. 
 
@@ -376,7 +376,7 @@ class PiecewiseDecay(LRScheduler):
 
 
 class NaturalExpDecay(LRScheduler):
-    """
+    r"""
 
     Applies natural exponential decay to the initial learning rate.
     
@@ -455,7 +455,7 @@ class NaturalExpDecay(LRScheduler):
 
 
 class InverseTimeDecay(LRScheduler):
-    """
+    r"""
 
     Applies inverse time decay to the initial learning rate.
 
@@ -536,7 +536,7 @@ class InverseTimeDecay(LRScheduler):
 
 
 class PolynomialDecay(LRScheduler):
-    """
+    r"""
 
     Applies polynomial decay to the initial learning rate.
 
@@ -656,7 +656,7 @@ class PolynomialDecay(LRScheduler):
 
 
 class LinearWarmup(LRScheduler):
-    """
+    r"""
 
     Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
     For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
@@ -794,7 +794,7 @@ class LinearWarmup(LRScheduler):
 
 
 class ExponentialDecay(LRScheduler):
-    """
+    r"""
 
     Update learning rate by `gamma` each epoch.
 
@@ -1383,7 +1383,7 @@ class ReduceOnPlateau(LRScheduler):
 
 
 class CosineAnnealingDecay(LRScheduler):
-    """
+    r"""
 
     Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to 
     the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in 
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index 87fa86c1761..2cfd8deaef7 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -21,7 +21,7 @@ __all__ = ["Momentum"]
 
 
 class Momentum(Optimizer):
-    """
+    r"""
 
     Simple Momentum optimizer with velocity state
 
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index d0326b4155a..030d419de48 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -47,7 +47,7 @@ __all__ = ['Optimizer']
 
 
 class Optimizer(object):
-    """Optimizer Base class.
+    r"""Optimizer Base class.
 
     Define the common interface of an optimizer.
     User should not use this class directly,
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index a664b015956..12825bb7813 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -21,7 +21,7 @@ __all__ = ["RMSProp"]
 
 
 class RMSProp(Optimizer):
-    """
+    r"""
     Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
     rate method. The original slides proposed RMSProp: Slide 29 of
     http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index 133c3dfb24f..44e5695a2cf 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -21,7 +21,7 @@ __all__ = ["SGD"]
 
 
 class SGD(Optimizer):
-    """
+    r"""
     Optimizer of the stochastic gradient descent algorithm.
 
     .. math::
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
index 881cfd81314..1a4d4546923 100644
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
+r"""
 At training and testing time, PaddlePaddle programs need to read data. To ease
 the users' work to write data reading code, we define that
 
diff --git a/python/paddle/regularizer.py b/python/paddle/regularizer.py
index a1ab329169a..586ae0f988c 100644
--- a/python/paddle/regularizer.py
+++ b/python/paddle/regularizer.py
@@ -18,7 +18,7 @@ import paddle.fluid as fluid
 
 
 class L1Decay(fluid.regularizer.L1Decay):
-    """
+    r"""
     Implement the L1 Weight Decay Regularization, which encourages the weights to be sparse.
     
     It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
@@ -80,7 +80,7 @@ class L1Decay(fluid.regularizer.L1Decay):
 
 
 class L2Decay(fluid.regularizer.L2Decay):
-    """
+    r"""
     Implement the L2 Weight Decay Regularization, which helps to prevent the model over-fitting.
     
     It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index a25a8fb191b..84a5ed9950a 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -14,7 +14,6 @@
 
 from __future__ import print_function
 
-
 import errno
 import inspect
 import logging
@@ -31,7 +30,6 @@ from paddle.fluid.io import prepend_feed_ops, append_fetch_ops, save_persistable
 from paddle.fluid.io import load_persistables, _endpoints_replacement
 from paddle.fluid.log_helper import get_logger
 
-
 __all__ = [
     'save_inference_model',
     'load_inference_model',
@@ -44,10 +42,13 @@ _logger = get_logger(
 def _check_args(caller, args, supported_args=[], deprecated_args=[]):
     for arg in args:
         if arg in deprecated_args:
-            raise ValueError("argument '{}' in function '{}' is deprecated, only {} are supported.".format(arg, caller, supported_args))
+            raise ValueError(
+                "argument '{}' in function '{}' is deprecated, only {} are supported.".
+                format(arg, caller, supported_args))
         elif arg not in supported_args:
             raise ValueError(
-                "function '{}' doesn't support argument '{}',\n only {} are supported.".format(caller, arg, supported_args))
+                "function '{}' doesn't support argument '{}',\n only {} are supported.".
+                format(caller, arg, supported_args))
 
 
 @static_only
@@ -129,14 +130,18 @@ def save_inference_model(path_prefix, feed_vars, fetch_vars, executor):
     # verify feed_vars
     if not isinstance(feed_vars, list):
         feed_vars = [feed_vars]
-    if not feed_vars or not all([isinstance(var, Variable) for var in feed_vars]):
-        raise ValueError("'feed_vars' should be a Variable or a list of Variable.")
+    if not feed_vars or not all(
+        [isinstance(var, Variable) for var in feed_vars]):
+        raise ValueError(
+            "'feed_vars' should be a Variable or a list of Variable.")
 
     # verify fetch_vars
     if not isinstance(fetch_vars, list):
         fetch_vars = [fetch_vars]
-    if not fetch_vars or not all([isinstance(var, Variable) for var in fetch_vars]):
-        raise ValueError("'fetch_vars' should be a Variable or a list of Variable.")
+    if not fetch_vars or not all(
+        [isinstance(var, Variable) for var in fetch_vars]):
+        raise ValueError(
+            "'fetch_vars' should be a Variable or a list of Variable.")
 
     main_program = _get_valid_program()
     # remind users to set auc_states to 0 if auc op were found.
@@ -145,7 +150,9 @@ def save_inference_model(path_prefix, feed_vars, fetch_vars, executor):
         device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
         op._set_attr(device_attr_name, "")
         if op.type == 'auc':
-            warnings.warn("Be sure that you have set auc states to 0 before saving inference model.")
+            warnings.warn(
+                "Be sure that you have set auc states to 0 before saving inference model."
+            )
             break
 
     # fix the bug that the activation op's output as target will be pruned.
@@ -154,10 +161,11 @@ def save_inference_model(path_prefix, feed_vars, fetch_vars, executor):
     with program_guard(main_program):
         uniq_fetch_vars = []
         for i, var in enumerate(fetch_vars):
-            var = layers.scale(var, 1., name="save_infer_model/scale_{}".format(i))
+            var = layers.scale(
+                var, 1., name="save_infer_model/scale_{}".format(i))
             uniq_fetch_vars.append(var)
         fetch_vars = uniq_fetch_vars
-    
+
     # save model
     origin_program = main_program.clone()
     main_program = main_program.clone()
@@ -257,7 +265,7 @@ def load_inference_model(path_prefix, executor, **configs):
     """
     # check configs
     supported_args = ('model_filename', 'params_filename')
-    deprecated_args = ('pserver_endpoints',)
+    deprecated_args = ('pserver_endpoints', )
     caller = inspect.currentframe().f_code.co_name
     _check_args(caller, configs, supported_args, deprecated_args)
 
@@ -268,8 +276,7 @@ def load_inference_model(path_prefix, executor, **configs):
         params_filename = configs.get('params_filename', None)
         if params_filename is None:
             raise ValueError(
-                "params_filename cannot be None when path_prefix is None."
-            )
+                "params_filename cannot be None when path_prefix is None.")
         load_dirname = path_prefix
         program_desc_str = model_filename
         params_filename = params_filename
@@ -297,18 +304,21 @@ def load_inference_model(path_prefix, executor, **configs):
             if model_filename is None:
                 model_path = os.path.join(path_prefix, "__model__")
             else:
-                model_path = os.path.join(path_prefix, model_filename + ".pdmodel")
+                model_path = os.path.join(path_prefix,
+                                          model_filename + ".pdmodel")
                 if not os.path.exists(model_path):
                     model_path = os.path.join(path_prefix, model_filename)
             # set params_path
             if params_filename is None:
                 params_path = os.path.join(path_prefix, "")
             else:
-                params_path = os.path.join(path_prefix, params_filename + ".pdiparams")
+                params_path = os.path.join(path_prefix,
+                                           params_filename + ".pdiparams")
                 if not os.path.exists(params_path):
                     params_path = os.path.join(path_prefix, params_filename)
             _logger.warning("The old way to load inference model is deprecated."
-                    " model path: {}, params path: {}".format(model_path, params_path))
+                            " model path: {}, params path: {}".format(
+                                model_path, params_path))
         with open(model_path, "rb") as f:
             program_desc_str = f.read()
         load_dirname = os.path.dirname(params_path)
@@ -328,4 +338,3 @@ def load_inference_model(path_prefix, executor, **configs):
     ]
 
     return [program, feed_target_names, fetch_targets]
-
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index 44f0a73fa42..0806d2c2914 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -26,7 +26,7 @@ def fc(x,
        bias_attr=None,
        activation=None,
        name=None):
-    """
+    r"""
 
     Fully-Connected layer can take a tensor or a list of tensor as its inputs.
     It creates a 2-D weight tensor for each input tensor, which represents its
@@ -180,7 +180,7 @@ def deform_conv2d(x,
                   weight_attr=None,
                   bias_attr=None,
                   name=None):
-    """
+    r"""
 
     Compute 2-D deformable convolution on 4-D input.
     Given input image x, output feature map y, the deformable convolution operation can be expressed as follow:
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index b46e1c79461..32e86c96b4e 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -54,7 +54,7 @@ __all__ = [
 
 @dygraph_only
 def to_tensor(data, dtype=None, place=None, stop_gradient=True):
-    """
+    r"""
     Constructs a ``paddle.Tensor`` or ``paddle.ComplexTensor`` from ``data`` , 
     which can be scalar, tuple, list, numpy\.ndarray, paddle\.Tensor, paddle\.ComplexTensor.
 
@@ -609,7 +609,7 @@ def _tril_triu_op(helper):
 
 
 def tril(x, diagonal=0, name=None):
-    """
+    r"""
 	:alias_main: paddle.tril
 	:alias: paddle.tril,paddle.tensor.tril,paddle.tensor.creation.tril
 
@@ -680,7 +680,7 @@ def tril(x, diagonal=0, name=None):
 
 
 def triu(x, diagonal=0, name=None):
-    """
+    r"""
 	:alias_main: paddle.triu
 	:alias: paddle.triu,paddle.tensor.triu,paddle.tensor.creation.triu
 
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 25fb9343179..b1c0f0b446a 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -453,7 +453,7 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
 
 
 def dist(x, y, p=2):
-    """
+    r"""
 
     This OP returns the p-norm of (x - y). It is not a norm in a strict sense, only as a measure
     of distance. The shapes of x and y must be broadcastable. The definition is as follows, for
@@ -740,7 +740,7 @@ def cross(x, y, axis=None, name=None):
 
 
 def cholesky(x, upper=False, name=None):
-    """
+    r"""
     Computes the Cholesky decomposition of one symmetric positive-definite
     matrix or batches of symmetric positive-definite matrice. 
     
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 0bda55a1fae..7ea8a9286c3 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -169,7 +169,7 @@ def flip(x, axis, name=None):
 
 
 def flatten(x, start_axis=0, stop_axis=-1, name=None):
-    """
+    r"""
     **Flatten op**
 
     Flattens a contiguous range of axes in a tensor according to start_axis and stop_axis.
@@ -565,7 +565,7 @@ def unique(x,
            axis=None,
            dtype="int64",
            name=None):
-    """
+    r"""
     Returns the unique elements of `x` in ascending order.
 
     Args:
@@ -946,7 +946,7 @@ def scatter(x, index, updates, overwrite=True, name=None):
 
 
 def scatter_nd_add(x, index, updates, name=None):
-    """
+    r"""
     **Scatter_nd_add Layer**
 
     Output is obtained by applying sparse addition to a single value
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index ccc49c769c2..e7b72fe95bc 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -379,7 +379,7 @@ def floor_divide(x, y, name=None):
 
 
 def remainder(x, y, name=None):
-    """
+    r"""
     Mod two tensors element-wise. The equation is:
 
     .. math::
@@ -981,7 +981,7 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
 
 
 def logsumexp(x, axis=None, keepdim=False, name=None):
-    """
+    r"""
     This OP calculates the log of the sum of exponentials of ``x`` along ``axis`` .
 
     .. math::
@@ -1281,7 +1281,7 @@ def min(x, axis=None, keepdim=False, name=None):
 
 
 def log1p(x, name=None):
-    """
+    r"""
     Calculates the natural log of the given input tensor, element-wise.
     .. math::
         Out = \\ln(x+1)
@@ -1315,7 +1315,7 @@ def log1p(x, name=None):
     return out
 
 def log2(x, name=None):
-    """
+    r"""
     Calculates the log to the base 2 of the given input tensor, element-wise.
 
     .. math::
@@ -1365,7 +1365,7 @@ def log2(x, name=None):
 
 
 def log10(x, name=None):
-    """
+    r"""
     Calculates the log to the base 10 of the given input tensor, element-wise.
 
     .. math::
@@ -1947,7 +1947,7 @@ def sign(x, name=None):
 
 
 def tanh(x, name=None):
-    """
+    r"""
     Tanh Activation Operator.
 
     .. math::
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index f5e0dc4c05b..c4a3bf4b1b6 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -494,7 +494,7 @@ def sort(x, axis=-1, descending=False, name=None):
 
 
 def where(condition, x, y, name=None):
-    """
+    r"""
     Return a tensor of elements selected from either $x$ or $y$, depending on $condition$.
 
     .. math::
diff --git a/python/paddle/text/datasets/imdb.py b/python/paddle/text/datasets/imdb.py
index f1bf247efca..f02b5981906 100644
--- a/python/paddle/text/datasets/imdb.py
+++ b/python/paddle/text/datasets/imdb.py
@@ -93,7 +93,7 @@ class Imdb(Dataset):
 
     def _build_work_dict(self, cutoff):
         word_freq = collections.defaultdict(int)
-        pattern = re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$")
+        pattern = re.compile(r"aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$")
         for doc in self._tokenize(pattern):
             for word in doc:
                 word_freq[word] += 1
@@ -123,8 +123,8 @@ class Imdb(Dataset):
         return data
 
     def _load_anno(self):
-        pos_pattern = re.compile("aclImdb/{}/pos/.*\.txt$".format(self.mode))
-        neg_pattern = re.compile("aclImdb/{}/neg/.*\.txt$".format(self.mode))
+        pos_pattern = re.compile(r"aclImdb/{}/pos/.*\.txt$".format(self.mode))
+        neg_pattern = re.compile(r"aclImdb/{}/neg/.*\.txt$".format(self.mode))
 
         UNK = self.word_idx['<unk>']
 
diff --git a/r/example/mobilenet.py b/r/example/mobilenet.py
index adb1c330a70..99e755ab69f 100755
--- a/r/example/mobilenet.py
+++ b/r/example/mobilenet.py
@@ -1,4 +1,19 @@
 #!/usr/bin/env python3.7
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # pylint: skip-file
 
 import functools
diff --git a/tools/check_ctest_hung.py b/tools/check_ctest_hung.py
index c44690a93ac..556c8ef6043 100644
--- a/tools/check_ctest_hung.py
+++ b/tools/check_ctest_hung.py
@@ -42,11 +42,11 @@ Diff:  set(['test_parallel_executor_crf'])
         for l in fn.readlines():
             if l.find("Test ") != -1 and \
                 l.find("Passed") != -1:
-                m = re.search("Test\s+#[0-9]*\:\s([a-z0-9_]+)", escape(l))
+                m = re.search(r"Test\s+#[0-9]*\:\s([a-z0-9_]+)", escape(l))
                 passed.add(m.group(1))
             if l.find("Start ") != -1:
                 start_parts = escape(l).split(" ")
-                m = re.search("Start\s+[0-9]+\:\s([a-z0-9_]+)", escape(l))
+                m = re.search(r"Start\s+[0-9]+\:\s([a-z0-9_]+)", escape(l))
                 started.add(m.group(1))
     print("Diff: ", started - passed)
 
diff --git a/tools/codestyle/docstring_checker.py b/tools/codestyle/docstring_checker.py
index 8d4b24a0cf6..823d9470230 100644
--- a/tools/codestyle/docstring_checker.py
+++ b/tools/codestyle/docstring_checker.py
@@ -101,7 +101,7 @@ class Docstring(object):
     def _arg_with_type(self):
 
         for t in self.d['Args']:
-            m = re.search('([A-Za-z0-9_-]+)\s{0,4}(\(.+\))\s{0,4}:', t)
+            m = re.search(r'([A-Za-z0-9_-]+)\s{0,4}(\(.+\))\s{0,4}:', t)
             if m:
                 self.args[m.group(1)] = m.group(2)
 
diff --git a/tools/coverage/coverage_diff.py b/tools/coverage/coverage_diff.py
index 38f671fe408..6a400d293b2 100644
--- a/tools/coverage/coverage_diff.py
+++ b/tools/coverage/coverage_diff.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 usage: coverage_diff.py info_file diff_file > > coverage-diff.info
 """
diff --git a/tools/coverage/coverage_diff_list.py b/tools/coverage/coverage_diff_list.py
index 8975185edad..62834301209 100644
--- a/tools/coverage/coverage_diff_list.py
+++ b/tools/coverage/coverage_diff_list.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 usage: coverage_diff_list.py list_file max_rate > coverage-diff-list-90.out
 """
diff --git a/tools/coverage/coverage_lines.py b/tools/coverage/coverage_lines.py
index cdec5b8b1bb..553cd691e45 100644
--- a/tools/coverage/coverage_lines.py
+++ b/tools/coverage/coverage_lines.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 usage: coverage_lines.py info_file expected
 """
diff --git a/tools/coverage/cuda_clean.py b/tools/coverage/cuda_clean.py
index c71ff375fd5..8c03edd0785 100644
--- a/tools/coverage/cuda_clean.py
+++ b/tools/coverage/cuda_clean.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """ usage: cuda_clean.py pull_id. """
 
 import os
diff --git a/tools/coverage/gcda_clean.py b/tools/coverage/gcda_clean.py
index f5726db005e..39fa3509cb8 100644
--- a/tools/coverage/gcda_clean.py
+++ b/tools/coverage/gcda_clean.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """ usage: gcda_clean.py pull_id. """
 
 import os
diff --git a/tools/coverage/pull_request.py b/tools/coverage/pull_request.py
index 105460032f7..f3e88286ca9 100644
--- a/tools/coverage/pull_request.py
+++ b/tools/coverage/pull_request.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 usage: pull_request.py files pull_id
        pull_request.py diff  pull_id
diff --git a/tools/coverage/python_coverage.py b/tools/coverage/python_coverage.py
index 8ad9d85c1bf..f2e52b5e23b 100644
--- a/tools/coverage/python_coverage.py
+++ b/tools/coverage/python_coverage.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 usage: python_coverage.py > python-coverage.info
 """
diff --git a/tools/get_quick_disable_lt.py b/tools/get_quick_disable_lt.py
index 1e3d7178922..18ebdb00317 100644
--- a/tools/get_quick_disable_lt.py
+++ b/tools/get_quick_disable_lt.py
@@ -20,7 +20,7 @@ import requests
 def download_file():
     """Get disabled unit tests"""
     ssl._create_default_https_context = ssl._create_unverified_context
-    sysstr=sys.platform
+    sysstr = sys.platform
     if sysstr == 'win32':
         url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_win')
     else:
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index d23c18a44e9..ce0490d487f 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -236,20 +236,24 @@ def single_defcom_extract(start_from, srcls, is_class_begin=False):
         if srcls[x].startswith('def ') or srcls[x].startswith('class '):
             break
         else:
-            if (comstart == -1 and srcls[x].replace(" ", '').replace(
-                    "\t", '').replace("\n", '').startswith("\"\"\"")):
-                comstart = x
-                comstyle = 2
-                continue
+            if comstart == -1:
+                s = srcls[x].replace(" ", '').replace("\t",
+                                                      '').replace("\n", '')
+                if s.startswith("\"\"\"") or s.startswith("r\"\"\""):
+                    comstart = x
+                    comstyle = 2
+                    continue
             if (comstyle == 2 and comstart != -1 and
                     srcls[x].replace(" ", '').replace("\t", '').replace(
                         "\n", '').startswith("\"\"\"")):
                 break
-            if (comstart == -1 and srcls[x].replace(" ", '').replace(
-                    "\t", '').replace("\n", '').startswith("\'\'\'")):
-                comstart = x
-                comstyle = 1
-                continue
+            if comstart == -1:
+                s = srcls[x].replace(" ", '').replace("\t",
+                                                      '').replace("\n", '')
+                if s.startswith("\'\'\'") or s.startswith("r\'\'\'"):
+                    comstart = x
+                    comstyle = 1
+                    continue
             if (comstyle == 1 and comstart != -1 and
                     srcls[x].replace(" ", '').replace("\t", '').replace(
                         "\n", '').startswith("\'\'\'")):
diff --git a/tools/summary_env.py b/tools/summary_env.py
index 39d6acaf536..38bae87651d 100644
--- a/tools/summary_env.py
+++ b/tools/summary_env.py
@@ -92,7 +92,7 @@ def get_cudnn_info():
         cudnn_dll_path = run_shell_command('where cudnn*')
         if cudnn_dll_path:
             cudnn_header_path = cudnn_dll_path.split('bin')[
-                0] + 'include\cudnn.h'
+                0] + r'include\cudnn.h'
             cmd = 'type "{0}" | findstr "{1}" | findstr /v "CUDNN_VERSION"'
         else:
             envs['cudnn_version'] = None
-- 
GitLab