Upgrade string literals to raw string (#28989)

* upgrade comment string to raw string * fix string in * fix string with ' ' * revert update on comments * upgrade only necessary * fix sample code checker * fix comments with '''

Upgrade string literals to raw string (#28989)
* upgrade comment string to raw string * fix string in * fix string with ' ' * revert update on comments * upgrade only necessary * fix sample code checker * fix comments with '''
3815d7aa · Leo Chen · GitHub · 767d0ba2 · 3815d7aa · 3815d7aa
109 changed file
--- a/paddle/scripts/conda_build.py
+++ b/paddle/scripts/conda_build.py
 #!/bin/python
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #
 import platform
 from sys import argv
@@ -120,7 +135,7 @@ python setup.py install
        self.py_str = ["py27", "py35", "py36", "py37"]
        self.pip_end = ".whl --no-deps"
        self.pip_prefix_linux = "pip install /package/paddlepaddle"
-        self.pip_prefix_windows = "pip install C:\package\paddlepaddle"
+        self.pip_prefix_windows = r"pip install C:\package\paddlepaddle"
        self.pip_gpu = "_gpu-"
        self.pip_cpu = "-"
        self.mac_pip = [
@@ -216,7 +231,7 @@ package:
    - matplotlib"""
    if not (cuda_str == None):
        meta_str = meta_str + cuda_str
-    
+
    blt_str = var.blt_const + blt_var
    if (python_str == var.python27):
        blt_str = blt_str + """
@@ -224,7 +239,7 @@ package:
    else:
        meta_str = meta_str + """
    - opencv>=3.4.2"""
-    
+
    meta_str = meta_str + var.test + var.about
    meta_filename = "meta.yaml"
    build_filename = "bld.bat"

--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -116,8 +116,8 @@ def train(word_idx):
    :rtype: callable
    """
    return reader_creator(
-        re.compile("aclImdb/train/pos/.*\.txt$"),
-        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
+        re.compile(r"aclImdb/train/pos/.*\.txt$"),
+        re.compile(r"aclImdb/train/neg/.*\.txt$"), word_idx)


 @deprecated(
@@ -137,8 +137,8 @@ def test(word_idx):
    :rtype: callable
    """
    return reader_creator(
-        re.compile("aclImdb/test/pos/.*\.txt$"),
-        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
+        re.compile(r"aclImdb/test/pos/.*\.txt$"),
+        re.compile(r"aclImdb/test/neg/.*\.txt$"), word_idx)


 @deprecated(
@@ -153,7 +153,7 @@ def word_dict():
    :rtype: dict
    """
    return build_dict(
-        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+        re.compile(r"aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)


 @deprecated(

--- a/python/paddle/dataset/tests/imdb_test.py
+++ b/python/paddle/dataset/tests/imdb_test.py
@@ -18,13 +18,13 @@ import paddle.dataset.imdb
 import unittest
 import re

-TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
-TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
-TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
+TRAIN_POS_PATTERN = re.compile(r"aclImdb/train/pos/.*\.txt$")
+TRAIN_NEG_PATTERN = re.compile(r"aclImdb/train/neg/.*\.txt$")
+TRAIN_PATTERN = re.compile(r"aclImdb/train/.*\.txt$")

-TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
-TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
-TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
+TEST_POS_PATTERN = re.compile(r"aclImdb/test/pos/.*\.txt$")
+TEST_NEG_PATTERN = re.compile(r"aclImdb/test/neg/.*\.txt$")
+TEST_PATTERN = re.compile(r"aclImdb/test/.*\.txt$")


 class TestIMDB(unittest.TestCase):

--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -862,7 +862,7 @@ class DistributedStrategy(object):

    @property
    def dgc_configs(self):
-        """
+        r"""
        Set Deep Gradient Compression training configurations. In general, dgc has serveral configurable
        settings that can be configured through a dict.


--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
+r"""
 fleetrun is a module that spawns multiple distributed
 process on each training node for gpu training and cpu training.
 Usage:

--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -158,13 +158,13 @@ class ParameterServerOptimizer(MetaOptimizerBase):
                    ['vm_stat'], stdout=subprocess.PIPE).communicate()[0]
                # Process vm_stat
                vmLines = vm.split('\n')
-                sep = re.compile(':[\s]+')
+                sep = re.compile(r':[\s]+')
                vmStats = {}
                for row in range(1, len(vmLines) - 2):
                    rowText = vmLines[row].strip()
                    rowElements = sep.split(rowText)
                    vmStats[(rowElements[0]
-                             )] = int(rowElements[1].strip('\.')) * 4096
+                             )] = int(rowElements[1].strip(r'\.')) * 4096
                return vmStats["Pages free"]
            elif platform.system() == "Linux":
                mems = {}

--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
+r"""
 paddle.distributed.launch is a module that spawns multiple distributed 
 process on each training node for gpu training.
 Usage:

--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -166,7 +166,7 @@ class Distribution(object):


 class Uniform(Distribution):
-    """Uniform distribution with `low` and `high` parameters.
+    r"""Uniform distribution with `low` and `high` parameters.

    Mathematical Details

@@ -374,7 +374,7 @@ class Uniform(Distribution):
        return elementwise_div((lb * ub), (self.high - self.low), name=name)

    def entropy(self):
-        """Shannon entropy in nats.
+        r"""Shannon entropy in nats.

        The entropy is

@@ -391,7 +391,7 @@ class Uniform(Distribution):


 class Normal(Distribution):
-    """The Normal distribution with location `loc` and `scale` parameters.
+    r"""The Normal distribution with location `loc` and `scale` parameters.

    Mathematical details

@@ -534,7 +534,7 @@ class Normal(Distribution):
                return output

    def entropy(self):
-        """Shannon entropy in nats.
+        r"""Shannon entropy in nats.

        The entropy is

@@ -599,7 +599,7 @@ class Normal(Distribution):
            name=name)

    def kl_divergence(self, other):
-        """The KL-divergence between two normal distributions.
+        r"""The KL-divergence between two normal distributions.

        The probability density function (pdf) is

@@ -644,7 +644,7 @@ class Normal(Distribution):


 class Categorical(Distribution):
-    """
+    r"""
    Categorical distribution is a discrete probability distribution that 
    describes the possible results of a random variable that can take on 
    one of K possible categories, with the probability of each category 

--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -40,7 +40,7 @@ class BaseErrorClipAttr(object):


 class ErrorClipByValue(BaseErrorClipAttr):
-    """
+    r"""
    Clips tensor values to the range [min, max].

    Given a tensor ``t`` (see Examples below), this operation clips its value \
@@ -241,7 +241,7 @@ class ClipGradByValue(ClipGradBase):


 class ClipGradByNorm(ClipGradBase):
-    """
+    r"""
    Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
    
    - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
@@ -343,7 +343,7 @@ class ClipGradByNorm(ClipGradBase):


 class ClipGradByGlobalNorm(ClipGradBase):
-    """
+    r"""
    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in 
    :math:`t\_list` , and limit it to ``clip_norm`` .
    

--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -137,7 +137,7 @@ def var_conv_2d(input,
                act=None,
                dtype='float32',
                name=None):
-    """
+    r"""
    The var_conv_2d layer calculates the output base on the :attr:`input` with variable length,
    row, col, input channel, filter size and strides. Both :attr:`input`, :attr:`row`,
    and :attr:`col` are 1-level LodTensor. The convolution operation is same as conv2d layer with
@@ -477,7 +477,7 @@ def fused_embedding_seq_pool(input,
                             combiner='sum',
                             param_attr=None,
                             dtype='float32'):
-    """
+    r"""
    **Embedding Sequence pool**

    This layer is the fusion of lookup table and sequence_pool.
@@ -1442,7 +1442,7 @@ def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None):


 def _pull_box_extended_sparse(input, size, extend_size=64, dtype='float32'):
-    """
+    r"""
    **Pull Box Extended Sparse Layer**
    This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
    BoxPS lookup table. The result of this lookup is the embedding of each ID in the
@@ -1640,7 +1640,7 @@ def fused_bn_add_act(x,
                     moving_variance_name=None,
                     act=None,
                     name=None):
-    """
+    r"""
    This Op performs batch norm on input x, and adds the result to input y. Then
    it performs activation on the sum. The data format of inputs must be NHWC
    `[batch, in_height, in_width, in_channels]`.

--- a/python/paddle/fluid/contrib/layers/rnn_impl.py
+++ b/python/paddle/fluid/contrib/layers/rnn_impl.py
@@ -175,7 +175,7 @@ def basic_gru(input,
              activation=None,
              dtype='float32',
              name='basic_gru'):
-    """
+    r"""
    GRU implementation using basic operator, supports multiple layers and bidirectional gru.

    .. math::
@@ -418,7 +418,7 @@ def basic_lstm(input,
               forget_bias=1.0,
               dtype='float32',
               name='basic_lstm'):
-    """
+    r"""
    LSTM implementation using basic operators, supports multiple layers and bidirectional LSTM.

    .. math::
@@ -697,7 +697,7 @@ def basic_lstm(input,


 class BasicLSTMUnit(Layer):
-    """
+    r"""
    ****
    BasicLSTMUnit class, Using basic operator to build LSTM
    The algorithm can be described as the code below.

--- a/python/paddle/fluid/contrib/memory_usage_calc.py
+++ b/python/paddle/fluid/contrib/memory_usage_calc.py
@@ -44,7 +44,7 @@ DEBUG = False


 def memory_usage(program, batch_size):
-    """
+    r"""
    Get the estimate memory usage of program with input batch size.

    Args:

--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -64,7 +64,7 @@ class ImperativeQuantAware(object):
                 act_preprocess_layer=None,
                 weight_quantize_layer=None,
                 act_quantize_layer=None):
-        """
+        r"""
        The constructor for ImperativeQuantAware.

        Args:

--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -30,7 +30,7 @@ __all__ = [


 class FakeQuantMovingAverage(layers.Layer):
-    """
+    r"""
    FakeQuantMovingAverage layer does the moving_average_abs_max quant and then dequant.
    Its computational formula is described as below:

@@ -128,7 +128,7 @@ class FakeQuantMovingAverage(layers.Layer):


 class FakeQuantAbsMax(layers.Layer):
-    """
+    r"""
    FakeQuantAbsMax layer does the abs_max quant and then dequant.
    Its computational formula is described as below:

@@ -545,7 +545,7 @@ class QuantizedLinear(layers.Layer):

 class MovingAverageAbsMaxScale(layers.Layer):
    def __init__(self, name=None, moving_rate=0.9, dtype='float32'):
-        """
+        r"""
        MovingAverageMaxScale layer is used to calculating the output quantization scale of Layer.
        Its computational formula is described as below:


--- a/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py
@@ -37,7 +37,7 @@ class QuantInt8MkldnnPass(object):
    """

    def __init__(self, _scope=None, _place=None):
-        """
+        r"""
        Args:
            scope(fluid.Scope): scope is used to initialize the new parameters.
            place(fluid.CPUPlace): place is used to initialize the new parameters.

--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -239,7 +239,7 @@ class QuantizationTransformPass(object):
                 act_preprocess_func=None,
                 optimizer_func=None,
                 executor=None):
-        """
+        r"""
        Constructor.

        Args:

--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py
@@ -33,7 +33,7 @@ _logger = get_logger(


 class HDFSClient(object):
-    """
+    r"""
    A tool of HDFS 

    Args:
@@ -376,7 +376,7 @@ class HDFSClient(object):
            _logger.info("HDFS list path: {} successfully".format(hdfs_path))

            ret_lines = []
-            regex = re.compile('\s+')
+            regex = re.compile(r'\s+')
            out_lines = output.strip().split("\n")
            for line in out_lines:
                re_line = regex.split(line)
@@ -418,7 +418,7 @@ class HDFSClient(object):
            _logger.info("HDFS list all files: {} successfully".format(
                hdfs_path))
            lines = []
-            regex = re.compile('\s+')
+            regex = re.compile(r'\s+')
            out_lines = output.strip().split("\n")
            for line in out_lines:
                re_line = regex.split(line)

--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -224,7 +224,7 @@ def less_than_ver(a, b):
    import operator

    def to_list(s):
-        s = re.sub('(\.0+)+$', '', s)
+        s = re.sub(r'(\.0+)+$', '', s)
        return [int(x) for x in s.split('.')]

    return operator.lt(to_list(a), to_list(b))

--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -101,10 +101,11 @@ class _DatasetKind(object):
    ITER = 1

    @staticmethod
-    def create_fetcher(kind, dataset, auto_collate_batch, collate_fn, drop_last):
+    def create_fetcher(kind, dataset, auto_collate_batch, collate_fn,
+                       drop_last):
        if kind == _DatasetKind.MAP:
-            return _MapDatasetFetcher(dataset, auto_collate_batch,
-                                      collate_fn, drop_last)
+            return _MapDatasetFetcher(dataset, auto_collate_batch, collate_fn,
+                                      drop_last)
        elif kind == _DatasetKind.ITER:
            return _IterableDatasetFetcher(dataset, auto_collate_batch,
                                           collate_fn, drop_last)
@@ -240,7 +241,8 @@ class _DataLoaderIterBase(object):
            if self._dataset_kind == _DatasetKind.MAP:
                self._sampler_iter = iter(list(range(len(self._dataset))))
            else:
-                self._sampler_iter = iter(_InfiniteIterableSampler(self._dataset, 1))
+                self._sampler_iter = iter(
+                    _InfiniteIterableSampler(self._dataset, 1))
            self._collate_fn = loader.collate_fn

        # LoDTensorBlockingQueue instance for create_py_reader and a thread
@@ -380,8 +382,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):

 # NOTE(chenweihang): _worker_loop must be top level method to be pickled
 def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
-                 auto_collate_batch, collate_fn, init_fn, worker_id, num_workers,
-                 use_shared_memory):
+                 auto_collate_batch, collate_fn, init_fn, worker_id,
+                 num_workers, use_shared_memory):
    try:
        # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
        # some shared memory objects may have been applied for but have not yet
@@ -400,8 +402,8 @@ def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
        try:
            if init_fn is not None:
                init_fn(worker_id)
-            fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset,
-                                    auto_collate_batch, collate_fn, True)
+            fetcher = _DatasetKind.create_fetcher(
+                dataset_kind, dataset, auto_collate_batch, collate_fn, True)
        except:
            init_exception = Exception("init_fn failed in worker {}: " \
                                    "{}".format(worker_id, sys.exc_info()))

--- a/python/paddle/fluid/distributed/downpour.py
+++ b/python/paddle/fluid/distributed/downpour.py
@@ -22,7 +22,7 @@ from google.protobuf import text_format


 class DownpourSGD(object):
-    """
+    r"""
    Distributed optimizer of downpour stochastic gradient descent
    Standard implementation of Google's Downpour SGD
    in Large Scale Distributed Deep Networks

--- a/python/paddle/fluid/distributed/node.py
+++ b/python/paddle/fluid/distributed/node.py
@@ -52,7 +52,7 @@ class DownpourServer(Server):

    def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
                         slot_value_var):
-        """
+        r"""
        Args:
            table_id(int): id of sparse params table
            learning_rate(float): the learning rate used to update parameters. \
@@ -84,7 +84,7 @@ class DownpourServer(Server):
        table.accessor.downpour_accessor_param.delete_threshold = 0.8

    def add_dense_table(self, table_id, learning_rate, param_var, grad_var):
-        """
+        r"""
        Args:
            table_id(int): id of sparse params table
            learning_rate(float): the learning rate used to update parameters. \
@@ -135,7 +135,7 @@ class DownpourWorker(Worker):

    def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
                         slot_value_vars):
-        """
+        r"""
        Args:
            table_id(int): id of sparse params table
            learning_rate(float): the learning rate used to update parameters. \
@@ -153,7 +153,7 @@ class DownpourWorker(Worker):
            [var.name + "@GRAD" for var in slot_value_vars])

    def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars):
-        """
+        r"""
        Args:
            table_id(int): id of sparse params table
            learning_rate(float): the learning rate used to update parameters. \

--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -593,7 +593,7 @@ def grad(outputs,

 @framework.dygraph_only
 def to_variable(value, name=None, zero_copy=None, dtype=None):
-    """
+    r"""
    :api_attr: imperative

    The API will create a ``Variable`` or ``ComplexVariable`` object from 

--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -183,7 +183,7 @@ class PiecewiseDecay(LearningRateDecay):


 class NaturalExpDecay(LearningRateDecay):
-    """
+    r"""
    :api_attr: imperative

    Applies natural exponential decay to the initial learning rate.
@@ -266,7 +266,7 @@ class NaturalExpDecay(LearningRateDecay):


 class ExponentialDecay(LearningRateDecay):
-    """
+    r"""
    :api_attr: imperative

    Applies exponential decay to the learning rate.
@@ -348,7 +348,7 @@ class ExponentialDecay(LearningRateDecay):


 class InverseTimeDecay(LearningRateDecay):
-    """
+    r"""
    :api_attr: imperative

    Applies inverse time decay to the initial learning rate.
@@ -426,7 +426,7 @@ class InverseTimeDecay(LearningRateDecay):


 class PolynomialDecay(LearningRateDecay):
-    """
+    r"""
    :api_attr: imperative

    Applies polynomial decay to the initial learning rate.
@@ -520,7 +520,7 @@ class PolynomialDecay(LearningRateDecay):


 class CosineDecay(LearningRateDecay):
-    """
+    r"""
    :api_attr: imperative

    Applies cosine decay to the learning rate.
@@ -578,7 +578,7 @@ class CosineDecay(LearningRateDecay):


 class NoamDecay(LearningRateDecay):
-    """
+    r"""
    :api_attr: imperative

    Applies Noam decay to the initial learning rate. 

--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -42,7 +42,7 @@ __all__ = [


 class Conv2D(layers.Layer):
-    """
+    r"""
    This interface is used to construct a callable object of the ``Conv2D`` class.
    For more details, refer to code examples.
    The convolution2D layer calculates the output based on the input, filter
@@ -282,7 +282,7 @@ class Conv2D(layers.Layer):


 class Conv3D(layers.Layer):
-    """
+    r"""
    **Convlution3D Layer**

    The convolution3D layer calculates the output based on the input, filter
@@ -484,7 +484,7 @@ class Conv3D(layers.Layer):


 class Conv3DTranspose(layers.Layer):
-    """
+    r"""
    **Convlution3D transpose layer**

    The convolution3D transpose layer calculates the output based on the input,
@@ -701,7 +701,7 @@ class Conv3DTranspose(layers.Layer):


 class Pool2D(layers.Layer):
-    """
+    r"""

    This interface is used to construct a callable object of the ``Pool2D`` class.
    For more details, refer to code examples.
@@ -1009,7 +1009,7 @@ class Linear(layers.Layer):


 class InstanceNorm(layers.Layer):
-    """
+    r"""
    This interface is used to construct a callable object of the ``InstanceNorm`` class.
    For more details, refer to code examples.

@@ -1143,7 +1143,7 @@ class InstanceNorm(layers.Layer):


 class BatchNorm(layers.Layer):
-    """
+    r"""
    :alias_main: paddle.nn.BatchNorm
 	:alias: paddle.nn.BatchNorm,paddle.nn.layer.BatchNorm,paddle.nn.layer.norm.BatchNorm
 	:old_api: paddle.fluid.dygraph.BatchNorm
@@ -1492,7 +1492,7 @@ class Dropout(layers.Layer):


 class Embedding(layers.Layer):
-    """
+    r"""
    :alias_main: paddle.nn.Embedding
 	:alias: paddle.nn.Embedding,paddle.nn.layer.Embedding,paddle.nn.layer.common.Embedding
 	:old_api: paddle.fluid.dygraph.Embedding
@@ -1652,7 +1652,7 @@ class Embedding(layers.Layer):


 class LayerNorm(layers.Layer):
-    """
+    r"""
    :alias_main: paddle.nn.LayerNorm
 	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
 	:old_api: paddle.fluid.dygraph.LayerNorm
@@ -2242,7 +2242,7 @@ class NCE(layers.Layer):


 class PRelu(layers.Layer):
-    """
+    r"""
    This interface is used to construct a callable object of the ``PRelu`` class.
    For more details, refer to code examples.
    It implements three activation methods of the ``PRelu`` activation function.
@@ -2350,7 +2350,7 @@ class PRelu(layers.Layer):


 class BilinearTensorProduct(layers.Layer):
-    """
+    r"""

    **Add Bilinear Tensor Product Layer**

@@ -2467,7 +2467,7 @@ class BilinearTensorProduct(layers.Layer):


 class Conv2DTranspose(layers.Layer):
-    """
+    r"""
    This interface is used to construct a callable object of the ``Conv2DTranspose`` class.
    For more details, refer to code examples.
    The convolution2D transpose layer calculates the output based on the input,
@@ -2979,7 +2979,7 @@ class GroupNorm(layers.Layer):


 class SpectralNorm(layers.Layer):
-    """
+    r"""
    :alias_main: paddle.nn.SpectralNorm
 	:alias: paddle.nn.SpectralNorm,paddle.nn.layer.SpectralNorm,paddle.nn.layer.norm.SpectralNorm
 	:old_api: paddle.fluid.dygraph.SpectralNorm

--- a/python/paddle/fluid/dygraph/rnn.py
+++ b/python/paddle/fluid/dygraph/rnn.py
@@ -20,7 +20,7 @@ __all__ = ['LSTMCell', 'GRUCell']


 class LSTMCell(Layer):
-    """
+    r"""
    LSTMCell implementation using basic operators.
    There are two LSTMCell version, the default one is compatible with CUDNN LSTM implementation.
    The algorithm can be described as the equations below.
@@ -236,7 +236,7 @@ class LSTMCell(Layer):


 class GRUCell(Layer):
-    """
+    r"""
    GRU implementation using basic operators.
    There are two GRUCell version, the default one is compatible with CUDNN GRU implementation.
    The algorithm can be described as the equations below.

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2255,7 +2255,7 @@ class Operator(object):
        return self.desc.type()

    def input(self, name):
-        """
+        r"""
        Get the input arguments according to the input parameter name.

        Args:
@@ -2306,7 +2306,7 @@ class Operator(object):
        return self.desc.output_arg_names()

    def output(self, name):
-        """
+        r"""
        Get output arguments by the output parameter name.

        Args:

--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
@@ -527,7 +527,7 @@ class DownpourWorker(Worker):

    def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars,
                        dense_start_table_id, sparse_table_names):
-        """
+        r"""
        Args:
            table_id(int): id of sparse params table
            learning_rate(float): the learning rate used to update parameters. \

--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
@@ -153,7 +153,7 @@ class FleetUtil(object):
                         stat_pos="_generated_var_2",
                         stat_neg="_generated_var_3",
                         print_prefix=""):
-        """
+        r"""
        Print global auc of all distributed workers.

        Args:
@@ -1073,7 +1073,7 @@ class FleetUtil(object):
                                hadoop_fs_name,
                                hadoop_fs_ugi,
                                hadoop_home="$HADOOP_HOME"):
-        """
+        r"""
        get last saved base xbox info from xbox_base_done.txt

        Args:
@@ -1118,7 +1118,7 @@ class FleetUtil(object):
                           hadoop_fs_name,
                           hadoop_fs_ugi,
                           hadoop_home="$HADOOP_HOME"):
-        """
+        r"""
        get last saved xbox info from xbox_patch_done.txt

        Args:
@@ -1164,7 +1164,7 @@ class FleetUtil(object):
                            hadoop_fs_name,
                            hadoop_fs_ugi,
                            hadoop_home="$HADOOP_HOME"):
-        """
+        r"""
        get last saved model info from donefile.txt

        Args:
@@ -1279,7 +1279,7 @@ class FleetUtil(object):
                           q_name="q",
                           pos_ins_num_name="pos",
                           total_ins_num_name="total"):
-        """
+        r"""
        get global metrics, including auc, bucket_error, mae, rmse,
        actual_ctr, predicted_ctr, copc, mean_predict_qvalue, total_ins_num.

@@ -1469,7 +1469,7 @@ class FleetUtil(object):
                             pos_ins_num_name="pos",
                             total_ins_num_name="total",
                             print_prefix=""):
-        """
+        r"""
        print global metrics, including auc, bucket_error, mae, rmse,
        actual_ctr, predicted_ctr, copc, mean_predict_qvalue, total_ins_num.


--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -459,7 +459,7 @@ class TruncatedNormalInitializer(Initializer):


 class XavierInitializer(Initializer):
-    """
+    r"""
    This class implements the Xavier weight initializer from the paper
    `Understanding the difficulty of training deep feedforward neural
    networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
@@ -595,7 +595,7 @@ class XavierInitializer(Initializer):


 class MSRAInitializer(Initializer):
-    """Implements the MSRA initializer a.k.a. Kaiming Initializer
+    r"""Implements the MSRA initializer a.k.a. Kaiming Initializer

    This class implements the weight initialization from the paper
    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on

--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
@@ -137,7 +137,7 @@ def embedding(input,
              padding_idx=None,
              param_attr=None,
              dtype='float32'):
-    """
+    r"""
    :api_attr: Static Graph

    The operator is used to lookup embeddings vector of ids provided by :attr:`input` . 

--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -59,7 +59,7 @@ class LayerHelperBase(object):
        return cls.__dtype

    def to_variable(self, value, name=None):
-        """
+        r"""
        The API will create a ``Variable`` object from numpy\.ndarray or Variable object.

        Parameters:

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -3012,7 +3012,7 @@ class DynamicRNN(object):
        self.mem_link = []

    def step_input(self, x, level=0):
-        """
+        r"""
        This function is used to set sequence x as DynamicRNN's input.
        The maximum sequence length in x determines the number of time steps
        the RNN unit will be executed. DynamicRNN can take multiple inputs.
@@ -3144,7 +3144,7 @@ class DynamicRNN(object):
        return array_read(array=input_array, i=self.step_idx)

    def static_input(self, x):
-        """
+        r"""
        This function is used to set x as DynamicRNN's static input. It is optional.

        - Case 1, set static input with LoD
@@ -3348,7 +3348,7 @@ class DynamicRNN(object):
               value=0.0,
               need_reorder=False,
               dtype='float32'):
-        """
+        r"""
        Create a memory Variable for DynamicRNN to deliver data cross time steps.
        It can be initialized by an existing Tensor or a constant Tensor of given
        dtype and shape.

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -77,7 +77,7 @@ def retinanet_target_assign(bbox_pred,
                            num_classes=1,
                            positive_overlap=0.5,
                            negative_overlap=0.4):
-    """
+    r"""
    **Target Assign Layer for the detector RetinaNet.**

    This OP finds out positive and negative samples from all anchors
@@ -471,7 +471,7 @@ def rpn_target_assign(bbox_pred,


 def sigmoid_focal_loss(x, label, fg_num, gamma=2.0, alpha=0.25):
-    """
+    r"""
 	:alias_main: paddle.nn.functional.sigmoid_focal_loss
 	:alias: paddle.nn.functional.sigmoid_focal_loss,paddle.nn.functional.loss.sigmoid_focal_loss
 	:old_api: paddle.fluid.layers.sigmoid_focal_loss
@@ -821,7 +821,7 @@ def box_coder(prior_box,
              box_normalized=True,
              name=None,
              axis=0):
-    """
+    r"""

    **Box Coder Layer**

@@ -1523,7 +1523,7 @@ def ssd_loss(location,
             mining_type='max_negative',
             normalize=True,
             sample_size=None):
-    """
+    r"""
 	:alias_main: paddle.nn.functional.ssd_loss
 	:alias: paddle.nn.functional.ssd_loss,paddle.nn.functional.loss.ssd_loss
 	:old_api: paddle.fluid.layers.ssd_loss
@@ -1930,7 +1930,7 @@ def density_prior_box(input,
                      offset=0.5,
                      flatten_to_2d=False,
                      name=None):
-    """
+    r"""

    This op generates density prior boxes for SSD(Single Shot MultiBox Detector) 
    algorithm. Each position of the input produce N prior boxes, N is 
@@ -2741,7 +2741,7 @@ def generate_proposal_labels(rpn_rois,

 def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
                         labels_int32, num_classes, resolution):
-    """
+    r"""

    **Generate Mask Labels for Mask-RCNN**

@@ -3671,7 +3671,7 @@ def distribute_fpn_proposals(fpn_rois,
                             refer_scale,
                             rois_num=None,
                             name=None):
-    """
+    r"""
 	
    **This op only takes LoDTensor as input.** In Feature Pyramid Networks 
    (FPN) models, it is needed to distribute all proposals into different FPN 

--- a/python/paddle/fluid/layers/distributions.py
+++ b/python/paddle/fluid/layers/distributions.py
@@ -113,7 +113,7 @@ class Distribution(object):


 class Uniform(Distribution):
-    """Uniform distribution with `low` and `high` parameters.
+    r"""Uniform distribution with `low` and `high` parameters.

    Mathematical Details

@@ -258,7 +258,7 @@ class Uniform(Distribution):


 class Normal(Distribution):
-    """The Normal distribution with location `loc` and `scale` parameters.
+    r"""The Normal distribution with location `loc` and `scale` parameters.

    Mathematical details

@@ -423,7 +423,7 @@ class Normal(Distribution):


 class Categorical(Distribution):
-    """
+    r"""
    Categorical distribution is a discrete probability distribution that 
    describes the possible results of a random variable that can take on 
    one of K possible categories, with the probability of each category 
@@ -529,7 +529,7 @@ class Categorical(Distribution):


 class MultivariateNormalDiag(Distribution):
-    """
+    r"""
    A multivariate normal (also called Gaussian) distribution parameterized by a mean vector
    and a covariance matrix.


--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -440,7 +440,7 @@ Applies piecewise decay to the initial learning rate.


 def cosine_decay(learning_rate, step_each_epoch, epochs):
-    """
+    r"""

    Applies cosine decay to the learning rate.


--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -57,7 +57,7 @@ def center_loss(input,
                alpha,
                param_attr,
                update_center=True):
-    """
+    r"""
    :api_attr: Static Graph

    **Center loss Cost layer**
@@ -151,7 +151,7 @@ def center_loss(input,


 def bpr_loss(input, label, name=None):
-    """
+    r"""

    **Bayesian Personalized Ranking Loss Operator**

@@ -203,7 +203,7 @@ def bpr_loss(input, label, name=None):


 def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
-    """
+    r"""
    :alias_main: paddle.nn.functional.cross_entropy
 	:alias: paddle.nn.functional.cross_entropy,paddle.nn.functional.loss.cross_entropy
 	:old_api: paddle.fluid.layers.cross_entropy
@@ -300,7 +300,7 @@ def cross_entropy2(input, label, ignore_index=kIgnoreIndex):


 def square_error_cost(input, label):
-    """
+    r"""

    This op accepts input predictions and target label and returns the
    squared error cost.
@@ -1185,7 +1185,7 @@ def softmax_with_cross_entropy(logits,
                               numeric_stable_mode=True,
                               return_softmax=False,
                               axis=-1):
-    """
+    r"""
    :alias_main: paddle.nn.functional.softmax_with_cross_entropy
 	:alias: paddle.nn.functional.softmax_with_cross_entropy,paddle.nn.functional.loss.softmax_with_cross_entropy
 	:old_api: paddle.fluid.layers.softmax_with_cross_entropy
@@ -1312,7 +1312,7 @@ def softmax_with_cross_entropy(logits,


 def rank_loss(label, left, right, name=None):
-    """
+    r"""

    This operator implements the sort loss layer in the RankNet model. RankNet is a pairwise ranking model 
    with a training sample consisting of a pair of documents (A and B), The label (P) 
@@ -1375,7 +1375,7 @@ def rank_loss(label, left, right, name=None):


 def margin_rank_loss(label, left, right, margin=0.1, name=None):
-    """
+    r"""
    Margin Ranking Loss Layer for ranking problem,
    which compares left score and right score passed in.
    The ranking loss can be defined as following equation:
@@ -1551,7 +1551,7 @@ def teacher_student_sigmoid_loss(input,


 def huber_loss(input, label, delta):
-    """
+    r"""
    This operator computes the Huber loss between input and label.
    Huber loss is commonly used in regression tasks. Compared to square_error_cost, Huber loss is more robust and less sensitivity to outliers.

@@ -1681,7 +1681,7 @@ from .control_flow import equal


 def npair_loss(anchor, positive, labels, l2_reg=0.002):
-    '''
+    r'''

  Read `Improved Deep Metric Learning with Multi class N pair Loss Objective\
       <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/\

--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -114,7 +114,7 @@ def auc(input,
        num_thresholds=2**12 - 1,
        topk=1,
        slide_steps=1):
-    """
+    r"""
    **Area Under the Curve (AUC) Layer**

    This implementation computes the AUC according to forward output and label.

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -215,7 +215,7 @@ def fc(input,
       bias_attr=None,
       act=None,
       name=None):
-    """
+    r"""
    :api_attr: Static Graph

    **Fully Connected Layer**
@@ -377,7 +377,7 @@ def embedding(input,
              padding_idx=None,
              param_attr=None,
              dtype='float32'):
-    """
+    r"""
    :api_attr: Static Graph

    **WARING:** This OP will be deprecated in a future release. This OP requires the
@@ -530,7 +530,7 @@ def _pull_sparse(input,
                 padding_id=0,
                 dtype='float32',
                 scale_sparse_grad=True):
-    """
+    r"""
    **Pull Fleet Sparse Layer**

    This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
@@ -601,7 +601,7 @@ def _pull_sparse_v2(input,
                    padding_id=0,
                    dtype='float32',
                    scale_sparse_grad=True):
-    """
+    r"""
    **Pull Fleet Sparse Layer**

    This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
@@ -664,7 +664,7 @@ def _pull_sparse_v2(input,


 def _pull_box_sparse(input, size, dtype='float32'):
-    """
+    r"""
    **Pull Box Sparse Layer**

    This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
@@ -1050,7 +1050,7 @@ def chunk_eval(input,
               num_chunk_types,
               excluded_chunk_types=None,
               seq_length=None):
-    """
+    r"""
    This operator computes the precision, recall and F1-score for chunk detection.
    It is often used in sequence tagging tasks, such as Named Entity Recognition(NER).

@@ -1199,7 +1199,7 @@ def chunk_eval(input,

 @deprecated(since="2.0.0", update_to="paddle.nn.functional.softmax")
 def softmax(input, use_cudnn=False, name=None, axis=-1):
-    """
+    r"""
    This operator implements the softmax layer. The calculation process is as follows:

    1. The dimension :attr:`axis` of the ``input`` will be permuted to the last.
@@ -1339,7 +1339,7 @@ def conv2d(input,
           act=None,
           name=None,
           data_format="NCHW"):
-    """
+    r"""
    :api_attr: Static Graph

    The convolution2D layer calculates the output based on the input, filter
@@ -1618,7 +1618,7 @@ def conv3d(input,
           act=None,
           name=None,
           data_format="NCDHW"):
-    """
+    r"""
    :api_attr: Static Graph

    The convolution3D layer calculates the output based on the input, filter
@@ -2325,7 +2325,7 @@ def adaptive_pool2d(input,
                    pool_type="max",
                    require_index=False,
                    name=None):
-    """
+    r"""

    This operation calculates the output based on the input, pool_size,
    pool_type parameters. Input(X) and output(Out) are in NCHW format, where N is batch
@@ -2471,7 +2471,7 @@ def adaptive_pool3d(input,
                    pool_type="max",
                    require_index=False,
                    name=None):
-    """
+    r"""

    This operation calculates the output based on the input, pool_size,
    pool_type parameters. Input(X) and output(Out) are in NCDHW format, where N is batch
@@ -2638,7 +2638,7 @@ def batch_norm(input,
               moving_variance_name=None,
               do_model_average_for_mean_and_var=True,
               use_global_stats=False):
-    """
+    r"""
    :api_attr: Static Graph

    **Batch Normalization Layer**
@@ -2902,7 +2902,7 @@ def inplace_abn(input,
                do_model_average_for_mean_and_var=True,
                use_global_stats=False,
                act_alpha=1.0):
-    """
+    r"""
    **In-place Activation Batch Normalization Layer**

    This layer calculates batch normalization and activation with in-place memory.
@@ -3096,7 +3096,7 @@ def instance_norm(input,
                  param_attr=None,
                  bias_attr=None,
                  name=None):
-    """
+    r"""
    :api_attr: Static Graph

    **Instance Normalization Layer**
@@ -3231,7 +3231,7 @@ def data_norm(input,
              sync_stats=False,
              summary_decay_rate=0.9999999,
              enable_scale_and_shift=False):
-    """
+    r"""
    :api_attr: Static Graph

    **Data Normalization Layer**
@@ -3416,7 +3416,7 @@ def layer_norm(input,
               bias_attr=None,
               act=None,
               name=None):
-    """
+    r"""
    :api_attr: Static Graph

    **Layer Normalization Layer**
@@ -3646,7 +3646,7 @@ def group_norm(input,

 @templatedoc()
 def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
-    """
+    r"""
    :api_attr: Static Graph

    **Spectral Normalization Layer**
@@ -3765,7 +3765,7 @@ def conv2d_transpose(input,
                     act=None,
                     name=None,
                     data_format='NCHW'):
-    """
+    r"""
    :api_attr: Static Graph

    The convolution2D transpose layer calculates the output based on the input,
@@ -4057,7 +4057,7 @@ def conv3d_transpose(input,
                     act=None,
                     name=None,
                     data_format='NCDHW'):
-    """
+    r"""
    :api_attr: Static Graph

    The convolution3D transpose layer calculates the output based on the input,
@@ -4961,7 +4961,7 @@ def split(input, num_or_sections, dim=-1, name=None):


 def l2_normalize(x, axis, epsilon=1e-12, name=None):
-    """
+    r"""

    This op normalizes `x` along dimension `axis` using an L2
    norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes
@@ -5286,7 +5286,7 @@ def ctc_greedy_decoder(input,
                       input_length=None,
                       padding_value=0,
                       name=None):
-    """
+    r"""
    This op is used to decode sequences by greedy policy by the following steps:

    1. Get the indexes of maximum value for each row in input. a.k.a.
@@ -5538,7 +5538,7 @@ def im2sequence(input,
                input_image_size=None,
                out_stride=1,
                name=None):
-    """
+    r"""
    :api_attr: Static Graph

    Extracts image patches from the input tensor to form a tensor of shape
@@ -6046,7 +6046,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):


 def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
-    """
+    r"""
    :alias_main: paddle.reshape
 	:alias: paddle.reshape,paddle.tensor.reshape,paddle.tensor.manipulation.reshape

@@ -6535,7 +6535,7 @@ def lod_append(x, level):

 def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None,
        data_format='NCHW'):
-    """
+    r"""
    :alias_main: paddle.nn.functional.lrn
 	:alias: paddle.nn.functional.lrn,paddle.nn.functional.norm.lrn
 	:old_api: paddle.fluid.layers.lrn
@@ -6625,7 +6625,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None,


 def pad(x, paddings, pad_value=0., name=None):
-    """
+    r"""
    :alias_main: paddle.nn.functional.pad
 	:alias: paddle.nn.functional.pad,paddle.nn.functional.common.pad
 	:old_api: paddle.fluid.layers.pad
@@ -6695,7 +6695,7 @@ def pad(x, paddings, pad_value=0., name=None):


 def pad_constant_like(x, y, pad_value=0., name=None):
-    """
+    r"""
    Pad :attr:`y` with :attr:`pad_value`, the number of values padded to
    the edges of each axis is specified by the difference of the shape
    of :attr:`x` and :attr:`y` . ((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n))
@@ -6794,7 +6794,7 @@ def label_smooth(label,
                 epsilon=0.1,
                 dtype="float32",
                 name=None):
-    """
+    r"""
    :alias_main: paddle.nn.functional.label_smooth
 	:alias: paddle.nn.functional.label_smooth,paddle.nn.functional.common.label_smooth
 	:old_api: paddle.fluid.layers.label_smooth
@@ -7067,7 +7067,7 @@ def roi_align(input,


 def dice_loss(input, label, epsilon=0.00001, name=None):
-    """
+    r"""

    Dice loss for comparing the similarity between the input predictions and the label.
    This implementation is for binary classification, where the input is sigmoid
@@ -8500,7 +8500,7 @@ def scatter(input, index, updates, name=None, overwrite=True):


 def scatter_nd_add(ref, index, updates, name=None):
-    """
+    r"""
    **Scatter_nd_add Layer**

    Output is obtained by applying sparse addition to a single value
@@ -8686,7 +8686,7 @@ def random_crop(x, shape, seed=None):


 def log(x, name=None):
-    """
+    r"""
    Calculates the natural log of the given input tensor, element-wise.

    .. math::
@@ -8768,7 +8768,7 @@ def relu(x, name=None):

 @deprecated(since="2.0.0", update_to="paddle.nn.functional.selu")
 def selu(x, scale=None, alpha=None, name=None):
-    """
+    r"""

    Selu Operator.

@@ -8836,7 +8836,7 @@ def selu(x, scale=None, alpha=None, name=None):


 def mean_iou(input, label, num_classes):
-    """
+    r"""
    Mean Intersection-Over-Union is a common evaluation metric for
    semantic image segmentation, which first computes the IOU for each
    semantic class and then computes the average over classes.
@@ -9640,7 +9640,7 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):

 @templatedoc()
 def swish(x, beta=1.0, name=None):
-    """
+    r"""
    :alias_main: paddle.nn.functional.swish
 	:alias: paddle.nn.functional.swish,paddle.nn.functional.activation.swish
 	:old_api: paddle.fluid.layers.swish
@@ -9725,7 +9725,7 @@ def swish(x, beta=1.0, name=None):

 @deprecated(since="2.0.0", update_to="paddle.static.nn.prelu")
 def prelu(x, mode, param_attr=None, name=None):
-    """
+    r"""
    prelu activation.

    .. math::
@@ -9883,7 +9883,7 @@ def leaky_relu(x, alpha=0.02, name=None):


 def soft_relu(x, threshold=40.0, name=None):
-    """
+    r"""

    SoftRelu Activation Operator.

@@ -9932,7 +9932,7 @@ def soft_relu(x, threshold=40.0, name=None):


 def flatten(x, axis=1, name=None):
-    """
+    r"""
    **Flatten op**

    Flatten the input tensor into a 2D matrix.
@@ -12153,7 +12153,7 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):


 def logical_and(x, y, out=None, name=None):
-    """
+    r"""

    ``logical_and`` operator computes element-wise logical AND on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
    Each element of ``out`` is calculated by
@@ -12230,7 +12230,7 @@ def logical_or(x, y, out=None, name=None):


 def logical_xor(x, y, out=None, name=None):
-    """
+    r"""

    ``logical_xor`` operator computes element-wise logical XOR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
    Each element of ``out`` is calculated by
@@ -12565,7 +12565,7 @@ def maxout(x, groups, name=None, axis=1):


 def space_to_depth(x, blocksize, name=None):
-    """
+    r"""

    Gives a blocksize to space_to_depth the input LoDtensor with Layout: [batch, channel, height, width]

@@ -12753,7 +12753,7 @@ def affine_channel(x,


 def similarity_focus(input, axis, indexes, name=None):
-    """
+    r"""
    SimilarityFocus Operator

    Generate a similarity focus mask with the same shape of input using the following method:
@@ -13034,7 +13034,7 @@ def grid_sampler(x, grid, name=None):


 def log_loss(input, label, epsilon=1e-4, name=None):
-    """
+    r"""

    **Negative Log Loss Layer**

@@ -13086,7 +13086,7 @@ def log_loss(input, label, epsilon=1e-4, name=None):


 def add_position_encoding(input, alpha, beta, name=None):
-    """
+    r"""

    This operator performs weighted sum of input feature at each position
    (position in the sequence) and the corresponding position encoding.
@@ -13160,7 +13160,7 @@ def bilinear_tensor_product(x,
                            name=None,
                            param_attr=None,
                            bias_attr=None):
-    """
+    r"""
    :api_attr: Static Graph

    **Bilinear Tensor Product Layer**
@@ -13987,7 +13987,7 @@ def fsp_matrix(x, y):


 def continuous_value_model(input, cvm, use_cvm=True):
-    """
+    r"""

    **continuous_value_model layers**

@@ -14092,7 +14092,7 @@ def where(condition):

 @deprecated(since="2.0.0", update_to="paddle.sign")
 def sign(x):
-    """
+    r"""
    This OP returns sign of every element in `x`: 1 for positive, -1 for negative and 0 for zero.

    Args:
@@ -14125,7 +14125,7 @@ def sign(x):


 def unique(x, dtype='int32'):
-    """
+    r"""
    Return a unique tensor for `x` and an index tensor pointing to this unique tensor.

    Args:
@@ -14164,7 +14164,7 @@ def unique(x, dtype='int32'):


 def unique_with_counts(x, dtype='int32'):
-    """
+    r"""
    This OP return a unique tensor for `x` , and count tensor that the count of unique result in raw input, \
    and an index tensor pointing to this unique tensor.

@@ -14236,7 +14236,7 @@ def deformable_conv(input,
                    bias_attr=None,
                    modulated=True,
                    name=None):
-    """
+    r"""
    :api_attr: Static Graph

    **Deformable Convolution op**
@@ -14453,7 +14453,7 @@ def deformable_conv(input,


 def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
-    """
+    r"""

    This op returns a col buffer of sliding local blocks of input x, also known
    as im2col for batched 2D image tensors. For each block under the convolution filter,
@@ -14590,7 +14590,7 @@ def deformable_roi_pooling(input,
                           trans_std=0.1,
                           position_sensitive=False,
                           name=None):
-    """
+    r"""

    Deformable ROI Pooling Layer

@@ -14821,7 +14821,7 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):

 @templatedoc()
 def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
-    """
+    r"""
    This operator implements the hard_swish activation function.
    Hard_swish is proposed in MobileNetV3, and performs better in computational stability and efficiency compared to swish function.
    For more details please refer to: https://arxiv.org/pdf/1905.02244.pdf
@@ -14890,7 +14890,7 @@ def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):

 @templatedoc()
 def mish(x, threshold=20, name=None):
-    """
+    r"""
    This operator implements the mish activation function.
    Refer to `Mish: A Self Regularized Non-Monotonic Neural
    Activation Function <https://arxiv.org/abs/1908.08681>`_
@@ -14964,7 +14964,7 @@ def mish(x, threshold=20, name=None):


 def gather_tree(ids, parents):
-    """
+    r"""
    To be used after beam search. After beam search, we get selected ids at
    each time step and the corresponding parents in the search tree. Both ids
    and parents have the layout :attr:`[max_time, batch_size, beam_size]`. Then

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -413,7 +413,7 @@ def softshrink(x, alpha=None):
    return _softshrink_(**kwargs)


-softshrink.__doc__ = """
+softshrink.__doc__ = r"""
 	:alias_main: paddle.nn.functional.softshrink
 	:alias: paddle.nn.functional.softshrink,paddle.nn.functional.activation.softshrink
 	:old_api: paddle.fluid.layers.softshrink
@@ -530,7 +530,7 @@ def thresholded_relu(x, threshold=None):
    return _thresholded_relu_(**kwargs)


-thresholded_relu.__doc__ = """
+thresholded_relu.__doc__ = r"""
 	:alias_main: paddle.nn.functional.thresholded_relu
 	:alias: paddle.nn.functional.thresholded_relu,paddle.nn.functional.activation.thresholded_relu
 	:old_api: paddle.fluid.layers.thresholded_relu
@@ -617,7 +617,7 @@ def gelu(x, approximate=False):
    return _gelu_(**kwargs)


-gelu.__doc__ = """
+gelu.__doc__ = r"""
 :strong:`GeLU Activation Operator`
 For more details, see [Gaussian Error Linear Units](https://arxiv.org/abs/1606.08415).

@@ -701,7 +701,7 @@ def erf(x, name=None):
    return _erf_(**kwargs)


-erf.__doc__ = """
+erf.__doc__ = r"""
 :strong:`Erf Operator`
 For more details, see [Error function](https://en.wikipedia.org/wiki/Error_function).


--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -67,7 +67,7 @@ class RNNCell(object):
    """

    def call(self, inputs, states, **kwargs):
-        """
+        r"""
        Every cell must implement this method to do the calculations mapping the
        inputs and states to the output and new states.

@@ -97,7 +97,7 @@ class RNNCell(object):
                           dtype='float32',
                           init_value=0,
                           batch_dim_idx=0):
-        """
+        r"""
        Generate initialized states according to provided shape, data type and
        value.

@@ -225,7 +225,7 @@ class RNNCell(object):


 class GRUCell(RNNCell):
-    """
+    r"""
 	:api_attr: Static Graph

    Gated Recurrent Unit cell. It is a wrapper for 
@@ -287,7 +287,7 @@ class GRUCell(RNNCell):
            activation, dtype)

    def call(self, inputs, states):
-        """
+        r"""
        Perform calculations of GRU.

        Parameters:
@@ -323,7 +323,7 @@ class GRUCell(RNNCell):


 class LSTMCell(RNNCell):
-    """
+    r"""
 	:api_attr: Static Graph

    Long-Short Term Memory cell. It is a wrapper for 
@@ -390,7 +390,7 @@ class LSTMCell(RNNCell):
            activation, forget_bias, dtype)

    def call(self, inputs, states):
-        """
+        r"""
        Perform calculations of LSTM.

        Parameters:
@@ -782,7 +782,7 @@ class Decoder(object):
    """

    def initialize(self, inits):
-        """
+        r"""
        Called once before the decoding iterations.

        Parameters:
@@ -797,7 +797,7 @@ class Decoder(object):
        raise NotImplementedError

    def step(self, time, inputs, states, **kwargs):
-        """
+        r"""
        Called per step of decoding. 

        Parameters:
@@ -818,7 +818,7 @@ class Decoder(object):
        raise NotImplementedError

    def finalize(self, outputs, final_states, sequence_lengths):
-        """
+        r"""
        Called once after the decoding iterations if implemented.

        Parameters:
@@ -931,7 +931,7 @@ class BeamSearchDecoder(Decoder):

    @staticmethod
    def tile_beam_merge_with_batch(x, beam_size):
-        """
+        r"""
        Tile the batch dimension of a tensor. Specifically, this function takes
        a tensor t shaped `[batch_size, s0, s1, ...]` composed of minibatch 
        entries `t[0], ..., t[batch_size - 1]` and tiles it to have a shape
@@ -966,7 +966,7 @@ class BeamSearchDecoder(Decoder):
        return x

    def _split_batch_beams(self, x):
-        """
+        r"""
        Reshape a tensor with shape `[batch_size * beam_size, ...]` to a new
        tensor with shape `[batch_size, beam_size, ...]`. 

@@ -983,7 +983,7 @@ class BeamSearchDecoder(Decoder):
        return nn.reshape(x, shape=[-1, self.beam_size] + list(x.shape[1:]))

    def _merge_batch_beams(self, x):
-        """
+        r"""
        Reshape a tensor with shape `[batch_size, beam_size, ...]` to a new
        tensor with shape `[batch_size * beam_size, ...]`. 

@@ -1000,7 +1000,7 @@ class BeamSearchDecoder(Decoder):
        return nn.reshape(x, shape=[-1] + list(x.shape[2:]))

    def _expand_to_beam_size(self, x):
-        """
+        r"""
        This function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed
        of minibatch entries `t[0], ..., t[batch_size - 1]` and tiles it to have a
        shape `[batch_size, beam_size, s0, s1, ...]` composed of minibatch entries
@@ -1023,7 +1023,7 @@ class BeamSearchDecoder(Decoder):
        return x

    def _mask_probs(self, probs, finished):
-        """
+        r"""
        Mask log probabilities. It forces finished beams to allocate all probability
        mass to eos and unfinished beams to remain unchanged.

@@ -1052,7 +1052,7 @@ class BeamSearchDecoder(Decoder):
        return probs

    def _gather(self, x, indices, batch_size):
-        """
+        r"""
        Gather from the tensor `x` using `indices`.

        Parameters:
@@ -1104,7 +1104,7 @@ class BeamSearchDecoder(Decoder):
        pass

    def initialize(self, initial_cell_states):
-        """
+        r"""
        Initialize the BeamSearchDecoder.

        Parameters:
@@ -1162,7 +1162,7 @@ class BeamSearchDecoder(Decoder):
                                              init_lengths), init_finished

    def _beam_search_step(self, time, logits, next_cell_states, beam_state):
-        """
+        r"""
        Calculate scores and select candidate token ids.

        Parameters:
@@ -1235,7 +1235,7 @@ class BeamSearchDecoder(Decoder):
        return beam_search_output, beam_search_state

    def step(self, time, inputs, states, **kwargs):
-        """
+        r"""
        Perform a beam search decoding step, which uses `cell` to get probabilities,
        and follows a beam search step to calculate scores and select candidate
        token ids.
@@ -1287,7 +1287,7 @@ class BeamSearchDecoder(Decoder):
        return (beam_search_output, beam_search_state, next_inputs, finished)

    def finalize(self, outputs, final_states, sequence_lengths):
-        """
+        r"""
        Use `gather_tree` to backtrace along the beam search tree and construct
        the full predicted sequences.

@@ -1572,7 +1572,7 @@ def dynamic_decode(decoder,
                   is_test=False,
                   return_length=False,
                   **kwargs):
-    """
+    r"""
    Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned
    Tensor indicating finished status contains all True values or the number of
    decoding step reaches to :attr:`max_step_num`.
@@ -1664,7 +1664,7 @@ class DecodeHelper(object):
    """

    def initialize(self):
-        """
+        r"""
        DecodeHelper initialization to produce inputs for the first decoding step
        and give the initial status telling whether each sequence in the batch
        is finished. It is the partial of the initialization of `BasicDecoder`.
@@ -1698,7 +1698,7 @@ class DecodeHelper(object):
        pass

    def next_inputs(self, time, outputs, states, sample_ids):
-        """
+        r"""
        Produce the inputs and states for next time step and give status telling
        whether each minibatch entry is finished. It is called after `sample` in
        `BasicDecoder.step`. It is the partial of `BasicDecoder.step`.
@@ -1787,7 +1787,7 @@ class TrainingHelper(DecodeHelper):
            self.inputs)

    def initialize(self):
-        """
+        r"""
        TrainingHelper initialization produces inputs for the first decoding
        step by slicing at the first time step of full sequence inputs, and it
        gives initial status telling whether each sequence in the batch is
@@ -1809,7 +1809,7 @@ class TrainingHelper(DecodeHelper):
        return init_inputs, init_finished

    def sample(self, time, outputs, states):
-        """
+        r"""
        Perform sampling by using `argmax` according to the `outputs`. Mostly
        the sampled ids would not be used since the inputs for next decoding
        step would be got by slicing.
@@ -1832,7 +1832,7 @@ class TrainingHelper(DecodeHelper):
        return sample_ids

    def next_inputs(self, time, outputs, states, sample_ids):
-        """
+        r"""
        Generate inputs for the next decoding step by slicing at corresponding
        step of the full sequence inputs. Simultaneously, produce the states
        for next time step by directly using the input `states` and emit status
@@ -1909,7 +1909,7 @@ class GreedyEmbeddingHelper(DecodeHelper):
    """

    def __init__(self, embedding_fn, start_tokens, end_token):
-        """
+        r"""
        Constructor of GreedyEmbeddingHelper.

        Parameters:
@@ -1934,7 +1934,7 @@ class GreedyEmbeddingHelper(DecodeHelper):
            shape=[1], dtype="int64", value=end_token)

    def initialize(self):
-        """
+        r"""
        GreedyEmbeddingHelper initialization produces inputs for the first decoding
        step by using `start_tokens` of the constructor, and gives initial
        status telling whether each sequence in the batch is finished. 
@@ -1957,7 +1957,7 @@ class GreedyEmbeddingHelper(DecodeHelper):
        return init_inputs, init_finished

    def sample(self, time, outputs, states):
-        """
+        r"""
        Perform sampling by using `argmax` according to the `outputs`.

        Parameters:
@@ -1978,7 +1978,7 @@ class GreedyEmbeddingHelper(DecodeHelper):
        return sample_ids

    def next_inputs(self, time, outputs, states, sample_ids):
-        """
+        r"""
        Generate inputs for the next decoding step by applying `embedding_fn`
        to `sample_ids`. Simultaneously, produce the states for next time step
        by directly using the input `states` and emit status telling whether
@@ -2046,7 +2046,7 @@ class SampleEmbeddingHelper(GreedyEmbeddingHelper):
                 end_token,
                 softmax_temperature=None,
                 seed=None):
-        """
+        r"""
        Constructor of SampleEmbeddingHelper.

        Parameters:
@@ -2080,7 +2080,7 @@ class SampleEmbeddingHelper(GreedyEmbeddingHelper):
        self.seed = seed

    def sample(self, time, outputs, states):
-        """
+        r"""
        Perform sampling from a categorical distribution, and the distribution
        is computed by `softmax(outputs/softmax_temperature)`.

@@ -2165,7 +2165,7 @@ class BasicDecoder(Decoder):
        self.output_fn = output_fn

    def initialize(self, initial_cell_states):
-        """
+        r"""
        BasicDecoder initialization includes helper initialization and cell
        initialization, and cell initialization uses `initial_cell_states` as
        the result directly.
@@ -2195,7 +2195,7 @@ class BasicDecoder(Decoder):
        pass

    def step(self, time, inputs, states, **kwargs):
-        """
+        r"""
        Perform one decoding step as following steps:

        1. Perform `cell_outputs, cell_states = cell.call(inputs, states)`
@@ -2258,7 +2258,7 @@ def dynamic_lstm(input,
                 candidate_activation='tanh',
                 dtype='float32',
                 name=None):
-    """
+    r"""
 	:api_attr: Static Graph

    **Note**:
@@ -2430,7 +2430,7 @@ def lstm(input,
         name=None,
         default_initializer=None,
         seed=-1):
-    """
+    r"""
 	:api_attr: Static Graph

    **Note**:
@@ -2612,7 +2612,7 @@ def dynamic_lstmp(input,
                  c_0=None,
                  cell_clip=None,
                  proj_clip=None):
-    """
+    r"""
 	:api_attr: Static Graph

    **Note**:
@@ -2823,7 +2823,7 @@ def dynamic_gru(input,
                candidate_activation='tanh',
                h_0=None,
                origin_mode=False):
-    """
+    r"""
 	:api_attr: Static Graph

    **Note: The input type of this must be LoDTensor. If the input type to be
@@ -2985,7 +2985,7 @@ def gru_unit(input,
             activation='tanh',
             gate_activation='sigmoid',
             origin_mode=False):
-    """
+    r"""
 	:api_attr: Static Graph

    Gated Recurrent Unit (GRU) RNN cell. This operator performs GRU calculations for
@@ -3143,7 +3143,7 @@ def beam_search(pre_ids,
                is_accumulated=True,
                name=None,
                return_parent_idx=False):
-    """
+    r"""

    Beam search is a classical algorithm for selecting candidate words in a
    machine translation task.
@@ -3293,7 +3293,7 @@ def beam_search(pre_ids,


 def beam_search_decode(ids, scores, beam_size, end_id, name=None):
-    """
+    r"""

    This operator is used after beam search has completed. It constructs the
    full predicted sequences for each sample by walking back along the search
@@ -3378,7 +3378,7 @@ def lstm_unit(x_t,
              param_attr=None,
              bias_attr=None,
              name=None):
-    """
+    r"""
 	:api_attr: Static Graph

    Long-Short Term Memory (LSTM) RNN cell. This operator performs LSTM calculations for

--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
@@ -51,7 +51,7 @@ def sequence_conv(input,
                  param_attr=None,
                  act=None,
                  name=None):
-    """
+    r"""
 	:api_attr: Static Graph

    **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use conv2d Op.(fluid.layers.** :ref:`api_fluid_layers_conv2d` ).
@@ -175,7 +175,7 @@ def sequence_conv(input,


 def sequence_softmax(input, use_cudnn=False, name=None):
-    """
+    r"""
 	:api_attr: Static Graph

    **Note**:
@@ -259,7 +259,7 @@ def sequence_softmax(input, use_cudnn=False, name=None):


 def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
-    """
+    r"""
 	:api_attr: Static Graph

    **Notes: The Op only receives LoDTensor as input. If your input is Tensor, please use pool2d Op.(fluid.layers.** :ref:`api_fluid_layers_pool2d` ).
@@ -636,7 +636,7 @@ def sequence_slice(input, offset, length, name=None):


 def sequence_expand(x, y, ref_level=-1, name=None):
-    """
+    r"""
 	:api_attr: Static Graph

        Sequence Expand Layer. This layer will expand the input variable ``x`` \
@@ -772,7 +772,7 @@ def sequence_expand(x, y, ref_level=-1, name=None):


 def sequence_expand_as(x, y, name=None):
-    """
+    r"""
 	:api_attr: Static Graph

        Sequence Expand As Layer. This OP will expand the input variable ``x`` \
@@ -892,7 +892,7 @@ def sequence_expand_as(x, y, name=None):


 def sequence_pad(x, pad_value, maxlen=None, name=None):
-    """
+    r"""
 	:api_attr: Static Graph

    This layer padding the sequences in a same batch to a common length (according \
@@ -1233,7 +1233,7 @@ def sequence_scatter(input, index, updates, name=None):


 def sequence_enumerate(input, win_size, pad_value=0, name=None):
-    """
+    r"""
 	:api_attr: Static Graph

    Generate a new sequence for the input index sequence with \
@@ -1301,7 +1301,7 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):


 def sequence_mask(x, maxlen=None, dtype='int64', name=None):
-    """
+    r"""
    **SequenceMask Layer**

    This layer outputs a mask according to the input :code:`x` and

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -343,7 +343,7 @@ def concat(input, axis=0, name=None):


 def tensor_array_to_tensor(input, axis=1, name=None, use_stack=False):
-    """
+    r"""
    This function concatenates or stacks all tensors in the input LoDTensorArray
    along the axis mentioned and returns that as the output.

@@ -452,7 +452,7 @@ def tensor_array_to_tensor(input, axis=1, name=None, use_stack=False):


 def sums(input, out=None):
-    """
+    r"""
    This function computes the sum of multiple input Tensors elementwisely.

    - Case 1, sum of 3 Tensors
@@ -1391,7 +1391,7 @@ def range(start, end, step, dtype, name=None):


 def linspace(start, stop, num, dtype=None, name=None):
-    """
+    r"""
    This OP return fixed number of evenly spaced values within a given interval.

    Args:
@@ -1527,7 +1527,7 @@ def zeros_like(x, out=None):

 @deprecated(since="2.0.0", update_to="paddle.diag")
 def diag(diagonal):
-    """
+    r"""
 	:alias_main: paddle.diag
 	:alias: paddle.diag,paddle.tensor.diag,paddle.tensor.creation.diag
 	:old_api: paddle.fluid.layers.diag

--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -475,7 +475,7 @@ class Accuracy(MetricBase):
        self.weight = .0

    def update(self, value, weight):
-        """
+        r"""
        This function takes the minibatch states (value, weight) as input,
        to accumulate and update the corresponding status of the Accuracy object. The update method is as follows:

@@ -561,7 +561,7 @@ class ChunkEvaluator(MetricBase):
        self.num_correct_chunks = 0

    def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
-        """
+        r"""
        This function takes (num_infer_chunks, num_label_chunks, num_correct_chunks) as input,
        to accumulate and update the corresponding status of the ChunkEvaluator object. The update method is as follows:
        

--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -42,7 +42,7 @@ def simple_img_conv_pool(input,
                         bias_attr=None,
                         act=None,
                         use_cudnn=True):
-    """
+    r"""
 	:api_attr: Static Graph

    The simple_img_conv_pool api is composed of :ref:`api_fluid_layers_conv2d` and :ref:`api_fluid_layers_pool2d` .
@@ -333,7 +333,7 @@ def sequence_conv_pool(input,


 def glu(input, dim=-1):
-    """
+    r"""
 	:api_attr: Static Graph

    The Gated Linear Units(GLU) composed by :ref:`api_fluid_layers_split` , 
@@ -384,7 +384,7 @@ def scaled_dot_product_attention(queries,
                                 values,
                                 num_heads=1,
                                 dropout_rate=0.):
-    """
+    r"""
 	:api_attr: Static Graph

    This interface Multi-Head Attention using scaled dot product.

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -954,7 +954,7 @@ class Optimizer(object):


 class SGDOptimizer(Optimizer):
-    """
+    r"""
    Optimizer of the stochastic gradient descent algorithm.

    .. math::
@@ -1048,7 +1048,7 @@ class SGDOptimizer(Optimizer):


 class MomentumOptimizer(Optimizer):
-    """
+    r"""

    Simple Momentum optimizer with velocity state

@@ -1183,7 +1183,7 @@ class MomentumOptimizer(Optimizer):


 class DGCMomentumOptimizer(Optimizer):
-    """
+    r"""
 	:api_attr: Static Graph

    DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887
@@ -1603,7 +1603,7 @@ class DGCMomentumOptimizer(Optimizer):


 class LarsMomentumOptimizer(Optimizer):
-    """
+    r"""
    Momentum optimizer with LARS support

    The update equations are as follows:
@@ -1735,7 +1735,7 @@ class LarsMomentumOptimizer(Optimizer):


 class AdagradOptimizer(Optimizer):
-    """
+    r"""
    The Adaptive Gradient optimizer (Adagrad for short) can adaptively assign
    different learning rates to individual parameters.

@@ -1851,7 +1851,7 @@ class AdagradOptimizer(Optimizer):


 class AdamOptimizer(Optimizer):
-    """
+    r"""
    The Adam optimizer uses an optimization described at the end
    of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
    it can dynamically adjusts the learning rate of each parameter using
@@ -2117,7 +2117,7 @@ class AdamOptimizer(Optimizer):


 class AdamaxOptimizer(Optimizer):
-    """
+    r"""
    The Adamax optimizer is implemented based on the Adamax Optimization 
    in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
    The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,
@@ -2289,7 +2289,7 @@ class AdamaxOptimizer(Optimizer):


 class DpsgdOptimizer(Optimizer):
-    """
+    r"""
    We implement the Dpsgd optimizer according to CCS16 paper -
    Deep Learning with Differential Privacy.

@@ -2384,7 +2384,7 @@ class DpsgdOptimizer(Optimizer):


 class DecayedAdagradOptimizer(Optimizer):
-    """
+    r"""
    The Decayed Adagrad optimizer can be seen as an Adagrad algorithm that introduces
    the decay rate to solve the problem of a sharp drop in the learning rate
    during model training when using the AdagradOptimizer.
@@ -2494,7 +2494,7 @@ class DecayedAdagradOptimizer(Optimizer):


 class AdadeltaOptimizer(Optimizer):
-    """
+    r"""
    **Notes: This API does not support sparse parameter optimization.**

    Adadelta Optimizer. Please refer to this for details:
@@ -2613,7 +2613,7 @@ class AdadeltaOptimizer(Optimizer):


 class RMSPropOptimizer(Optimizer):
-    """
+    r"""
    Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
    rate method. The original slides proposed RMSProp: Slide 29 of
    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
@@ -2801,7 +2801,7 @@ class RMSPropOptimizer(Optimizer):


 class FtrlOptimizer(Optimizer):
-    """
+    r"""
    FTRL (Follow The Regularized Leader) Optimizer.

    The paper that proposed Follow The Regularized Leader (FTRL):
@@ -2960,7 +2960,7 @@ class FtrlOptimizer(Optimizer):


 class LambOptimizer(AdamOptimizer):
-    """
+    r"""
    LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.

    LAMB Optimizer is designed to scale up the batch size of training without losing 
@@ -3132,7 +3132,7 @@ Lamb = LambOptimizer


 class ModelAverage(Optimizer):
-    """
+    r"""
 	:api_attr: Static Graph

    The ModelAverage optimizer accumulates specific continuous historical parameters
@@ -3441,7 +3441,7 @@ class ModelAverage(Optimizer):


 class ExponentialMovingAverage(object):
-    """
+    r"""
 	:api_attr: Static Graph

    Compute the moving average of parameters with exponential decay.
@@ -4795,7 +4795,7 @@ class RecomputeOptimizer(Optimizer):


 class LookaheadOptimizer(object):
-    """
+    r"""
 	:api_attr: Static Graph

    This implements the Lookahead optimizer of the

--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -210,7 +210,7 @@ class ParamAttr(object):


 class WeightNormParamAttr(ParamAttr):
-    """
+    r"""
 	:api_attr: Static Graph

    Note:

--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -1325,7 +1325,7 @@ class GeneratorLoader(DataLoaderBase):


 class PyReader(DataLoaderBase):
-    """
+    r"""
    Create a reader object for data feeding in Python. 
    Data would be prefetched using Python thread and be pushed
    into a queue asynchronously. Data in the queue would be extracted 

--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -63,7 +63,7 @@ def _create_regularization_of_grad(param, grad, regularization=None):


 def append_regularization_ops(parameters_and_grads, regularization=None):
-    """Create and add backward regularization Operators
+    r"""Create and add backward regularization Operators

    Creates and adds backward regularization operators in the BlockDesc.
    This will add gradients of the regularizer function to the gradients
@@ -132,7 +132,7 @@ class WeightDecayRegularizer(object):


 class L2DecayRegularizer(WeightDecayRegularizer):
-    """ 
+    r""" 
    Implement the L2 Weight Decay Regularization, which helps to prevent the model over-fitting.

    It can be set in :ref:`api_fluid_ParamAttr` or ``optimizer`` (such as :ref:`api_fluid_optimizer_SGDOptimizer` ). 
@@ -239,7 +239,7 @@ class L2DecayRegularizer(WeightDecayRegularizer):


 class L1DecayRegularizer(WeightDecayRegularizer):
-    """
+    r"""
    Implement the L1 Weight Decay Regularization, which encourages the weights to be sparse.
    
    It can be set in :ref:`api_fluid_ParamAttr` or ``optimizer`` (such as :ref:`api_fluid_optimizer_SGDOptimizer` ). 

--- a/python/paddle/fluid/tests/unittests/dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py
@@ -204,8 +204,8 @@ def train(word_idx):
    :rtype: callable
    """
    return reader_creator(
-        re.compile("train/pos/.*\.txt$"),
-        re.compile("train/neg/.*\.txt$"), word_idx)
+        re.compile(r"train/pos/.*\.txt$"),
+        re.compile(r"train/neg/.*\.txt$"), word_idx)


 def test(word_idx):
@@ -221,8 +221,8 @@ def test(word_idx):
    :rtype: callable
    """
    return reader_creator(
-        re.compile("test/pos/.*\.txt$"),
-        re.compile("test/neg/.*\.txt$"), word_idx)
+        re.compile(r"test/pos/.*\.txt$"),
+        re.compile(r"test/neg/.*\.txt$"), word_idx)


 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -230,7 +230,7 @@ class SoftsignLayer(object):


 class FC(Layer):
-    """
+    r"""
    This interface is used to construct a callable object of the ``FC`` class.
    For more details, refer to code examples.
    It creates a fully connected layer in the network. It can take

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
@@ -227,7 +227,7 @@ class SoftsignLayer(object):


 class FC(paddle.nn.Layer):
-    """
+    r"""
    This interface is used to construct a callable object of the ``FC`` class.
    For more details, refer to code examples.
    It creates a fully connected layer in the network. It can take

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
@@ -235,7 +235,7 @@ class EagerDeletionRecurrentOpTest1(unittest.TestCase):


 class EagerDeletionRecurrentOpTest2(EagerDeletionRecurrentOpTest1):
-    '''
+    r'''
    Test RNNOp
    equation:
        h_t = \sigma (W x_t + U h_{t-1})

--- a/python/paddle/fluid/tests/unittests/test_full_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_like_op.py
@@ -31,7 +31,8 @@ class TestFullOp(unittest.TestCase):
        train_program = Program()
        with program_guard(train_program, startup_program):
            fill_value = 2.0
-            input = paddle.fluid.data(name='input', dtype='float32', shape=[2, 3])
+            input = paddle.fluid.data(
+                name='input', dtype='float32', shape=[2, 3])
            output = paddle.full_like(input, fill_value)
            output_dtype = paddle.full_like(input, fill_value, dtype='float32')


--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -25,7 +25,7 @@ from paddle.fluid import compiler, Program, program_guard

 class TestLRNOp(OpTest):
    def get_input(self):
-        ''' TODO(gongweibao): why it's grad diff is so large?
+        r''' TODO(gongweibao): why it's grad diff is so large?
        x = np.ndarray(
            shape=(self.N, self.C, self.H, self.W), dtype=float, order='C')
        for m in range(0, self.N):

--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -232,7 +232,7 @@ class RecurrentOpTest1(unittest.TestCase):


 class RecurrentOpTest2(RecurrentOpTest1):
-    '''
+    r'''
    Test RNNOp
    equation:
        h_t = \sigma (W x_t + U h_{t-1})
@@ -469,7 +469,7 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):


 class RecurrentOpSubBlockTest(RecurrentOpTest1):
-    '''
+    r'''
    Test RNNOp with subblock variable
    equation:
        y_ = emb * w1
@@ -608,7 +608,7 @@ class RecurrentOpSubBlockTest(RecurrentOpTest1):


 class RecurrentOpStopGradientTest(RecurrentOpTest1):
-    """
+    r"""
    Test RNNOp with stop_gradient = True
    equation:
        h_t = \sigma (W x_t + U h_{t-1})

--- a/python/paddle/fluid/tests/unittests/test_require_version.py
+++ b/python/paddle/fluid/tests/unittests/test_require_version.py
@@ -79,7 +79,7 @@ class TestErrors(unittest.TestCase):

        self.assertRaises(TypeError, test_input_type_1)

-        # The value of params must be in format '\d+(\.\d+){0,3}', like '1.5.2.0', '1.6' ...
+        # The value of params must be in format r'\d+(\.\d+){0,3}', like '1.5.2.0', '1.6' ...
        def test_input_value_1():
            fluid.require_version('string')


--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -35,7 +35,7 @@ def _is_numpy_(var):

 @six.add_metaclass(abc.ABCMeta)
 class Metric(object):
-    """
+    r"""
    Base class for metric, encapsulates metric logic and APIs
    Usage:
        

--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -58,7 +58,7 @@ import paddle


 def elu(x, alpha=1.0, name=None):
-    """
+    r"""
    elu activation.

    .. math::
@@ -101,7 +101,7 @@ def elu(x, alpha=1.0, name=None):


 def gelu(x, approximate=False, name=None):
-    """
+    r"""
    gelu activation.

    if approximate is True
@@ -155,7 +155,7 @@ def gelu(x, approximate=False, name=None):


 def hardshrink(x, threshold=0.5, name=None):
-    """
+    r"""
    hard shrinkage activation

    .. math::
@@ -204,7 +204,7 @@ def hardshrink(x, threshold=0.5, name=None):


 def hardtanh(x, min=-1.0, max=1.0, name=None):
-    """
+    r"""
    hardtanh activation

    .. math::
@@ -254,7 +254,7 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):


 def hardsigmoid(x, name=None):
-    """
+    r"""
    hardsigmoid activation.

    A 3-part piecewise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
@@ -308,7 +308,7 @@ def hardsigmoid(x, name=None):


 def hardswish(x, name=None):
-    """
+    r"""
    hardswish activation

    hardswish is proposed in MobileNetV3, and performs better in computational stability
@@ -357,7 +357,7 @@ def hardswish(x, name=None):


 def leaky_relu(x, negative_slope=0.01, name=None):
-    """
+    r"""
    leaky_relu activation

    .. math::
@@ -515,7 +515,7 @@ def relu(x, name=None):


 def log_sigmoid(x, name=None):
-    """
+    r"""
    log_sigmoid activation.

    .. math::
@@ -552,7 +552,7 @@ def log_sigmoid(x, name=None):


 def maxout(x, groups, axis=1, name=None):
-    """
+    r"""
    maxout activation.

    Assumed the input shape is (N, Ci, H, W).
@@ -671,7 +671,7 @@ def selu(x,
         scale=1.0507009873554804934193349852946,
         alpha=1.6732632423543772848170429916717,
         name=None):
-    """
+    r"""
    selu activation

    .. math::
@@ -726,7 +726,7 @@ def selu(x,


 def softmax(x, axis=-1, dtype=None, name=None):
-    """
+    r"""
    This operator implements the softmax layer. The calculation process is as follows:

    1. The dimension :attr:`axis` of ``x`` will be permuted to the last.
@@ -880,7 +880,7 @@ def softmax(x, axis=-1, dtype=None, name=None):


 def softplus(x, beta=1, threshold=20, name=None):
-    """
+    r"""
    softplus activation

    .. math::
@@ -925,7 +925,7 @@ def softplus(x, beta=1, threshold=20, name=None):


 def softshrink(x, threshold=0.5, name=None):
-    """
+    r"""
    softshrink activation

    .. math::
@@ -976,7 +976,7 @@ def softshrink(x, threshold=0.5, name=None):


 def softsign(x, name=None):
-    """
+    r"""
    softsign activation

    .. math::
@@ -1013,7 +1013,7 @@ def softsign(x, name=None):


 def swish(x, name=None):
-    """
+    r"""
    swish activation.

    .. math::
@@ -1091,7 +1091,7 @@ def tanhshrink(x, name=None):


 def thresholded_relu(x, threshold=1.0, name=None):
-    """
+    r"""
    thresholded relu activation.

    .. math::
@@ -1137,7 +1137,7 @@ def thresholded_relu(x, threshold=1.0, name=None):


 def log_softmax(x, axis=-1, dtype=None, name=None):
-    """
+    r"""
    This operator implements the log_softmax layer. The calculation process is
    as follows:


--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1413,7 +1413,7 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):


 def linear(x, weight, bias=None, name=None):
-    """
+    r"""

    Fully-connected linear transformation operator. For each input :math:`X` ,
    the equation is:
@@ -1500,7 +1500,7 @@ def linear(x, weight, bias=None, name=None):


 def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
-    """
+    r"""
    Label smoothing is a mechanism to regularize the classifier layer and is called
    label-smoothing regularization (LSR).


--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -166,7 +166,7 @@ def conv1d(x,
           groups=1,
           data_format='NCL',
           name=None):
-    """
+    r"""
    The convolution1D layer calculates the output based on the input, filter
    and strides, paddings, dilations, groups parameters. Input and
    Output are in NCL format, where N is batch size, C is the number of
@@ -392,7 +392,7 @@ def conv2d(x,
           groups=1,
           data_format="NCHW",
           name=None):
-    """
+    r"""

    The convolution2D layer calculates the output based on the input, filter
    and strides, paddings, dilations, groups parameters. Input and
@@ -568,7 +568,7 @@ def conv1d_transpose(x,
                     output_size=None,
                     data_format="NCL",
                     name=None):
-    """
+    r"""
    The 1-D convolution transpose layer calculates the output based on the input,
    filter, and dilation, stride, padding. Input(Input) and output(Output)
    are in 'NCL' format or 'NLC' where N is batch size, C is the number of channels,
@@ -828,7 +828,7 @@ def conv2d_transpose(x,
                     output_size=None,
                     data_format='NCHW',
                     name=None):
-    """
+    r"""

    The convolution2D transpose layer calculates the output based on the input,
    filter, and dilations, strides, paddings. Input(Input) and output(Output)
@@ -1068,7 +1068,7 @@ def conv3d(x,
           groups=1,
           data_format="NCDHW",
           name=None):
-    """
+    r"""

    The convolution3D layer calculates the output based on the input, filter
    and strides, paddings, dilations, groups parameters. Input(Input) and
@@ -1233,7 +1233,7 @@ def conv3d_transpose(x,
                     output_size=None,
                     data_format='NCDHW',
                     name=None):
-    """
+    r"""
    The convolution3d transpose layer calculates the output based on the input,
    filter, and dilations, strides, paddings. Input(Input) and output(Output)
    are in NCDHW or NDHWC format. Where N is batch size, C is the number of channels,

--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -14,10 +14,7 @@

 # TODO: define the extention functions

-__all__ = [
-    'diag_embed',
-    'row_conv'
-]
+__all__ = ['diag_embed', 'row_conv']

 import numpy as np
 from ...fluid.data_feeder import check_dtype

--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -111,7 +111,7 @@ def one_hot(x, num_classes, name=None):


 def embedding(x, weight, padding_idx=None, sparse=False, name=None):
-    """
+    r"""
    The operator is used to lookup embeddings vector of ids provided by :attr:`x` .

    The shape of output Tensor is generated by appending the last dimension of the input Tensor shape

--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -184,7 +184,7 @@ def binary_cross_entropy_with_logits(logit,
                                     reduction='mean',
                                     pos_weight=None,
                                     name=None):
-    """
+    r"""
    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
    Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
    layer and some reduce operations.
@@ -461,7 +461,7 @@ def hsigmoid_loss(input,


 def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
-    """
+    r"""
    This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
    term if the absolute element-wise error falls below 1 and an L1 term otherwise.
    In some cases it can prevent exploding gradients and it is more robust and less
@@ -544,7 +544,7 @@ def margin_ranking_loss(input,
                        margin=0.0,
                        reduction='mean',
                        name=None):
-    """
+    r"""

    This op the calcluate the the margin rank loss between the input, other and label, use the math function as follows.

@@ -646,7 +646,7 @@ def margin_ranking_loss(input,


 def l1_loss(input, label, reduction='mean', name=None):
-    """
+    r"""
    This operator computes the L1 Loss of Tensor ``input`` and ``label`` as follows.

    If `reduction` set to ``'none'``, the loss is:
@@ -840,7 +840,7 @@ def nll_loss(input,


 def kl_div(input, label, reduction='mean', name=None):
-    """
+    r"""
    This operator calculates the Kullback-Leibler divergence loss
    between Input(X) and Input(Target). Notes that Input(X) is the
    log-probability and Input(Target) is the probability.
@@ -947,7 +947,7 @@ def kl_div(input, label, reduction='mean', name=None):


 def mse_loss(input, label, reduction='mean', name=None):
-    """
+    r"""
    This op accepts input predications and label and returns the mean square error.

    If :attr:`reduction` is set to ``'none'``, loss is calculated as:
@@ -1121,7 +1121,7 @@ def cross_entropy(input,
                  weight=None,
                  ignore_index=-100,
                  reduction='mean'):
-    """
+    r"""
    This operator implements the cross entropy loss function. This OP combines ``LogSoftmax``,
    and ``NLLLoss`` together.

@@ -1252,7 +1252,7 @@ def sigmoid_focal_loss(logit,
                       gamma=2.0,
                       reduction='sum',
                       name=None):
-    """
+    r"""
    `Focal Loss <https://arxiv.org/abs/1708.02002>`_ is proposed to address the
    foreground-background class imbalance for classification tasks. It down-weights
    easily-classified examples and thus focuses training on hard examples. For example,

--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -35,7 +35,7 @@ __all__ = [


 def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
-    """
+    r"""
    This op normalizes ``x`` along dimension ``axis`` using :math:`L_p` norm. This layer computes

    .. math::
@@ -412,7 +412,7 @@ def local_response_norm(x,
                        k=1.,
                        data_format="NCHW",
                        name=None):
-    """
+    r"""
        Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
        For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_


--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -54,11 +54,7 @@ import numpy as np
 # from ...fluid.layers import roi_perspective_transform  #DEFINE_ALIAS
 # from ...fluid.layers import shuffle_channel  #DEFINE_ALIAS

-__all__ = [
-    'affine_grid',
-    'grid_sample',
-    'pixel_shuffle'
-]
+__all__ = ['affine_grid', 'grid_sample', 'pixel_shuffle']


 def affine_grid(theta, out_shape, align_corners=True, name=None):

--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -19,7 +19,7 @@ __all__ = ['KaimingUniform', 'KaimingNormal']


 class KaimingNormal(MSRAInitializer):
-    """Implements the Kaiming Normal initializer
+    r"""Implements the Kaiming Normal initializer

    This class implements the weight initialization from the paper
    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
@@ -62,7 +62,7 @@ class KaimingNormal(MSRAInitializer):


 class KaimingUniform(MSRAInitializer):
-    """Implements the Kaiming Uniform initializer
+    r"""Implements the Kaiming Uniform initializer

    This class implements the weight initialization from the paper
    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on

--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -18,7 +18,7 @@ __all__ = ['XavierNormal', 'XavierUniform']


 class XavierNormal(XavierInitializer):
-    """
+    r"""
    This class implements the Xavier weight initializer from the paper
    `Understanding the difficulty of training deep feedforward neural
    networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
@@ -71,7 +71,7 @@ class XavierNormal(XavierInitializer):


 class XavierUniform(XavierInitializer):
-    """
+    r"""
    This class implements the Xavier weight initializer from the paper
    `Understanding the difficulty of training deep feedforward neural
    networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_

--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -50,7 +50,7 @@ from .. import functional as F


 class ELU(layers.Layer):
-    """
+    r"""
    ELU Activation.

    .. math::
@@ -88,7 +88,7 @@ class ELU(layers.Layer):


 class GELU(layers.Layer):
-    """
+    r"""
    GELU Activation.

    If approximate is True
@@ -137,7 +137,7 @@ class GELU(layers.Layer):


 class Hardshrink(layers.Layer):
-    """
+    r"""
    Hardshrink Activation

    .. math::
@@ -181,7 +181,7 @@ class Hardshrink(layers.Layer):


 class Hardswish(layers.Layer):
-    """
+    r"""
    Hardswish activation

    Hardswish is proposed in MobileNetV3, and performs better in computational stability
@@ -227,7 +227,7 @@ class Hardswish(layers.Layer):


 class Tanh(layers.Layer):
-    """
+    r"""
    Tanh Activation.

    .. math::
@@ -264,7 +264,7 @@ class Tanh(layers.Layer):


 class Hardtanh(layers.Layer):
-    """
+    r"""
    Hardtanh Activation

    .. math::
@@ -442,7 +442,7 @@ class ReLU6(layers.Layer):


 class SELU(layers.Layer):
-    """
+    r"""
    SELU Activation

    .. math::
@@ -488,7 +488,7 @@ class SELU(layers.Layer):


 class LeakyReLU(layers.Layer):
-    """
+    r"""
    Leaky ReLU Activation.

    .. math::
@@ -574,7 +574,7 @@ class Sigmoid(layers.Layer):


 class Hardsigmoid(layers.Layer):
-    """
+    r"""
    This interface is used to construct a callable object of the ``Hardsigmoid`` class.
    This layer calcluate the `hardsigmoid` of input x.

@@ -621,7 +621,7 @@ class Hardsigmoid(layers.Layer):


 class Softplus(layers.Layer):
-    """
+    r"""
    Softplus Activation

    .. math::
@@ -661,7 +661,7 @@ class Softplus(layers.Layer):


 class Softshrink(layers.Layer):
-    """
+    r"""
    Softshrink Activation

    .. math::
@@ -702,7 +702,7 @@ class Softshrink(layers.Layer):


 class Softsign(layers.Layer):
-    """
+    r"""
    Softsign Activation

    .. math::
@@ -737,7 +737,7 @@ class Softsign(layers.Layer):


 class Swish(layers.Layer):
-    """
+    r"""
    Swish Activation.

    .. math::
@@ -807,7 +807,7 @@ class Tanhshrink(layers.Layer):


 class ThresholdedReLU(layers.Layer):
-    """
+    r"""
    Thresholded ReLU Activation

    .. math::
@@ -847,7 +847,7 @@ class ThresholdedReLU(layers.Layer):


 class LogSigmoid(layers.Layer):
-    """
+    r"""
    LogSigmoid Activation.

    .. math::
@@ -882,7 +882,7 @@ class LogSigmoid(layers.Layer):


 class Softmax(layers.Layer):
-    """
+    r"""
    Softmax Activation.

    This operator implements the softmax layer. The calculation process is as follows:
@@ -1005,7 +1005,7 @@ class Softmax(layers.Layer):


 class LogSoftmax(layers.Layer):
-    """
+    r"""
    This operator implements the log_softmax layer. The calculation process is as follows:

    .. math::
@@ -1059,7 +1059,7 @@ class LogSoftmax(layers.Layer):


 class Maxout(layers.Layer):
-    """
+    r"""
    Maxout Activation.

    Assumed the input shape is (N, Ci, H, W).

--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -40,7 +40,7 @@ __all__ = [


 class Linear(layers.Layer):
-    """
+    r"""

    Fully-connected linear transformation layer. For each input :math:`X` ,
    the equation is:
@@ -381,7 +381,7 @@ class Upsample(layers.Layer):


 class Bilinear(layers.Layer):
-    """
+    r"""

    This layer performs bilinear on two inputs.

@@ -988,7 +988,7 @@ class CosineSimilarity(layers.Layer):


 class Embedding(layers.Layer):
-    """
+    r"""
    **Embedding Layer**

    This interface is used to construct a callable object of the ``Embedding`` class.

--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -141,7 +141,7 @@ class _ConvNd(layers.Layer):


 class Conv1D(_ConvNd):
-    """
+    r"""
    This interface is used to construct a callable object of the ``Conv1D`` class.
    For more details, refer to code examples.
    The convolution1D layer calculates the output based on the input, filter
@@ -294,7 +294,7 @@ class Conv1D(_ConvNd):


 class Conv1DTranspose(_ConvNd):
-    """
+    r"""
    This interface is used to construct a callable object of the ``Conv1DTranspose`` class.
    For more details, refer to code examples.
    The 1-D convolution transpose layer calculates the output based on the input,
@@ -469,7 +469,7 @@ class Conv1DTranspose(_ConvNd):


 class Conv2D(_ConvNd):
-    """
+    r"""
    This interface is used to construct a callable object of the ``Conv2D`` class.
    For more details, refer to code examples.
    The convolution2D layer calculates the output based on the input, filter
@@ -626,7 +626,7 @@ class Conv2D(_ConvNd):


 class Conv2DTranspose(_ConvNd):
-    """
+    r"""
    This interface is used to construct a callable object of the ``Conv2DTranspose`` class.
    For more details, refer to code examples.
    The convolution2D transpose layer calculates the output based on the input,
@@ -786,7 +786,7 @@ class Conv2DTranspose(_ConvNd):


 class Conv3D(_ConvNd):
-    """
+    r"""
    **Convlution3d Layer**
    The convolution3d layer calculates the output based on the input, filter
    and strides, paddings, dilations, groups parameters. Input(Input) and
@@ -943,7 +943,7 @@ class Conv3D(_ConvNd):


 class Conv3DTranspose(_ConvNd):
-    """
+    r"""
    **Convlution3D transpose layer**
    The convolution3D transpose layer calculates the output based on the input,
    filter, and dilations, strides, paddings. Input(Input) and output(Output)

--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -24,7 +24,7 @@ from ...fluid.layer_helper import LayerHelper


 class PairwiseDistance(layers.Layer):
-    """
+    r"""
    This operator computes the pairwise distance between two vectors. The
    distance is calculated by p-oreder norm:


--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -36,7 +36,7 @@ __all__ = [


 class BCEWithLogitsLoss(fluid.dygraph.Layer):
-    """
+    r"""
    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
    Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
    layer and some reduce operations.
@@ -141,7 +141,7 @@ class BCEWithLogitsLoss(fluid.dygraph.Layer):


 class CrossEntropyLoss(fluid.dygraph.Layer):
-    """
+    r"""
 	:alias_main: paddle.nn.CrossEntropyLoss
 	:alias: paddle.nn.CrossEntropyLoss,paddle.nn.layer.CrossEntropyLoss,paddle.nn.layer.loss.CrossEntropyLoss

@@ -375,7 +375,7 @@ class HSigmoidLoss(fluid.dygraph.Layer):


 class MSELoss(fluid.dygraph.layers.Layer):
-    """
+    r"""
    **Mean Square Error Loss**
    Computes the mean square error (squared L2 norm) of given input and label.

@@ -454,7 +454,7 @@ class MSELoss(fluid.dygraph.layers.Layer):


 class L1Loss(fluid.dygraph.Layer):
-    """
+    r"""
    This interface is used to construct a callable object of the ``L1Loss`` class.
    The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.

@@ -622,7 +622,7 @@ class BCELoss(fluid.dygraph.Layer):


 class NLLLoss(fluid.dygraph.Layer):
-    """
+    r"""

    This class accepts input and target label and returns negative log likelihood
    cross error. It is useful to train a classification problem with C classes.
@@ -733,7 +733,7 @@ class NLLLoss(fluid.dygraph.Layer):


 class KLDivLoss(fluid.dygraph.Layer):
-    """
+    r"""
    This interface calculates the Kullback-Leibler divergence loss
    between Input(X) and Input(Target). Notes that Input(X) is the
    log-probability and Input(Target) is the probability.
@@ -806,7 +806,7 @@ class KLDivLoss(fluid.dygraph.Layer):


 class MarginRankingLoss(fluid.dygraph.Layer):
-    """
+    r"""

    This interface is used to construct a callable object of the ``MarginRankingLoss`` class.
    The MarginRankingLoss layer calculates the margin rank loss between the input, other and label
@@ -958,7 +958,7 @@ class CTCLoss(fluid.dygraph.Layer):


 class SmoothL1Loss(fluid.dygraph.Layer):
-    """
+    r"""
    This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
    term if the absolute element-wise error falls below 1 and an L1 term otherwise.
    In some cases it can prevent exploding gradients and it is more robust and less

--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -109,7 +109,7 @@ class _InstanceNormBase(layers.Layer):


 class InstanceNorm1D(_InstanceNormBase):
-    """
+    r"""
    Applies Instance Normalization over a 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .

    DataLayout: NCL `[batch, in_channels, length]`
@@ -181,7 +181,7 @@ class InstanceNorm1D(_InstanceNormBase):


 class InstanceNorm2D(_InstanceNormBase):
-    """
+    r"""
    Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .

    DataLayout: NCHW `[batch, in_channels, in_height, in_width]`
@@ -252,7 +252,7 @@ class InstanceNorm2D(_InstanceNormBase):


 class InstanceNorm3D(_InstanceNormBase):
-    """
+    r"""
    Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .

    DataLayout: NCHW `[batch, in_channels, D, in_height, in_width]`
@@ -437,7 +437,7 @@ class GroupNorm(layers.Layer):


 class LayerNorm(layers.Layer):
-    """
+    r"""
    :alias_main: paddle.nn.LayerNorm
 	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
 	:old_api: paddle.fluid.dygraph.LayerNorm
@@ -649,7 +649,7 @@ class _BatchNormBase(layers.Layer):


 class BatchNorm1D(_BatchNormBase):
-    """
+    r"""
    Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .

    When track_running_stats = False, the :math:`\\mu_{\\beta}`
@@ -740,7 +740,7 @@ class BatchNorm1D(_BatchNormBase):


 class BatchNorm2D(_BatchNormBase):
-    """
+    r"""
    Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .

    When track_running_stats = False, the :math:`\\mu_{\\beta}`
@@ -829,7 +829,7 @@ class BatchNorm2D(_BatchNormBase):


 class BatchNorm3D(_BatchNormBase):
-    """
+    r"""
    Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .

    When track_running_stats = False, the :math:`\\mu_{\\beta}`
@@ -919,7 +919,7 @@ class BatchNorm3D(_BatchNormBase):


 class SyncBatchNorm(_BatchNormBase):
-    """
+    r"""
    This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
    It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
    be used as a normalizer function for other operations, such as conv2d and fully connected 

--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -120,7 +120,7 @@ class AvgPool1D(layers.Layer):


 class AvgPool2D(layers.Layer):
-    """
+    r"""
    This operation applies 2D average pooling over input features based on the input,
    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
    in NCHW format, where N is batch size, C is the number of channels,
@@ -401,7 +401,7 @@ class MaxPool1D(layers.Layer):


 class MaxPool2D(layers.Layer):
-    """
+    r"""
    This operation applies 2D max pooling over input feature based on the input,
    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
    in NCHW format, where N is batch size, C is the number of channels,
@@ -595,7 +595,7 @@ class MaxPool3D(layers.Layer):


 class AdaptiveAvgPool1D(layers.Layer):
-    """
+    r"""

    This operation applies a 1D adaptive average pooling over an input signal composed
    of several input planes, based on the input, output_size, return_mask parameters.
@@ -663,7 +663,7 @@ class AdaptiveAvgPool1D(layers.Layer):


 class AdaptiveAvgPool2D(layers.Layer):
-    """
+    r"""

    This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
    of the output tensor are determined by the parameter output_size.
@@ -745,7 +745,7 @@ class AdaptiveAvgPool2D(layers.Layer):


 class AdaptiveAvgPool3D(layers.Layer):
-    """
+    r"""

    This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
    of the output tensor are determined by the parameter output_size.

--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -157,7 +157,7 @@ class MultiHeadAttention(Layer):
            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)

    def _prepare_qkv(self, query, key, value, cache=None):
-        """
+        r"""
        Prapares linear projected queries, keys and values for usage of subsequnt
        multiple parallel attention. If `cache` is not None, using cached results
        to reduce redundant calculations.
@@ -212,7 +212,7 @@ class MultiHeadAttention(Layer):
        return (q, k, v) if cache is None else (q, k, v, cache)

    def compute_kv(self, key, value):
-        """
+        r"""
        Applies linear projection on input keys and values, then splits heads
        (reshape and transpose) to get keys and values from different representation
        subspaces. The results are used as key-values pairs for subsequent multiple
@@ -312,7 +312,7 @@ class MultiHeadAttention(Layer):
            return self.Cache(key, value)

    def forward(self, query, key, value, attn_mask=None, cache=None):
-        """
+        r"""
        Applies multi-head attention to map queries and a set of key-value pairs
        to outputs.

@@ -499,7 +499,7 @@ class TransformerEncoderLayer(Layer):
        self.activation = getattr(F, activation)

    def forward(self, src, src_mask=None):
-        """
+        r"""
        Applies a Transformer encoder layer on the input.

        Parameters:
@@ -575,7 +575,7 @@ class TransformerEncoder(Layer):
        self.norm = norm

    def forward(self, src, src_mask=None):
-        """
+        r"""
        Applies a stack of N Transformer encoder layers on inputs. If `norm` is
        provided, also applies layer normalization on the output of last encoder
        layer.
@@ -725,7 +725,7 @@ class TransformerDecoderLayer(Layer):
        self.activation = getattr(F, activation)

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
-        """
+        r"""
        Applies a Transformer decoder layer on the input.

        Parameters:
@@ -801,7 +801,7 @@ class TransformerDecoderLayer(Layer):
                                                static_cache))

    def gen_cache(self, memory):
-        """
+        r"""
        Generates cache for `forward` usage. The generated cache is a tuple
        composed of an instance of `MultiHeadAttention.Cache` and an instance
        of `MultiHeadAttention.StaticCache`.
@@ -873,7 +873,7 @@ class TransformerDecoder(Layer):
        self.norm = norm

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
-        """
+        r"""
        Applies a stack of N Transformer decoder layers on inputs. If `norm` is
        provided, also applies layer normalization on the output of last decoder
        layer.
@@ -937,7 +937,7 @@ class TransformerDecoder(Layer):
        return output if cache is None else (output, new_caches)

    def gen_cache(self, memory, do_zip=False):
-        """
+        r"""
        Generates cache for `forward` usage. The generated cache is a list, and
        each element in it is a tuple( :code:`(incremental_cache, static_cache)` )
        produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache`
@@ -1139,7 +1139,7 @@ class Transformer(Layer):
        self.nhead = nhead

    def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
-        """
+        r"""
        Applies a Transformer model on the inputs.

        Parameters:

--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -153,7 +153,7 @@ class WeightNorm(object):


 def weight_norm(layer, name='weight', dim=0):
-    """
+    r"""
    This weight_norm layer applies weight normalization to a parameter according to the 
    following formula:


--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -21,7 +21,7 @@ __all__ = ["Adadelta"]


 class Adadelta(Optimizer):
-    """
+    r"""
    **Notes: This API does not support sparse parameter optimization.**

    Adadelta Optimizer. Please refer to this for details:

--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
@@ -21,7 +21,7 @@ __all__ = ["Adagrad"]


 class Adagrad(Optimizer):
-    """
+    r"""
    The Adaptive Gradient optimizer (Adagrad for short) use an optimization described 
    in paper: `Adaptive Subgradient Methods for Online Learning and
    Stochastic Optimization <http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf>`_.

--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -24,7 +24,7 @@ __all__ = ["Adam"]


 class Adam(Optimizer):
-    """
+    r"""
    The Adam optimizer uses an optimization described at the end
    of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
    it can dynamically adjusts the learning rate of each parameter using

--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -21,7 +21,7 @@ __all__ = ["Adamax"]


 class Adamax(Optimizer):
-    """
+    r"""
    The Adamax optimizer is implemented based on the Adamax Optimization 
    in Section 7 of `Adam paper <https://arxiv.org/abs/1412.6980>`_.
    The Adamax algorithm is a variant of the Adam algorithm based on the infinite norm,

--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -23,7 +23,7 @@ __all__ = ['AdamW']


 class AdamW(Adam):
-    """
+    r"""
    The AdamW optimizer is implemented based on the AdamW Optimization
    in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
    it can resolves the problem of L2 regularization failure in the Adam optimizer.

--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -192,7 +192,7 @@ class LRScheduler(object):


 class NoamDecay(LRScheduler):
-    """
+    r"""

    Applies Noam Decay to the initial learning rate. 

@@ -376,7 +376,7 @@ class PiecewiseDecay(LRScheduler):


 class NaturalExpDecay(LRScheduler):
-    """
+    r"""

    Applies natural exponential decay to the initial learning rate.
    
@@ -455,7 +455,7 @@ class NaturalExpDecay(LRScheduler):


 class InverseTimeDecay(LRScheduler):
-    """
+    r"""

    Applies inverse time decay to the initial learning rate.

@@ -536,7 +536,7 @@ class InverseTimeDecay(LRScheduler):


 class PolynomialDecay(LRScheduler):
-    """
+    r"""

    Applies polynomial decay to the initial learning rate.

@@ -656,7 +656,7 @@ class PolynomialDecay(LRScheduler):


 class LinearWarmup(LRScheduler):
-    """
+    r"""

    Linear learning rate warm up strategy. Update the learning rate preliminarily before the normal learning rate scheduler.
    For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
@@ -794,7 +794,7 @@ class LinearWarmup(LRScheduler):


 class ExponentialDecay(LRScheduler):
-    """
+    r"""

    Update learning rate by `gamma` each epoch.

@@ -1383,7 +1383,7 @@ class ReduceOnPlateau(LRScheduler):


 class CosineAnnealingDecay(LRScheduler):
-    """
+    r"""

    Set the learning rate using a cosine annealing schedule, where :math:`\eta_{max}` is set to 
    the initial learning_rate. :math:`T_{cur}` is the number of epochs since the last restart in 

--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -21,7 +21,7 @@ __all__ = ["Momentum"]


 class Momentum(Optimizer):
-    """
+    r"""

    Simple Momentum optimizer with velocity state


--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -47,7 +47,7 @@ __all__ = ['Optimizer']


 class Optimizer(object):
-    """Optimizer Base class.
+    r"""Optimizer Base class.

    Define the common interface of an optimizer.
    User should not use this class directly,

--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -21,7 +21,7 @@ __all__ = ["RMSProp"]


 class RMSProp(Optimizer):
-    """
+    r"""
    Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
    rate method. The original slides proposed RMSProp: Slide 29 of
    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .

--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -21,7 +21,7 @@ __all__ = ["SGD"]


 class SGD(Optimizer):
-    """
+    r"""
    Optimizer of the stochastic gradient descent algorithm.

    .. math::

--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
+r"""
 At training and testing time, PaddlePaddle programs need to read data. To ease
 the users' work to write data reading code, we define that


--- a/python/paddle/regularizer.py
+++ b/python/paddle/regularizer.py
@@ -18,7 +18,7 @@ import paddle.fluid as fluid


 class L1Decay(fluid.regularizer.L1Decay):
-    """
+    r"""
    Implement the L1 Weight Decay Regularization, which encourages the weights to be sparse.
    
    It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
@@ -80,7 +80,7 @@ class L1Decay(fluid.regularizer.L1Decay):


 class L2Decay(fluid.regularizer.L2Decay):
-    """
+    r"""
    Implement the L2 Weight Decay Regularization, which helps to prevent the model over-fitting.
    
    It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 

--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -14,7 +14,6 @@

 from __future__ import print_function

-
 import errno
 import inspect
 import logging
@@ -31,7 +30,6 @@ from paddle.fluid.io import prepend_feed_ops, append_fetch_ops, save_persistable
 from paddle.fluid.io import load_persistables, _endpoints_replacement
 from paddle.fluid.log_helper import get_logger

-
 __all__ = [
    'save_inference_model',
    'load_inference_model',
@@ -44,10 +42,13 @@ _logger = get_logger(
 def _check_args(caller, args, supported_args=[], deprecated_args=[]):
    for arg in args:
        if arg in deprecated_args:
-            raise ValueError("argument '{}' in function '{}' is deprecated, only {} are supported.".format(arg, caller, supported_args))
+            raise ValueError(
+                "argument '{}' in function '{}' is deprecated, only {} are supported.".
+                format(arg, caller, supported_args))
        elif arg not in supported_args:
            raise ValueError(
-                "function '{}' doesn't support argument '{}',\n only {} are supported.".format(caller, arg, supported_args))
+                "function '{}' doesn't support argument '{}',\n only {} are supported.".
+                format(caller, arg, supported_args))


 @static_only
@@ -129,14 +130,18 @@ def save_inference_model(path_prefix, feed_vars, fetch_vars, executor):
    # verify feed_vars
    if not isinstance(feed_vars, list):
        feed_vars = [feed_vars]
-    if not feed_vars or not all([isinstance(var, Variable) for var in feed_vars]):
-        raise ValueError("'feed_vars' should be a Variable or a list of Variable.")
+    if not feed_vars or not all(
+        [isinstance(var, Variable) for var in feed_vars]):
+        raise ValueError(
+            "'feed_vars' should be a Variable or a list of Variable.")

    # verify fetch_vars
    if not isinstance(fetch_vars, list):
        fetch_vars = [fetch_vars]
-    if not fetch_vars or not all([isinstance(var, Variable) for var in fetch_vars]):
-        raise ValueError("'fetch_vars' should be a Variable or a list of Variable.")
+    if not fetch_vars or not all(
+        [isinstance(var, Variable) for var in fetch_vars]):
+        raise ValueError(
+            "'fetch_vars' should be a Variable or a list of Variable.")

    main_program = _get_valid_program()
    # remind users to set auc_states to 0 if auc op were found.
@@ -145,7 +150,9 @@ def save_inference_model(path_prefix, feed_vars, fetch_vars, executor):
        device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
        op._set_attr(device_attr_name, "")
        if op.type == 'auc':
-            warnings.warn("Be sure that you have set auc states to 0 before saving inference model.")
+            warnings.warn(
+                "Be sure that you have set auc states to 0 before saving inference model."
+            )
            break

    # fix the bug that the activation op's output as target will be pruned.
@@ -154,10 +161,11 @@ def save_inference_model(path_prefix, feed_vars, fetch_vars, executor):
    with program_guard(main_program):
        uniq_fetch_vars = []
        for i, var in enumerate(fetch_vars):
-            var = layers.scale(var, 1., name="save_infer_model/scale_{}".format(i))
+            var = layers.scale(
+                var, 1., name="save_infer_model/scale_{}".format(i))
            uniq_fetch_vars.append(var)
        fetch_vars = uniq_fetch_vars
-    
+
    # save model
    origin_program = main_program.clone()
    main_program = main_program.clone()
@@ -257,7 +265,7 @@ def load_inference_model(path_prefix, executor, **configs):
    """
    # check configs
    supported_args = ('model_filename', 'params_filename')
-    deprecated_args = ('pserver_endpoints',)
+    deprecated_args = ('pserver_endpoints', )
    caller = inspect.currentframe().f_code.co_name
    _check_args(caller, configs, supported_args, deprecated_args)

@@ -268,8 +276,7 @@ def load_inference_model(path_prefix, executor, **configs):
        params_filename = configs.get('params_filename', None)
        if params_filename is None:
            raise ValueError(
-                "params_filename cannot be None when path_prefix is None."
-            )
+                "params_filename cannot be None when path_prefix is None.")
        load_dirname = path_prefix
        program_desc_str = model_filename
        params_filename = params_filename
@@ -297,18 +304,21 @@ def load_inference_model(path_prefix, executor, **configs):
            if model_filename is None:
                model_path = os.path.join(path_prefix, "__model__")
            else:
-                model_path = os.path.join(path_prefix, model_filename + ".pdmodel")
+                model_path = os.path.join(path_prefix,
+                                          model_filename + ".pdmodel")
                if not os.path.exists(model_path):
                    model_path = os.path.join(path_prefix, model_filename)
            # set params_path
            if params_filename is None:
                params_path = os.path.join(path_prefix, "")
            else:
-                params_path = os.path.join(path_prefix, params_filename + ".pdiparams")
+                params_path = os.path.join(path_prefix,
+                                           params_filename + ".pdiparams")
                if not os.path.exists(params_path):
                    params_path = os.path.join(path_prefix, params_filename)
            _logger.warning("The old way to load inference model is deprecated."
-                    " model path: {}, params path: {}".format(model_path, params_path))
+                            " model path: {}, params path: {}".format(
+                                model_path, params_path))
        with open(model_path, "rb") as f:
            program_desc_str = f.read()
        load_dirname = os.path.dirname(params_path)
@@ -328,4 +338,3 @@ def load_inference_model(path_prefix, executor, **configs):
    ]

    return [program, feed_target_names, fetch_targets]
-
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -26,7 +26,7 @@ def fc(x,
       bias_attr=None,
       activation=None,
       name=None):
-    """
+    r"""

    Fully-Connected layer can take a tensor or a list of tensor as its inputs.
    It creates a 2-D weight tensor for each input tensor, which represents its
@@ -180,7 +180,7 @@ def deform_conv2d(x,
                  weight_attr=None,
                  bias_attr=None,
                  name=None):
-    """
+    r"""

    Compute 2-D deformable convolution on 4-D input.
    Given input image x, output feature map y, the deformable convolution operation can be expressed as follow:

--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -54,7 +54,7 @@ __all__ = [

 @dygraph_only
 def to_tensor(data, dtype=None, place=None, stop_gradient=True):
-    """
+    r"""
    Constructs a ``paddle.Tensor`` or ``paddle.ComplexTensor`` from ``data`` , 
    which can be scalar, tuple, list, numpy\.ndarray, paddle\.Tensor, paddle\.ComplexTensor.

@@ -609,7 +609,7 @@ def _tril_triu_op(helper):


 def tril(x, diagonal=0, name=None):
-    """
+    r"""
 	:alias_main: paddle.tril
 	:alias: paddle.tril,paddle.tensor.tril,paddle.tensor.creation.tril

@@ -680,7 +680,7 @@ def tril(x, diagonal=0, name=None):


 def triu(x, diagonal=0, name=None):
-    """
+    r"""
 	:alias_main: paddle.triu
 	:alias: paddle.triu,paddle.tensor.triu,paddle.tensor.creation.triu


--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -453,7 +453,7 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):


 def dist(x, y, p=2):
-    """
+    r"""

    This OP returns the p-norm of (x - y). It is not a norm in a strict sense, only as a measure
    of distance. The shapes of x and y must be broadcastable. The definition is as follows, for
@@ -740,7 +740,7 @@ def cross(x, y, axis=None, name=None):


 def cholesky(x, upper=False, name=None):
-    """
+    r"""
    Computes the Cholesky decomposition of one symmetric positive-definite
    matrix or batches of symmetric positive-definite matrice. 
    

--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -169,7 +169,7 @@ def flip(x, axis, name=None):


 def flatten(x, start_axis=0, stop_axis=-1, name=None):
-    """
+    r"""
    **Flatten op**

    Flattens a contiguous range of axes in a tensor according to start_axis and stop_axis.
@@ -565,7 +565,7 @@ def unique(x,
           axis=None,
           dtype="int64",
           name=None):
-    """
+    r"""
    Returns the unique elements of `x` in ascending order.

    Args:
@@ -946,7 +946,7 @@ def scatter(x, index, updates, overwrite=True, name=None):


 def scatter_nd_add(x, index, updates, name=None):
-    """
+    r"""
    **Scatter_nd_add Layer**

    Output is obtained by applying sparse addition to a single value

--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -379,7 +379,7 @@ def floor_divide(x, y, name=None):


 def remainder(x, y, name=None):
-    """
+    r"""
    Mod two tensors element-wise. The equation is:

    .. math::
@@ -981,7 +981,7 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):


 def logsumexp(x, axis=None, keepdim=False, name=None):
-    """
+    r"""
    This OP calculates the log of the sum of exponentials of ``x`` along ``axis`` .

    .. math::
@@ -1281,7 +1281,7 @@ def min(x, axis=None, keepdim=False, name=None):


 def log1p(x, name=None):
-    """
+    r"""
    Calculates the natural log of the given input tensor, element-wise.
    .. math::
        Out = \\ln(x+1)
@@ -1315,7 +1315,7 @@ def log1p(x, name=None):
    return out

 def log2(x, name=None):
-    """
+    r"""
    Calculates the log to the base 2 of the given input tensor, element-wise.

    .. math::
@@ -1365,7 +1365,7 @@ def log2(x, name=None):


 def log10(x, name=None):
-    """
+    r"""
    Calculates the log to the base 10 of the given input tensor, element-wise.

    .. math::
@@ -1947,7 +1947,7 @@ def sign(x, name=None):


 def tanh(x, name=None):
-    """
+    r"""
    Tanh Activation Operator.

    .. math::

--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -494,7 +494,7 @@ def sort(x, axis=-1, descending=False, name=None):


 def where(condition, x, y, name=None):
-    """
+    r"""
    Return a tensor of elements selected from either $x$ or $y$, depending on $condition$.

    .. math::

--- a/python/paddle/text/datasets/imdb.py
+++ b/python/paddle/text/datasets/imdb.py
@@ -93,7 +93,7 @@ class Imdb(Dataset):

    def _build_work_dict(self, cutoff):
        word_freq = collections.defaultdict(int)
-        pattern = re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$")
+        pattern = re.compile(r"aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$")
        for doc in self._tokenize(pattern):
            for word in doc:
                word_freq[word] += 1
@@ -123,8 +123,8 @@ class Imdb(Dataset):
        return data

    def _load_anno(self):
-        pos_pattern = re.compile("aclImdb/{}/pos/.*\.txt$".format(self.mode))
-        neg_pattern = re.compile("aclImdb/{}/neg/.*\.txt$".format(self.mode))
+        pos_pattern = re.compile(r"aclImdb/{}/pos/.*\.txt$".format(self.mode))
+        neg_pattern = re.compile(r"aclImdb/{}/neg/.*\.txt$".format(self.mode))

        UNK = self.word_idx['<unk>']


--- a/r/example/mobilenet.py
+++ b/r/example/mobilenet.py
 #!/usr/bin/env python3.7
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # pylint: skip-file

 import functools

--- a/tools/check_ctest_hung.py
+++ b/tools/check_ctest_hung.py
@@ -42,11 +42,11 @@ Diff:  set(['test_parallel_executor_crf'])
        for l in fn.readlines():
            if l.find("Test ") != -1 and \
                l.find("Passed") != -1:
-                m = re.search("Test\s+#[0-9]*\:\s([a-z0-9_]+)", escape(l))
+                m = re.search(r"Test\s+#[0-9]*\:\s([a-z0-9_]+)", escape(l))
                passed.add(m.group(1))
            if l.find("Start ") != -1:
                start_parts = escape(l).split(" ")
-                m = re.search("Start\s+[0-9]+\:\s([a-z0-9_]+)", escape(l))
+                m = re.search(r"Start\s+[0-9]+\:\s([a-z0-9_]+)", escape(l))
                started.add(m.group(1))
    print("Diff: ", started - passed)


--- a/tools/codestyle/docstring_checker.py
+++ b/tools/codestyle/docstring_checker.py
@@ -101,7 +101,7 @@ class Docstring(object):
    def _arg_with_type(self):

        for t in self.d['Args']:
-            m = re.search('([A-Za-z0-9_-]+)\s{0,4}(\(.+\))\s{0,4}:', t)
+            m = re.search(r'([A-Za-z0-9_-]+)\s{0,4}(\(.+\))\s{0,4}:', t)
            if m:
                self.args[m.group(1)] = m.group(2)


--- a/tools/coverage/coverage_diff.py
+++ b/tools/coverage/coverage_diff.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 usage: coverage_diff.py info_file diff_file > > coverage-diff.info
 """

--- a/tools/coverage/coverage_diff_list.py
+++ b/tools/coverage/coverage_diff_list.py
--- a/tools/coverage/coverage_lines.py
+++ b/tools/coverage/coverage_lines.py
--- a/tools/coverage/cuda_clean.py
+++ b/tools/coverage/cuda_clean.py
--- a/tools/coverage/gcda_clean.py
+++ b/tools/coverage/gcda_clean.py
--- a/tools/coverage/pull_request.py
+++ b/tools/coverage/pull_request.py
--- a/tools/coverage/python_coverage.py
+++ b/tools/coverage/python_coverage.py
--- a/tools/get_quick_disable_lt.py
+++ b/tools/get_quick_disable_lt.py
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
--- a/tools/summary_env.py
+++ b/tools/summary_env.py