[CodeStyle][W291] trim trailing whitespace in python file (#45937)

* trim trailing whitespace * fix `.cmake-format.py` * revert npu ut changes, avoid npu ci error

[CodeStyle][W291] trim trailing whitespace in python file (#45937)
* trim trailing whitespace * fix `.cmake-format.py` * revert npu ut changes, avoid npu ci error
de8c0ba5 · Nyakku Shigure · GitHub · cbe64cc1 · de8c0ba5 · de8c0ba5
416 changed file
--- a/.cmake-format.py
+++ b/.cmake-format.py
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

--- a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
+++ b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py
@@ -22,7 +22,7 @@ def GenerateFileStructureForFinalDygraph(eager_dir):
    |- generated
    |  |- CMakeLists.txt
    |  |  "add_subdirectory(forwards), add_subdirectory(backwards)"
-    |  
+    |
    |  |- forwards
    |     |- "dygraph_functions.cc"
    |     |- "dygraph_functions.h"
@@ -59,7 +59,7 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
    |- generated
    |  |- CMakeLists.txt
    |  |  "add_subdirectory(forwards), add_subdirectory(nodes)"
-    |  
+    |
    |  |- forwards
    |     |- "dygraph_forward_functions.cc"
    |     |- CMakeLists.txt
@@ -70,7 +70,7 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count):
    |     |- "nodes.h"
    |     |- CMakeLists.txt
    |     |  "cc_library(dygraph_node SRCS nodes.cc DEPS ${eager_deps} ${fluid_deps})"
-    | 
+    |
    |  |- dygraph_forward_api.h
    """
    # Directory Generation

--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -403,9 +403,9 @@ LAYOUT_LOGIC_TEMPLATE=\
  if (paddle::imperative::LayoutAutoTune::Instance().UseLayoutAutoTune()) {{
    VLOG(5) << "Check and Prepare For LAYOUT";
    paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> tensors_vector = {};
-    {} 
    {}
-    paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune(); 
+    {}
+    paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune();
    {}
    {}
    paddle::imperative::LayoutAutoTune::Instance().EnableLayoutAutoTune();
@@ -1772,7 +1772,7 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase):
            autograd_api = self.grad_api_contents['invoke'].replace(
                forward_api_name, forward_api_name + '_dygraph_function', 1)
            grad_function_call_str = f"""
-  if (trace_backward) {{            
+  if (trace_backward) {{
  {indent}{autograd_api_out} api_output = {autograd_api};
  {out_assign_str}}} else {{
  {indent}{autograd_api_out} api_output = paddle::experimental::{self.namespace}{self.grad_api_contents['invoke']};

--- a/paddle/fluid/inference/api/demo_ci/untar_model.py
+++ b/paddle/fluid/inference/api/demo_ci/untar_model.py
@@ -20,7 +20,7 @@ def untar(fname, dirs):
    """
    extract the tar.gz file
    :param fname: the name of tar.gz file
-    :param dirs: the path of decompressed file 
+    :param dirs: the path of decompressed file
    :return: bool
    """
    try:

--- a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
@@ -106,8 +106,8 @@ def convert_pascalvoc_local2bin(args):
        for object in objects:
            bbox_sample = []
            # start from 1
-            bbox_sample.append(
-                float(label_list.index(object.find('name').text)))
+            bbox_sample.append(float(label_list.index(
+                object.find('name').text)))
            bbox = object.find('bndbox')
            difficult = float(object.find('difficult').text)
            bbox_sample.append(float(bbox.find('xmin').text) / im_width)
@@ -131,7 +131,7 @@ def convert_pascalvoc_local2bin(args):
    f1.close()

    object_nums_sum = sum(object_nums)
-    # The data should be contains 
+    # The data should be contains
    # number of images + all images data + an array that represent object numbers of each image
    # + labels of all objects in images + bboxes of all objects + difficulties of all objects
    # so the target size should be as follows:
@@ -210,8 +210,8 @@ def convert_pascalvoc_tar2bin(tar_path, data_out_path):

        for object in objects:
            bbox_sample = []
-            bbox_sample.append(
-                float(label_list.index(object.find('name').text)))
+            bbox_sample.append(float(label_list.index(
+                object.find('name').text)))
            bbox = object.find('bndbox')
            difficult = float(object.find('difficult').text)
            bbox_sample.append(float(bbox.find('xmin').text) / im_width)
@@ -230,7 +230,7 @@ def convert_pascalvoc_tar2bin(tar_path, data_out_path):
        if line_idx % per_percentage:
            print_processbar(line_idx / per_percentage)

-    # The data should be stored in binary in following sequence: 
+    # The data should be stored in binary in following sequence:
    # number of images->all images data->an array that represent object numbers in each image
    # ->labels of all objects in images->bboxes of all objects->difficulties of all objects
    f1.write(np.array(object_nums).astype('uint64').tobytes())
@@ -258,9 +258,9 @@ def download_pascalvoc(data_url, data_dir, tar_targethash, tar_path):
 def run_convert():
    try_limit = 2
    retry = 0
-    while not (os.path.exists(DATA_OUT_PATH) and
-               os.path.getsize(DATA_OUT_PATH) == BIN_FULLSIZE and BIN_TARGETHASH
-               == hashlib.md5(open(DATA_OUT_PATH, 'rb').read()).hexdigest()):
+    while not (os.path.exists(DATA_OUT_PATH) and os.path.getsize(DATA_OUT_PATH)
+               == BIN_FULLSIZE and BIN_TARGETHASH == hashlib.md5(
+                   open(DATA_OUT_PATH, 'rb').read()).hexdigest()):
        if os.path.exists(DATA_OUT_PATH):
            sys.stderr.write(
                "The existing binary file is broken. It is being removed...\n")
@@ -275,52 +275,52 @@ def run_convert():

 def main_pascalvoc_preprocess(args):
    parser = argparse.ArgumentParser(
-        description="Convert the full pascalvoc val set or local data to binary file.",
+        description=
+        "Convert the full pascalvoc val set or local data to binary file.",
        usage=None,
        add_help=True)
    parser.add_argument(
        '--local',
        action="store_true",
        help="If used, user need to set --data_dir and then convert file")
-    parser.add_argument(
-        "--data_dir", default="", type=str, help="Dataset root directory")
+    parser.add_argument("--data_dir",
+                        default="",
+                        type=str,
+                        help="Dataset root directory")
    parser.add_argument(
        "--img_annotation_list",
        type=str,
        default="test_100.txt",
-        help="A file containing the image file path and corresponding annotation file path"
+        help=
+        "A file containing the image file path and corresponding annotation file path"
    )
    parser.add_argument(
        "--label_file",
        type=str,
        default="label_list",
-        help="List of object labels with same sequence as denoted in the annotation file"
+        help=
+        "List of object labels with same sequence as denoted in the annotation file"
    )
-    parser.add_argument(
-        "--output_file",
-        type=str,
-        default="pascalvoc_small.bin",
-        help="File path of the output binary file")
-    parser.add_argument(
-        "--resize_h",
-        type=int,
-        default=RESIZE_H,
-        help="Image preprocess with resize_h")
-    parser.add_argument(
-        "--resize_w",
-        type=int,
-        default=RESIZE_W,
-        help="Image prerocess with resize_w")
-    parser.add_argument(
-        "--mean_value",
-        type=str,
-        default=MEAN_VALUE,
-        help="Image preprocess with mean_value")
-    parser.add_argument(
-        "--ap_version",
-        type=str,
-        default=AP_VERSION,
-        help="Image preprocess with ap_version")
+    parser.add_argument("--output_file",
+                        type=str,
+                        default="pascalvoc_small.bin",
+                        help="File path of the output binary file")
+    parser.add_argument("--resize_h",
+                        type=int,
+                        default=RESIZE_H,
+                        help="Image preprocess with resize_h")
+    parser.add_argument("--resize_w",
+                        type=int,
+                        default=RESIZE_W,
+                        help="Image prerocess with resize_w")
+    parser.add_argument("--mean_value",
+                        type=str,
+                        default=MEAN_VALUE,
+                        help="Image preprocess with mean_value")
+    parser.add_argument("--ap_version",
+                        type=str,
+                        default=AP_VERSION,
+                        help="Image preprocess with ap_version")
    args = parser.parse_args()
    if args.local:
        convert_pascalvoc_local2bin(args)

--- a/paddle/infrt/tests/models/abs_model.py
+++ b/paddle/infrt/tests/models/abs_model.py
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,6 +20,7 @@ import sys


 class AbsNet(paddle.nn.Layer):
+
    def __init__(self):
        super(AbsNet, self).__init__()

@@ -32,7 +33,6 @@ if __name__ == '__main__':
    # build network
    model = AbsNet()
    # save inferencing format model
-    net = to_static(
-        model, input_spec=[InputSpec(
-            shape=[None, 1, 28, 28], name='x')])
+    net = to_static(model,
+                    input_spec=[InputSpec(shape=[None, 1, 28, 28], name='x')])
    paddle.jit.save(net, sys.argv[1])
--- a/paddle/infrt/tests/models/efficientnet-b4/model.py
+++ b/paddle/infrt/tests/models/efficientnet-b4/model.py
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,7 +20,6 @@ import paddle
 import sys

 model = EfficientNet.from_name('efficientnet-b4')
-net = to_static(
-    model, input_spec=[InputSpec(
-        shape=[None, 3, 256, 256], name='x')])
+net = to_static(model,
+                input_spec=[InputSpec(shape=[None, 3, 256, 256], name='x')])
 paddle.jit.save(net, sys.argv[1])
--- a/paddle/infrt/tests/models/efficientnet-b4/net/__init__.py
+++ b/paddle/infrt/tests/models/efficientnet-b4/net/__init__.py
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

--- a/paddle/infrt/tests/models/efficientnet-b4/net/efficientnet.py
+++ b/paddle/infrt/tests/models/efficientnet-b4/net/efficientnet.py
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -38,8 +38,8 @@ class MBConvBlock(nn.Layer):
        self._block_args = block_args
        self._bn_mom = global_params.batch_norm_momentum
        self._bn_eps = global_params.batch_norm_epsilon
-        self.has_se = (self._block_args.se_ratio is not None) and (
-            0 < self._block_args.se_ratio <= 1)
+        self.has_se = (self._block_args.se_ratio
+                       is not None) and (0 < self._block_args.se_ratio <= 1)
        self.id_skip = block_args.id_skip  # skip connection and drop connect

        # Get static or dynamic convolution depending on image size
@@ -49,13 +49,13 @@ class MBConvBlock(nn.Layer):
        inp = self._block_args.input_filters  # number of input channels
        oup = self._block_args.input_filters * self._block_args.expand_ratio  # number of output channels
        if self._block_args.expand_ratio != 1:
-            self._expand_conv = Conv2d(
-                in_channels=inp,
-                out_channels=oup,
-                kernel_size=1,
-                bias_attr=False)
-            self._bn0 = nn.BatchNorm2D(
-                num_features=oup, momentum=self._bn_mom, epsilon=self._bn_eps)
+            self._expand_conv = Conv2d(in_channels=inp,
+                                       out_channels=oup,
+                                       kernel_size=1,
+                                       bias_attr=False)
+            self._bn0 = nn.BatchNorm2D(num_features=oup,
+                                       momentum=self._bn_mom,
+                                       epsilon=self._bn_eps)

        # Depthwise convolution phase
        k = self._block_args.kernel_size
@@ -67,32 +67,31 @@ class MBConvBlock(nn.Layer):
            kernel_size=k,
            stride=s,
            bias_attr=False)
-        self._bn1 = nn.BatchNorm2D(
-            num_features=oup, momentum=self._bn_mom, epsilon=self._bn_eps)
+        self._bn1 = nn.BatchNorm2D(num_features=oup,
+                                   momentum=self._bn_mom,
+                                   epsilon=self._bn_eps)

        # Squeeze and Excitation layer, if desired
        if self.has_se:
-            num_squeezed_channels = max(1,
-                                        int(self._block_args.input_filters *
-                                            self._block_args.se_ratio))
-            self._se_reduce = Conv2d(
-                in_channels=oup,
-                out_channels=num_squeezed_channels,
-                kernel_size=1)
-            self._se_expand = Conv2d(
-                in_channels=num_squeezed_channels,
-                out_channels=oup,
-                kernel_size=1)
+            num_squeezed_channels = max(
+                1,
+                int(self._block_args.input_filters * self._block_args.se_ratio))
+            self._se_reduce = Conv2d(in_channels=oup,
+                                     out_channels=num_squeezed_channels,
+                                     kernel_size=1)
+            self._se_expand = Conv2d(in_channels=num_squeezed_channels,
+                                     out_channels=oup,
+                                     kernel_size=1)

        # Output phase
        final_oup = self._block_args.output_filters
-        self._project_conv = Conv2d(
-            in_channels=oup,
-            out_channels=final_oup,
-            kernel_size=1,
-            bias_attr=False)
-        self._bn2 = nn.BatchNorm2D(
-            num_features=final_oup, momentum=self._bn_mom, epsilon=self._bn_eps)
+        self._project_conv = Conv2d(in_channels=oup,
+                                    out_channels=final_oup,
+                                    kernel_size=1,
+                                    bias_attr=False)
+        self._bn2 = nn.BatchNorm2D(num_features=final_oup,
+                                   momentum=self._bn_mom,
+                                   epsilon=self._bn_eps)
        self._swish = nn.Hardswish()

    def forward(self, inputs, drop_connect_rate=None):
@@ -121,8 +120,9 @@ class MBConvBlock(nn.Layer):
        input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
        if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
            if drop_connect_rate:
-                x = drop_connect(
-                    x, prob=drop_connect_rate, training=self.training)
+                x = drop_connect(x,
+                                 prob=drop_connect_rate,
+                                 training=self.training)
            x = x + inputs  # skip connection
        return x

@@ -162,10 +162,14 @@ class EfficientNet(nn.Layer):
        in_channels = 3  # rgb
        out_channels = round_filters(
            32, self._global_params)  # number of output channels
-        self._conv_stem = Conv2d(
-            in_channels, out_channels, kernel_size=3, stride=2, bias_attr=False)
-        self._bn0 = nn.BatchNorm2D(
-            num_features=out_channels, momentum=bn_mom, epsilon=bn_eps)
+        self._conv_stem = Conv2d(in_channels,
+                                 out_channels,
+                                 kernel_size=3,
+                                 stride=2,
+                                 bias_attr=False)
+        self._bn0 = nn.BatchNorm2D(num_features=out_channels,
+                                   momentum=bn_mom,
+                                   epsilon=bn_eps)

        # Build blocks
        self._blocks = nn.LayerList([])
@@ -186,16 +190,19 @@ class EfficientNet(nn.Layer):
                block_args = block_args._replace(
                    input_filters=block_args.output_filters, stride=1)
            for _ in range(block_args.num_repeat - 1):
-                self._blocks.append(
-                    MBConvBlock(block_args, self._global_params))
+                self._blocks.append(MBConvBlock(block_args,
+                                                self._global_params))

        # Head
        in_channels = block_args.output_filters  # output of final block
        out_channels = round_filters(1280, self._global_params)
-        self._conv_head = Conv2d(
-            in_channels, out_channels, kernel_size=1, bias_attr=False)
-        self._bn1 = nn.BatchNorm2D(
-            num_features=out_channels, momentum=bn_mom, epsilon=bn_eps)
+        self._conv_head = Conv2d(in_channels,
+                                 out_channels,
+                                 kernel_size=1,
+                                 bias_attr=False)
+        self._bn1 = nn.BatchNorm2D(num_features=out_channels,
+                                   momentum=bn_mom,
+                                   epsilon=bn_eps)

        # Final linear layer
        self._avg_pooling = nn.AdaptiveAvgPool2D(1)
@@ -253,20 +260,21 @@ class EfficientNet(nn.Layer):
                        advprop=False,
                        num_classes=1000,
                        in_channels=3):
-        model = cls.from_name(
-            model_name, override_params={'num_classes': num_classes})
-        load_pretrained_weights(
-            model, model_name, load_fc=(num_classes == 1000), advprop=advprop)
+        model = cls.from_name(model_name,
+                              override_params={'num_classes': num_classes})
+        load_pretrained_weights(model,
+                                model_name,
+                                load_fc=(num_classes == 1000),
+                                advprop=advprop)
        if in_channels != 3:
            Conv2d = get_same_padding_conv2d(
                image_size=model._global_params.image_size)
            out_channels = round_filters(32, model._global_params)
-            model._conv_stem = Conv2d(
-                in_channels,
-                out_channels,
-                kernel_size=3,
-                stride=2,
-                bias_attr=False)
+            model._conv_stem = Conv2d(in_channels,
+                                      out_channels,
+                                      kernel_size=3,
+                                      stride=2,
+                                      bias_attr=False)
        return model

    @classmethod
@@ -280,5 +288,5 @@ class EfficientNet(nn.Layer):
        """ Validates model name. """
        valid_models = ['efficientnet-b' + str(i) for i in range(9)]
        if model_name not in valid_models:
-            raise ValueError('model_name should be one of: ' + ', '.join(
-                valid_models))
+            raise ValueError('model_name should be one of: ' +
+                             ', '.join(valid_models))
--- a/paddle/infrt/tests/models/efficientnet-b4/net/utils.py
+++ b/paddle/infrt/tests/models/efficientnet-b4/net/utils.py
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -96,15 +96,14 @@ class Conv2dDynamicSamePadding(nn.Conv2D):
                 dilation=1,
                 groups=1,
                 bias_attr=None):
-        super().__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            0,
-            dilation,
-            groups,
-            bias_attr=bias_attr)
+        super().__init__(in_channels,
+                         out_channels,
+                         kernel_size,
+                         stride,
+                         0,
+                         dilation,
+                         groups,
+                         bias_attr=bias_attr)
        self.stride = self._stride if len(
            self._stride) == 2 else [self._stride[0]] * 2

@@ -113,10 +112,12 @@ class Conv2dDynamicSamePadding(nn.Conv2D):
        kh, kw = self.weight.shape[-2:]
        sh, sw = self.stride
        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
-        pad_h = max((oh - 1) * self.stride[0] +
-                    (kh - 1) * self._dilation[0] + 1 - ih, 0)
-        pad_w = max((ow - 1) * self.stride[1] +
-                    (kw - 1) * self._dilation[1] + 1 - iw, 0)
+        pad_h = max(
+            (oh - 1) * self.stride[0] + (kh - 1) * self._dilation[0] + 1 - ih,
+            0)
+        pad_w = max(
+            (ow - 1) * self.stride[1] + (kw - 1) * self._dilation[1] + 1 - iw,
+            0)
        if pad_h > 0 or pad_w > 0:
            x = F.pad(x, [
                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
@@ -142,15 +143,18 @@ class Conv2dStaticSamePadding(nn.Conv2D):

        # Calculate padding based on image size and save it
        assert image_size is not None
-        ih, iw = image_size if type(
-            image_size) == list else [image_size, image_size]
+        ih, iw = image_size if type(image_size) == list else [
+            image_size, image_size
+        ]
        kh, kw = self.weight.shape[-2:]
        sh, sw = self.stride
        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
-        pad_h = max((oh - 1) * self.stride[0] +
-                    (kh - 1) * self._dilation[0] + 1 - ih, 0)
-        pad_w = max((ow - 1) * self.stride[1] +
-                    (kw - 1) * self._dilation[1] + 1 - iw, 0)
+        pad_h = max(
+            (oh - 1) * self.stride[0] + (kh - 1) * self._dilation[0] + 1 - ih,
+            0)
+        pad_w = max(
+            (ow - 1) * self.stride[1] + (kw - 1) * self._dilation[1] + 1 - iw,
+            0)
        if pad_h > 0 or pad_w > 0:
            self.static_padding = nn.Pad2D([
                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
@@ -166,6 +170,7 @@ class Conv2dStaticSamePadding(nn.Conv2D):


 class Identity(nn.Layer):
+
    def __init__(self, ):
        super().__init__()

@@ -225,9 +230,12 @@ class BlockDecoder(object):
    def _encode_block_string(block):
        """Encodes a block to a string."""
        args = [
-            'r%d' % block.num_repeat, 'k%d' % block.kernel_size, 's%d%d' %
-            (block.strides[0], block.strides[1]), 'e%s' % block.expand_ratio,
-            'i%d' % block.input_filters, 'o%d' % block.output_filters
+            'r%d' % block.num_repeat,
+            'k%d' % block.kernel_size,
+            's%d%d' % (block.strides[0], block.strides[1]),
+            'e%s' % block.expand_ratio,
+            'i%d' % block.input_filters,
+            'o%d' % block.output_filters
        ]
        if 0 < block.se_ratio <= 1:
            args.append('se%s' % block.se_ratio)
@@ -291,7 +299,8 @@ def efficientnet(width_coefficient=None,
        depth_coefficient=depth_coefficient,
        depth_divisor=8,
        min_depth=None,
-        image_size=image_size, )
+        image_size=image_size,
+    )

    return blocks_args, global_params

@@ -300,11 +309,10 @@ def get_model_params(model_name, override_params):
    """ Get the block args and global params for a given model """
    if model_name.startswith('efficientnet'):
        w, d, s, p = efficientnet_params(model_name)
-        blocks_args, global_params = efficientnet(
-            width_coefficient=w,
-            depth_coefficient=d,
-            dropout_rate=p,
-            image_size=s)
+        blocks_args, global_params = efficientnet(width_coefficient=w,
+                                                  depth_coefficient=d,
+                                                  dropout_rate=p,
+                                                  image_size=s)
    else:
        raise NotImplementedError('model name is not pre-defined: %s' %
                                  model_name)

--- a/paddle/infrt/tests/models/linear.py
+++ b/paddle/infrt/tests/models/linear.py
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -28,6 +28,7 @@ CLASS_NUM = 10

 # define a random dataset
 class RandomDataset(paddle.io.Dataset):
+
    def __init__(self, num_samples):
        self.num_samples = num_samples

@@ -41,6 +42,7 @@ class RandomDataset(paddle.io.Dataset):


 class LinearNet(nn.Layer):
+
    def __init__(self):
        super(LinearNet, self).__init__()
        self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
@@ -69,8 +71,11 @@ adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())

 # create data loader
 dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
-loader = paddle.io.DataLoader(
-    dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2)
+loader = paddle.io.DataLoader(dataset,
+                              batch_size=BATCH_SIZE,
+                              shuffle=True,
+                              drop_last=True,
+                              num_workers=2)

 # train
 train(layer, loader, loss_fn, adam)

--- a/paddle/infrt/tests/models/resnet50_model.py
+++ b/paddle/infrt/tests/models/resnet50_model.py
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,7 +19,6 @@ from paddle.static import InputSpec
 import sys

 model = resnet50(True)
-net = to_static(
-    model, input_spec=[InputSpec(
-        shape=[None, 3, 256, 256], name='x')])
+net = to_static(model,
+                input_spec=[InputSpec(shape=[None, 3, 256, 256], name='x')])
 paddle.jit.save(net, sys.argv[1])
--- a/paddle/phi/api/yaml/generator/api_base.py
+++ b/paddle/phi/api/yaml/generator/api_base.py
@@ -706,29 +706,29 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
 {code_indent}     std::vector<std::pair<const char*, std::vector<phi::DDim>>> input_shapes{{"""
            for input_name in single_tensor_names[:-1]:
                if input_name in self.optional_vars:
-                    input_tensor_code = input_tensor_code + f"""            
+                    input_tensor_code = input_tensor_code + f"""
 {code_indent}     {{"{input_name}", {input_name}_record_shapes}},"""
                else:
-                    input_tensor_code = input_tensor_code + f"""            
+                    input_tensor_code = input_tensor_code + f"""
 {code_indent}     {{"{input_name}", {{"""
                    input_tensors = input_name_tensor_map[input_name]
                    for input_tensor, _ in input_tensors[:-1]:
-                        input_tensor_code = input_tensor_code + f"""            
+                        input_tensor_code = input_tensor_code + f"""
 {code_indent}     (*{input_tensor}).dims(),"""
-                    input_tensor_code = input_tensor_code + f"""            
+                    input_tensor_code = input_tensor_code + f"""
 {code_indent}     (*{input_tensors[-1][0]}).dims()}}}},"""
            if single_tensor_names[-1] in self.optional_vars:
-                input_tensor_code = input_tensor_code + f"""            
-{code_indent}     {{"{single_tensor_names[-1]}",         
+                input_tensor_code = input_tensor_code + f"""
+{code_indent}     {{"{single_tensor_names[-1]}",
 {code_indent}     {single_tensor_names[-1]}_record_shapes}}}};"""
            else:
-                input_tensor_code = input_tensor_code + f"""            
+                input_tensor_code = input_tensor_code + f"""
 {code_indent}     {{"{single_tensor_names[-1]}", {{"""
                input_tensors = input_name_tensor_map[single_tensor_names[-1]]
                for input_tensor, _ in input_tensors[:-1]:
-                    input_tensor_code = input_tensor_code + f"""            
+                    input_tensor_code = input_tensor_code + f"""
 {code_indent}     (*{input_tensor}).dims(),"""
-                input_tensor_code = input_tensor_code + f"""            
+                input_tensor_code = input_tensor_code + f"""
 {code_indent}     (*{input_tensors[-1][0]}).dims()}}}}}};"""
        if list_tensor_names:
            input_tensor_code = input_tensor_code + f"""
@@ -757,14 +757,14 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
 {code_indent}       ddims_vec.emplace_back((*{input_tensor_truncate}[i]).dims());
 {code_indent}     }}"""
                else:
-                    input_tensor_code = input_tensor_code + f"""  
+                    input_tensor_code = input_tensor_code + f"""
                  ddims_vec.emplace_back((*{input_tensor}).dims());
 {code_indent}     """
            input_tensor_code = input_tensor_code + f"""
 {code_indent}     input_shapes.emplace_back("{input_name}", ddims_vec);"""

-        input_tensor_code = input_tensor_code + f"""  
-{code_indent}     platform::RecordOpInfoSupplement("{self.api}", input_shapes);  
+        input_tensor_code = input_tensor_code + f"""
+{code_indent}     platform::RecordOpInfoSupplement("{self.api}", input_shapes);
 {code_indent}  }}"""
        kernel_args = ["*dev_ctx"]
        for param in kernel_param:

--- a/paddle/scripts/conda_build.py
+++ b/paddle/scripts/conda_build.py
 #!/bin/python

 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,17 +24,17 @@ import time

 def parse_args():
    parser = argparse.ArgumentParser("conda build for paddlepaddle version")
-    parser.add_argument(
-        "--paddle_version",
-        type=str,
-        required=True,
-        help="paddle version for conda build.")
+    parser.add_argument("--paddle_version",
+                        type=str,
+                        required=True,
+                        help="paddle version for conda build.")
    args = parser.parse_args()

    return args


 class ConstantVar:
+
    def __init__(self):
        self.build = r"""
 build:
@@ -89,7 +89,7 @@ about:
        self.build_const = r"""
 """

-        self.blt_const = r""" 
+        self.blt_const = r"""
 """

        self.python36 = r"    - python>=3.6, <3.7"

--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -25,21 +25,21 @@ def auto_cast(enable=True,
              dtype='float16'):
    """
    Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
-    If enabled, the input data type (float32 or float16) of each operator is decided 
-    by autocast algorithm for better performance. 
-    
-    Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in 
+    If enabled, the input data type (float32 or float16) of each operator is decided
+    by autocast algorithm for better performance.
+
+    Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in
    imperative mode. It is used together with `decorator` to achieve Pure fp16 in imperative mode.

    Args:
        enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
        custom_white_list(set|list|tuple, optional): The custom white_list. It's the set of ops that support
-             fp16 calculation and are considered numerically-safe and performance-critical. These ops 
+             fp16 calculation and are considered numerically-safe and performance-critical. These ops
             will be converted to fp16.
        custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16
-             calculation and are considered numerically-dangerous and whose effects may also be 
+             calculation and are considered numerically-dangerous and whose effects may also be
             observed in downstream ops. These ops will not be converted to fp16.
-        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list; 
+        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the input data type of each operator will be casted by white_list and black_list;
             O2 represent Pure fp16, all operators parameters and input data will be casted to fp16, except operators in black_list, don't support fp16 kernel and batchnorm. Default is O1(amp)
        dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.

@@ -69,7 +69,7 @@ def auto_cast(enable=True,
        with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}):
            c = a + b
            print(c.dtype) # paddle.float32
-        
+
        with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}, level='O2'):
            d = a + b
            print(d.dtype) # paddle.float32
@@ -85,15 +85,15 @@ def decorate(models,
             master_weight=None,
             save_dtype=None):
    """
-    Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing. 
+    Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
    When level is O2(pure float16/bfloat16), the decorate will cast all parameters of models to float16/bfloat16, except BatchNorm and LayerNorm.
-    
+
    Commonly, it is used together with `auto_cast` to achieve Pure float16/bfloat16 in imperative mode.

    Args:
        models(Layer|list of Layer, optional): The defined models by user, models must be either a single model or a list of models. Default is None.
        optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
-        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing; 
+        level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing;
             O2 represent Pure float16/bfloat16, the decorator will cast all parameters of models to float16/bfloat16, except BatchNorm and LayerNorm. Default is O1(amp)
        dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.
        master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
@@ -102,7 +102,7 @@ def decorate(models,

    Examples:

-     .. code-block:: python   
+     .. code-block:: python

        # required: gpu
        # Demo1: single model and optimizer:
@@ -118,7 +118,7 @@ def decorate(models,
        with paddle.amp.auto_cast(enable=True, custom_white_list=None, custom_black_list=None, level='O2'):
            output = model(data)
            print(output.dtype) # FP16
-            
+
        # required: gpu
        # Demo2: multi models and optimizers:
        model2 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
@@ -133,7 +133,7 @@ def decorate(models,
            output2 = models[1](data)
            print(output.dtype) # FP16
            print(output2.dtype) # FP16
-        
+
        # required: gpu
        # Demo3: optimizers is None:
        model3 = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)

--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -25,7 +25,7 @@ def _refresh_optimizer_state():

 class GradScaler(AmpScaler):
    """
-    GradScaler is used for Auto-Mixed-Precision training in dynamic graph mode. 
+    GradScaler is used for Auto-Mixed-Precision training in dynamic graph mode.
    It controls the scaling of loss, helps avoiding numerical overflow.
    The object of this class has nineteen methods `scale()`, `unscale_()`, `minimize()`, `step()`, `update()` and `get`/`set` api of parameters.

@@ -36,19 +36,19 @@ class GradScaler(AmpScaler):
    `update` is used to update the loss_scaling.


-    Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in 
+    Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in
    dynamic graph mode.

    Args:
        enable(bool, optional): Enable loss scaling or not. Default is True.
        init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15.
-        incr_ratio(float, optional): The multiplier to use when increasing the loss 
+        incr_ratio(float, optional): The multiplier to use when increasing the loss
                        scaling. Default is 2.0.
-        decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing 
+        decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing
                        the loss scaling. Default is 0.5.
-        incr_every_n_steps(int, optional): Increases loss scaling every n consecutive 
+        incr_every_n_steps(int, optional): Increases loss scaling every n consecutive
                                steps with finite gradients. Default is 1000.
-        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n 
+        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
                                    accumulated steps with nan or inf gradients. Default is 2.
        use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
    Returns:
@@ -57,7 +57,7 @@ class GradScaler(AmpScaler):
    Examples:

        .. code-block:: python
-            
+
            import paddle

            model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
@@ -68,10 +68,10 @@ class GradScaler(AmpScaler):
            with paddle.amp.auto_cast():
                conv = model(data)
                loss = paddle.mean(conv)
-                
-            scaled = scaler.scale(loss)  # scale the loss 
+
+            scaled = scaler.scale(loss)  # scale the loss
            scaled.backward()            # do backward
-            scaler.minimize(optimizer, scaled)  # update parameters     
+            scaler.minimize(optimizer, scaled)  # update parameters
            optimizer.clear_grad()
    """

@@ -90,18 +90,18 @@ class GradScaler(AmpScaler):

    def scale(self, var):
        """
-        Multiplies a Tensor by the scale factor and returns scaled outputs.  
+        Multiplies a Tensor by the scale factor and returns scaled outputs.
        If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.

        Args:
            var (Tensor):  The tensor to scale.
        Returns:
            The scaled tensor or original tensor.
-        
+
        Examples:

            .. code-block:: python
-                
+
                import paddle

                model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
@@ -113,9 +113,9 @@ class GradScaler(AmpScaler):
                    conv = model(data)
                    loss = paddle.mean(conv)

-                scaled = scaler.scale(loss)  # scale the loss 
+                scaled = scaler.scale(loss)  # scale the loss
                scaled.backward()            # do backward
-                scaler.minimize(optimizer, scaled)  # update parameters  
+                scaler.minimize(optimizer, scaled)  # update parameters
                optimizer.clear_grad()
        """
        return super(GradScaler, self).scale(var)
@@ -123,7 +123,7 @@ class GradScaler(AmpScaler):
    def minimize(self, optimizer, *args, **kwargs):
        """
        This function is similar as `optimizer.minimize()`, which performs parameters updating.
-        
+
        If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
        Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.

@@ -149,9 +149,9 @@ class GradScaler(AmpScaler):
                    conv = model(data)
                    loss = paddle.mean(conv)

-                scaled = scaler.scale(loss)  # scale the loss 
+                scaled = scaler.scale(loss)  # scale the loss
                scaled.backward()            # do backward
-                scaler.minimize(optimizer, scaled)  # update parameters  
+                scaler.minimize(optimizer, scaled)  # update parameters
                optimizer.clear_grad()
        """
        return super(GradScaler, self).minimize(optimizer, *args, **kwargs)
@@ -159,7 +159,7 @@ class GradScaler(AmpScaler):
    def step(self, optimizer):
        """
        This function is similar as `optimizer.step()`, which performs parameters updating.
-        
+
        If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
        Otherwise, if `unscale_()` has not been called, it first unscales the scaled gradients of parameters, then updates the parameters.

@@ -169,7 +169,7 @@ class GradScaler(AmpScaler):
        Examples:

            .. code-block:: python
-            
+
                # required: gpu
                import paddle

@@ -180,7 +180,7 @@ class GradScaler(AmpScaler):
                with paddle.amp.auto_cast():
                    conv = model(data)
                    loss = paddle.mean(conv)
-                scaled = scaler.scale(loss)  # scale the loss 
+                scaled = scaler.scale(loss)  # scale the loss
                scaled.backward()            # do backward
                scaler.step(optimizer)       # update parameters
                scaler.update()              # update the loss scaling ratio
@@ -212,11 +212,11 @@ class GradScaler(AmpScaler):
    def update(self):
        """
        Updates the loss_scaling.
-        
+
        Examples:

            .. code-block:: python
-            
+
                # required: gpu
                import paddle

@@ -227,11 +227,11 @@ class GradScaler(AmpScaler):
                with paddle.amp.auto_cast():
                    conv = model(data)
                    loss = paddle.mean(conv)
-                scaled = scaler.scale(loss)     # scale the loss 
+                scaled = scaler.scale(loss)     # scale the loss
                scaled.backward()               # do backward
                scaler.step(optimizer)          # update parameters
                scaler.update()                 # update the loss scaling ratio
-                optimizer.clear_grad() 
+                optimizer.clear_grad()
        """
        if not self._enable:
            return
@@ -242,7 +242,7 @@ class GradScaler(AmpScaler):

    def unscale_(self, optimizer):
        """
-        Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).  
+        Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).
        If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.

        Args:
@@ -250,7 +250,7 @@ class GradScaler(AmpScaler):

        Returns:
            The unscaled parameters or original parameters.
-        
+
        Examples:

            .. code-block:: python
@@ -265,12 +265,12 @@ class GradScaler(AmpScaler):
                with paddle.amp.auto_cast():
                    conv = model(data)
                    loss = paddle.mean(conv)
-                scaled = scaler.scale(loss)  # scale the loss 
+                scaled = scaler.scale(loss)  # scale the loss
                scaled.backward()            # do backward
                scaler.unscale_(optimizer)    # unscale the parameter
                scaler.step(optimizer)
-                scaler.update()  
-                optimizer.clear_grad() 
+                scaler.update()
+                optimizer.clear_grad()
        """
        return super(GradScaler, self)._unscale(optimizer)

@@ -280,7 +280,7 @@ class GradScaler(AmpScaler):

        Returns:
            bool: enable loss scaling return True else return False.
-        
+
        Examples:
            .. code-block:: python

@@ -304,11 +304,11 @@ class GradScaler(AmpScaler):

        Returns:
            bool: if fixed loss_scaling is used return False, if the loss scaling is updated dynamicly return true.
-        
+
        Examples:
            .. code-block:: python

-                # required: gpu,xpu         
+                # required: gpu,xpu
                import paddle
                scaler = paddle.amp.GradScaler(enable=True,
                                               init_loss_scaling=1024,
@@ -328,7 +328,7 @@ class GradScaler(AmpScaler):

        Reurns:
            float:  the initial loss scaling factor.
-        
+
        Examples:
            .. code-block:: python

@@ -352,10 +352,10 @@ class GradScaler(AmpScaler):

        Args:
            new_init_loss_scaling(float):  The new_init_loss_scaling used to update initial loss scaling factor.
-        
+
        Examples:
            .. code-block:: python
-                
+
                # required: gpu,xpu
                import paddle
                scaler = paddle.amp.GradScaler(enable=True,
@@ -378,7 +378,7 @@ class GradScaler(AmpScaler):

        Reurns:
            float:  the multiplier to use when increasing the loss scaling.
-        
+
        Examples:
            .. code-block:: python

@@ -402,7 +402,7 @@ class GradScaler(AmpScaler):

        Args:
            new_incr_ratio(float):  The new_incr_ratio used to update the multiplier to use when increasing the loss scaling.
-        
+
        Examples:
            .. code-block:: python

@@ -428,7 +428,7 @@ class GradScaler(AmpScaler):

        Reurns:
            float:  the less-than-one-multiplier to use when decreasing the loss scaling.
-        
+
        Examples:
            .. code-block:: python

@@ -452,7 +452,7 @@ class GradScaler(AmpScaler):

        Args:
            new_decr_ratio(float):  The new_decr_ratio used to update the less-than-one-multiplier to use when decreasing the loss scaling.
-        
+
        Examples:
            .. code-block:: python

@@ -478,7 +478,7 @@ class GradScaler(AmpScaler):

        Reurns:
            int:  the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.
-        
+
        Examples:
            .. code-block:: python

@@ -502,7 +502,7 @@ class GradScaler(AmpScaler):

        Args:
            new_incr_every_n_steps(int):  The new_incr_every_n_steps used to update the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.
-        
+
        Examples:
            .. code-block:: python

@@ -528,7 +528,7 @@ class GradScaler(AmpScaler):

        Reurns:
            int:  the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.
-        
+
        Examples:
            .. code-block:: python

@@ -552,7 +552,7 @@ class GradScaler(AmpScaler):

        Args:
            new_decr_every_n_nan_or_inf(int):  The new_decr_every_n_nan_or_inf used to update the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.
-        
+
        Examples:
            .. code-block:: python

@@ -588,7 +588,7 @@ class GradScaler(AmpScaler):
            decr_count(int): The number of recent consecutive skipped steps.
            use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.

-        
+
        Examples:

            .. code-block:: python
@@ -610,10 +610,10 @@ class GradScaler(AmpScaler):
    def load_state_dict(self, state_dict):
        """
        Loads the scaler state.
-        
+
        Args:
           state_dict(dict): scaler state.  Should be an object returned from a call to `GradScaler.state_dict()`.
-                
+
        Examples:

            .. code-block:: python

--- a/python/paddle/audio/functional/functional.py
+++ b/python/paddle/audio/functional/functional.py
@@ -247,7 +247,7 @@ def create_dct(n_mfcc: int,
    """Create a discrete cosine transform(DCT) matrix.

    Args:
-        n_mfcc (int): Number of mel frequency cepstral coefficients. 
+        n_mfcc (int): Number of mel frequency cepstral coefficients.
        n_mels (int): Number of mel filterbanks.
        norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'.
        dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.

--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -24,12 +24,12 @@ __all__ = []
 def backward(tensors, grad_tensors=None, retain_graph=False):
    """
    Compute the backward gradients of given tensors.
-    
+
    Args:
        tensors(list of Tensors): the tensors which the gradient to be computed. The tensors can not contain the same tensor.

        grad_tensors(list of Tensors of None, optional): the init gradients of the `tensors`` .If not None, it must have the same length with ``tensors`` ,
-            and if any of the elements is None, then the init gradient is the default value which is filled with 1.0. 
+            and if any of the elements is None, then the init gradient is the default value which is filled with 1.0.
            If None, all the gradients of the ``tensors`` is the default value which is filled with 1.0.
            Defaults to None.

@@ -37,7 +37,7 @@ def backward(tensors, grad_tensors=None, retain_graph=False):
            like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
            :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
            Defaults to False.
-    
+
    Returns:
        NoneType: None


--- a/python/paddle/autograd/py_layer.py
+++ b/python/paddle/autograd/py_layer.py
@@ -54,16 +54,16 @@ class LegacyPyLayerContext(object):
    def save_for_backward(self, *tensors):
        """
        Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
-        
+
        .. note::
-            This API should be called at most once, and only inside `forward`. 
+            This API should be called at most once, and only inside `forward`.

        Args:
            tensors(list of Tensors): Tensors to be stored.

        Returns:
            None
-        
+
        Examples:
            .. code-block:: python

@@ -94,7 +94,7 @@ class LegacyPyLayerContext(object):
        Get the tensors stored by ``save_for_backward``.

        Returns:
-            list of Tensors or None: If context contains tensors stored by `save_for_backward`, 
+            list of Tensors or None: If context contains tensors stored by `save_for_backward`,
            then return these tensors, otherwise return None.

        Examples:
@@ -147,7 +147,7 @@ class CPyLayer(object):

        Returns:
            tensors or other types : output of PyLayer.
-        
+
        Examples:
            .. code-block:: python

@@ -210,15 +210,15 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
    Build a custom `Layer` by creating subclasses. Subclasses need to follow the following rules:
    1. Subclasses contain `forward` and `backward` function. Both forward and backward are @staticmethod.
    Their first argument should be a context and `None` can not be included in the returned result.
-    2. Input of backward contains a context as the first argument, and the rest arguments are the 
-    gradient of forward's output tensors. so the number of backward's input tensors equal to 
-    the number of forward output tensors. If you need the forward's inputs or outputs in `backward`, 
+    2. Input of backward contains a context as the first argument, and the rest arguments are the
+    gradient of forward's output tensors. so the number of backward's input tensors equal to
+    the number of forward output tensors. If you need the forward's inputs or outputs in `backward`,
    you can use `save_for_backward` to store the required tensors, and then use them in the backward.
    3. Output of backward function can only be `Tensor` or tuple/list of `Tensor`.
-    Output tensors of backward are the gradient of forward's input tensors, 
+    Output tensors of backward are the gradient of forward's input tensors,
    so the number of backward's output tensors equal to the number of forward input tensors.
    After building the custom Layer, run it through the `apply` method.
-    
+

    Examples:
        .. code-block:: python
@@ -259,8 +259,8 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
    @staticmethod
    def forward(ctx, *args, **kwargs):
        """
-        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as 
-        the first argument, followed by any number of arguments (tensors or other types). 
+        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as
+        the first argument, followed by any number of arguments (tensors or other types).
        `None` can not be included in the returned result.

        Args:
@@ -269,7 +269,7 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):

        Returns:
            tensors or other types : output of PyLayer.
-        
+
        Examples:
            .. code-block:: python

@@ -297,9 +297,9 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
    @staticmethod
    def backward(ctx, *args, **kwargs):
        """
-        This is a function to calculate the gradient. It is to be overloaded by subclasses. 
-        It must accept a object of `PyLayerContext` as the first argument, and the rest 
-        arguments are the gradient of forward's output tensors. Output tensors of backward 
+        This is a function to calculate the gradient. It is to be overloaded by subclasses.
+        It must accept a object of `PyLayerContext` as the first argument, and the rest
+        arguments are the gradient of forward's output tensors. Output tensors of backward
        are the gradient of forward's input tensors.

        Args:
@@ -308,7 +308,7 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):

        Returns:
            Tensor or list of Tensors: The gradient of forward's input tensor(s).
-        
+
        Examples:
            .. code-block:: python

@@ -340,16 +340,16 @@ class EagerPyLayerContext(object):
    def save_for_backward(self, *tensors):
        """
        Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
-        
+
        .. note::
-            This API should be called at most once, and only inside `forward`. 
+            This API should be called at most once, and only inside `forward`.

        Args:
            tensors(list of Tensors): Tensors to be stored.

        Returns:
            None
-        
+
        Examples:
            .. code-block:: python

@@ -380,7 +380,7 @@ class EagerPyLayerContext(object):
        Get the tensors stored by ``save_for_backward``.

        Returns:
-            list of Tensors or None: If context contains tensors stored by `save_for_backward`, 
+            list of Tensors or None: If context contains tensors stored by `save_for_backward`,
            then return these tensors, otherwise return None.

        Examples:
@@ -410,11 +410,11 @@ class EagerPyLayerContext(object):
    def mark_not_inplace(self, *args):
        """
        Marks inputs as not inplace.
-        This should be called at most once, only from inside the `forward` method, 
+        This should be called at most once, only from inside the `forward` method,
        and all arguments should be Tensor inputs.

-        If the Tensor returned by `forward` method is the same as the Tensor input of forward, 
-        and this Tensor is marked as not_inplace, then Paddle will help the user create a new Tensor as output. 
+        If the Tensor returned by `forward` method is the same as the Tensor input of forward,
+        and this Tensor is marked as not_inplace, then Paddle will help the user create a new Tensor as output.
        Thereby preventing the auto grad information of the input Tensor from being overwritten.

        Examples:
@@ -427,7 +427,7 @@ class EagerPyLayerContext(object):
                    def forward(ctx, x):
                        ctx.mark_not_inplace(x)
                        return x
-                    
+
                    @staticmethod
                    def backward(ctx, grad_output):
                        out = grad_output.exp()
@@ -438,7 +438,7 @@ class EagerPyLayerContext(object):
                attn_layers = []
                for idx in range(0, 2):
                    attn_layers.append(Exp())
-                
+
                for step in range(0, 2):
                    a = x
                    for j in range(0,2):
@@ -450,7 +450,7 @@ class EagerPyLayerContext(object):
    def mark_non_differentiable(self, *args):
        """
        Marks outputs as non-differentiable.
-        This should be called at most once, only from inside the `forward` method, 
+        This should be called at most once, only from inside the `forward` method,
        and all arguments should be tensor outputs.

        This will mark outputs as not requiring gradients, increasing the
@@ -564,8 +564,8 @@ class EagerPyLayer(
    @staticmethod
    def forward(ctx, *args, **kwargs):
        """
-        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as 
-        the first argument, followed by any number of arguments (tensors or other types). 
+        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as
+        the first argument, followed by any number of arguments (tensors or other types).
        `None` can not be included in the returned result.

        Args:
@@ -574,7 +574,7 @@ class EagerPyLayer(

        Returns:
            tensors or other types : output of PyLayer.
-        
+
        Examples:
            .. code-block:: python

@@ -602,9 +602,9 @@ class EagerPyLayer(
    @staticmethod
    def backward(ctx, *args):
        """
-        This is a function to calculate the gradient. It is to be overloaded by subclasses. 
-        It must accept a object of `PyLayerContext` as the first argument, and the rest 
-        arguments are the gradient of forward's output tensors. Output tensors of backward 
+        This is a function to calculate the gradient. It is to be overloaded by subclasses.
+        It must accept a object of `PyLayerContext` as the first argument, and the rest
+        arguments are the gradient of forward's output tensors. Output tensors of backward
        are the gradient of forward's input tensors.

        Args:
@@ -613,7 +613,7 @@ class EagerPyLayer(

        Returns:
            Tensor or list of Tensors: The gradient of forward's input tensor(s).
-        
+
        Examples:
            .. code-block:: python


--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -17,30 +17,30 @@ __all__ = []

 def batch(reader, batch_size, drop_last=False):
    """
-    This operator creates a batched reader which combines the data from the 
+    This operator creates a batched reader which combines the data from the
    input reader to batched data.
-    
+
    Args:
        reader(generator): the data reader to read from.
        batch_size(int): size of each mini-batch.
-        drop_last(bool, optional): If set to True, the last batch is dropped when 
+        drop_last(bool, optional): If set to True, the last batch is dropped when
            the size of last batch is not equal to batch_size, if set to False,
            it will not. Default: False.
    Returns:
-        The batched reader. 
-    
+        The batched reader.
+
    Return Type:
-        generator   
+        generator

    Examples:
        .. code-block:: python
-           
+
            import paddle
            def reader():
                for i in range(10):
                    yield i
            batch_reader = paddle.batch(reader, batch_size=2)
-            
+
            for data in batch_reader():
                print(data)


--- a/python/paddle/compat.py
+++ b/python/paddle/compat.py
@@ -25,7 +25,7 @@ long_type = int
 def to_text(obj, encoding='utf-8', inplace=False):
    """
    All string in PaddlePaddle should be represented as a literal string.
-    
+
    This function will convert object to a literal string without any encoding.
    Especially, if the object type is a list or set container, we will iterate
    all items in the object and convert them to literal string.
@@ -43,7 +43,7 @@ def to_text(obj, encoding='utf-8', inplace=False):

    Returns:
        Decoded result of obj
-    
+
    Examples:

        .. code-block:: python
@@ -121,7 +121,7 @@ def _to_text(obj, encoding):
 def to_bytes(obj, encoding='utf-8', inplace=False):
    """
    All string in PaddlePaddle should be represented as a literal string.
-    
+
    This function will convert object to a bytes with specific encoding.
    Especially, if the object type is a list or set container, we will iterate
    all items in the object and convert them to bytes.
@@ -140,7 +140,7 @@ def to_bytes(obj, encoding='utf-8', inplace=False):

    Returns:
        Decoded result of obj
-    
+
    Examples:

        .. code-block:: python

--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -119,7 +119,7 @@ def XPUPlace(dev_id):
        .. code-block:: python

            # required: xpu
-            
+
            import paddle
            place = paddle.device.XPUPlace(0)
    """
@@ -163,15 +163,15 @@ def MLUPlace(dev_id):

 def get_cudnn_version():
    """
-    This funciton return the version of cudnn. the retuen value is int which represents the 
+    This funciton return the version of cudnn. the retuen value is int which represents the
    cudnn version. For example, if it return 7600, it represents the version of cudnn is 7.6.
-    
+
    Returns:
        int: A int value which represents the cudnn version. If cudnn version is not installed, it return None.

    Examples:
        .. code-block:: python
-            
+
            import paddle

            cudnn_version = paddle.device.get_cudnn_version()
@@ -305,7 +305,7 @@ def set_device(device):
    Examples:

     .. code-block:: python
-            
+
        import paddle

        paddle.device.set_device("cpu")
@@ -322,13 +322,13 @@ def get_device():
    """
    This funciton can get the current global device of the program is running.
    It's a string which is like 'cpu', 'gpu:x', 'xpu:x', 'mlu:x' and 'npu:x'. if the global device is not
-    set, it will return a string which is 'gpu:x' when cuda is avaliable or it 
+    set, it will return a string which is 'gpu:x' when cuda is avaliable or it
    will return a string which is 'cpu' when cuda is not avaliable.

    Examples:

     .. code-block:: python
-            
+
        import paddle
        device = paddle.device.get_device()

@@ -394,7 +394,7 @@ def get_all_custom_device_type():
    """
    Get all available custom device types.

-    Returns: 
+    Returns:
        A list of all available custom device types.

    Examples:

--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -42,12 +42,12 @@ def current_stream(device=None):
    Return the current CUDA stream by the device.

    Parameters:
-        device(paddle.CUDAPlace()|int, optional): The device or the ID of the device which want to get stream from. 
+        device(paddle.CUDAPlace()|int, optional): The device or the ID of the device which want to get stream from.
        If device is None, the device is the current device. Default: None.
-    
+
    Returns:
        CUDAStream: the stream to the device.
-    
+
    Examples:
        .. code-block:: python

@@ -82,7 +82,7 @@ def synchronize(device=None):
    Parameters:
        device(paddle.CUDAPlace()|int, optional): The device or the ID of the device.
        If device is None, the device is the current device. Default: None.
-    
+
    Examples:
        .. code-block:: python

@@ -111,7 +111,7 @@ def synchronize(device=None):
 def device_count():
    '''
    Return the number of GPUs available.
-    
+
    Returns:
        int: the number of GPUs available.

@@ -158,7 +158,7 @@ def extract_cuda_device_id(device, op_name):
    Return the id of the given cuda device. It is just a utility that will not be exposed to users.

    Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
            the string name of device like 'gpu:x'.
            Default: None.

@@ -197,12 +197,12 @@ def max_memory_allocated(device=None):
    Return the peak size of gpu memory that is allocated to tensor of the given device.

    .. note::
-        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need. 
+        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need.
        For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.

    Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
            Default: None.

    Return:
@@ -232,8 +232,8 @@ def max_memory_reserved(device=None):
    Return the peak size of GPU memory that is held by the allocator of the given device.

    Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
            Default: None.

    Return:
@@ -263,12 +263,12 @@ def memory_allocated(device=None):
    Return the current size of gpu memory that is allocated to tensor of the given device.

    .. note::
-        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need. 
-        For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes. 
+        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need.
+        For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.

    Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
            Default: None.

    Return:
@@ -298,14 +298,14 @@ def memory_reserved(device=None):
    Return the current size of GPU memory that is held by the allocator of the given device.

    Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
            Default: None.

    Return:
        int: The current size of GPU memory that is held by the allocator of the given device, in bytes.

-    Examples:    
+    Examples:
        .. code-block:: python

            # required: gpu
@@ -389,18 +389,18 @@ def get_device_properties(device=None):
    Return the properties of given device.

    Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x' which to get the properties of the 
-            device from. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x' which to get the properties of the
+            device from. If device is None, the device is the current device.
            Default: None.

    Returns:
-        _gpuDeviceProperties: The properties of the device which include ASCII string 
-        identifying device, major compute capability, minor compute capability, global 
+        _gpuDeviceProperties: The properties of the device which include ASCII string
+        identifying device, major compute capability, minor compute capability, global
        memory available and the number of multiprocessors on the device.

    Examples:
-    
+
        .. code-block:: python

            # required: gpu
@@ -484,7 +484,7 @@ def get_device_capability(device=None):
    Return the major and minor revision numbers defining the device's compute capability which are got from CUDA function `cudaDeviceProp <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g1bf9d625a931d657e08db2b4391170f0>`_.

    Parameters:
-        device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device. 
+        device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device.

    Returns:
        tuple(int,int): the major and minor revision numbers defining the device's compute capability.

--- a/python/paddle/distributed/auto_parallel/cluster_v2.py
+++ b/python/paddle/distributed/auto_parallel/cluster_v2.py
@@ -49,14 +49,14 @@ class LinkType(IntEnum):

 class DeviceMesh(core.DeviceMesh):
    r"""
-    The class `DeviceMesh` describes the topology of physical devices. 
+    The class `DeviceMesh` describes the topology of physical devices.

    Args:
        mesh (list|numpy.array): an N-dimensional array describes the toplogy
            of logical processes.
        dim_names (list, optional): the i-th element of this list gives the name of the
            i-th dimension.
-    
+
    Returns:
        None

@@ -65,9 +65,9 @@ class DeviceMesh(core.DeviceMesh):

            import paddle
            import paddle.distributed as dist
-            
+
            paddle.enable_static()
-            
+
            mesh = dist.DeviceMesh([[2, 4, 5], [0, 1, 3]])
            assert mesh.shape == [2, 3]
            assert mesh.device_ids == [2, 4, 5, 0, 1, 3]

--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -901,7 +901,7 @@ class Completer:

    def _complete_high_order_grad_annotation(self, serial_main_program=None):
        """
-        NOTE: 
+        NOTE:
            [HighOrderGrad] Complete the annotation of vars and ops only for high order gradient.
            This function is temporary to support high order gradient, and will be removed in the future.
        """

--- a/python/paddle/distributed/auto_parallel/converter.py
+++ b/python/paddle/distributed/auto_parallel/converter.py
@@ -21,18 +21,18 @@ from ..utils import get_logger

 class Converter(object):
    """
-    Converter is a class object for auto parallel to convert tensors from 
-    one parallel strategy to another one. Tensors will merge and slice value 
+    Converter is a class object for auto parallel to convert tensors from
+    one parallel strategy to another one. Tensors will merge and slice value
    with their strategy when strategies are different.
    """

    def __init__(self, tensors_dict, pre_strategy, cur_strategy):
        """
        Args:
-            tensors_dict(dict): tensors' value of all ranks that to be converted. 
+            tensors_dict(dict): tensors' value of all ranks that to be converted.
                key is tensor's name(str), value is all ranks' data(list(numpy.ndarray))
            pre_strategy(dict): tensors' distributed attribute of last training process.
-                key is tensor's name(str), value is tensor's distributed attribute in last 
+                key is tensor's name(str), value is tensor's distributed attribute in last
                training process.
            cur_strategy(dict): tensors' distributed attribute of current rank.
                key is tensor's name(str), value is tensor's distributed attribute in current
@@ -432,7 +432,7 @@ class Converter(object):
                process_group = [0, 1, 2]

                slice_tensor = _slice_tensor(complete_tensor, [[], [], [2, 4]], 3)
-                # slice_tensor: 
+                # slice_tensor:
                # [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]

                index = _get_sliced_index(rank, complete_shape, dims_mapping

--- a/python/paddle/distributed/auto_parallel/cost_model.py
+++ b/python/paddle/distributed/auto_parallel/cost_model.py
@@ -433,9 +433,9 @@ class CostModel(object):

    def merge_linear(self):
        r'''
-        This method does the following: 
+        This method does the following:
        If X depends on Y only, they must be run sequentially.
-            [ e.g. A ->- C ->- D   D and E depends on C only.] 
+            [ e.g. A ->- C ->- D   D and E depends on C only.]
            [      B ->-/ \->- E   C depends on A and B.     ]
        We merge X and Y into a new node and sum up their cost time.
        '''
@@ -453,7 +453,7 @@ class CostModel(object):
        r'''
        This method does the following:
        If a node has more than one successor, there is *branch*.
-            [ e.g. A ->- B ->- D                                       ] 
+            [ e.g. A ->- B ->- D                                       ]
            [       \->- C ->- / , B and C can be run at the same time ]
            case 1: if B or C is null (or D is directly dependent on A),
                    it's equivalent to A->C->D or A->B->D, fall back to self.merge_linear
@@ -789,12 +789,12 @@ def estimate_cost(distributed_program, cluster, pipeline_config,
                  standalone_cost_data, batch_size):
    """
    Estimated cost from distributed program, cluster model and distributed settings.
-    
+
    Args:
        distributed_program(list): list of paddle programs
-        cluster(Cluster): cluster model 
+        cluster(Cluster): cluster model
        standalone_cost_data(CostData): cost data given by paddle.core
-        batch_size(int): batch size of the training workload 
+        batch_size(int): batch size of the training workload
        pipeline_config(list): configuration of pipeline stage allocation
    """
    # the following line is left for now, cluster model will be involved in the future

--- a/python/paddle/distributed/auto_parallel/dist_tensor.py
+++ b/python/paddle/distributed/auto_parallel/dist_tensor.py
@@ -25,11 +25,11 @@ from .utils import _linear_idx2coordinate

 class DistributedTensor:
    """
-    DistributedTensor represents the distribution of tensor on the process group and 
+    DistributedTensor represents the distribution of tensor on the process group and
    local tensors can be created by DistributedTensor.
    Only support even sharding now and uneven sharding will be supported in the future.
-    Local tensor information can be obtained from the DistributedTensor instance object, 
-    or obtained by the static methods provided by DistributedTensor, 
+    Local tensor information can be obtained from the DistributedTensor instance object,
+    or obtained by the static methods provided by DistributedTensor,
    including shard (i.e. the index in the serial tensor), offsets, and sizes.
    """


--- a/python/paddle/distributed/auto_parallel/interface.py
+++ b/python/paddle/distributed/auto_parallel/interface.py
@@ -39,8 +39,8 @@ def shard_tensor(x, dist_attr=None):
        x (Tensor): the tensor to be sharded.
        dist_attr (dict): the tensor distributed attributes. The accepted attributes are as follow:
            "process_mesh": a nested list an to describe the mesh topology of logical processes.
-            "dims_mapping": a list to describe the mapping between `x` and `process_mesh`, the dimension 
-                `i` of `x` is split across the dimension `dims_mapping[i]` of `process_mesh`, 
+            "dims_mapping": a list to describe the mapping between `x` and `process_mesh`, the dimension
+                `i` of `x` is split across the dimension `dims_mapping[i]` of `process_mesh`,
                where -1 means that tensor dimension is not split.
            Both process_mesh and dims_mapping are optional and users can specify as need.

@@ -52,7 +52,7 @@ def shard_tensor(x, dist_attr=None):

            import paddle
            import paddle.distributed as dist
-            
+
            paddle.enable_static()

            x = paddle.ones([4, 6])
@@ -76,12 +76,12 @@ def shard_op(op_fn, dist_attr=None):

    Args:
        op_fn (callable): a callable operator or module to be sharded.
-        dist_attr (dict): the operator distributed attributes. The accepted attributes are classified into 
-            two categories. The first category decsribes the distributed attributes shared by all inputs and 
+        dist_attr (dict): the operator distributed attributes. The accepted attributes are classified into
+            two categories. The first category decsribes the distributed attributes shared by all inputs and
            outputs, and only `process_mesh` can be specified now. The second category describes distributed
            attributes for inputs or outputs same as the `dist_attr` of `shard_tensor`. All of them are
            optional and users can specify them as need. Note that `process_mesh` for operators must be the
-            same as these process_meshes for inputs and outputs. 
+            same as these process_meshes for inputs and outputs.

    Returns:
        list: the outputs of the function `op_fn`, which are annotated with distributed attributes.
@@ -93,7 +93,7 @@ def shard_op(op_fn, dist_attr=None):
            import paddle.distributed as dist

            paddle.enable_static()
-            
+
            x = paddle.ones([4, 6])
            y = paddle.zeros([4, 6])
            dist_add = dist.shard_op(paddle.add,

--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -176,7 +176,7 @@ def register_distributed_operator_impl(op_type, dist_impl):

 def find_compatible_distributed_operator_impls(dist_op, fwd=True, partial=True):
    """
-    Here just return the first compatible implemention. 
+    Here just return the first compatible implemention.
    This will be improved by cost model in the future.
    """
    op_type = dist_op.serial_op.type
@@ -327,9 +327,9 @@ def get_data_parallel_group(dist_ctx, op, act_grad_names, rank):

    Args:
        dist_ctx (DistributedContext): dist context.
-        op (Operator): the current (backward) operator which might need. 
-        act_grad_names (list): list of input activation grads variable name to the current operator. 
-        out_grad_names (list): list of the output parameter's grads variable name of the current operator. 
+        op (Operator): the current (backward) operator which might need.
+        act_grad_names (list): list of input activation grads variable name to the current operator.
+        out_grad_names (list): list of the output parameter's grads variable name of the current operator.
        rank (int): global ranks index for current process.
    """
    dp_group = None
@@ -360,13 +360,13 @@ def get_data_parallel_group(dist_ctx, op, act_grad_names, rank):

 def sync_and_scale_gradients(dist_ctx, op, dp_group, allreduce_var_names):
    """
-    insert the allreudce and scale ops for gradients of model 
+    insert the allreudce and scale ops for gradients of model
    parameters for operator in data parallelism.

    Args:
        dist_ctx (DistributedContext): dist context.
-        op (Operator): the current (backward) operator which might need. 
-        allreduce_var_names (list): list of the parameter's grads variable name in the current operator output. 
+        op (Operator): the current (backward) operator which might need.
+        allreduce_var_names (list): list of the parameter's grads variable name in the current operator output.
    """

    op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
@@ -417,14 +417,14 @@ def sync_and_scale_gradients(dist_ctx, op, dp_group, allreduce_var_names):
 def gradient_synchronization(dist_ctx, op, act_grad_names, out_grad_names,
                             rank):
    """
-    conduct the allreudce and scaling（dp size）for gradients of model 
+    conduct the allreudce and scaling（dp size）for gradients of model
    parameters for operator in data parallelism.

    Args:
        dist_ctx (DistributedContext): dist context.
-        op (Operator): the current (backward) operator which might need. 
-        act_grad_names (list): list of input activation grads variable name to the current operator. 
-        out_grad_names (list): list of the output parameter's grads variable name of the current operator. 
+        op (Operator): the current (backward) operator which might need.
+        act_grad_names (list): list of input activation grads variable name to the current operator.
+        out_grad_names (list): list of the output parameter's grads variable name of the current operator.
        rank (int): global ranks index for current process.
    """


--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -57,9 +57,9 @@ class AutoParallelizer:
    AutoParallelizer is the main controller class to do the auto parallel process.
    And the auto parallel process will be triggered in the wrapped parallelize function.
    To facilitate the auto parallelization, it will contain information about program, cluster and the
-    related context. In this basic version, the program information will be retrevied from 
+    related context. In this basic version, the program information will be retrevied from
    Fleet object, and the cluster information can be retrevied in the new created Cluster object,
-    and the context information can be retrevied in the new created DistributedContext. 
+    and the context information can be retrevied in the new created DistributedContext.
    """

    def __init__(self, fleet):

--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -39,7 +39,7 @@ class Partitioner(object):
    warning:: Partitioner is experimental and subject to change.

    Partitioner convert a program into another program.
-    Given a serial program which has been auto completed with shard annotation, the Partitioner 
+    Given a serial program which has been auto completed with shard annotation, the Partitioner
    convert the serial program into a "distributed" program. The Partitioner will  modify the serial
    program in following two ways, which is also the major difference between serial and distributed program:
        1. partition op: replace a serial op into its corresponding dist op infered from the shard annotation

--- a/python/paddle/distributed/auto_parallel/process_mesh.py
+++ b/python/paddle/distributed/auto_parallel/process_mesh.py
@@ -38,7 +38,7 @@ def _flatten_nested_list(nested_list):

 class ProcessMesh(object):
    r"""
-    The class `Processmesh` describes the topology of logical processes. 
+    The class `Processmesh` describes the topology of logical processes.
    A mesh is an N-dimensional array. The shape of the N-dimensional
    array represents the topology of logical processes and every
    element of the N-dimensional array represent a logical process. For
@@ -52,9 +52,9 @@ class ProcessMesh(object):
    Args:
        mesh (list): an N-dimensional array (nested list) describes the toplogy
            of logical processes. The shape of the N-dimensional array
-            represents the topology of logical processes and every 
+            represents the topology of logical processes and every
            element of the N-dimensional array represents a logical process.
-    
+
    Returns:
        None

@@ -66,9 +66,9 @@ class ProcessMesh(object):

            import paddle
            import paddle.distributed as dist
-            
+
            paddle.enable_static()
-            
+
            mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]])
            assert mesh.topology == [2, 3]
            assert mesh.processes == [2, 4, 5, 0, 1, 3]

--- a/python/paddle/distributed/auto_parallel/process_mesh_v2.py
+++ b/python/paddle/distributed/auto_parallel/process_mesh_v2.py
@@ -19,14 +19,14 @@ from paddle.fluid import core

 class ProcessMesh(core.ProcessMesh):
    r"""
-    The class `Processmesh` describes the topology of logical processes. 
+    The class `Processmesh` describes the topology of logical processes.

    Args:
        mesh (list|numpy.array): an N-dimensional array describes the toplogy
            of logical processes.
        dim_names (list, optional): the i-th element of this list gives the name of the
            i-th dimension.
-    
+
    Returns:
        None

@@ -35,9 +35,9 @@ class ProcessMesh(core.ProcessMesh):

            import paddle
            import paddle.distributed as dist
-            
+
            paddle.enable_static()
-            
+
            mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]])
            assert mesh.shape == [2, 3]
            assert mesh.processe_ids == [2, 4, 5, 0, 1, 3]

--- a/python/paddle/distributed/auto_parallel/tuner/algorithms.py
+++ b/python/paddle/distributed/auto_parallel/tuner/algorithms.py
@@ -23,12 +23,12 @@ from .trial import OptimizationTunerTrial as Trial

 class AlgorithmBase(ABC):
    """
-    An Tuning alogrithm is a class to find out an optimal configuration 
-    given the selected tuning optimization pass(es) and the arguments to be tuned. 
+    An Tuning alogrithm is a class to find out an optimal configuration
+    given the selected tuning optimization pass(es) and the arguments to be tuned.
    Different optimization pass(es) will correspond to a different algorithm,
    where different search space **pruning rules** will applied.

-    In another word, the key "algorithm" for this class is the 
+    In another word, the key "algorithm" for this class is the
    search space pruning rules specific for the given optimization scenario.
    """
    _REGISTERED_ALGORITHMS = {}
@@ -52,9 +52,9 @@ class AlgorithmBase(ABC):

    def collect_model_info(self, main_prog, startup_prog):
        """
-        Collect the model static info (from programs) that could be used to 
-        pruning candidate trials and saving tuning time.For instance, 
-        model info like number of model parameters and activation memory could be 
+        Collect the model static info (from programs) that could be used to
+        pruning candidate trials and saving tuning time.For instance,
+        model info like number of model parameters and activation memory could be
        used to prune candidated trial and decide the next trial.
        """
        pass
@@ -70,7 +70,7 @@ class AlgorithmBase(ABC):
    @abstractmethod
    def update(self, results):
        """
-        Update the algorthim with the results of last trial. Using this information is used to 
+        Update the algorthim with the results of last trial. Using this information is used to
        pruning the search space of the future trial.
        """
        pass

--- a/python/paddle/distributed/auto_parallel/tuner/config.py
+++ b/python/paddle/distributed/auto_parallel/tuner/config.py
@@ -33,7 +33,7 @@ class TuningConfig(object):
    """
    A uniform config wrap:
    distributed strategy: the user defined configuration for optimization pass
-    tuning config: configuration for the tuning process: mode (profile or cost model), log dir, extra tuning config for optimization like search range for specific 
+    tuning config: configuration for the tuning process: mode (profile or cost model), log dir, extra tuning config for optimization like search range for specific
    """

    def __init__(self, user_config, strategy):

--- a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
+++ b/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
@@ -161,7 +161,7 @@ def _copy_context(ref_dist_context):

 class OptimizationTuner:
    """
-    OptimizationTuner is used to manage the tuning procedure of hyper-parameters (configs) 
+    OptimizationTuner is used to manage the tuning procedure of hyper-parameters (configs)
    of Optimization Pass in AutoParallel.
    """

@@ -466,7 +466,7 @@ class OptimizationTuner:
        Return the best optimization configuration found in the tuning.

        Returns:
-            A object of fleet.DistributedStrategy with best configuration.       
+            A object of fleet.DistributedStrategy with best configuration.
        """
        assert self._best_iter >= 0, "The best configuration is not found yet !"
        best_trial = self._finished_trials[self._best_iter]
@@ -481,7 +481,7 @@ class OptimizationTuner:
        summary_ = """
 Tuning Result Summary
 Run total {} trials with {} min.
-The best trial is: [{}], whose configuration is following: 
+The best trial is: [{}], whose configuration is following:
        """.format(len(self._finished_trials),
                   (time.time() - self._tuning_start_time) / 60,
                   best_trial.name)
@@ -508,8 +508,8 @@ The best trial is: [{}], whose configuration is following:

    def tune(self):
        """
-        Performs the search for best hyperparameter configuations 
-        for the selected optimization pass(es). 
+        Performs the search for best hyperparameter configuations
+        for the selected optimization pass(es).
        """

        # step1: collect model info which might be used for

--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -171,7 +171,7 @@ def print_program_with_dist_attr(program, dist_context=None):

 def _get_comm_group(processes, shape, axis, rank):
    """
-    Given a rank and the processes mesh the rank belongs to,  
+    Given a rank and the processes mesh the rank belongs to,
    compute the communication peers of the rank based on the give axis in the mesh.

    Example: 16 processes managed in a 4-Dimensinal mesh with shape of [2, 2, 2, 2].
@@ -205,7 +205,7 @@ def _get_comm_group(processes, shape, axis, rank):

 def _get_idx_in_axis(processes, shape, axis, rank):
    """
-    Given a rank and the processes mesh the rank belongs to,  
+    Given a rank and the processes mesh the rank belongs to,
    compute the index of the rank in given axis.

    Example: 27 processes managed in a 3-Dimensinal mesh with shape of [3, 3, 3].
@@ -226,20 +226,20 @@ def _coordinate2linear_idx(mesh_shape, coordinate):
    """
    convert a coordinate in multidimensional mesh space into a scala idx in linear space.

-    it use Row-major order for dimension conversion. 
+    it use Row-major order for dimension conversion.
    so it has:  [most_significant_dim, ..., least_significant_dim]
-    assume: 
+    assume:

        the size of i-th dimension to be:  S[i]
        the index of j-th dimension is: I[j]

-    linear_idx of a n dimensional coordinate is: 
+    linear_idx of a n dimensional coordinate is:

        I[n-1] * (S[n-2] * S[n-3] * S[n-4] *     ....    S[0]) +
-        I[n-2] * (         S[n-3] * S[n-4] *     ....    S[0]) +       
-        I[n-3] * (                  S[n-4] *     ....    S[0]) +  
+        I[n-2] * (         S[n-3] * S[n-4] *     ....    S[0]) +
+        I[n-3] * (                  S[n-4] *     ....    S[0]) +
        ...
-        I[1]   * (                                       S[0]) + 
+        I[1]   * (                                       S[0]) +
        I[0]

    """
@@ -279,7 +279,7 @@ def _linear_idx2coordinate(mesh_shape, linear_idx):
    mapping a linear scala into multidimensional mesh space, return it coordinate in that space.

    it is the inverse function of _coordinate2linear_idx.
-    assume: 
+    assume:

        the size of i-th dimension to be:  S[i]
        the index of j-th dimension is: I[j]
@@ -460,8 +460,8 @@ def save_distributed_checkpoint(program,
                                addition_info=None,
                                is_integrated=False,
                                dist_context=None):
-    """ 
-    Save model parameter state, optimzer state, distributed attribute and 
+    """
+    Save model parameter state, optimzer state, distributed attribute and
    additional information of each rank.

    Args:
@@ -502,7 +502,7 @@ def save_distributed_checkpoint(program,


 def load_distributed_checkpoint(checkpoint_path, dist_attr_path):
-    """ 
+    """
    Load parameter, optimizer, distributed attribute and addition_info.

    Args:
@@ -512,7 +512,7 @@ def load_distributed_checkpoint(checkpoint_path, dist_attr_path):
    Returns:
        param_dict(dict): parameters' value of all ranks.
        dist_attr(dict): parameters' distributed attribute.
-        addition_info(dict): additional information user saved in last training. 
+        addition_info(dict): additional information user saved in last training.

    Notes:
        The return, 'addition_info', is belonging to the first file of checkpoint_path by default.
@@ -520,9 +520,9 @@ def load_distributed_checkpoint(checkpoint_path, dist_attr_path):
    Examples:
        .. code-block:: python

-            ckpt_path = ['./model_state_rank0.pdmodel', 
+            ckpt_path = ['./model_state_rank0.pdmodel',
                         './model_state_rank1.pdmodel']
-            dist_attr_path = ['./dist_attr_rank0.pdattr', 
+            dist_attr_path = ['./dist_attr_rank0.pdattr',
                              './dist_attr_rank1.pdattr']
            param_dict, dist_attr, add_info = load_distributed_checkpoint(ckpt_path, dist_attr_path)
    """
@@ -542,7 +542,7 @@ def load_checkpoint_into_program(checkpoint_path,
                                 dist_attr_path,
                                 program,
                                 dist_context=None):
-    """ 
+    """
    Load parameter, optimizer, distributed attribute and addition_info into model.

    Args:
@@ -553,7 +553,7 @@ def load_checkpoint_into_program(checkpoint_path,

    Returns:
        addition_info(dict): user saved in last train.
-    
+
    Notes:
        The return, 'addition_info', is belonging to the first file of checkpoint_path by default.

@@ -561,9 +561,9 @@ def load_checkpoint_into_program(checkpoint_path,
        .. code-block:: python

            exe.run(startup_program)
-            ckpt_path = ['./model_state_rank0.pdmodel', 
+            ckpt_path = ['./model_state_rank0.pdmodel',
                         './model_state_rank1.pdmodel']
-            dist_attr_path = ['./dist_attr_rank0.pdattr', 
+            dist_attr_path = ['./dist_attr_rank0.pdattr',
                              './dist_attr_rank1.pdattr']
            load_checkpoint_into_program(ckpt_path, dist_attr_path, main_program)
    """
@@ -590,7 +590,7 @@ def load_checkpoint_into_program(checkpoint_path,


 def load_parameter_into_program(param_dict, program):
-    """ 
+    """
    Load parameters into program.

    Args:
@@ -672,7 +672,7 @@ def _load_distributed_state_dict(checkpoint_path):


 def get_dist_attr(program, dist_context=None):
-    """ 
+    """
    Get distributed attribute of current rank.

    Args:
@@ -935,7 +935,7 @@ def _get_sliced_param_index(rank, complete_shape, dims_mapping, process_shape,
            process_group = [0, 1, 2]

            slice_param = _slice_parameter(complete_param, [[], [], [2, 4]], 3)
-            # slice_param: 
+            # slice_param:
            # [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]

            index = _get_sliced_param_index(rank, complete_shape, dims_mapping

--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -579,10 +579,10 @@ def destroy_process_group(group=None):
    Destroy a given group for communication

    Args:
-        group (ProcessGroup, optional): The group to be destroyed. All of process groups, including 
-                                        the default group, will be destroyed and the distributed 
+        group (ProcessGroup, optional): The group to be destroyed. All of process groups, including
+                                        the default group, will be destroyed and the distributed
                                        environment will be deinitialized.
-    
+
    Returns : None

    Examples:
@@ -776,7 +776,7 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True):

    Reduce a tensor over all ranks so that all get the result.
    As shown below, one process is started with a GPU and the data of this process is represented
-    by its group rank. The reduce operator is sum. Through all_reduce operator, 
+    by its group rank. The reduce operator is sum. Through all_reduce operator,
    each GPU will have the sum of the data from all GPUs.

    .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/allreduce.png
@@ -1662,10 +1662,10 @@ def _parallel_linear(x,
    """
    Parallel Linear

-    axis the dimension of the parameter of linear layer. 
+    axis the dimension of the parameter of linear layer.
    axis = 0: the row dimension
    axis = 1: the col dimension
-    
+
    """
    if group is not None and not group.is_member():
        return
@@ -1840,7 +1840,7 @@ def split(x,
        of which is a matrix with N/num_partitions rows and M column.

        The linear layer put on single card is shown as below, the input variable is represented by X,
-        the weight matrix is represented by W and the output vaiable is O. The linear layer on single card is 
+        the weight matrix is represented by W and the output vaiable is O. The linear layer on single card is
        simple matrix multiplication operation, O = X * W.

        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_single.png
@@ -1863,14 +1863,14 @@ def split(x,
        of which is a matrix with N rows and M/num_partitions column.

        The linear layer put on single card has been illustrated on case 2 and Column Parallel Linear
-        is shown as below. The Column Parallel Linear splits the weight matrix W into [W_col1, W_col2] along the column and 
-        these splitted matrices respectively multiply the input. Finally apply AllGather on the output from each card to get the final output. 
+        is shown as below. The Column Parallel Linear splits the weight matrix W into [W_col1, W_col2] along the column and
+        these splitted matrices respectively multiply the input. Finally apply AllGather on the output from each card to get the final output.

        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_col.png
            :width: 800
            :alt: split_col
            :align: center
-    
+
    As observed, the column parallel linear and row parallel linear can be combined to skip one ALLGATHER communication
    operator. Furthermore the Attention and MLP can be combined to imporve the performance as shown below.

@@ -2019,10 +2019,10 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True):
            data type of the input Tensors.
        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
        use_calc_stream (bool, optional): Whether to use calculation stream (True) or communication stream. Default: True.
-    
+
    Returns:
        None.
-    
+
    Examples:
        .. code-block:: python

@@ -2116,16 +2116,16 @@ def alltoall_single(in_tensor,
    Args:
        in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
        out_tensor (Tensor): Output Tensor. The data type should be the same as the data type of the input Tensor.
-        in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor`` 
+        in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor``
            must be divisible by group size and ``in_tensor`` will be scattered averagely to all participators. Default: None.
-        out_split_sizes (list[int], optional): Split sizes of ``out_tensor`` for dim[0]. If not given, dim[0] of ``out_tensor`` 
+        out_split_sizes (list[int], optional): Split sizes of ``out_tensor`` for dim[0]. If not given, dim[0] of ``out_tensor``
            must be divisible by group size and ``out_tensor`` will be gathered averagely from all participators. Default: None.
        group (Group, optional): The group instance return by ``new_group`` or None for global default group. Default: None.
        use_calc_stream (bool, optional): Whether to use calculation stream (True) or communication stream. Default: True.
-    
+
    Returns:
        None, if ``use_calc_stream`` is set to ``True``; ``Task`` of ``group``, if ``use_calc_stream`` is set to ``False``.
-    
+
    Examples:
        .. code-block:: python

@@ -2207,7 +2207,7 @@ def send(tensor, dst=0, group=None, use_calc_stream=True):
        dst (int): The destination rank id.
        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
        use_calc_stream (bool, optional): Whether to use calculate stream or communication stream. Default: True.
-    
+
    Returns:
        None.

@@ -2272,7 +2272,7 @@ def recv(tensor, src=0, group=None, use_calc_stream=True):
        src (int): The source rank id.
        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
        use_calc_stream (bool, optional): Whether to use calculate stream or communication stream. Default: True.
-    
+
    Returns:
        None.

@@ -2353,11 +2353,11 @@ def isend(tensor, dst, group=None):
            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
        dst (int): The destination rank.
        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
-    
+
    Returns:
        A distributed task object.

-    Warning:    
+    Warning:
        This API only supports the dygraph mode.

    Examples:
@@ -2407,7 +2407,7 @@ def irecv(tensor, src=None, group=None):
    Returns:
        A distributed task object.

-    Warning:    
+    Warning:
        This API only supports the dygraph mode.

    Examples:
@@ -2456,7 +2456,7 @@ class P2POp(object):
            The type of ``op`` is either ``paddle.distributed.isend`` or ``paddle.distributed.irecv``.
        tensor (Tensor): Tensor to send or receive.
        peer (int): The destination or source rank.
-        group (Group, optional): The group instance return by new_group or None for global 
+        group (Group, optional): The group instance return by new_group or None for global
            default group. Default: None.

    """
@@ -2505,7 +2505,7 @@ def batch_isend_irecv(p2p_op_list):
    """
    Send or Receive a batch of tensors asynchronously and return a list of requests.

-    Process each of the point-to-point operations in ``p2p_op_list`` and return the 
+    Process each of the point-to-point operations in ``p2p_op_list`` and return the
    corresponding tasks. NCCL are currently supported.

    Args:
@@ -2516,9 +2516,9 @@ def batch_isend_irecv(p2p_op_list):

    Returns:
        A list of distributed tasks returned by calling the corresponding
-        op in the op_list. 
+        op in the op_list.

-    Warning:    
+    Warning:
        This API only supports the dygraph mode.

    Examples:
@@ -2546,7 +2546,7 @@ def batch_isend_irecv(p2p_op_list):

            for task in tasks:
                task.wait()
-            
+
            print(recv_t)
            # paddle.tensor([1, 2])     # Rank-0
            # paddle.tensor([0, 1])     # Rank-1
@@ -2587,15 +2587,15 @@ def reduce_scatter(tensor,
        tensor_list (list[Tensor]): List of tensors to reduce and scatter. Every element in the list must be a Tensor whose data type
            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
-        group (Group, optional): The group instance return by new_group or None for global 
+        group (Group, optional): The group instance return by new_group or None for global
            default group. Default: None.
        use_calc_stream (bool, optional): Whether this op should be an async op.

    Returns:
        Async task handle, if use_calc_stream is set to False.
        None, if use_calc_stream or if not part of the group.
-    
-    Warning:    
+
+    Warning:
        This API only supports the dygraph mode.


@@ -2652,7 +2652,7 @@ def _reduce_scatter_base(output,

    Args:
        output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
-        input (Tensor): Input tensor that is of size output tensor size times world size. Its data type 
+        input (Tensor): Input tensor that is of size output tensor size times world size. Its data type
            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
        group (ProcessGroup, optional): The process group to work on. If None,

--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -114,12 +114,12 @@ class DistributedStrategy(object):
        """
        DistributedStrategy is the main configuration entry for distributed training of Paddle.
        All of the distributed training configurations can be configured in DistributedStrategy,
-        such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS), 
+        such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS),
        asynchronous update parameter server(ASGD), etc.

        DistributedStrategy can be serialized into protobuf file or deserialized from protobuf file

-        Users who run local training usually configure BuildStrategy and ExecutionStrategy, and 
+        Users who run local training usually configure BuildStrategy and ExecutionStrategy, and
        DistributedStrategy supports configurations from BuildStrategy and ExecutionStrategy

        """
@@ -290,7 +290,7 @@ class DistributedStrategy(object):
    def a_sync(self):
        """
        Indicating whether we are using asynchronous stocastic gradient descent updates
-        for training. This property is valid when we are using parameter server training, 
+        for training. This property is valid when we are using parameter server training,
        which is implied by setting approperate RoleMaker
        Default value: True

@@ -372,7 +372,7 @@ class DistributedStrategy(object):
    @property
    def trainer_desc_configs(self):
        """
-        Set trainer desc configurations. 
+        Set trainer desc configurations.

        **Notes**:
            dump_fields_path(str): the path of dump fields
@@ -381,7 +381,7 @@ class DistributedStrategy(object):

            dump_param(list(str)): the param that you want to dump

-            stat_var_names(list(str)): 
+            stat_var_names(list(str)):

        Examples:

@@ -443,12 +443,12 @@ class DistributedStrategy(object):
    @property
    def fs_client_param(self):
        """
-        Set fs client configurations. 
+        Set fs client configurations.
        **Notes**:
            uri(str): the uri of fs client
            user(str): the user_name of fs client
            passwd(str): the passwd of fs client
-            hadoop_bin(str): 
+            hadoop_bin(str):
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -1001,15 +1001,15 @@ class DistributedStrategy(object):
    @property
    def last_comm_group_size_MB(self):
        """
-        Specifying the size of gradient to fuse in Mega-Bytes when 
-        the last group of each batch communicates. Making the last group 
-        small is useful to improve performance. 
+        Specifying the size of gradient to fuse in Mega-Bytes when
+        the last group of each batch communicates. Making the last group
+        small is useful to improve performance.

        Default value: 1

        Examples:
          .. code-block:: python
-        
+
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.last_comm_group_size_MB = 2
@@ -1027,7 +1027,7 @@ class DistributedStrategy(object):
    @property
    def find_unused_parameters(self):
        """
-        Indicating whether we are using find_unused_parameters to 
+        Indicating whether we are using find_unused_parameters to
        find unused parameters in DataParallel.

        Default value: False
@@ -1104,20 +1104,20 @@ class DistributedStrategy(object):
    @property
    def recompute_configs(self):
        """
-        Set recompute configurations. 
-        
+        Set recompute configurations.
+
        **Note**:
        checkpoints(list): list of string name of checkpoints. In general, the recompute
        strategy of current implementation should have some manually assign checkpoints.

-        enable_offload(bool): enable recompute checkpoints offload feature. this feature 
+        enable_offload(bool): enable recompute checkpoints offload feature. this feature
        will offload the checkpoint to host memory to allow even larger batch size. since
        the memcpy from host to device takes time, it is a trade off between larger batch
        size and training speed.

        checkpoint_shape(list): list of int that specific the shape of checkpoint. so far
        recompute-offload requires that all checkpoint to be same shape, and every dimension
-        specific here should be determined ("-1" is not allowed). 
+        specific here should be determined ("-1" is not allowed).

        Examples:

@@ -1145,7 +1145,7 @@ class DistributedStrategy(object):
    def sharding(self):
        """
        Indicating whether we are using sharding Optimizer for memory
-        optimization. We implement the sharding optimizer following the ZeRO-DP 
+        optimization. We implement the sharding optimizer following the ZeRO-DP
        idea from [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054).
        Model parameters and Optimizer State are sharded into different ranks allowing to fit larger model.

@@ -1174,26 +1174,26 @@ class DistributedStrategy(object):
    @property
    def sharding_configs(self):
        """
-        Set sharding configurations. 
+        Set sharding configurations.

        **Note**:
-            sharding_segment_strategy(string, optional): strategy used to segment the program(forward & backward operations). two strategise are 
-            available: "segment_broadcast_MB" and "segment_anchors". segment is a concept used in sharding to overlap computation and 
+            sharding_segment_strategy(string, optional): strategy used to segment the program(forward & backward operations). two strategise are
+            available: "segment_broadcast_MB" and "segment_anchors". segment is a concept used in sharding to overlap computation and
            communication. Default is segment_broadcast_MB.

-            segment_broadcast_MB(float, optional): segment by the parameters broadcast volume. sharding will introduce parameter broadcast operations into program, and 
+            segment_broadcast_MB(float, optional): segment by the parameters broadcast volume. sharding will introduce parameter broadcast operations into program, and
            after every segment_broadcast_MB size parameter being broadcasted, the program will be cutted into one segment.
            This configuration will affect the communication speed in sharding training, and should be an empirical value decided by your model size and network topology.
            Only enable when sharding_segment_strategy = segment_broadcast_MB. Default is 32.0 .

-            segment_anchors(list): list of anchors used to segment the program, which allows a finner control of program segmentation. 
+            segment_anchors(list): list of anchors used to segment the program, which allows a finner control of program segmentation.
            this strategy is experimental by now. Only enable when sharding_segment_strategy = segment_anchors.

            sharding_degree(int, optional): specific the number of gpus within each sharding parallelism group; and sharding will be turn off if sharding_degree=1.  Default is 8.

            gradient_merge_acc_step(int, optional): specific the accumulation steps in gradient merge; and gradient merge will be turn off if gradient_merge_acc_step=1.  Default is 1.

-            optimize_offload(bool, optional): enable the optimizer offload which will offload the moment vars to Host memory in order to saving GPU memory for fitting larger model. 
+            optimize_offload(bool, optional): enable the optimizer offload which will offload the moment vars to Host memory in order to saving GPU memory for fitting larger model.
            the moment var will be prefetch from and offloaded to Host memory during update stage. it is a stragtegy that trades off between training speed and GPU memory, and is recommened to be turn on only when gradient_merge_acc_step large, where
            the number of time of update stage will be relatively small compared with forward&backward's.  Default is False.

@@ -1203,7 +1203,7 @@ class DistributedStrategy(object):

            pp_degree(int, optional): [Hybrid parallelism ONLY] specific the number of gpus within each pipeline parallelism group; and pipeline parallelism will turn be off if pp_degree=1.  Default is 1.

-            pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on. 
+            pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on.
            This configuration will affect the communication speed of Hybrid parallelism training depeneded on network topology. this strategy is experimental by now..  Default is False.

            optimize_cast(bool, optional): [Hybrid parallelism ONLY] Move the cast op of AMP which cast fp32 param to fp16 param to optimizer. optimize_cast will persist fp16 param, it
@@ -1385,11 +1385,11 @@ class DistributedStrategy(object):
        """
        Set pipeline parallelism configurations. In pipeline parallelism,
        different parts of neural networks are running on different GPUS.
-        There are Tensor queue buffer between each pair of neighborhood GPUS 
+        There are Tensor queue buffer between each pair of neighborhood GPUS
        that are responsible for synchronizing hidden Tensor results between
        GPUs. Pipeline parallelism consists of serveral producer-consumer style
        hardware pairs, such as GPU-GPU, CPU-GPU, GPU-XPU. The best way to speedup
-        pipeline parallelism is to make the size of Tensor in Tensor queue smaller, 
+        pipeline parallelism is to make the size of Tensor in Tensor queue smaller,
        so that we will have a faster producer for downstream consumers.

        **Notes**:
@@ -1475,7 +1475,7 @@ class DistributedStrategy(object):
    @property
    def hybrid_configs(self):
        """
-        Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism 
+        Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism
        needs to meet the following relationships

        total_number_GPUs = dp_degree * mp_degree * pp_degree
@@ -1483,7 +1483,7 @@ class DistributedStrategy(object):
        **Note**:
            dp_degree(int): set number of GPUs in a data parallel group. Default -1.
                                    This value should be an integer greater than 0.
-                                    If it is not set, or set to -1, its value will be inferred 
+                                    If it is not set, or set to -1, its value will be inferred
                                    based on the total number of cards.
            mp_degree(int): set number of GPUs in a model parallel group. Default 1
            pp_degree(int): set number of GPUs in a pipeline parallel group. Default 1
@@ -1567,7 +1567,7 @@ class DistributedStrategy(object):
    def adaptive_localsgd(self):
        """
        Indicating whether we are using Adaptive Local SGD training. Default Value: False
-        For more details, please refer to `Adaptive Communication Strategies to Achieve 
+        For more details, please refer to `Adaptive Communication Strategies to Achieve
        the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.


@@ -1770,8 +1770,8 @@ class DistributedStrategy(object):
    @property
    def lars(self):
        """
-        Set lars configurations. lars is used to deal with the convergence problems when the global 
-        batch size is larger than 8k.  For more details, please refer to 
+        Set lars configurations. lars is used to deal with the convergence problems when the global
+        batch size is larger than 8k.  For more details, please refer to
        [Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888).

        Default Value: False
@@ -1802,8 +1802,8 @@ class DistributedStrategy(object):
        **Notes**:
        **lars_coeff (float)**: trust ratio in lars formula.
        **lars_weight_decay** (float): weight decay coefficient in lars formula.
-        **epsilon (float)**: argument is used to avoid potential devision-by-zero 
-        when compute the local lr; 
+        **epsilon (float)**: argument is used to avoid potential devision-by-zero
+        when compute the local lr;
        **exclude_from_weight_decay ([string])**: is a list of name strings of layers which
        will be exclude from weight decay in lars formula.

@@ -1832,9 +1832,9 @@ class DistributedStrategy(object):
    @property
    def lamb(self):
        """
-        Set lamb configurations. lamb is used to deal with the convergence problems for large 
-        batch size training, specially for attention-related model like BERT. For more details, 
-        please refer to 
+        Set lamb configurations. lamb is used to deal with the convergence problems for large
+        batch size training, specially for attention-related model like BERT. For more details,
+        please refer to
        [Large Batch Optimization for Deep Learning: Training BERT in 76 minutes](https://arxiv.org/abs/1904.00962).

        Default Value: False
@@ -1908,7 +1908,7 @@ class DistributedStrategy(object):
    def auto(self):
        """
        Indicating whether we are using auto-parallel configuration
-        This feature is currently an experimental feature. Currently, 
+        This feature is currently an experimental feature. Currently,
        auto-parallelism can be used only when a user does not set any other
        strategy configs except auto. For details, please reference the following
        code example
@@ -1943,7 +1943,7 @@ class DistributedStrategy(object):
    def semi_auto(self):
        """
        Indicating whether we are using semi-auto parallel function
-        This feature is currently an experimental feature. Currently, 
+        This feature is currently an experimental feature. Currently,
        auto-parallelism can be used only when a user does not set any other
        strategy configs except semi-auto. For details, please reference the following
        code example
@@ -2047,7 +2047,7 @@ class DistributedStrategy(object):

            activation_bits(int): quantization bit number for activation. Default is 8.

-            not_quant_pattern(list[str]): When the skip pattern is detected in an op's name scope, 
+            not_quant_pattern(list[str]): When the skip pattern is detected in an op's name scope,
                the corresponding op will not be quantized.

            algo(str): Other quantization training algorithm.

--- a/python/paddle/distributed/fleet/base/private_helper_function.py
+++ b/python/paddle/distributed/fleet/base/private_helper_function.py
@@ -24,11 +24,11 @@ def wait_server_ready(endpoints):
    """
    Wait until parameter servers are ready, use connext_ex to detect
    port readiness.
-    
+
    Args:
    endpoints (list|tuple): endpoints string list, like:
    ["127.0.0.1:8080", "127.0.0.1:8081"]
-    
+
    Examples:
    .. code-block:: python


--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -750,7 +750,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):

    def _get_previous_trainers(self):
        """
-        invoked by heter worker 
+        invoked by heter worker
        """
        if not self._role_is_generated:
            self._generate_role()
@@ -761,7 +761,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):

    def _get_next_trainers(self):
        """
-        invoked by heter worker 
+        invoked by heter worker
        """
        if not self._role_is_generated:
            self._generate_role()

--- a/python/paddle/distributed/fleet/base/strategy_compiler.py
+++ b/python/paddle/distributed/fleet/base/strategy_compiler.py
@@ -116,7 +116,7 @@ class StrategyCompiler(StrategyCompilerBase):
    """
    StrategyCompiler is responsible for meta optimizers combination
    Generally, a user can define serveral distributed strategies that
-    can generate serveral meta optimizer. The combination of these 
+    can generate serveral meta optimizer. The combination of these
    meta optimizers should have the right order to apply the optimizers'
    minimize function.
    This class is responsible for the executable distributed optimizer
@@ -162,7 +162,7 @@ class StrategyCompiler(StrategyCompilerBase):
    """
    Meta Optimizer Type A: rewrite forward, backward. e.g. recompute, async, sync, pipeline.
                           results will be splitted in async, sync, pipeline
-    Meta Optimizer Type B: rewrite forward, 
+    Meta Optimizer Type B: rewrite forward,
                           e.g. AMP and the corresponding backward is generated by rewritten forward
    Meta Opitmizer Type B: rewrite backward. e.g. gradient fusion
    Meta Optimizer Type D: rewrite optimize. e.g. lars, lamb, localsgd, gradient merge, dgc

--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -32,7 +32,7 @@ class ParallelMode(object):
    - DATA_PARALLEL: Distribute input data to different devices.
    - TENSOR_PARALLEL: Shards tensors in the network to different devices.
    - PIPELINE_PARALLEL: Place different layers of the network on different devices.
-    - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states 
+    - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states
                         corresponding to the parameters to each device.

    Examples:

--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -286,7 +286,7 @@ class UtilBase(object):

    def print_on_rank(self, message, rank_id):
        """
-        Woker of rank `rank_id` print some message. 
+        Woker of rank `rank_id` print some message.

        Args:
            message(str): Log to be printed.

--- a/python/paddle/distributed/fleet/data_generator/data_generator.py
+++ b/python/paddle/distributed/fleet/data_generator/data_generator.py
@@ -22,7 +22,7 @@ class DataGenerator(object):
    """
    DataGenerator is a general Base class for user to inherit
    A user who wants to define his/her own python processing logic
-    with paddle.distributed.InMemoryDataset/QueueDataset should 
+    with paddle.distributed.InMemoryDataset/QueueDataset should
    inherit this class.
    """

@@ -96,7 +96,7 @@ class DataGenerator(object):
    def run_from_stdin(self):
        '''
        This function reads the data row from stdin, parses it with the
-        process function, and further parses the return value of the 
+        process function, and further parses the return value of the
        process function with the _gen_str function. The parsed data will
        be wrote to stdout and the corresponding protofile will be
        generated.
@@ -152,7 +152,7 @@ class DataGenerator(object):

    def generate_sample(self, line):
        '''
-        This function needs to be overridden by the user to process the 
+        This function needs to be overridden by the user to process the
        original data row into a list or tuple.

        Args:
@@ -160,8 +160,8 @@ class DataGenerator(object):

        Returns:
            Returns the data processed by the user.
-              The data format is list or tuple: 
-            [(name, [feasign, ...]), ...] 
+              The data format is list or tuple:
+            [(name, [feasign, ...]), ...]
              or ((name, [feasign, ...]), ...)

            For example:
@@ -290,7 +290,7 @@ class MultiSlotDataGenerator(DataGenerator):
        and updating proto_info information.

        The input line will be in this format:
-            >>> [(name, [feasign, ...]), ...] 
+            >>> [(name, [feasign, ...]), ...]
            >>> or ((name, [feasign, ...]), ...)
        The output will be in this format:
            >>> [ids_num id1 id2 ...] ...

--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
@@ -46,7 +46,7 @@ class DatasetBase(object):
             fs_ugi="",
             download_cmd="cat"):
        """
-        should be called only once in user's python scripts to initialize setings of dataset instance. 
+        should be called only once in user's python scripts to initialize setings of dataset instance.
        Normally, it is called by InMemoryDataset or QueueDataset.

        Args:
@@ -341,7 +341,7 @@ class DatasetBase(object):
 class InMemoryDataset(DatasetBase):
    """
    :api_attr: Static Graph
-    
+
    It will load data into memory and shuffle data before training.

    Examples:
@@ -376,8 +376,8 @@ class InMemoryDataset(DatasetBase):
        Args:
            kwargs: Keyword arguments. Currently, we support following keys in **kwargs:

-            merge_size(int): ins size to merge, if merge_size > 0, set merge by line id, 
-                             instances of same line id will be merged after shuffle, 
+            merge_size(int): ins size to merge, if merge_size > 0, set merge by line id,
+                             instances of same line id will be merged after shuffle,
                             you should parse line id in data generator. default is -1.
            parse_ins_id(bool): Set if Dataset need to parse ins_id. default is False.
            parse_content(bool): Set if Dataset need to parse content. default is False.
@@ -404,7 +404,7 @@ class InMemoryDataset(DatasetBase):
                    parse_content=True,
                    fea_eval=True,
                    candidate_size=10000)
-              
+
        """
        merge_size = kwargs.get("merge_size", -1)
        if merge_size > 0:
@@ -449,8 +449,8 @@ class InMemoryDataset(DatasetBase):
            data_feed_type(str): data feed type used in c++ code. default is "MultiSlotInMemoryDataFeed".
            queue_num(int): Dataset output queue num, training threads get data from queues. default is-1, which is set same as thread number in c++.

-            merge_size(int): ins size to merge, if merge_size > 0, set merge by line id, 
-                             instances of same line id will be merged after shuffle, 
+            merge_size(int): ins size to merge, if merge_size > 0, set merge by line id,
+                             instances of same line id will be merged after shuffle,
                             you should parse line id in data generator. default is -1.
            parse_ins_id(bool): Set if Dataset need to parse ins_id. default is False.
            parse_content(bool): Set if Dataset need to parse content. default is False.
@@ -463,7 +463,7 @@ class InMemoryDataset(DatasetBase):
        Examples:
            .. code-block:: python

-                import paddle    
+                import paddle
                paddle.enable_static()

                dataset = paddle.distributed.InMemoryDataset()
@@ -479,7 +479,7 @@ class InMemoryDataset(DatasetBase):
                    fea_eval=True,
                    candidate_size=10000)
                dataset.update_settings(batch_size=2)
-            
+
        """
        for key in kwargs:
            if key == "pipe_command":
@@ -515,10 +515,10 @@ class InMemoryDataset(DatasetBase):
        :api_attr: Static Graph

        should be called only once in user's python scripts to initialize setings of dataset instance
-        
+
        Args:
            kwargs: Keyword arguments. Currently, we support following keys in **kwargs:
-            
+
            batch_size(int): batch size. It will be effective during training. default is 1.
            thread_num(int): thread num, it is the num of readers. default is 1.
            use_var(list): list of variables. Variables which you will use. default is [].
@@ -561,7 +561,7 @@ class InMemoryDataset(DatasetBase):
                dataset.set_filelist(
                    ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
                dataset.load_into_memory()
-                
+
                place = paddle.CPUPlace()
                exe = paddle.static.Executor(place)
                startup_program = paddle.static.Program()
@@ -569,7 +569,7 @@ class InMemoryDataset(DatasetBase):
                exe.run(startup_program)

                exe.train_from_dataset(main_program, dataset)
-                
+
                os.remove("./test_queue_dataset_run_a.txt")
                os.remove("./test_queue_dataset_run_b.txt")

@@ -831,7 +831,7 @@ class InMemoryDataset(DatasetBase):
    def load_into_memory(self, is_shuffle=False):
        """
        :api_attr: Static Graph
-        
+
        Load data into memory

        Args:
@@ -842,7 +842,7 @@ class InMemoryDataset(DatasetBase):

                import paddle
                paddle.enable_static()
-                
+
                dataset = paddle.distributed.InMemoryDataset()
                slots = ["slot1", "slot2", "slot3", "slot4"]
                slots_vars = []
@@ -1035,7 +1035,7 @@ class InMemoryDataset(DatasetBase):
    def release_memory(self):
        """
        :api_attr: Static Graph
-        
+
        Release InMemoryDataset memory data, when data will not be used again.

        Examples:
@@ -1043,7 +1043,7 @@ class InMemoryDataset(DatasetBase):

                import paddle
                paddle.enable_static()
-                
+
                dataset = paddle.distributed.InMemoryDataset()
                slots = ["slot1", "slot2", "slot3", "slot4"]
                slots_vars = []
@@ -1144,7 +1144,7 @@ class InMemoryDataset(DatasetBase):

                import paddle
                paddle.enable_static()
-                
+
                dataset = paddle.distributed.InMemoryDataset()
                dataset = paddle.distributed.InMemoryDataset()
                slots = ["slot1", "slot2", "slot3", "slot4"]
@@ -1180,13 +1180,13 @@ class InMemoryDataset(DatasetBase):
        """
        set fea eval mode for slots shuffle to debug the importance level of
        slots(features), fea_eval need to be set True for slots shuffle.
-        
+
        Args:
-            record_candidate_size(int): size of instances candidate to shuffle 
+            record_candidate_size(int): size of instances candidate to shuffle
                                        one slot
            fea_eval(bool): whether enable fea eval mode to enable slots shuffle.
                            default is True.
-            
+
        Examples:
            .. code-block:: python

@@ -1202,12 +1202,12 @@ class InMemoryDataset(DatasetBase):

    def slots_shuffle(self, slots):
        """
-        Slots Shuffle 
-        Slots Shuffle is a shuffle method in slots level, which is usually used 
+        Slots Shuffle
+        Slots Shuffle is a shuffle method in slots level, which is usually used
        in sparse feature with large scale of instances. To compare the metric, i.e.
-        auc while doing slots shuffle on one or several slots with baseline to 
+        auc while doing slots shuffle on one or several slots with baseline to
        evaluate the importance level of slots(features).
-        
+
        Args:
            slots(list[string]): the set of slots(string) to do slots shuffle.

@@ -1216,7 +1216,7 @@ class InMemoryDataset(DatasetBase):

                import paddle
                paddle.enable_static()
-                
+
                dataset = paddle.distributed.InMemoryDataset()
                dataset._init_distributed_settings(fea_eval=True)
                slots = ["slot1", "slot2", "slot3", "slot4"]
@@ -1442,7 +1442,7 @@ class BoxPSDataset(InMemoryDataset):
    def begin_pass(self):
        """
        Begin Pass
-        Notify BoxPS to load sparse parameters of next pass to GPU Memory 
+        Notify BoxPS to load sparse parameters of next pass to GPU Memory

        Examples:
            .. code-block:: python
@@ -1456,7 +1456,7 @@ class BoxPSDataset(InMemoryDataset):
    def end_pass(self, need_save_delta):
        """
        End Pass
-        Notify BoxPS that current pass ended 
+        Notify BoxPS that current pass ended
        Examples:
            .. code-block:: python

@@ -1522,12 +1522,12 @@ class BoxPSDataset(InMemoryDataset):

    def slots_shuffle(self, slots):
        """
-        Slots Shuffle 
-        Slots Shuffle is a shuffle method in slots level, which is usually used 
+        Slots Shuffle
+        Slots Shuffle is a shuffle method in slots level, which is usually used
        in sparse feature with large scale of instances. To compare the metric, i.e.
-        auc while doing slots shuffle on one or several slots with baseline to 
+        auc while doing slots shuffle on one or several slots with baseline to
        evaluate the importance level of slots(features).
-        
+
        Args:
            slots(list[string]): the set of slots(string) to do slots shuffle.

@@ -1585,7 +1585,7 @@ class BoxPSDataset(InMemoryDataset):

    def preprocess_instance(self):
        """
-        Merge pv instance and convey it from input_channel to input_pv_channel. 
+        Merge pv instance and convey it from input_channel to input_pv_channel.
        It will be effective when enable_pv_merge_ is True.

        Examples:

--- a/python/paddle/distributed/fleet/elastic/manager.py
+++ b/python/paddle/distributed/fleet/elastic/manager.py
@@ -360,7 +360,7 @@ class ElasticManager(object):

    def _parse_np(self, np: str):
        """
-        np format is "MIN" or "MIN:MAX" 
+        np format is "MIN" or "MIN:MAX"
        """
        np_str = np or os.getenv('PADDLE_ELASTIC_NP', "0")
        np_dict = np_str.split(":")

--- a/python/paddle/distributed/fleet/fleet.py
+++ b/python/paddle/distributed/fleet/fleet.py
@@ -174,14 +174,14 @@ class Fleet(object):

        Args:
            role_maker (RoleMakerBase, optional): A ``RoleMakerBase`` containing the configuration
-                of environment variables related to distributed training.If you did not initialize 
+                of environment variables related to distributed training.If you did not initialize
                the rolemaker by yourself, it will be automatically initialized to PaddleRoleMaker.
                The default value is None.
-            is_collective (Boolean, optional): A ``Boolean`` variable determines whether the program 
+            is_collective (Boolean, optional): A ``Boolean`` variable determines whether the program
                runs on Collective mode or ParameterServer mode. True means the program runs on
-                Collective mode, and False means running on ParameterServer mode. The default value 
+                Collective mode, and False means running on ParameterServer mode. The default value
                is False.
-            strategy (DistributedStrategy): Extra properties for distributed training. 
+            strategy (DistributedStrategy): Extra properties for distributed training.
                For details, please refer to paddle.distributed.fleet.DistributedStrategy. Default: None.


@@ -991,10 +991,10 @@ class Fleet(object):

        Args:
            optimizer(Optimizer): The executor to run for init server.
-            strategy(DistributedStrategy): Extra properties for distributed optimizer. 
+            strategy(DistributedStrategy): Extra properties for distributed optimizer.
                It is recommended to use DistributedStrategy in fleet.init(). The strategy
-                here is for compatibility. If the strategy in fleet.distributed_optimizer() 
-                is not None, then it will overwrite the DistributedStrategy in fleet.init(), 
+                here is for compatibility. If the strategy in fleet.distributed_optimizer()
+                is not None, then it will overwrite the DistributedStrategy in fleet.init(),
                which will take effect in distributed training.

        Returns:
@@ -1057,14 +1057,14 @@ class Fleet(object):
                 use_fp16_test=False):
        """
        Init the amp training, such as cast fp32 parameters to fp16 type.
-  
+
        Args:
-            place(CUDAPlace): place is used to initialize 
+            place(CUDAPlace): place is used to initialize
                fp16 parameters with fp32 values.
            scope(Scope): The scope is used to find fp32 parameters.
            test_program(Program): The program is used for testing.
            use_fp16_test(bool): Whether to use fp16 testing.
-            
+
        Examples:
            .. code-block:: python

@@ -1086,7 +1086,7 @@ class Fleet(object):
                        loss = paddle.mean(hidden)
                    # 2) Create the optimizer and set `multi_precision` to True.
                    # Setting `multi_precision` to True can avoid the poor accuracy
-                    # or the slow convergence in a way. 
+                    # or the slow convergence in a way.
                    optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                    # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                    amp_list = paddle.static.amp.CustomOpLists(
@@ -1106,9 +1106,9 @@ class Fleet(object):
                    # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                    # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                    optimizer.amp_init(place, scope=paddle.static.global_scope())
-                    
+
                if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
-                    run_example_code()       
+                    run_example_code()
        """
        amp_optimizer = self._get_amp_optimizer()
        return amp_optimizer.amp_init(place, scope, test_program, use_fp16_test)

--- a/python/paddle/distributed/fleet/fleet_executor_utils.py
+++ b/python/paddle/distributed/fleet/fleet_executor_utils.py
@@ -39,7 +39,7 @@ class TaskNode:
        :param role (int): The role of the task node. (Will be removed in the future)
        :param node_type (str): The type of the task node.
        :param task_id (int): The id of task node.
-        :param ops (list): A list of op.desc to init the task node. (Will be removed in the future) 
+        :param ops (list): A list of op.desc to init the task node. (Will be removed in the future)
        :param program (Program): An instance of Program to init the task node.
        :param lazy_initialize (bool): In user-defined task, the program may change adding feed/fetch op. As efficient consideration, the task node will have the C++ object later.
        """

--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -543,7 +543,7 @@ def which_distributed_mode(args):
 def launch():
    """
    Paddle distribution training entry ``python -m paddle.distributed.launch``.
-    
+
    Usage:
        .. code-block:: bash
            :name: code-block-bash1
@@ -553,7 +553,7 @@ def launch():
                             [--worker_num WORKER_NUM] [--server_num SERVER_NUM] [--heter_worker_num HETER_WORKER_NUM]
                             [--http_port HTTP_PORT] [--elastic_server ELASTIC_SERVER] [--job_id JOB_ID] [--np NP] [--scale SCALE]
                             [--host HOST] [--force FORCE]
-                             training_script ...    
+                             training_script ...


    Base Parameters:
@@ -566,9 +566,9 @@ def launch():
        - ``--gpus``: It's for gpu training. e.g., ``--gpus=0,1,2,3`` will launch four training processes each bound to one gpu.

        - ``--selected_gpus``: gpus aliases, recommend to use ``--gpus``.
-        
+
        - ``--xpus``: It's for xpu training if xpu is available. e.g., ``--xpus=0,1,2,3``.
-        
+
        - ``--selected_xpus``: xpus aliases, recommend to use ``--xpus``.

        - ``--mlus``: It's for mlu training. e.g., ``--mlus=0,1,2,3`` will launch four training processes each bound to one mlu.
@@ -594,7 +594,7 @@ def launch():
        - ``--server_num``: Number of servers (It recommend to set when in the emulated distributed environment using single node)

        - ``--heter_worker_num``: Number of heter_workers in each stage (It recommend to set when in the emulated distributed environment using single node)
-        
+
        - ``--heter_devices``: Type of heter_device in each stage

        - ``--http_port``: Gloo http Port
@@ -615,18 +615,18 @@ def launch():
    Examples 1 (collective, single node):
        .. code-block:: bash
            :name: code-block-example-bash1
-            
+
            # For training on single node using 4 gpus.

            python -m paddle.distributed.launch --gpus=0,1,2,3 train.py --lr=0.01
-        
+
    Examples 2 (collective, multi node):
        .. code-block:: bash
            :name: code-block-example-bash2

            # The parameters of --gpus and --ips must be consistent in each node.

-            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17

            # On 192.168.0.16:

@@ -634,15 +634,15 @@ def launch():

            # On 192.168.0.17:
            python -m paddle.distributed.launch --gpus=0,1,2,3 --ips=192.168.0.16,192.168.0.17 train.py --lr=0.01
-        
+
    Examples 3 (ps, cpu, single node):
        .. code-block:: bash
            :name: code-block-example-bash3

            # To simulate distributed environment using single node, e.g., 2 servers and 4 workers.
-            
+
            python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
-        
+
    Examples 4 (ps, cpu, multi node):
        .. code-block:: bash
            :name: code-block-example-bash4
@@ -662,10 +662,10 @@ def launch():
            :name: code-block-example-bash5

           # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, each worker use single gpu.
-            
+
            export CUDA_VISIBLE_DEVICES=0,1,2,3
            python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
-            
+
    Examples 6 (ps, gpu, multi node):
        .. code-block:: bash
            :name: code-block-example-bash6
@@ -687,10 +687,10 @@ def launch():
            :name: code-block-example-bash7

            # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, two workers use gpu, two workers use cpu.
-            
+
            export CUDA_VISIBLE_DEVICES=0,1
            python -m paddle.distributed.launch --server_num=2 --worker_num=2 --heter_worker_num=2 train.py --lr=0.01
-            
+
    Examples 8 (ps-heter, cpu + gpu, multi node):
        .. code-block:: bash
            :name: code-block-example-bash8
@@ -712,7 +712,7 @@ def launch():
            :name: code-block-example-bash9

            python -m paddle.distributed.launch --elastic_server=127.0.0.1:2379 --np=2 --job_id=job1  --gpus=0,1,2,3 train.py
-        
+
    """

    args = _parse_args()

--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -27,7 +27,7 @@ def _is_trainable(param):

 class DygraphShardingOptimizer(object):
    """
-    A wrapper for Sharding Optimizer in Dygraph. 
+    A wrapper for Sharding Optimizer in Dygraph.

    .. warning: DygraphShardingOptimizer is experimental and subject to change.

@@ -88,7 +88,7 @@ class DygraphShardingOptimizer(object):
        Partitions parameters among sharding ranks.

        Return:
-        Dict[int, List] 
+        Dict[int, List]
        """
        # TODO(JZ-LIANG) support multiple partition methods
        # method1: greedy even but unorder
@@ -113,7 +113,7 @@ class DygraphShardingOptimizer(object):
        mapping parameters to the shard which holds it.

        Return:
-        Dict[str, int] 
+        Dict[str, int]
        """
        mapping = {}
        for rank, params in self._rank2params.items():

--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -49,7 +49,7 @@ align = {

 class ShardingOptimizerStage2(Optimizer):
    """
-    A wrapper for Sharding Stage2 Optimizer in Dygraph. 
+    A wrapper for Sharding Stage2 Optimizer in Dygraph.

    .. warning: ShardingOptimizer encapsulates the optimization strategy and integrates it into the optimizer.


--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -816,7 +816,7 @@ def insert_scale_loss_grad_ops(block, scale=1.0):

 def comm_analyse(main_program):
    """
-    Analyse the parameter size that need to be broadcast/allreduce during sharding training 
+    Analyse the parameter size that need to be broadcast/allreduce during sharding training
    """
    reduce_vars = {}
    broadcast_vars = {}
@@ -858,7 +858,7 @@ def comm_analyse(main_program):

 def add_sync_comm(program, sharding_ring_id):
    """
-    When clone a test prog by clone from the sharding main prog, 
+    When clone a test prog by clone from the sharding main prog,
    part of the sync_comm op maybe be pruned by mistake, this function
    add the sync_comm op for the test prog.


--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -961,7 +961,7 @@ class ShardingOptimizer(MetaOptimizerBase):
        2. prune cast_fp32_to_fp16; update amp_infine_checking
        3. prune gradient_clip related; update global_norm_sum
        4. prune optimizer op + param + gradient
-            
+
        """
        weightdecay_helper = WeightDecayHelper()
        weightdecay_helper.prune_weight_decay(block, shard)
@@ -1066,7 +1066,7 @@ class ShardingOptimizer(MetaOptimizerBase):
        add broadcast allreduce op
        if enable gradient_merge, insert related ops

-        if combined with pipeline(grad accumulate), 
+        if combined with pipeline(grad accumulate),
        the grad allreduce should be done in optimize role
        """
        if len(self._segments) < 1:
@@ -1302,7 +1302,7 @@ class ShardingOptimizer(MetaOptimizerBase):
            pp: 4
            pp-pair: >= 20
        if one parallelism is not enable: -1
-        and only support parallelism hierarchy: mp --> sharding --> pp --> dp        
+        and only support parallelism hierarchy: mp --> sharding --> pp --> dp
        """
        # step 1: initialize nccl
        self.global_word_size = self.role_maker._worker_num()
@@ -1688,7 +1688,7 @@ class ShardingOptimizer(MetaOptimizerBase):
        grad@gradientmerge / acc_step
        re-create all optimize ops of origin main block and rename them
            cast(backward)
-            amp 
+            amp
            clip
            opt
        # fill constant grad@gradientmerge

--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -198,11 +198,11 @@ class PipelineLayer(Layer):
    """PipelineLayer
    Args:
        layers(Iterable): A sequence of layers description to define the structure for pipeline.
-        num_stages(int, optional): pp degree, if not specified, 'topology' parameter must be given. 
+        num_stages(int, optional): pp degree, if not specified, 'topology' parameter must be given.
        topology(CommunicateTopology, optional): topo of hybrid parallel, if it is None, 'num_stages' parameters must be given.
        loss_fn(callable, optional): Loss function.
        seg_method(str, optional): the method of splitting pp layer, default 'uniform', or use specific layer to split, method's name must be start with 'layer:'.
-        recompute_interval(int, optional): the number of layers to be used recompute, the value of 0 represents no recompute. default 0. 
+        recompute_interval(int, optional): the number of layers to be used recompute, the value of 0 represents no recompute. default 0.
        recompute_ctx(dict,optional): the context of recompute, when 'recompute_interval' > 0, the context must be given.
        num_virtual_pipeline_stages(int, optional): the num of virtual pipeline stages for interleave pp.
    Examples:
@@ -212,7 +212,7 @@ class PipelineLayer(Layer):
        from paddle.fluid.dygraph.layers import Layer
        import paddle.nn.functional as F
        from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
-        
+
        pipeline_parallel_size = 2
        strategy = fleet.DistributedStrategy()
        strategy.hybrid_configs = {
@@ -224,19 +224,19 @@ class PipelineLayer(Layer):
            "accumulate_steps": 4,
            "micro_batch_size": 2
        }
-        
+
        fleet.init(is_collective=True, strategy=strategy)
-        
+
        hcg = fleet.get_hybrid_communicate_group()
-        
+
        class ReshapeHelp(Layer):
            def __init__(self, shape):
                super(ReshapeHelp, self).__init__()
                self.shape = shape
-        
+
            def forward(self, x):
                return x.reshape(shape=self.shape)
-        
+
        class AlexNetPipeDesc(PipelineLayer):
            def __init__(self, num_classes=10, **kwargs):
                self.num_classes = num_classes
@@ -268,7 +268,7 @@ class PipelineLayer(Layer):
                ]
                super(AlexNetPipeDesc, self).__init__(
                    layers=decs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
-        
+
        model = AlexNetPipeDesc(num_stages=pipeline_parallel_size, topology=hcg._topo)

    """

--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -107,7 +107,7 @@ def _initialize_recompute_hcg(hcg):

 def _all_gather(tensor, group=None, use_calc_stream=True):
    """
-    The main difference with paddle.distributed.all_gather: 
+    The main difference with paddle.distributed.all_gather:
    no need to pass in tensor_list, the returned tensor is spliced
    """
    if group is not None and not group.is_member():

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
@@ -47,7 +47,7 @@ align = {

 class GroupShardedOptimizerStage2(Optimizer):
    """
-    A wrapper for Sharding Stage2 Optimizer in Dygraph. 
+    A wrapper for Sharding Stage2 Optimizer in Dygraph.

    .. warning: ShardingOptimizer encapsulates the optimization strategy and integrates it into the optimizer.


--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
@@ -47,8 +47,8 @@ def _trainable(param):


 class GroupShardedStage2(nn.Layer):
-    """ 
-    A wrapper for Sharding Stage2 Layer in Dygraph. 
+    """
+    A wrapper for Sharding Stage2 Layer in Dygraph.
    .. warning: GroupShardedStage2 encapsulates the layer strategy and integrates it into the nn.Layer.
    .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf.
    """

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -33,7 +33,7 @@ from .group_sharded_utils import Type, GroupShardedClipGrad, device_guard

 def _all_gather(tensor, buffer_size, group):
    """
-    The main difference with paddle.distributed.all_gather: 
+    The main difference with paddle.distributed.all_gather:
    no need to pass in tensor_list, the returned tensor is spliced
    """

@@ -58,8 +58,8 @@ CHECK_LAYER = dict()  # Help to check layer's id -> layer's name


 class GroupShardedStage3(nn.Layer):
-    """ 
-    A wrapper for Sharding Stage3 Layer in Dygraph. 
+    """
+    A wrapper for Sharding Stage3 Layer in Dygraph.

    .. warning: GroupShardedStage3 encapsulates the layer strategy and integrates it into the nn.Layer.


--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -48,8 +48,8 @@ def _trainable(param):


 class ShardingStage2(nn.Layer):
-    """ 
-    A wrapper for Sharding Stage2 Layer in Dygraph. 
+    """
+    A wrapper for Sharding Stage2 Layer in Dygraph.
    .. warning: ShardingStage2 encapsulates the layer strategy and integrates it into the nn.Layer.
    .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf.
    """

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -50,8 +50,8 @@ CHECK_LAYER = dict()  # Help to check layer's id -> layer's name


 class ShardingStage3(nn.Layer):
-    """ 
-    A wrapper for Sharding Stage3 Layer in Dygraph. 
+    """
+    A wrapper for Sharding Stage3 Layer in Dygraph.

    .. warning: ShardingStage3 encapsulates the layer strategy and integrates it into the nn.Layer.


--- a/python/paddle/distributed/fleet/metrics/metric.py
+++ b/python/paddle/distributed/fleet/metrics/metric.py
@@ -41,7 +41,7 @@ def sum(input, scope=None, util=None):
          global_cnt = fluid.layers.create_global_var(persistable=True, dtype='float32', shape=[1], value=0)
          tmp = fluid.layers.elementwise_add(cnt, global_cnt)
          fluid.layers.assign(tmp, global_cnt)
-          
+
          # in train.py, after train or infer
          res = np.array(scope.find_var(global_cnt.name).get_tensor())
          print("sum array: ", paddle.distributed.fleet.sum(res))

--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -131,14 +131,14 @@ class LocalFS(FS):
    """

    def ls_dir(self, fs_path):
-        """	
+        """
        List directorys and files under `fs_path` .

        Args:
            fs_path(str): The local file path.

        Returns:
-            Tuple: Return a 2-tuple, the first is a list of all its subdirectories, 
+            Tuple: Return a 2-tuple, the first is a list of all its subdirectories,
            and the second is a list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).

        Examples:
@@ -290,7 +290,7 @@ class LocalFS(FS):
            fs_path(str): The local file path.

        Returns:
-            Bool: Wheter it's a file or directory, return true if the path exists, 
+            Bool: Wheter it's a file or directory, return true if the path exists,
            otherwise return false.

        Examples:
@@ -359,7 +359,7 @@ class LocalFS(FS):
        return self.rename(src_path, dst_path)

    def list_dirs(self, fs_path):
-        """	
+        """
        Only list directorys under `fs_path` .

        Args:
@@ -430,7 +430,7 @@ class HDFSClient(FS):
    A tool of HDFS.

    Args:
-        hadoop_home(str): Hadoop home. 
+        hadoop_home(str): Hadoop home.
        configs(dict): Hadoop config. It is a dictionary and needs to contain the
            keys: "fs.default.name" and "hadoop.job.ugi".

@@ -491,7 +491,7 @@ class HDFSClient(FS):

    @_handle_errors()
    def list_dirs(self, fs_path):
-        """	
+        """
        Only list directorys under `fs_path` .

        Args:
@@ -523,14 +523,14 @@ class HDFSClient(FS):

    @_handle_errors()
    def ls_dir(self, fs_path):
-        """	
+        """
        List directorys and files under `fs_path` .

        Args:
            fs_path(str): The HDFS file path.

        Returns:
-            Tuple: Return a 2-tuple, the first element is the list of all its subdirectories, 
+            Tuple: Return a 2-tuple, the first element is the list of all its subdirectories,
            and the second one is the list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).

        Examples:
@@ -923,7 +923,7 @@ class HDFSClient(FS):
            fs_src_path(str):  Name of the file or directory, that's needed to be moved.
            fs_dst_path(str):  Name of the file or directory to which to move to.
            overwrite(bool): Whether to re-write `fs_dst_path` if that exists. Default is False.
-            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption. 
+            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption.

        Examples:

@@ -1174,7 +1174,7 @@ class AFSClient(FS):
        self._fs.init(fs_name, fs_user, fs_passwd, fs_conf)

    def list_dirs(self, fs_path):
-        """	
+        """
        Only list directorys under `fs_path` .

        Args:
@@ -1200,14 +1200,14 @@ class AFSClient(FS):
        return dirs

    def ls_dir(self, fs_path):
-        """	
+        """
        List directorys and files under `fs_path` .

        Args:
            fs_path(str): The HDFS file path.

        Returns:
-            Tuple: Return a 2-tuple, the first element is the list of all its subdirectories, 
+            Tuple: Return a 2-tuple, the first element is the list of all its subdirectories,
            and the second one is the list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).

        Examples:
@@ -1438,7 +1438,7 @@ class AFSClient(FS):
            fs_src_path(str):  Name of the file or directory, that's needed to be moved.
            fs_dst_path(str):  Name of the file or directory to which to move to.
            overwrite(bool): Whether to re-write `fs_dst_path` if that exists. Default is False.
-            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption. 
+            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption.

        Examples:


--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
@@ -23,7 +23,7 @@ import numpy as np
 class HybridParallelInferenceHelper(object):
    """
    A helper class to split program for inference with hybrid parallelism.
-    
+
    Args:
        startup_program (Program): the startup program.
        main_program (Program): the main program.
@@ -34,15 +34,15 @@ class HybridParallelInferenceHelper(object):
        init_comm (bool): wheter if initilize comminication group. Default ``True``.
        role_maker (RoleMakerBase or subclass): user custom define RoleMakerBase.
            If ``role_maker==None``, then use PaddleCloudRoleMaker. Default ``None``.
-    
+
    Returns:
        None.
-        
+
    Write Paradigm:
-    
+
    .. code-block:: bash
        :name: bash-example1
-        
+
        # while op pattern
        with paddle.fluid.device_guard(f'{device}:all'):
            # init global cond
@@ -51,10 +51,10 @@ class HybridParallelInferenceHelper(object):
            cond_int = layers.fill_constant(shape=[1], dtype="int64", value=0, force_cpu=False, name="cond_int")
            cond = layers.cast(step_idx < max_len, dtype="bool")
            while_op = layers.While(cond, is_test=True)
-            
+
            # init global lod_tensor_array for generation task
            arr = layers.array_write(data, step_idx)
-            
+
        with while_op.block():
            with paddle.fluid.device_guard(f'{device}:all'):
                # read data from global lod_tensor_array
@@ -63,36 +63,36 @@ class HybridParallelInferenceHelper(object):
                # it need for send_v2 of lod_tensor_array
                layers.increment(x=step_idx, value=1.0, in_place=True)
                layers.array_write(element_in_arr, i=step_idx, array=arr)
-                
+
            with paddle.fluid.device_guard(f'{device}:0'):
                ... some code
-                
+
            with paddle.fluid.device_guard(f'{device}:1'):
                ... some code
-                
+
            with paddle.fluid.device_guard(f'{device}:{num_pp-1}'):
                # generate some data in while block and write to global lod_tensor_array
                # that they are read in next while step.
                # we will using send_v2 to send global lod_tensor_array to other pipeline and sync
                layers.array_write(other_var, i=step_idx, array=arr)
-                
+
                # update cond and assign to cond_int, we will sync cond_int
                layers.assign(layers.cast(cond, dtype="int32"), cond_int)
-                
+
            with paddle.fluid.device_guard(f'{model._device}:all'):
                # the code below must at end of while block and exists in device:all
                layers.assign(layers.cast(cond_int, dtype='bool'), cond)
-                
+
        with paddle.fluid.device_guard(f'{model._device}:all'):
            # use a empty lod_tensor_array to clear lod_tensor_array
            layers.assign(layers.create_array(data.dtype), arr)
-            
-            
+
+
    Examples:
-    
+
    .. code-block:: python
        :name: code-example1
-    
+
        # required: distributed
        import os
        import numpy as np
@@ -172,7 +172,7 @@ class HybridParallelInferenceHelper(object):

        exe = paddle.static.Executor(paddle.CUDAPlace(dev_id))
        exe.run(startup_program)
-        
+
        np.random.seed(2333)
        for step in range(5):
            init_data = np.random.uniform(low=0.0, high=1.0, size=[2, 2]).astype('float32')
@@ -358,7 +358,7 @@ class HybridParallelInferenceHelper(object):
        Args:
            stage (int): pipeline stage
            block_idx (int): block index
-            
+
        Returns:
            used_var_names (set): used var names in block_idx block
        """
@@ -445,9 +445,9 @@ class HybridParallelInferenceHelper(object):

    def _add_op_device_attr(self, block):
        """
-        Add op_device attrribute for ops in block that have 
+        Add op_device attrribute for ops in block that have
        not that attribute set.
-        
+
        Args:
            block (Block): the block to process.
        """
@@ -474,7 +474,7 @@ class HybridParallelInferenceHelper(object):

    def _check_validation(self, block):
        """
-        Check whether ops in a block have both the op_device and the 
+        Check whether ops in a block have both the op_device and the
        op_role attributes set.
        """
        assert isinstance(block, Block)
@@ -729,7 +729,7 @@ class HybridParallelInferenceHelper(object):
        """
        Generate inference program.
        Params:
-            sync_in_while_lastpp2firstpp_var_names (list(str)): the vars in the last pipeline 
+            sync_in_while_lastpp2firstpp_var_names (list(str)): the vars in the last pipeline
                that need to send var to first pipeline and exclude bool dtype var
            sync_in_while_var_names (list(str)): the vars sync among all pipeline in while block
                e.g cond. Note that cond cannot be bool dtype.

--- a/python/paddle/distributed/fleet/utils/recompute.py
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -352,13 +352,13 @@ def recompute(function, *args, **kwargs):
    recompute intermediate activations to save then memory.

    Parameters:
-        function(paddle.nn.Sequential): layer of sequence of layers that describes part of forward pass of the model  
-              whose intermediate activations will be released to save memory in forward stage and will be recomputed 
-              in backward stage for gradient calculation. 
-        *args(Tensor): inputs to the function.    
-        **kwargs(Dict): Kwargs should only contain the key-value pair of preserve_rng_state, which is used to 
-              indicate whether to save the forward rng. If it is True, then the last forward rng value will be 
-              restored when the forward recalculation of backpropagation is performed. The default 
+        function(paddle.nn.Sequential): layer of sequence of layers that describes part of forward pass of the model
+              whose intermediate activations will be released to save memory in forward stage and will be recomputed
+              in backward stage for gradient calculation.
+        *args(Tensor): inputs to the function.
+        **kwargs(Dict): Kwargs should only contain the key-value pair of preserve_rng_state, which is used to
+              indicate whether to save the forward rng. If it is True, then the last forward rng value will be
+              restored when the forward recalculation of backpropagation is performed. The default
              preserve_rng_state is True.

    Returns:

--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -18,7 +18,7 @@ from .context import Context
 def launch():
    """
    Paddle distribution training entry ``python -m paddle.distributed.launch``.
-    
+
    Usage:
        .. code-block:: bash
            :name: code-block-bash1
@@ -77,7 +77,7 @@ def launch():
        - ``--heter_workers``: User defined heter workers ip1:port1;ip2:port2, e.g., ``--heter_workers="192.168.0.16:6172;192.168.0.17:6172"``

        - ``--heter_worker_num``: Number of heter_workers in each stage (It recommend to set when in the emulated distributed environment using single node)
-        
+
        - ``--heter_devices``: Type of heter_device in each stage

        - ``--gloo_port``: Gloo http Port. Default ``--gloo_port=6767``.
@@ -94,12 +94,12 @@ def launch():
    IPU Parameters:
        IPU distributed launch only requires and allowes three arguments ``--devices``, ``training_script`` and ``training_script_args``.
        The ``--devices`` is the number of IPU devices. e.g., ``--devices=4`` will launch the training program with four IPU devices.
-        The ``training_script`` is only allowed to set as ``ipu``. 
+        The ``training_script`` is only allowed to set as ``ipu``.
        The ``training_script_args`` includes arguments required by IPU distributed launch and illustrated as below.
        ``Examples 10`` has provided a example of paddle.distributed.launch with IPUs.

        - ``--hosts``: The hosts for IPU distributd training. Each host is able to include multiple processes.
-        
+
        - ``--nproc_per_host``: The number of processes launched per host. Each process is able to include multiple replicas.

        - ``--ipus_per_replica``: The number of IPUs requested per replica. Each replica is able to include multiple IPUs.
@@ -144,16 +144,16 @@ def launch():
    Examples 1 (collective, single node):
        .. code-block:: bash
            :name: code-block-example-bash1
-            
+
            # For training on single node using 4 gpus.

            python -m paddle.distributed.launch --devices=0,1,2,3 train.py --lr=0.01
-        
+
    Examples 2 (collective, multi node):
        .. code-block:: bash
            :name: code-block-example-bash2

-            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17

            # On 192.168.0.16:

@@ -161,15 +161,15 @@ def launch():

            # On 192.168.0.17:
            python -m paddle.distributed.launch --devices=0,1,2,3 --master=192.168.0.16:8090 train.py --lr=0.01
-        
+
    Examples 3 (ps, cpu, single node):
        .. code-block:: bash
            :name: code-block-example-bash3

            # To simulate distributed environment using single node, e.g., 2 servers and 4 workers.
-            
+
            python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
-        
+
    Examples 4 (ps, cpu, multi node):
        .. code-block:: bash
            :name: code-block-example-bash4
@@ -194,10 +194,10 @@ def launch():
            :name: code-block-example-bash5

            # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, each worker use single gpu.
-            
+
            export CUDA_VISIBLE_DEVICES=0,1,2,3
            python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
-            
+
    Examples 6 (ps, gpu, multi node):
        .. code-block:: bash
            :name: code-block-example-bash6
@@ -219,10 +219,10 @@ def launch():
            :name: code-block-example-bash7

            # To simulate distributed environment using single node, e.g., 2 servers and 4 workers, two workers use gpu, two workers use cpu.
-            
+
            export CUDA_VISIBLE_DEVICES=0,1
            python -m paddle.distributed.launch --server_num=2 --worker_num=2 --heter_worker_num=2 train.py --lr=0.01
-            
+
    Examples 8 (ps-heter, cpu + gpu, multi node):
        .. code-block:: bash
            :name: code-block-example-bash8
@@ -246,7 +246,7 @@ def launch():
            # With the following command, the job will begin to run immediately if 4 nodes are ready,
            # or it will run after elastic_timeout if only 2 or 3 nodes ready
            python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:4 train.py
-            
+
            # once the number of nodes changes between 2:4 during training, the strategy holds

    Examples 10 (ipu):

--- a/python/paddle/distributed/models/moe/utils.py
+++ b/python/paddle/distributed/models/moe/utils.py
@@ -60,18 +60,18 @@ def _number_count(numbers, upper_range):

 def _assign_pos(x, cum_count):
    """
-    Assign pos decides which tokens should be fetched belong to 
+    Assign pos decides which tokens should be fetched belong to
    specially expert orderingly.
-    
+
    Args:
        x (Tensor): Tensor. Every element in the list must be a Tensor whose data type
            should be float16, float32, float64, int32 or int64.
-        cum_count (Tensor): The cumulative sum tokens of counters. Every element in the list must be a Tensor whose 
+        cum_count (Tensor): The cumulative sum tokens of counters. Every element in the list must be a Tensor whose
            data type should be int64.
-  
+
    Returns:
-        out (Tensor): Assemble numbers in the order of counters. 
-    
+        out (Tensor): Assemble numbers in the order of counters.
+
    Examples:
        .. code-block:: python

@@ -185,10 +185,10 @@ def _prune_gate_by_capacity(gate_idx, expert_count, n_expert, n_worker):
        gate_idx (Tensor): Represents the gate_id sequence corresponding to the input data with type int32, int64.
        expert_count (Tensor): The quantity value counted on the gate_id sequence of the input data with type int32, int64.
        n_worker(int，optional): The number of workers on the trainer with type int64.
-  
+
    Returns:
        new_gate_idx (Tensor): The gate_id sequence corresponding to the new input data after passing through prune.
-    
+
    Examples:
        .. code-block:: python


--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -105,7 +105,7 @@ def init_parallel_env():

    Returns:
        None
-        
+
    Examples:
        .. code-block:: python
            # required: gpu
@@ -119,7 +119,7 @@ def init_parallel_env():
                    super(LinearNet, self).__init__()
                    self._linear1 = nn.Linear(10, 10)
                    self._linear2 = nn.Linear(10, 1)
-                    
+
                def forward(self, x):
                    return self._linear2(self._linear1(x))

@@ -140,7 +140,7 @@ def init_parallel_env():
                outputs = dp_layer(inputs)
                labels = paddle.randn([10, 1], 'float32')
                loss = loss_fn(outputs, labels)
-                
+
                loss.backward()

                adam.step()

--- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
+++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
@@ -41,7 +41,7 @@ def numel(var):
 class DataParallelOptimizationPass(PassBase):
    """
    Apply Optimizations that specialized for data parallelism in Auto Parallel.
-    1. prune grad scaling 
+    1. prune grad scaling
    2. overlap comm and calc
    3. fuse allreduce
    """
@@ -350,9 +350,9 @@ class DataParallelOptimizationPass(PassBase):
        """
        conditions for gradients to be grouped:
        1. group size < max_fuse_numel
-        2. same dp group 
+        2. same dp group
        3. same dtype
-        4. dependency: grad would NOT be used by other ops within group segment 
+        4. dependency: grad would NOT be used by other ops within group segment

        gradients inside same group would be fuse into one coalesce tensor
        """

--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -126,7 +126,7 @@ class FP16State(object):

    def _build_state(self):
        """
-        mark the execution mode (fp16 or fp32) for ops in all blocks 
+        mark the execution mode (fp16 or fp32) for ops in all blocks
        include forward ops & backward ops
        """
        # mark op dtype

--- a/python/paddle/distributed/passes/auto_parallel_recompute.py
+++ b/python/paddle/distributed/passes/auto_parallel_recompute.py
@@ -95,7 +95,7 @@ class RecomputeState(ProgramStats):

    def modify_forward_desc_for_recompute(self, dist_context):
        """
-        If program's foward part has 'dropout' op, this function will insert 
+        If program's foward part has 'dropout' op, this function will insert
        a seed op before it to guarantee that two dropout op have the same outputs.
        """
        op_types = [op.desc.type() for op in self._ops]

--- a/python/paddle/distributed/passes/pass_utils.py
+++ b/python/paddle/distributed/passes/pass_utils.py
@@ -86,11 +86,11 @@ def prune_program(program, start_op_idx, end_op_idx):

 def split_program(program, op_indices):
    """
-    Split the program by op_indices. 
+    Split the program by op_indices.

    For examples, a program has 100 ops, and op_indices = [25, 60].
    Then the program is splitted into 3 parts, containing 25, 35 and 40
-    ops respectively.  
+    ops respectively.

    The return values are a tuple with 3 elements: the splitted program
    list, the input var names of each splitted program, and the output

--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -1140,7 +1140,7 @@ class SplitTrainerOpsPass(PassBase):
        split cpu-trainer program from origin-program
        1. find heter op (located on different device)
        2. find input&output of every heter-block
-        3. create cpu-trainer program, add send&recv op 
+        3. create cpu-trainer program, add send&recv op
        """
        attrs = pass_ctx._attrs
        default_device_ = 'cpu'

--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -611,7 +611,7 @@ def find_heter_ops(program, default_device="cpu"):
                    if no_grad_var in var2idx:
                        """
                       insert sum op & remove sum op from var2idx and origin place
-  
+
                       """
                        op_list = list(block.ops)
                        sum_op = op_list[var2idx[no_grad_var]]
@@ -1335,7 +1335,7 @@ def build_var_distributed(context):

    context["param_name_to_grad_name"] = param_name_to_grad_name
    context["grad_name_to_param_name"] = grad_name_to_param_name
-    '''    
+    '''
    print("public build_var_distributed origin_sparse_pairs:",
        context["origin_sparse_pairs"])
    print("public build_var_distributed origin_for_dense:",

--- a/python/paddle/distributed/sharding/group_sharded.py
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -62,12 +62,12 @@ def group_sharded_parallel(model,
        buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. The larger the size, the more GPU memory will be used. Defaults to 2**23, which means that the dimension of the buffer is 2**23.
        segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20, indicating that the dimension of the minimum segmented parameter is 2**20.
        sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False, indicating that asynchronous communication is used.
-    
+
    Returns:
        model: A wrapper for group sharded given model.
        optimizer: A wrapper for group sharded given optimizer.
        scaler: A wrapper for group sharded given scaler.
-    
+
    Examples:
        .. code-block:: python

@@ -184,7 +184,7 @@ def save_group_sharded_model(model, output, optimizer=None):
        model (Layer): A wrapper for group sharded given model.
        output (str): Save directory.
        optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None, indicating that the optimizer state is not saved.
-    
+
    Examples:
        .. code-block:: python


--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -60,10 +60,10 @@ def global_scatter(x,
                   group=None,
                   use_calc_stream=True):
    """
-    The global_scatter operator distributes the data of x to n_expert * world_size experts according to local_count, 
-    and then receives data according to global_count. The expert refers to a user-defined expert network, 
+    The global_scatter operator distributes the data of x to n_expert * world_size experts according to local_count,
+    and then receives data according to global_count. The expert refers to a user-defined expert network,
    n_expert refers to the number of expert networks owned by each card, and world_size refers to the number of graphics cards running the network.
-    
+
    As shown below, the value of the world size is 2, n_expert 2, the batch size of the x 4 and local_count is [2, 0, 2, 0].
    The global_count of the rank 0 is [2, 0, , ], rank 1 is [2, 0, ,](Due to the limited space, only the data calculated on rank 0 is shown here).
    In the global_scatter operator, local_count[i] represents sending local_count[i] data to the (i % n_expert)th expert of the (i // n_expert)th card,
@@ -101,10 +101,10 @@ def global_scatter(x,
            how many data needed to be received. The tensor data type should be int64.
        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
        use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True.
-    
+
    Returns:
-        out (Tensor): The data received from all experts. 
-    
+        out (Tensor): The data received from all experts.
+
    Examples:
        .. code-block:: python

@@ -120,7 +120,7 @@ def global_scatter(x,
            local_input_buf = np.array([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]], \
            dtype=np.float32)
            if paddle.distributed.ParallelEnv().local_rank == 0:
-                local_count = np.array([2, 1, 1, 1]) 
+                local_count = np.array([2, 1, 1, 1])
                global_count = np.array([2, 1, 1, 1])
            else:
                local_count = np.array([1, 1, 2, 1])
@@ -195,11 +195,11 @@ def global_gather(x,
    The process of global_gather sending data is as follows:

    The global_count[0] of the 0th card represents sending 2 data to the 0th expert of the 0th card;
-    
+
    The global_count[1] of the 0th card represents sending 0 data to the 1th expert of the 0th card;
-    
+
    The global_count[0] of the 1th card represents sending 2 data to the 0th expert of the 0th card;
-    
+
    The global_count[1] of the 1th card represents sending 0 data to the 1th expert of the 0th card.

    .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/global_scatter_gather.png
@@ -216,10 +216,10 @@ def global_gather(x,
            how many data needed to be sent. Tensor data type should be int64.
        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
        use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True.
-    
+
    Returns:
-        out (Tensor): The data received from all experts. 
-    
+        out (Tensor): The data received from all experts.
+
    Examples:
        .. code-block:: python


--- a/python/paddle/distribution/beta.py
+++ b/python/paddle/distribution/beta.py
@@ -21,11 +21,11 @@ class Beta(exponential_family.ExponentialFamily):
    r"""
    Beta distribution parameterized by alpha and beta.

-    In probability theory and statistics, the beta distribution is a family of 
-    continuous probability distributions defined on the interval [0, 1] 
-    parameterized by two positive shape parameters, denoted by alpha and beta, 
-    that appear as exponents of the random variable and control the shape of 
-    the distribution. The generalization to multiple variables is called a 
+    In probability theory and statistics, the beta distribution is a family of
+    continuous probability distributions defined on the interval [0, 1]
+    parameterized by two positive shape parameters, denoted by alpha and beta,
+    that appear as exponents of the random variable and control the shape of
+    the distribution. The generalization to multiple variables is called a
    Dirichlet distribution.

    The probability density function (pdf) is
@@ -38,18 +38,18 @@ class Beta(exponential_family.ExponentialFamily):

    .. math::

-        B(\alpha, \beta) = \int_{0}^{1} t^{\alpha - 1} (1-t)^{\beta - 1}\mathrm{d}t 
+        B(\alpha, \beta) = \int_{0}^{1} t^{\alpha - 1} (1-t)^{\beta - 1}\mathrm{d}t


    Args:
-        alpha (float|Tensor): Alpha parameter. It supports broadcast semantics. 
-            The value of alpha must be positive. When the parameter is a tensor, 
-            it represents multiple independent distribution with 
+        alpha (float|Tensor): Alpha parameter. It supports broadcast semantics.
+            The value of alpha must be positive. When the parameter is a tensor,
+            it represents multiple independent distribution with
+            a batch_shape(refer to ``Distribution`` ).
+        beta (float|Tensor): Beta parameter. It supports broadcast semantics.
+            The value of beta must be positive(>0). When the parameter is tensor,
+            it represent multiple independent distribution with
            a batch_shape(refer to ``Distribution`` ).
-        beta (float|Tensor): Beta parameter. It supports broadcast semantics. 
-            The value of beta must be positive(>0). When the parameter is tensor, 
-            it represent multiple independent distribution with 
-            a batch_shape(refer to ``Distribution`` ). 

    Examples:

@@ -114,7 +114,7 @@ class Beta(exponential_family.ExponentialFamily):

        Args:
            value (Tensor): Value to be evaluated.
-        
+
        Returns:
            Tensor: Probability.
        """
@@ -125,7 +125,7 @@ class Beta(exponential_family.ExponentialFamily):

        Args:
            value (Tensor): Value to be evaluated
-        
+
        Returns:
            Tensor: Log probability.
        """

--- a/python/paddle/distribution/categorical.py
+++ b/python/paddle/distribution/categorical.py
@@ -31,9 +31,9 @@ from paddle.tensor import arange, concat, gather_nd, multinomial

 class Categorical(distribution.Distribution):
    r"""
-    Categorical distribution is a discrete probability distribution that 
-    describes the possible results of a random variable that can take on 
-    one of K possible categories, with the probability of each category 
+    Categorical distribution is a discrete probability distribution that
+    describes the possible results of a random variable that can take on
+    one of K possible categories, with the probability of each category
    separately specified.

    The probability mass function (pmf) is:
@@ -267,9 +267,9 @@ class Categorical(distribution.Distribution):
    def probs(self, value):
        """Probabilities of the given category (``value``).

-        If ``logits`` is 2-D or higher dimension, the last dimension will be regarded as 
+        If ``logits`` is 2-D or higher dimension, the last dimension will be regarded as
        category, and the others represents the different distributions.
-        At the same time, if ``vlaue`` is 1-D Tensor, ``value`` will be broadcast to the 
+        At the same time, if ``vlaue`` is 1-D Tensor, ``value`` will be broadcast to the
        same number of distributions as ``logits``.
        If ``value`` is not 1-D Tensor, ``value`` should have the same number distributions
        with ``logits. That is, ``value[:-1] = logits[:-1]``.

--- a/python/paddle/distribution/dirichlet.py
+++ b/python/paddle/distribution/dirichlet.py
@@ -23,32 +23,32 @@ class Dirichlet(exponential_family.ExponentialFamily):
    r"""
    Dirichlet distribution with parameter "concentration".

-    The Dirichlet distribution is defined over the `(k-1)-simplex` using a 
+    The Dirichlet distribution is defined over the `(k-1)-simplex` using a
    positive, lenght-k vector concentration(`k > 1`).
    The Dirichlet is identically the Beta distribution when `k = 2`.

-    For independent and identically distributed continuous random variable 
-    :math:`\boldsymbol X \in R_k` , and support 
-    :math:`\boldsymbol X \in (0,1), ||\boldsymbol X|| = 1` , 
+    For independent and identically distributed continuous random variable
+    :math:`\boldsymbol X \in R_k` , and support
+    :math:`\boldsymbol X \in (0,1), ||\boldsymbol X|| = 1` ,
    The probability density function (pdf) is

    .. math::
-    
-        f(\boldsymbol X; \boldsymbol \alpha) = \frac{1}{B(\boldsymbol \alpha)} \prod_{i=1}^{k}x_i^{\alpha_i-1} 

-    where :math:`\boldsymbol \alpha = {\alpha_1,...,\alpha_k}, k \ge 2` is 
+        f(\boldsymbol X; \boldsymbol \alpha) = \frac{1}{B(\boldsymbol \alpha)} \prod_{i=1}^{k}x_i^{\alpha_i-1}
+
+    where :math:`\boldsymbol \alpha = {\alpha_1,...,\alpha_k}, k \ge 2` is
    parameter, the normalizing constant is the multivariate beta function.

    .. math::

        B(\boldsymbol \alpha) = \frac{\prod_{i=1}^{k} \Gamma(\alpha_i)}{\Gamma(\alpha_0)}

-    :math:`\alpha_0=\sum_{i=1}^{k} \alpha_i` is the sum of parameters, 
+    :math:`\alpha_0=\sum_{i=1}^{k} \alpha_i` is the sum of parameters,
    :math:`\Gamma(\alpha)` is gamma function.

    Args:
-        concentration (Tensor): "Concentration" parameter of dirichlet 
-            distribution, also called :math:`\alpha`. When it's over one 
+        concentration (Tensor): "Concentration" parameter of dirichlet
+            distribution, also called :math:`\alpha`. When it's over one
            dimension, the last axis denotes the parameter of distribution,
            ``event_shape=concentration.shape[-1:]`` , axes other than last are
            condsider batch dimensions with ``batch_shape=concentration.shape[:-1]`` .

--- a/python/paddle/distribution/distribution.py
+++ b/python/paddle/distribution/distribution.py
@@ -39,15 +39,15 @@ from paddle.tensor import arange, concat, gather_nd, multinomial

 class Distribution(object):
    """
-    The abstract base class for probability distributions. Functions are 
+    The abstract base class for probability distributions. Functions are
    implemented in specific distributions.

    Args:
-        batch_shape(Sequence[int], optional):  independent, not identically 
+        batch_shape(Sequence[int], optional):  independent, not identically
            distributed draws, aka a "collection" or "bunch" of distributions.
-        event_shape(Sequence[int], optional): the shape of a single 
-            draw from the distribution; it may be dependent across dimensions. 
-            For scalar distributions, the event shape is []. For n-dimension 
+        event_shape(Sequence[int], optional): the shape of a single
+            draw from the distribution; it may be dependent across dimensions.
+            For scalar distributions, the event shape is []. For n-dimension
            multivariate distribution, the event shape is [n].
    """

@@ -118,16 +118,16 @@ class Distribution(object):

    def probs(self, value):
        """Probability density/mass function.
-        
-        .. note:: 
-        
-            This method will be deprecated in the future, please use `prob` 
+
+        .. note::
+
+            This method will be deprecated in the future, please use `prob`
            instead.
        """
        raise NotImplementedError

    def _extend_shape(self, sample_shape):
-        """compute shape of the sample 
+        """compute shape of the sample

        Args:
            sample_shape (Tensor): sample shape
@@ -239,9 +239,9 @@ class Distribution(object):

    def _probs_to_logits(self, probs, is_binary=False):
        r"""
-        Converts probabilities into logits. For the binary, probs denotes the 
-        probability of occurrence of the event indexed by `1`. For the 
-        multi-dimensional, values of last axis denote the probabilities of 
+        Converts probabilities into logits. For the binary, probs denotes the
+        probability of occurrence of the event indexed by `1`. For the
+        multi-dimensional, values of last axis denote the probabilities of
        occurrence of each of the events.
        """
        return (paddle.log(probs) - paddle.log1p(-probs)) \
@@ -249,8 +249,8 @@ class Distribution(object):

    def _logits_to_probs(self, logits, is_binary=False):
        r"""
-        Converts logits into probabilities. For the binary, each value denotes 
-        log odds, whereas for the multi-dimensional case, the values along the 
+        Converts logits into probabilities. For the binary, each value denotes
+        log odds, whereas for the multi-dimensional case, the values along the
        last dimension denote the log probabilities of the events.
        """
        return paddle.nn.functional.sigmoid(logits) \

--- a/python/paddle/distribution/exponential_family.py
+++ b/python/paddle/distribution/exponential_family.py
@@ -18,19 +18,19 @@ from paddle.fluid.framework import _non_static_mode, in_dygraph_mode


 class ExponentialFamily(distribution.Distribution):
-    r""" 
-    ExponentialFamily is the base class for probability distributions belonging 
-    to exponential family, whose probability mass/density function has the 
+    r"""
+    ExponentialFamily is the base class for probability distributions belonging
+    to exponential family, whose probability mass/density function has the
    form is defined below

    ExponentialFamily is derived from `paddle.distribution.Distribution`.
-    
+
    .. math::

        f_{F}(x; \theta) = \exp(\langle t(x), \theta\rangle - F(\theta) + k(x))
-    
-    where :math:`\theta` denotes the natural parameters, :math:`t(x)` denotes 
-    the sufficient statistic, :math:`F(\theta)` is the log normalizer function 
+
+    where :math:`\theta` denotes the natural parameters, :math:`t(x)` denotes
+    the sufficient statistic, :math:`F(\theta)` is the log normalizer function
    for a given family and :math:`k(x)` is the carrier measure.

    Distribution belongs to exponential family referring to https://en.wikipedia.org/wiki/Exponential_family
@@ -48,7 +48,7 @@ class ExponentialFamily(distribution.Distribution):
        raise NotImplementedError

    def entropy(self):
-        """caculate entropy use `bregman divergence` 
+        """caculate entropy use `bregman divergence`
        https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf
        """
        entropy_value = -self._mean_carrier_measure

--- a/python/paddle/distribution/independent.py
+++ b/python/paddle/distribution/independent.py
@@ -20,17 +20,17 @@ class Independent(distribution.Distribution):
    Reinterprets some of the batch dimensions of a distribution as event dimensions.

    This is mainly useful for changing the shape of the result of
-    :meth:`log_prob`. 
+    :meth:`log_prob`.

    Args:
        base (Distribution): The base distribution.
-        reinterpreted_batch_rank (int): The number of batch dimensions to 
+        reinterpreted_batch_rank (int): The number of batch dimensions to
            reinterpret as event dimensions.

    Examples:

        .. code-block:: python
-        
+
            import paddle
            from paddle.distribution import independent


--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -35,7 +35,7 @@ def kl_divergence(p, q):

    .. math::

-        KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x 
+        KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x

    Args:
        p (Distribution): ``Distribution`` object.
@@ -64,11 +64,11 @@ def kl_divergence(p, q):
 def register_kl(cls_p, cls_q):
    """Decorator for register a KL divergence implemention function.

-    The ``kl_divergence(p, q)`` function will search concrete implemention 
-    functions registered by ``register_kl``, according to multi-dispatch pattern. 
-    If an implemention function is found, it will return the result, otherwise, 
-    it will raise ``NotImplementError`` exception. Users can register 
-    implemention funciton by the decorator. 
+    The ``kl_divergence(p, q)`` function will search concrete implemention
+    functions registered by ``register_kl``, according to multi-dispatch pattern.
+    If an implemention function is found, it will return the result, otherwise,
+    it will raise ``NotImplementError`` exception. Users can register
+    implemention funciton by the decorator.

    Args:
        cls_p(Distribution): Subclass derived from ``Distribution``.

--- a/python/paddle/distribution/multinomial.py
+++ b/python/paddle/distribution/multinomial.py
@@ -24,14 +24,14 @@ except:

 class Multinomial(distribution.Distribution):
    r"""
-    Multinomial distribution parameterized by :attr:`total_count` and 
+    Multinomial distribution parameterized by :attr:`total_count` and
    :attr:`probs`.

-    In probability theory, the multinomial distribution is a generalization of 
+    In probability theory, the multinomial distribution is a generalization of
    the binomial distribution, it models the probability of counts for each side
-    of a k-sided die rolled n times. When k is 2 and n is 1, the multinomial is 
-    the bernoulli distribution, when k is 2 and n is grater than 1, it is the 
-    binomial distribution, when k is grater than 2 and n is 1, it is the 
+    of a k-sided die rolled n times. When k is 2 and n is 1, the multinomial is
+    the bernoulli distribution, when k is 2 and n is grater than 1, it is the
+    binomial distribution, when k is grater than 2 and n is 1, it is the
    categorical distribution.

    The probability mass function (PMF) for multinomial is
@@ -40,18 +40,18 @@ class Multinomial(distribution.Distribution):

        f(x_1, ..., x_k; n, p_1,...,p_k) = \frac{n!}{x_1!...x_k!}p_1^{x_1}...p_k^{x_k}

-    where, :math:`n` is number of trials, k is the number of categories, 
-    :math:`p_i` denote probability of a trial falling into each category, 
-    :math:`{\textstyle \sum_{i=1}^{k}p_i=1}, p_i \ge 0`, and :math:`x_i` denote 
-    count of each category. 
+    where, :math:`n` is number of trials, k is the number of categories,
+    :math:`p_i` denote probability of a trial falling into each category,
+    :math:`{\textstyle \sum_{i=1}^{k}p_i=1}, p_i \ge 0`, and :math:`x_i` denote
+    count of each category.

    Args:
        total_count (int): Number of trials.
-        probs (Tensor): Probability of a trial falling into each category. Last 
+        probs (Tensor): Probability of a trial falling into each category. Last
            axis of probs indexes over categories, other axes index over batches.
-            Probs value should between [0, 1], and sum to 1 along last axis. If 
-            the value over 1, it will be normalized to sum to 1 along the last 
-            axis. 
+            Probs value should between [0, 1], and sum to 1 along last axis. If
+            the value over 1, it will be normalized to sum to 1 along the last
+            axis.

    Examples:


--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -55,7 +55,7 @@ class Normal(distribution.Distribution):

    Examples:
        .. code-block:: python
-          
+
          import paddle
          from paddle.distribution import Normal

@@ -248,7 +248,7 @@ class Normal(distribution.Distribution):
        .. math::

            ratio = \\frac{\sigma_0}{\sigma_1}
-        
+
        .. math::

            diff = \mu_1 - \mu_0

--- a/python/paddle/distribution/transform.py
+++ b/python/paddle/distribution/transform.py
@@ -50,45 +50,45 @@ class Type(enum.Enum):
 class Transform(object):
    r"""Base class for the transformations of random variables.

-    ``Transform`` can be used to represent any differentiable and injective 
-    function from the subset of :math:`R^n` to subset of :math:`R^m`, generally 
-    used for transforming a random sample generated by ``Distribution`` 
-    instance. 
-
-    Suppose :math:`X` is a K-dimensional random variable with probability 
-    density function :math:`p_X(x)`. A new random variable :math:`Y = f(X)` may 
-    be defined by transforming :math:`X` with a suitably well-behaved funciton 
-    :math:`f`. It suffices for what follows to note that if f is one-to-one and 
-    its inverse :math:`f^{-1}` have a well-defined Jacobian, then the density of 
+    ``Transform`` can be used to represent any differentiable and injective
+    function from the subset of :math:`R^n` to subset of :math:`R^m`, generally
+    used for transforming a random sample generated by ``Distribution``
+    instance.
+
+    Suppose :math:`X` is a K-dimensional random variable with probability
+    density function :math:`p_X(x)`. A new random variable :math:`Y = f(X)` may
+    be defined by transforming :math:`X` with a suitably well-behaved funciton
+    :math:`f`. It suffices for what follows to note that if f is one-to-one and
+    its inverse :math:`f^{-1}` have a well-defined Jacobian, then the density of
    :math:`Y` is

    .. math::

        p_Y(y) = p_X(f^{-1}(y)) |det J_{f^{-1}}(y)|

-    where det is the matrix determinant operation and :math:`J_{f^{-1}}(y)` is 
+    where det is the matrix determinant operation and :math:`J_{f^{-1}}(y)` is
    the Jacobian matrix of :math:`f^{-1}` evaluated at :math:`y`.
    Taking :math:`x = f^{-1}(y)`, the Jacobian matrix is defined by

    .. math::

        J(y) = \begin{bmatrix}
-        {\frac{\partial x_1}{\partial y_1}} &{\frac{\partial x_1}{\partial y_2}} 
+        {\frac{\partial x_1}{\partial y_1}} &{\frac{\partial x_1}{\partial y_2}}
        &{\cdots} &{\frac{\partial x_1}{\partial y_K}} \\
        {\frac{\partial x_2}{\partial y_1}}  &{\frac{\partial x_2}
        {\partial y_2}}&{\cdots} &{\frac{\partial x_2}{\partial y_K}} \\
        {\vdots} &{\vdots} &{\ddots} &{\vdots}\\
-        {\frac{\partial x_K}{\partial y_1}} &{\frac{\partial x_K}{\partial y_2}} 
-        &{\cdots} &{\frac{\partial x_K}{\partial y_K}} 
+        {\frac{\partial x_K}{\partial y_1}} &{\frac{\partial x_K}{\partial y_2}}
+        &{\cdots} &{\frac{\partial x_K}{\partial y_K}}
        \end{bmatrix}

    A ``Transform`` can be characterized by three operations:

        #. forward
-           Forward implements :math:`x \rightarrow f(x)`, and is used to convert 
+           Forward implements :math:`x \rightarrow f(x)`, and is used to convert
           one random outcome into another.
        #. inverse
-           Undoes the transformation :math:`y \rightarrow f^{-1}(y)`.  
+           Undoes the transformation :math:`y \rightarrow f^{-1}(y)`.
        #. log_det_jacobian
           The log of the absolute value of the determinant of the matrix of all
           first-order partial derivatives of the inverse function.
@@ -121,14 +121,14 @@ class Transform(object):
        return Type.is_injective(cls._type)

    def __call__(self, input):
-        """Make this instance as a callable object. The return value is 
-        depening on the input type. 
+        """Make this instance as a callable object. The return value is
+        depening on the input type.

-        * If the input is a ``Tensor`` instance, return 
+        * If the input is a ``Tensor`` instance, return
          ``self.forward(input)`` .
-        * If the input is a ``Distribution`` instance, return 
+        * If the input is a ``Distribution`` instance, return
          ``TransformedDistribution(base=input, transforms=[self])`` .
-        * If the input is a ``Transform`` instance, return 
+        * If the input is a ``Transform`` instance, return
          ``ChainTransform([self, input])`` .

        Args:
@@ -145,12 +145,12 @@ class Transform(object):
        return self.forward(x)

    def forward(self, x):
-        """Forward transformation with mapping :math:`y = f(x)`. 
+        """Forward transformation with mapping :math:`y = f(x)`.

        Useful for turning one random outcome into another.

        Args:
-            x (Tensos): Input parameter, generally is a sample generated 
+            x (Tensos): Input parameter, generally is a sample generated
                from ``Distribution``.

        Returns:
@@ -166,7 +166,7 @@ class Transform(object):
        return self._forward(x)

    def inverse(self, y):
-        """Inverse transformation :math:`x = f^{-1}(y)`. It's useful for "reversing" 
+        """Inverse transformation :math:`x = f^{-1}(y)`. It's useful for "reversing"
        a transformation to compute one probability in terms of another.

        Args:
@@ -185,15 +185,15 @@ class Transform(object):
        return self._inverse(y)

    def forward_log_det_jacobian(self, x):
-        """The log of the absolute value of the determinant of the matrix of all 
+        """The log of the absolute value of the determinant of the matrix of all
        first-order partial derivatives of the inverse function.

        Args:
-            x (Tensor): Input tensor, generally is a sample generated from 
+            x (Tensor): Input tensor, generally is a sample generated from
                ``Distribution``

        Returns:
-            Tensor: The log of the absolute value of Jacobian determinant. 
+            Tensor: The log of the absolute value of Jacobian determinant.
        """
        if not isinstance(x, paddle.fluid.framework.Variable):
            raise TypeError(
@@ -212,11 +212,11 @@ class Transform(object):

    def inverse_log_det_jacobian(self, y):
        """Compute :math:`log|det J_{f^{-1}}(y)|`.
-        Note that ``forward_log_det_jacobian`` is the negative of this function, 
+        Note that ``forward_log_det_jacobian`` is the negative of this function,
        evaluated at :math:`f^{-1}(y)`.

        Args:
-            y (Tensor): The input to the ``inverse`` Jacobian determinant 
+            y (Tensor): The input to the ``inverse`` Jacobian determinant
                evaluation.

        Returns:
@@ -269,13 +269,13 @@ class Transform(object):
        return variable.real

    def _forward(self, x):
-        """Inner method for publid API ``forward``, subclass should 
+        """Inner method for publid API ``forward``, subclass should
        overwrite this method for supporting forward transformation.
        """
        raise NotImplementedError('Forward not implemented')

    def _inverse(self, y):
-        """Inner method of public API ``inverse``, subclass should 
+        """Inner method of public API ``inverse``, subclass should
        overwrite this method for supporting inverse transformation.
        """
        raise NotImplementedError('Inverse not implemented')
@@ -301,35 +301,35 @@ class Transform(object):
            'is implemented. One of them is required')

    def _forward_shape(self, shape):
-        """Inner method called by ``forward_shape``, which is used to infer the 
-        forward shape. Subclass should overwrite this method for supporting 
+        """Inner method called by ``forward_shape``, which is used to infer the
+        forward shape. Subclass should overwrite this method for supporting
        ``forward_shape``.
        """
        return shape

    def _inverse_shape(self, shape):
-        """Inner method called by ``inverse_shape``, whic is used to infer the 
-        invese shape. Subclass should overwrite this method for supporting 
+        """Inner method called by ``inverse_shape``, whic is used to infer the
+        invese shape. Subclass should overwrite this method for supporting
        ``inverse_shape``.
        """
        return shape


 class AbsTransform(Transform):
-    r"""Absolute transformation with formula :math:`y = f(x) = abs(x)`, 
+    r"""Absolute transformation with formula :math:`y = f(x) = abs(x)`,
    element-wise.

-    This non-injective transformation allows for transformations of scalar 
-    distributions with the absolute value function, which maps ``(-inf, inf)`` 
+    This non-injective transformation allows for transformations of scalar
+    distributions with the absolute value function, which maps ``(-inf, inf)``
    to ``[0, inf)`` .

-    * For ``y`` in ``(0, inf)`` , ``AbsTransform.inverse(y)`` returns the set invese 
+    * For ``y`` in ``(0, inf)`` , ``AbsTransform.inverse(y)`` returns the set invese
      ``{x  in (-inf, inf) : |x| = y}`` as a tuple, ``-y, y`` .
-    * For ``y`` equal ``0`` , ``AbsTransform.inverse(0)`` returns ``0, 0``, which is not 
-      the set inverse (the set inverse is the singleton {0}), but "works" in 
-      conjunction with ``TransformedDistribution`` to produce a left 
+    * For ``y`` equal ``0`` , ``AbsTransform.inverse(0)`` returns ``0, 0``, which is not
+      the set inverse (the set inverse is the singleton {0}), but "works" in
+      conjunction with ``TransformedDistribution`` to produce a left
      semi-continuous pdf.
-    * For ``y`` in ``(-inf, 0)`` , ``AbsTransform.inverse(y)`` returns the 
+    * For ``y`` in ``(-inf, 0)`` , ``AbsTransform.inverse(y)`` returns the
      wrong thing ``-y, y``. This is done for efficiency.

    Examples:
@@ -388,7 +388,7 @@ class AbsTransform(Transform):


 class AffineTransform(Transform):
-    r"""Affine transformation with mapping 
+    r"""Affine transformation with mapping
    :math:`y = \text{loc} + \text{scale} \times x`.

    Args:
@@ -638,26 +638,26 @@ class ExpTransform(Transform):

 class IndependentTransform(Transform):
    r"""
-    ``IndependentTransform`` wraps a base transformation, reinterprets 
+    ``IndependentTransform`` wraps a base transformation, reinterprets
    some of the rightmost batch axes as event axes.

    Generally, it is used to expand the event axes. This has no effect on the
-    forward or inverse transformaion, but does sum out the 
-    ``reinterpretd_bach_rank`` rightmost dimensions in computing the determinant 
+    forward or inverse transformaion, but does sum out the
+    ``reinterpretd_bach_rank`` rightmost dimensions in computing the determinant
    of Jacobian matrix.

-    To see this, consider the ``ExpTransform`` applied to a Tensor which has 
-    sample, batch, and event ``(S,B,E)`` shape semantics. Suppose the Tensor's 
+    To see this, consider the ``ExpTransform`` applied to a Tensor which has
+    sample, batch, and event ``(S,B,E)`` shape semantics. Suppose the Tensor's
    paritioned-shape is ``(S=[4], B=[2, 2], E=[3])`` , reinterpreted_batch_rank
    is 1. Then the reinterpreted Tensor's shape  is ``(S=[4], B=[2], E=[2, 3])`` .
-    The shape returned by ``forward`` and ``inverse`` is unchanged, ie, 
-    ``[4,2,2,3]`` . However the shape returned by ``inverse_log_det_jacobian`` 
-    is ``[4,2]``, because the Jacobian determinant is a reduction over the 
+    The shape returned by ``forward`` and ``inverse`` is unchanged, ie,
+    ``[4,2,2,3]`` . However the shape returned by ``inverse_log_det_jacobian``
+    is ``[4,2]``, because the Jacobian determinant is a reduction over the
    event dimensions.

    Args:
        base (Transform): The base transformation.
-        reinterpreted_batch_rank (int): The num of rightmost batch rank that 
+        reinterpreted_batch_rank (int): The num of rightmost batch rank that
            will be reinterpreted as event rank.

    Examples:
@@ -793,7 +793,7 @@ class PowerTransform(Transform):
 class ReshapeTransform(Transform):
    r"""Reshape the event shape of a tensor.

-    Note that ``in_event_shape`` and ``out_event_shape`` must have the same 
+    Note that ``in_event_shape`` and ``out_event_shape`` must have the same
    number of elements.

    Args:
@@ -943,8 +943,8 @@ class SigmoidTransform(Transform):
 class SoftmaxTransform(Transform):
    r"""Softmax transformation with mapping :math:`y=\exp(x)` then normalizing.

-    It's generally used to convert unconstrained space to simplex. This mapping 
-    is not injective, so ``forward_log_det_jacobian`` and 
+    It's generally used to convert unconstrained space to simplex. This mapping
+    is not injective, so ``forward_log_det_jacobian`` and
    ``inverse_log_det_jacobian`` are not implemented.

    Examples:
@@ -997,11 +997,11 @@ class SoftmaxTransform(Transform):


 class StackTransform(Transform):
-    r""" ``StackTransform`` applies a sequence of transformations along the 
+    r""" ``StackTransform`` applies a sequence of transformations along the
    specific axis.

    Args:
-        transforms(Sequence[Transform]): The sequence of transformations. 
+        transforms(Sequence[Transform]): The sequence of transformations.
        axis(int): The axis along which will be transformed.

    Examples:
@@ -1102,7 +1102,7 @@ class StackTransform(Transform):


 class StickBreakingTransform(Transform):
-    r"""Convert an unconstrained vector to the simplex with one additional 
+    r"""Convert an unconstrained vector to the simplex with one additional
    dimension by the stick-breaking construction.

    Examples:
@@ -1213,8 +1213,8 @@ class TanhTransform(Transform):
        return y.atanh()

    def _forward_log_det_jacobian(self, x):
-        """We implicitly rely on _forward_log_det_jacobian rather than 
-        explicitly implement ``_inverse_log_det_jacobian`` since directly using 
+        """We implicitly rely on _forward_log_det_jacobian rather than
+        explicitly implement ``_inverse_log_det_jacobian`` since directly using
        ``-tf.math.log1p(-tf.square(y))`` has lower numerical precision.

        See details: https://github.com/tensorflow/probability/blob/master/tensorflow_probability/python/bijectors/tanh.py#L69-L80

--- a/python/paddle/distribution/transformed_distribution.py
+++ b/python/paddle/distribution/transformed_distribution.py
@@ -20,8 +20,8 @@ from paddle.distribution import independent


 class TransformedDistribution(distribution.Distribution):
-    r"""    
-    Applies a sequence of Transforms to a base distribution. 
+    r"""
+    Applies a sequence of Transforms to a base distribution.

    Args:
        base (Distribution): The base distribution.
@@ -30,12 +30,12 @@ class TransformedDistribution(distribution.Distribution):
    Examples:

        .. code-block:: python
-        
-            import paddle 
+
+            import paddle
            from paddle.distribution import transformed_distribution

            d = transformed_distribution.TransformedDistribution(
-                paddle.distribution.Normal(0., 1.), 
+                paddle.distribution.Normal(0., 1.),
                [paddle.distribution.AffineTransform(paddle.to_tensor(1.), paddle.to_tensor(2.))]
            )


--- a/python/paddle/distribution/variable.py
+++ b/python/paddle/distribution/variable.py
@@ -37,7 +37,7 @@ class Variable(object):
        return self._event_rank

    def constraint(self, value):
-        """Check whether the 'value' meet the constraint conditions of this 
+        """Check whether the 'value' meet the constraint conditions of this
        random variable."""
        return self._constraint(value)

@@ -59,8 +59,8 @@ class Independent(Variable):

    Args:
        base (Variable): Base variable.
-        reinterpreted_batch_rank (int): The rightmost batch rank to be 
-            reinterpreted. 
+        reinterpreted_batch_rank (int): The rightmost batch rank to be
+            reinterpreted.
    """

    def __init__(self, base, reinterpreted_batch_rank):

--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -19,9 +19,9 @@ import warnings
 """
    Class of all kinds of Average.

-    All Averages are accomplished via Python totally. 
+    All Averages are accomplished via Python totally.
    They do not change Paddle's Program, nor do anything to
-    modify NN model's configuration. They are completely 
+    modify NN model's configuration. They are completely
    wrappers of Python functions.
 """

@@ -41,9 +41,9 @@ class WeightedAverage(object):
    """
    Calculate weighted average.

-    The average calculating is accomplished via Python totally. 
+    The average calculating is accomplished via Python totally.
    They do not change Paddle's Program, nor do anything to
-    modify NN model's configuration. They are completely 
+    modify NN model's configuration. They are completely
    wrappers of Python functions.

    Examples:

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1409,11 +1409,11 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
    """
    ops_to_remove = []
    '''
-    NOTE(paddle-dev): while_grad op may hold some inputs which are not found 
-    in the parent/forward block, and they are also the outputs of while_grad 
-    op. These kinds of inputs are the recursive outputs inside while_grad op. 
-    They should be considered as "already created" when scanning the inner 
-    ops of while_grad ops.  
+    NOTE(paddle-dev): while_grad op may hold some inputs which are not found
+    in the parent/forward block, and they are also the outputs of while_grad
+    op. These kinds of inputs are the recursive outputs inside while_grad op.
+    They should be considered as "already created" when scanning the inner
+    ops of while_grad ops.
    '''
    parent_op = _find_parent_op_(block)
    parent_op_vars = []
@@ -1452,7 +1452,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
            continue
        else:
            '''
-            If the output is not empty and there is any grad input, find 
+            If the output is not empty and there is any grad input, find
            whether there is any existing input. If not, just remove it.
            '''
            if grad_var_ins:
@@ -1464,11 +1464,11 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
                if not existing_grad_var_ins:
                    '''
                    FIXME(paddle-dev, zengjinle): rnn_memory_helper_grad is used
-                    in recurrent op. The input of this op does not even exist in 
-                    the program! Therefore, any dependency analysis would not 
+                    in recurrent op. The input of this op does not even exist in
+                    the program! Therefore, any dependency analysis would not
                    work to this op! If I do not add the following code, this op
-                    would be pruned, and the calculation result would be wrong. 
-                    Maybe we should re-design this op later...  
+                    would be pruned, and the calculation result would be wrong.
+                    Maybe we should re-design this op later...
                    '''
                    if op_desc.type() not in ['rnn_memory_helper_grad']:
                        ops_to_remove.append(op_idx)
@@ -2206,7 +2206,7 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
        will be None.

    Examples:
-    
+
        .. code-block:: python
          :name: code-example
            import paddle

--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -209,34 +209,34 @@ class ClipGradBase(object):
 class ClipGradByValue(ClipGradBase):
    """
    Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
-    
+
    - Any values less than min are set to ``min``.
-    
+
    - Any values greater than max are set to ``max``.

-    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``. 
+    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
-    
-    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
+
+    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
    (for example: :ref:`api_paddle_optimizer_SGD`).

    Note:
-        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0. 
+        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
-    
+
    Args:
        max (float): The maximum value to clip by.
-        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max`` 
+        min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
            automatically. In this case, ``max`` must be greater than 0.

    Examples:
        .. code-block:: python
-        
+
            import paddle

            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10, 
-                                      weight_attr=paddle.ParamAttr(need_clip=True), 
+            linear = paddle.nn.Linear(in_features=10, out_features=10,
+                                      weight_attr=paddle.ParamAttr(need_clip=True),
                                      bias_attr=paddle.ParamAttr(need_clip=False))
            out = linear(x)
            loss = paddle.mean(out)
@@ -300,17 +300,17 @@ class ClipGradByValue(ClipGradBase):
 class ClipGradByNorm(ClipGradBase):
    r"""
    Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
-    
+
    - If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
-    
+
    - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
-    
+
    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
-    
-    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
+
+    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
    (for example: :ref:`api_paddle_optimizer_SGD`).
-    
+
    The clipping formula is:

    .. math::
@@ -329,7 +329,7 @@ class ClipGradByNorm(ClipGradBase):
        norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}

    Note:
-        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0. 
+        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.

    Args:
@@ -337,12 +337,12 @@ class ClipGradByNorm(ClipGradBase):

    Examples:
        .. code-block:: python
-        
+
            import paddle

            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10, 
-                                      weight_attr=paddle.ParamAttr(need_clip=True), 
+            linear = paddle.nn.Linear(in_features=10, out_features=10,
+                                      weight_attr=paddle.ParamAttr(need_clip=True),
                                      bias_attr=paddle.ParamAttr(need_clip=False))
            out = linear(x)
            loss = paddle.mean(out)
@@ -415,17 +415,17 @@ def _allow_pure_fp16_global_norm_clip(*args):

 class ClipGradByGlobalNorm(ClipGradBase):
    r"""
-    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in 
+    Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
    :math:`t\_list` , and limit it to ``clip_norm`` .
-    
+
    - If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.
-    
+
    - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
-    
+
    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
-    
-    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
+
+    Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
    (for example: :ref:`api_paddle_optimizer_SGD`).

    The clipping formula is:
@@ -441,7 +441,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}

    Note:
-        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0. 
+        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.

    Args:
@@ -450,12 +450,12 @@ class ClipGradByGlobalNorm(ClipGradBase):

    Examples:
        .. code-block:: python
-        
+
            import paddle

            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(in_features=10, out_features=10, 
-                                      weight_attr=paddle.ParamAttr(need_clip=True), 
+            linear = paddle.nn.Linear(in_features=10, out_features=10,
+                                      weight_attr=paddle.ParamAttr(need_clip=True),
                                      bias_attr=paddle.ParamAttr(need_clip=False))
            out = linear(x)
            loss = paddle.mean(out)
@@ -719,23 +719,23 @@ class ClipGradByGlobalNorm(ClipGradBase):
 def set_gradient_clip(clip, param_list=None, program=None):
    """
    :api_attr: Static Graph
-    
+
    Warning:
-    
-        This API must be used after building network, and before ``minimize`` , 
-        and it may be removed in future releases, so it is not recommended. 
+
+        This API must be used after building network, and before ``minimize`` ,
+        and it may be removed in future releases, so it is not recommended.
        It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
        this is a better method to clip gradient. There are three clipping strategies:
-         :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+         :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
         :ref:`api_fluid_clip_GradientClipByValue` .
-        
+
    To specify parameters that require gradient clip.

    Args:
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
-            :ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no
            gradient clipping.
        param_list (list(Variable), optional): Parameters that require gradient clip.
                It can be a list of parameter or a list of parameter's name.
@@ -789,7 +789,7 @@ def set_gradient_clip(clip, param_list=None, program=None):
                    param_list=[param_var1, param_var2])
                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
                sgd.minimize(loss)
-            
+
            # network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
            with fluid.program_guard(fluid.Program(), fluid.Program()):
                loss = network()
@@ -800,10 +800,10 @@ def set_gradient_clip(clip, param_list=None, program=None):
                # Set the gradient clipping strategy: clip2
                sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
                sgd.minimize(loss)
-                # 'set_gradient_clip' will not take effect when setting has a conflict, 
+                # 'set_gradient_clip' will not take effect when setting has a conflict,
                # and the gradient clipping strategy will be 'clip2'
-            
-            
+
+
    """
    warnings.warn("Caution! 'set_gradient_clip' is not recommended "
                  "and may be deprecated in future! "

--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -102,7 +102,7 @@ def _should_broadcast_or_not_exists(program, var_name):
 class CompiledProgram(object):
    """
    :api_attr: Static Graph
-    
+
    The CompiledProgram is used to transform a program or graph for
    various optimizations according to the configuration of build_strategy,
    for example, the operators' fusion in the computation graph, memory
@@ -187,12 +187,12 @@ class CompiledProgram(object):
        exec_strategy to set some optimizations that can be applied during the construction
        and computation of the Graph, such as reducing the number of AllReduce operations,
        specifying the size of the thread pool used in the computation Graph running the model,
-        and so on. 
-        
+        and so on.
+
        .. note::
-            If build_strategy is specified when building CompiledProgram and calling 
-            with_data_parallel, build_strategy in CompiledProgram will be overwritten, therefore, 
-            if it is data parallel training, it is recommended to set build_strategy when calling 
+            If build_strategy is specified when building CompiledProgram and calling
+            with_data_parallel, build_strategy in CompiledProgram will be overwritten, therefore,
+            if it is data parallel training, it is recommended to set build_strategy when calling
            with_data_parallel interface.

        Args:
@@ -228,7 +228,7 @@ class CompiledProgram(object):
                export CPU_NUM=4, if the environment variable is not set, the executor will
                add the variable to the environment variable and set its value to 1.
                The default is None. If ``places`` is the list of string, the string in the list
-                can be ``cpu``, ``gpu:x``, where ``x`` is the index of the GPUs. 
+                can be ``cpu``, ``gpu:x``, where ``x`` is the index of the GPUs.

        Returns:
            CompiledProgram
@@ -270,7 +270,7 @@ class CompiledProgram(object):
                    static.default_main_program()).with_data_parallel(
                            loss_name=loss.name, places=parallel_places)
                # NOTE: if not set share_vars_from=compiled_train_prog,
-                # the parameters used in test process are different with 
+                # the parameters used in test process are different with
                # the parameters used by train process
                compiled_test_prog = static.CompiledProgram(
                    test_program).with_data_parallel(
@@ -701,7 +701,7 @@ class IpuStrategy(object):

    Examples:
        .. code-block:: python
-	
+
            # required: ipu

            import paddle
@@ -744,7 +744,7 @@ class IpuStrategy(object):

        Examples:
            .. code-block:: python
-	
+
                # required: ipu

                import paddle
@@ -762,7 +762,7 @@ class IpuStrategy(object):

        Examples:
            .. code-block:: python
-	
+
                # required: ipu

                import paddle
@@ -780,13 +780,13 @@ class IpuStrategy(object):

          Args:
              optimizer (Optimizer): Optimizer to be used in training.
-              
+
          Returns:
              None.

          Examples:
              .. code-block:: python
-	
+
                  # required: ipu

                  import paddle
@@ -812,13 +812,13 @@ class IpuStrategy(object):

          Args:
              optimizer (Optimizer): Optimizer to be parsed.
-              
+
          Returns:
              Dict.

          Examples:
              .. code-block:: python
-	
+
                  # required: ipu

                  import paddle
@@ -857,15 +857,15 @@ class IpuStrategy(object):
            is_training (bool, optional): True is training graph, False is inference graph. Default True, which means is training mode.
            batch_size (int, optional): The batch-size in the graph. Used to make the graph batch-size fixed,
                if the batch-size in the graph is dynamic. Default 1, which means the batch-size would be set 1, if the batch-size is dynamice.
-            enable_manual_shard (bool, optional): Enable graph sharding or not. Only if num_ipus > 1, enable_manual_shard is able to be set True. 
-                Default False, which means disabled.    
-            
+            enable_manual_shard (bool, optional): Enable graph sharding or not. Only if num_ipus > 1, enable_manual_shard is able to be set True.
+                Default False, which means disabled.
+
        Returns:
            None.

        Examples:
            .. code-block:: python
-	
+
                # required: ipu

                import paddle
@@ -900,15 +900,15 @@ class IpuStrategy(object):
        Set pipelining configuration to the IpuStrategy instance. Used to optimize the throughput performance.

        Args:
-            enable_pipelining (bool, optional): Enable data pipelining between subgraphs. Only if enable_manual_shard=True, enable_pipelining is able to be set True. 
+            enable_pipelining (bool, optional): Enable data pipelining between subgraphs. Only if enable_manual_shard=True, enable_pipelining is able to be set True.
                Default False, which means disabled.
            batches_per_step (int, optional): Set the batches per run in data pipelining mode. Only if enable_pipelining=True, batches_per_step is able to be set > 1.
                Default 1, which means no data pipelining.
            enable_gradient_accumulation (bool, optional): Enable to accumulate gradients before updating the weights in training mode. Only if enable_pipelining=True,
-                enable_gradient_accumulation is able to be set True. Default False, which means no gradient accumulation. 
-            accumulation_factor (int, optional): Specify the number of micro-batches to accumulate 
+                enable_gradient_accumulation is able to be set True. Default False, which means no gradient accumulation.
+            accumulation_factor (int, optional): Specify the number of micro-batches to accumulate
                before applying the varUpdate. Default 1, which means disable the accumulation.
-        
+
        Returns:
            None.

@@ -947,7 +947,7 @@ class IpuStrategy(object):

        Args:
            enable_fp16 (bool, optional): Enable FLOAT16 mode and transform FLOAT32 to FLOAT16. Default False, which means disable FLOAT16 mode.
-        
+
        Returns:
            None.

@@ -985,7 +985,7 @@ class IpuStrategy(object):
            domain(str): domain name of custom op in popart.

            version(int): version of custom op in popart.
-        
+
        Returns:
            None.

@@ -1021,7 +1021,7 @@ class IpuStrategy(object):

        Args:
            options(dict): dict of options.
-        
+
        Returns:
            None.

@@ -1051,7 +1051,7 @@ class IpuStrategy(object):

        Args:
            option(str): name of option.
-        
+
        Returns:
            option value.

@@ -1076,7 +1076,7 @@ class IpuStrategy(object):

        Args:
            pattern(string): the name of the pattern.
-        
+
        Returns:
            None.

@@ -1101,7 +1101,7 @@ class IpuStrategy(object):

        Args:
            pattern(string): the name of the pattern.
-        
+
        Returns:
            None.

@@ -1156,21 +1156,21 @@ class IpuCompiledProgram(object):

    Args:
        program(Program, optional): This parameter represents the :code:`Program`
-            to be executed. Default is None, which means the program will be set to 
+            to be executed. Default is None, which means the program will be set to
            the default program :code:`paddle.static.default_main_program()` .
        scope(Scope, optional): The scope used to run this program, you can switch
-            it to different scope. Default is None, which means use the global 
+            it to different scope. Default is None, which means use the global
            scope :code:`paddle.static.global_scope()` .
        ipu_strategy(IpuStrategy, optional): This argument is used to build the program with the
            specified options, such as half computation, training or inference session, the number of IPUs, etc.
-            Default is None, which means build the program based on the default `ipu_strategy`. 
+            Default is None, which means build the program based on the default `ipu_strategy`.

    Returns:
        IpuCompiledProgram

    Example:
        .. code-block:: python
-	
+
            # required: ipu

            import paddle
@@ -1181,12 +1181,12 @@ class IpuCompiledProgram(object):
            a = static.data(name='data', shape=[None, 1], dtype='int32')
            b = a + 1
            main_prog = static.default_main_program()
-            
+
            ipu_strategy = static.IpuStrategy()
            ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1)
            ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1)
            ipu_strategy.set_precision_config(enable_fp16=False)
-            
+
            ipu_compiled_program = static.IpuCompiledProgram(
                main_prog,
                ipu_strategy=ipu_strategy)
@@ -1232,7 +1232,7 @@ class IpuCompiledProgram(object):
        """
        This interface is used to compile the input Program to a program
        to run the model on the ipu.
-        
+
        Args:
            feed_list(list): This parameter represents the input Tensors of the model.

@@ -1244,14 +1244,14 @@ class IpuCompiledProgram(object):

        Example:
            .. code-block:: python
-    	
+
                # required: ipu
-    
+
                import paddle
                import paddle.static as static
-    
+
                paddle.enable_static()
-    
+
                a = static.data(name='data', shape=[None, 1], dtype='int32')
                b = a + 1
                main_prog = static.default_main_program()
@@ -1260,7 +1260,7 @@ class IpuCompiledProgram(object):
                ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1)
                ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1)
                ipu_strategy.set_precision_config(enable_fp16=False)
-                
+
                program = static.IpuCompiledProgram(
                    main_prog,
                    ipu_strategy=ipu_strategy).compile([a.name], [b.name])

--- a/python/paddle/fluid/contrib/layers/metric_op.py
+++ b/python/paddle/fluid/contrib/layers/metric_op.py
@@ -49,7 +49,7 @@ def ctr_metric_bundle(input, label, ins_tag_weight=None):
        label(Tensor): A 2D int Tensor indicating the label of the training
                         data. The height is batch size and width is always 1.
        ins_tag_weight(Tensor): A 2D int Tensor indicating the ins_tag_weight of the training
-                         data. 1 means real data, 0 means fake data. 
+                         data. 1 means real data, 0 means fake data.
                         A LoDTensor or Tensor with type float32,float64.

    Returns:

--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
--- a/python/paddle/fluid/contrib/layers/rnn_impl.py
+++ b/python/paddle/fluid/contrib/layers/rnn_impl.py
--- a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
+++ b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
--- a/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
--- a/python/paddle/fluid/contrib/model_stat.py
+++ b/python/paddle/fluid/contrib/model_stat.py
--- a/python/paddle/fluid/contrib/optimizer.py
+++ b/python/paddle/fluid/contrib/optimizer.py
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_config.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_config.py
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
--- a/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantize_transpiler_v2.py
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
--- a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
+++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
--- a/python/paddle/fluid/data.py
+++ b/python/paddle/fluid/data.py
--- a/python/paddle/fluid/data_feed_desc.py
+++ b/python/paddle/fluid/data_feed_desc.py
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
--- a/python/paddle/fluid/dataloader/batch_sampler.py
+++ b/python/paddle/fluid/dataloader/batch_sampler.py
--- a/python/paddle/fluid/dataloader/collate.py
+++ b/python/paddle/fluid/dataloader/collate.py
--- a/python/paddle/fluid/dataloader/dataset.py
+++ b/python/paddle/fluid/dataloader/dataset.py
--- a/python/paddle/fluid/dataloader/sampler.py
+++ b/python/paddle/fluid/dataloader/sampler.py
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
--- a/python/paddle/fluid/distributed/downpour.py
+++ b/python/paddle/fluid/distributed/downpour.py
--- a/python/paddle/fluid/distributed/fleet.py
+++ b/python/paddle/fluid/distributed/fleet.py
--- a/python/paddle/fluid/distributed/helper.py
+++ b/python/paddle/fluid/distributed/helper.py
--- a/python/paddle/fluid/distributed/node.py
+++ b/python/paddle/fluid/distributed/node.py
--- a/python/paddle/fluid/distributed/ps_instance.py
+++ b/python/paddle/fluid/distributed/ps_instance.py
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
--- a/python/paddle/fluid/dygraph/rnn.py
+++ b/python/paddle/fluid/dygraph/rnn.py
--- a/python/paddle/fluid/dygraph/static_runner.py
+++ b/python/paddle/fluid/dygraph/static_runner.py
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
--- a/python/paddle/fluid/dygraph_utils.py
+++ b/python/paddle/fluid/dygraph_utils.py
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/generator.py
+++ b/python/paddle/fluid/generator.py
--- a/python/paddle/fluid/incubate/data_generator/__init__.py
+++ b/python/paddle/fluid/incubate/data_generator/__init__.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/heter_trainer_pass.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/pserver_pass.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
--- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py
+++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/distributions.py
+++ b/python/paddle/fluid/layers/distributions.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
--- a/python/paddle/fluid/lazy_init.py
+++ b/python/paddle/fluid/lazy_init.py
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
--- a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
--- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py
--- a/python/paddle/fluid/tests/unittests/autograd/utils.py
+++ b/python/paddle/fluid/tests/unittests/autograd/utils.py
--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
--- a/python/paddle/fluid/tests/unittests/distribution/mock_data.py
+++ b/python/paddle/fluid/tests/unittests/distribution/mock_data.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_multi_forward.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_multi_forward.py
--- a/python/paddle/fluid/tests/unittests/gradient_checker.py
+++ b/python/paddle/fluid/tests/unittests/gradient_checker.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/quant_dequant_test.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_bn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_bn_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_flatten2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_flatten2_matmul_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_layernorm_shift_partition_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_layernorm_shift_partition_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_to_mul_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_to_mul_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_matmul_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_map_matmul_v2_to_mul_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_scale_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_scale_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_matmul_v2_scale_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_depthwise_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_depthwise_conv_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_reshape2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_reshape2_matmul_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_squeeze2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_squeeze2_matmul_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten2_matmul_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape2_matmul_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_squeeze2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_squeeze2_matmul_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_unsqueeze2_eltwise_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/mlu/test_tril_triu_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_tril_triu_op_mlu.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
+++ b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
--- a/python/paddle/fluid/tests/unittests/test_cond.py
+++ b/python/paddle/fluid/tests/unittests/test_cond.py
--- a/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_stream_event.py
--- a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adagrad.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adagrad.py
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adam.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_adam.py
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_ftrl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_ftrl.py
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_momentum.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_momentum.py
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_rmsprop.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_rmsprop.py
--- a/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_sgd.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_tensor_load_sgd.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
--- a/python/paddle/fluid/tests/unittests/test_einsum_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
--- a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_launch_coverage.py
+++ b/python/paddle/fluid/tests/unittests/test_launch_coverage.py
--- a/python/paddle/fluid/tests/unittests/test_lu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lu_op.py
--- a/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py
--- a/python/paddle/fluid/tests/unittests/test_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_prune.py
--- a/python/paddle/fluid/tests/unittests/test_svd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_svd_op.py
--- a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
--- a/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py
+++ b/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py
--- a/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py
+++ b/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py
--- a/python/paddle/fluid/tests/unittests/xpu/test_generate_proposals_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_generate_proposals_v2_op_xpu.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
--- a/python/paddle/framework/dtype.py
+++ b/python/paddle/framework/dtype.py
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
--- a/python/paddle/geometric/math.py
+++ b/python/paddle/geometric/math.py
--- a/python/paddle/geometric/message_passing/send_recv.py
+++ b/python/paddle/geometric/message_passing/send_recv.py
--- a/python/paddle/geometric/reindex.py
+++ b/python/paddle/geometric/reindex.py
--- a/python/paddle/geometric/sampling/neighbors.py
+++ b/python/paddle/geometric/sampling/neighbors.py
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
--- a/python/paddle/hapi/dynamic_flops.py
+++ b/python/paddle/hapi/dynamic_flops.py
--- a/python/paddle/hapi/hub.py
+++ b/python/paddle/hapi/hub.py
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
--- a/python/paddle/hapi/static_flops.py
+++ b/python/paddle/hapi/static_flops.py
--- a/python/paddle/incubate/autograd/functional.py
+++ b/python/paddle/incubate/autograd/functional.py
--- a/python/paddle/incubate/autograd/primapi.py
+++ b/python/paddle/incubate/autograd/primapi.py
--- a/python/paddle/incubate/autograd/primreg.py
+++ b/python/paddle/incubate/autograd/primreg.py
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
--- a/python/paddle/incubate/autograd/utils.py
+++ b/python/paddle/incubate/autograd/utils.py
--- a/python/paddle/incubate/distributed/models/moe/grad_clip.py
+++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py
--- a/python/paddle/incubate/distributed/models/moe/moe_layer.py
+++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py
--- a/python/paddle/incubate/nn/functional/fused_matmul_bias.py
+++ b/python/paddle/incubate/nn/functional/fused_matmul_bias.py
--- a/python/paddle/incubate/nn/layer/fused_linear.py
+++ b/python/paddle/incubate/nn/layer/fused_linear.py
--- a/python/paddle/incubate/operators/graph_khop_sampler.py
+++ b/python/paddle/incubate/operators/graph_khop_sampler.py
--- a/python/paddle/incubate/operators/graph_reindex.py
+++ b/python/paddle/incubate/operators/graph_reindex.py
--- a/python/paddle/incubate/operators/graph_sample_neighbors.py
+++ b/python/paddle/incubate/operators/graph_sample_neighbors.py
--- a/python/paddle/incubate/operators/graph_send_recv.py
+++ b/python/paddle/incubate/operators/graph_send_recv.py
--- a/python/paddle/incubate/optimizer/functional/bfgs.py
+++ b/python/paddle/incubate/optimizer/functional/bfgs.py
--- a/python/paddle/incubate/optimizer/functional/lbfgs.py
+++ b/python/paddle/incubate/optimizer/functional/lbfgs.py
--- a/python/paddle/incubate/optimizer/functional/line_search.py
+++ b/python/paddle/incubate/optimizer/functional/line_search.py
--- a/python/paddle/incubate/optimizer/functional/utils.py
+++ b/python/paddle/incubate/optimizer/functional/utils.py
--- a/python/paddle/incubate/optimizer/lookahead.py
+++ b/python/paddle/incubate/optimizer/lookahead.py
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
--- a/python/paddle/incubate/sparse/binary.py
+++ b/python/paddle/incubate/sparse/binary.py
--- a/python/paddle/incubate/sparse/creation.py
+++ b/python/paddle/incubate/sparse/creation.py
--- a/python/paddle/incubate/sparse/multiary.py
+++ b/python/paddle/incubate/sparse/multiary.py
--- a/python/paddle/incubate/sparse/nn/functional/activation.py
+++ b/python/paddle/incubate/sparse/nn/functional/activation.py
--- a/python/paddle/incubate/sparse/nn/functional/conv.py
+++ b/python/paddle/incubate/sparse/nn/functional/conv.py
--- a/python/paddle/incubate/sparse/nn/functional/pooling.py
+++ b/python/paddle/incubate/sparse/nn/functional/pooling.py
--- a/python/paddle/incubate/sparse/nn/functional/transformer.py
+++ b/python/paddle/incubate/sparse/nn/functional/transformer.py
--- a/python/paddle/incubate/sparse/nn/layer/activation.py
+++ b/python/paddle/incubate/sparse/nn/layer/activation.py
--- a/python/paddle/incubate/sparse/nn/layer/conv.py
+++ b/python/paddle/incubate/sparse/nn/layer/conv.py
--- a/python/paddle/incubate/sparse/nn/layer/norm.py
+++ b/python/paddle/incubate/sparse/nn/layer/norm.py
--- a/python/paddle/incubate/sparse/unary.py
+++ b/python/paddle/incubate/sparse/unary.py
--- a/python/paddle/incubate/tensor/math.py
+++ b/python/paddle/incubate/tensor/math.py
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
--- a/python/paddle/nn/functional/sparse_attention.py
+++ b/python/paddle/nn/functional/sparse_attention.py
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
--- a/python/paddle/nn/initializer/assign.py
+++ b/python/paddle/nn/initializer/assign.py
--- a/python/paddle/nn/initializer/dirac.py
+++ b/python/paddle/nn/initializer/dirac.py
--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
--- a/python/paddle/nn/initializer/normal.py
+++ b/python/paddle/nn/initializer/normal.py
--- a/python/paddle/nn/initializer/orthogonal.py
+++ b/python/paddle/nn/initializer/orthogonal.py
--- a/python/paddle/nn/initializer/uniform.py
+++ b/python/paddle/nn/initializer/uniform.py
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
--- a/python/paddle/nn/layer/container.py
+++ b/python/paddle/nn/layer/container.py
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
--- a/python/paddle/nn/utils/spectral_norm_hook.py
+++ b/python/paddle/nn/utils/spectral_norm_hook.py
--- a/python/paddle/nn/utils/transform_parameters.py
+++ b/python/paddle/nn/utils/transform_parameters.py
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
--- a/python/paddle/onnx/export.py
+++ b/python/paddle/onnx/export.py
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
--- a/python/paddle/optimizer/lamb.py
+++ b/python/paddle/optimizer/lamb.py
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
--- a/python/paddle/profiler/timer.py
+++ b/python/paddle/profiler/timer.py
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
--- a/python/paddle/regularizer.py
+++ b/python/paddle/regularizer.py
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
--- a/python/paddle/tensor/ops.py
+++ b/python/paddle/tensor/ops.py
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
--- a/python/paddle/text/datasets/conll05.py
+++ b/python/paddle/text/datasets/conll05.py
--- a/python/paddle/text/datasets/uci_housing.py
+++ b/python/paddle/text/datasets/uci_housing.py
--- a/python/paddle/text/datasets/wmt14.py
+++ b/python/paddle/text/datasets/wmt14.py
--- a/python/paddle/text/datasets/wmt16.py
+++ b/python/paddle/text/datasets/wmt16.py
--- a/python/paddle/text/viterbi_decode.py
+++ b/python/paddle/text/viterbi_decode.py
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
--- a/python/paddle/utils/dlpack.py
+++ b/python/paddle/utils/dlpack.py
--- a/python/paddle/vision/datasets/flowers.py
+++ b/python/paddle/vision/datasets/flowers.py
--- a/python/paddle/vision/datasets/folder.py
+++ b/python/paddle/vision/datasets/folder.py
--- a/python/paddle/vision/datasets/mnist.py
+++ b/python/paddle/vision/datasets/mnist.py
--- a/python/paddle/vision/image.py
+++ b/python/paddle/vision/image.py
--- a/python/paddle/vision/models/alexnet.py
+++ b/python/paddle/vision/models/alexnet.py
--- a/python/paddle/vision/models/densenet.py
+++ b/python/paddle/vision/models/densenet.py
--- a/python/paddle/vision/models/googlenet.py
+++ b/python/paddle/vision/models/googlenet.py
--- a/python/paddle/vision/models/inceptionv3.py
+++ b/python/paddle/vision/models/inceptionv3.py
--- a/python/paddle/vision/models/lenet.py
+++ b/python/paddle/vision/models/lenet.py
--- a/python/paddle/vision/models/mobilenetv1.py
+++ b/python/paddle/vision/models/mobilenetv1.py
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
--- a/python/paddle/vision/models/mobilenetv3.py
+++ b/python/paddle/vision/models/mobilenetv3.py
--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
--- a/python/paddle/vision/models/shufflenetv2.py
+++ b/python/paddle/vision/models/shufflenetv2.py
--- a/python/paddle/vision/models/squeezenet.py
+++ b/python/paddle/vision/models/squeezenet.py
--- a/python/paddle/vision/models/vgg.py
+++ b/python/paddle/vision/models/vgg.py
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
--- a/python/paddle/vision/transforms/functional_cv2.py
+++ b/python/paddle/vision/transforms/functional_cv2.py
--- a/python/paddle/vision/transforms/functional_pil.py
+++ b/python/paddle/vision/transforms/functional_pil.py
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
--- a/r/example/mobilenet.py
+++ b/r/example/mobilenet.py
--- a/tools/check_ut.py
+++ b/tools/check_ut.py
--- a/tools/codestyle/test_docstring_checker.py
+++ b/tools/codestyle/test_docstring_checker.py
--- a/tools/count_api_without_core_ops.py
+++ b/tools/count_api_without_core_ops.py
--- a/tools/coverage/coverage_diff.py
+++ b/tools/coverage/coverage_diff.py
--- a/tools/coverage/coverage_diff_list.py
+++ b/tools/coverage/coverage_diff_list.py
--- a/tools/coverage/pull_request.py
+++ b/tools/coverage/pull_request.py
--- a/tools/diff_api.py
+++ b/tools/diff_api.py
--- a/tools/diff_unittest.py
+++ b/tools/diff_unittest.py
--- a/tools/final_ut_parallel_rule.py
+++ b/tools/final_ut_parallel_rule.py
--- a/tools/gen_ut_cmakelists.py
+++ b/tools/gen_ut_cmakelists.py
--- a/tools/jetson_infer_op.py
+++ b/tools/jetson_infer_op.py
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
--- a/tools/prune_for_jetson.py
+++ b/tools/prune_for_jetson.py
--- a/tools/remove_grad_op_and_kernel.py
+++ b/tools/remove_grad_op_and_kernel.py
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py