From cfee9c13dcb0c983dd5857913cf8c15696b26f67 Mon Sep 17 00:00:00 2001
From: Ligoml <39876205+Ligoml@users.noreply.github.com>
Date: Fri, 4 Nov 2022 10:49:04 +0800
Subject: [PATCH] [cherry-pick2.4]for CodeStyle (#47608)

* only run pre-commit

* only run pre-commit
---
 python/paddle/autograd/py_layer.py            |  112 +-
 python/paddle/device/cuda/__init__.py         |   82 +-
 python/paddle/distributed/collective.py       |  689 +++---
 .../fleet/base/private_helper_function.py     |   14 +-
 python/paddle/distributed/fleet/fleet.py      |  435 ++--
 python/paddle/distributed/parallel.py         |  117 +-
 .../distributed/sharding/group_sharded.py     |  146 +-
 python/paddle/distribution/distribution.py    |  112 +-
 python/paddle/distribution/kl.py              |   89 +-
 python/paddle/distribution/normal.py          |  116 +-
 python/paddle/distribution/transform.py       |  300 +--
 python/paddle/distribution/uniform.py         |  111 +-
 python/paddle/fluid/framework.py              | 1806 +++++++++-------
 python/paddle/framework/framework.py          |   20 +-
 python/paddle/framework/io.py                 |  336 +--
 python/paddle/incubate/autograd/primapi.py    |   70 +-
 python/paddle/incubate/autograd/primx.py      |  133 +-
 python/paddle/incubate/autograd/utils.py      |   21 +-
 .../nn/functional/fused_transformer.py        |  734 ++++---
 python/paddle/incubate/optimizer/lookahead.py |   95 +-
 .../paddle/incubate/optimizer/modelaverage.py |  173 +-
 python/paddle/nn/functional/activation.py     |  520 +++--
 python/paddle/nn/functional/common.py         | 1105 ++++++----
 python/paddle/nn/functional/conv.py           |  930 ++++++---
 python/paddle/nn/functional/extension.py      |  133 +-
 python/paddle/nn/functional/loss.py           | 1739 ++++++++++------
 python/paddle/nn/functional/norm.py           |  435 ++--
 python/paddle/nn/functional/pooling.py        | 1587 ++++++++------
 .../paddle/nn/functional/sparse_attention.py  |  114 +-
 python/paddle/nn/layer/activation.py          |   80 +-
 python/paddle/nn/layer/common.py              |  513 +++--
 python/paddle/nn/layer/conv.py                |  652 +++---
 python/paddle/nn/layer/loss.py                |  477 +++--
 python/paddle/nn/layer/norm.py                |  657 +++---
 python/paddle/nn/layer/pooling.py             |  420 ++--
 python/paddle/nn/utils/spectral_norm_hook.py  |   64 +-
 python/paddle/nn/utils/weight_norm_hook.py    |   54 +-
 python/paddle/onnx/export.py                  |   31 +-
 python/paddle/optimizer/adadelta.py           |   85 +-
 python/paddle/optimizer/adagrad.py            |   75 +-
 python/paddle/optimizer/adamax.py             |  161 +-
 python/paddle/optimizer/lr.py                 |  482 +++--
 python/paddle/optimizer/rmsprop.py            |  111 +-
 python/paddle/reader/decorator.py             |  138 +-
 python/paddle/regularizer.py                  |   30 +-
 python/paddle/static/input.py                 |   50 +-
 python/paddle/static/io.py                    |  169 +-
 python/paddle/static/nn/common.py             |   72 +-
 python/paddle/tensor/creation.py              |  844 +++++---
 python/paddle/tensor/linalg.py                | 1541 +++++++-------
 python/paddle/tensor/logic.py                 |  359 ++--
 python/paddle/tensor/manipulation.py          | 1847 ++++++++++-------
 python/paddle/tensor/math.py                  |   21 +-
 python/paddle/tensor/stat.py                  |  151 +-
 .../utils/cpp_extension/cpp_extension.py      |  301 ++-
 python/paddle/vision/datasets/cifar.py        |   70 +-
 python/paddle/vision/ops.py                   | 1430 ++++++++-----
 57 files changed, 13869 insertions(+), 9290 deletions(-)

diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py
index 673b047d5a3..7e04d02e903 100644
--- a/python/paddle/autograd/py_layer.py
+++ b/python/paddle/autograd/py_layer.py
@@ -54,16 +54,16 @@ class LegacyPyLayerContext(object):
     def save_for_backward(self, *tensors):
         """
         Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
-        
+
         .. note::
-            This API should be called at most once, and only inside `forward`. 
+            This API should be called at most once, and only inside `forward`.
 
         Args:
             tensors(list of Tensors): Tensors to be stored.
 
         Returns:
             None
-        
+
         Examples:
             .. code-block:: python
 
@@ -94,7 +94,7 @@ class LegacyPyLayerContext(object):
         Get the tensors stored by ``save_for_backward``.
 
         Returns:
-            list of Tensors or None: If context contains tensors stored by `save_for_backward`, 
+            list of Tensors or None: If context contains tensors stored by `save_for_backward`,
             then return these tensors, otherwise return None.
 
         Examples:
@@ -124,9 +124,7 @@ class LegacyPyLayerContext(object):
 
 
 def with_mateclass(meta, *bases):
-
     class impl(meta):
-
         def __new__(cls, name, temp_bases, attrs):
             return meta(name, bases, attrs)
 
@@ -134,7 +132,6 @@ def with_mateclass(meta, *bases):
 
 
 class CPyLayer(object):
-
     @classmethod
     @dygraph_only
     def apply(cls, *args, **kwargs):
@@ -147,7 +144,7 @@ class CPyLayer(object):
 
         Returns:
             tensors or other types : output of PyLayer.
-        
+
         Examples:
             .. code-block:: python
 
@@ -182,12 +179,14 @@ class CPyLayer(object):
 
 
 class PyLayerBackward(LegacyPyLayerContext):
-
     def backward(self, *args, **kwargs):
         with paddle.fluid.dygraph.guard():
             with paddle.fluid.dygraph.no_grad():
-                if self._amp_state and 'enable' in self._amp_state and self._amp_state[
-                        'enable']:
+                if (
+                    self._amp_state
+                    and 'enable' in self._amp_state
+                    and self._amp_state['enable']
+                ):
                     with auto_cast(**args[0]._amp_state):
                         return self._forward_cls.backward(*args, **kwargs)
                 else:
@@ -197,10 +196,10 @@ class PyLayerBackward(LegacyPyLayerContext):
 
 
 class LayerMeta(type):
-
     def __init__(cls, name, bases, attrs):
-        cls._backward_function = type(name + '_backward', (PyLayerBackward, ),
-                                      {"_forward_cls": cls})
+        cls._backward_function = type(
+            name + '_backward', (PyLayerBackward,), {"_forward_cls": cls}
+        )
 
         return super(LayerMeta, cls).__init__(name, bases, attrs)
 
@@ -210,15 +209,15 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
     Build a custom `Layer` by creating subclasses. Subclasses need to follow the following rules:
     1. Subclasses contain `forward` and `backward` function. Both forward and backward are @staticmethod.
     Their first argument should be a context and `None` can not be included in the returned result.
-    2. Input of backward contains a context as the first argument, and the rest arguments are the 
-    gradient of forward's output tensors. so the number of backward's input tensors equal to 
-    the number of forward output tensors. If you need the forward's inputs or outputs in `backward`, 
+    2. Input of backward contains a context as the first argument, and the rest arguments are the
+    gradient of forward's output tensors. so the number of backward's input tensors equal to
+    the number of forward output tensors. If you need the forward's inputs or outputs in `backward`,
     you can use `save_for_backward` to store the required tensors, and then use them in the backward.
     3. Output of backward function can only be `Tensor` or tuple/list of `Tensor`.
-    Output tensors of backward are the gradient of forward's input tensors, 
+    Output tensors of backward are the gradient of forward's input tensors,
     so the number of backward's output tensors equal to the number of forward input tensors.
     After building the custom Layer, run it through the `apply` method.
-    
+
 
     Examples:
         .. code-block:: python
@@ -259,8 +258,8 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
     @staticmethod
     def forward(ctx, *args, **kwargs):
         """
-        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as 
-        the first argument, followed by any number of arguments (tensors or other types). 
+        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as
+        the first argument, followed by any number of arguments (tensors or other types).
         `None` can not be included in the returned result.
 
         Args:
@@ -269,7 +268,7 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
 
         Returns:
             tensors or other types : output of PyLayer.
-        
+
         Examples:
             .. code-block:: python
 
@@ -292,14 +291,15 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
                         return grad
         """
         raise NotImplementedError(
-            "You must implement the forward function for PyLayer.")
+            "You must implement the forward function for PyLayer."
+        )
 
     @staticmethod
     def backward(ctx, *args, **kwargs):
         """
-        This is a function to calculate the gradient. It is to be overloaded by subclasses. 
-        It must accept a object of `PyLayerContext` as the first argument, and the rest 
-        arguments are the gradient of forward's output tensors. Output tensors of backward 
+        This is a function to calculate the gradient. It is to be overloaded by subclasses.
+        It must accept a object of `PyLayerContext` as the first argument, and the rest
+        arguments are the gradient of forward's output tensors. Output tensors of backward
         are the gradient of forward's input tensors.
 
         Args:
@@ -308,7 +308,7 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
 
         Returns:
             Tensor or list of Tensors: The gradient of forward's input tensor(s).
-        
+
         Examples:
             .. code-block:: python
 
@@ -332,24 +332,24 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
         """
 
         raise NotImplementedError(
-            "You must implement the backward function for PyLayer.")
+            "You must implement the backward function for PyLayer."
+        )
 
 
 class EagerPyLayerContext(object):
-
     def save_for_backward(self, *tensors):
         """
         Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
-        
+
         .. note::
-            This API should be called at most once, and only inside `forward`. 
+            This API should be called at most once, and only inside `forward`.
 
         Args:
             tensors(list of Tensors): Tensors to be stored.
 
         Returns:
             None
-        
+
         Examples:
             .. code-block:: python
 
@@ -380,7 +380,7 @@ class EagerPyLayerContext(object):
         Get the tensors stored by ``save_for_backward``.
 
         Returns:
-            list of Tensors or None: If context contains tensors stored by `save_for_backward`, 
+            list of Tensors or None: If context contains tensors stored by `save_for_backward`,
             then return these tensors, otherwise return None.
 
         Examples:
@@ -410,11 +410,11 @@ class EagerPyLayerContext(object):
     def mark_not_inplace(self, *args):
         """
         Marks inputs as not inplace.
-        This should be called at most once, only from inside the `forward` method, 
+        This should be called at most once, only from inside the `forward` method,
         and all arguments should be Tensor inputs.
 
-        If the Tensor returned by `forward` method is the same as the Tensor input of forward, 
-        and this Tensor is marked as not_inplace, then Paddle will help the user create a new Tensor as output. 
+        If the Tensor returned by `forward` method is the same as the Tensor input of forward,
+        and this Tensor is marked as not_inplace, then Paddle will help the user create a new Tensor as output.
         Thereby preventing the auto grad information of the input Tensor from being overwritten.
 
         Examples:
@@ -427,7 +427,7 @@ class EagerPyLayerContext(object):
                     def forward(ctx, x):
                         ctx.mark_not_inplace(x)
                         return x
-                    
+
                     @staticmethod
                     def backward(ctx, grad_output):
                         out = grad_output.exp()
@@ -438,7 +438,7 @@ class EagerPyLayerContext(object):
                 attn_layers = []
                 for idx in range(0, 2):
                     attn_layers.append(Exp())
-                
+
                 for step in range(0, 2):
                     a = x
                     for j in range(0,2):
@@ -450,7 +450,7 @@ class EagerPyLayerContext(object):
     def mark_non_differentiable(self, *args):
         """
         Marks outputs as non-differentiable.
-        This should be called at most once, only from inside the `forward` method, 
+        This should be called at most once, only from inside the `forward` method,
         and all arguments should be tensor outputs.
 
         This will mark outputs as not requiring gradients, increasing the
@@ -542,30 +542,27 @@ class EagerPyLayerContext(object):
 
 
 class EagerPyLayerBackward(core.eager.PyLayer, EagerPyLayerContext):
-
     def backward(self, *args):
         return self._forward_cls.backward(self, *args)
 
 
 class EagerPyLayerMeta(type):
-
     def __init__(cls, name, bases, attrs):
-        cls._backward_function = type(name + '_backward',
-                                      (EagerPyLayerBackward, ),
-                                      {"_forward_cls": cls})
+        cls._backward_function = type(
+            name + '_backward', (EagerPyLayerBackward,), {"_forward_cls": cls}
+        )
 
         return super(EagerPyLayerMeta, cls).__init__(name, bases, attrs)
 
 
 class EagerPyLayer(
-        with_mateclass(EagerPyLayerMeta, core.eager.PyLayer,
-                       EagerPyLayerContext)):
-
+    with_mateclass(EagerPyLayerMeta, core.eager.PyLayer, EagerPyLayerContext)
+):
     @staticmethod
     def forward(ctx, *args, **kwargs):
         """
-        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as 
-        the first argument, followed by any number of arguments (tensors or other types). 
+        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as
+        the first argument, followed by any number of arguments (tensors or other types).
         `None` can not be included in the returned result.
 
         Args:
@@ -574,7 +571,7 @@ class EagerPyLayer(
 
         Returns:
             tensors or other types : output of PyLayer.
-        
+
         Examples:
             .. code-block:: python
 
@@ -597,14 +594,15 @@ class EagerPyLayer(
                         return grad
         """
         raise NotImplementedError(
-            "You must implement the forward function for PyLayer.")
+            "You must implement the forward function for PyLayer."
+        )
 
     @staticmethod
     def backward(ctx, *args):
         """
-        This is a function to calculate the gradient. It is to be overloaded by subclasses. 
-        It must accept a object of `PyLayerContext` as the first argument, and the rest 
-        arguments are the gradient of forward's output tensors. Output tensors of backward 
+        This is a function to calculate the gradient. It is to be overloaded by subclasses.
+        It must accept a object of `PyLayerContext` as the first argument, and the rest
+        arguments are the gradient of forward's output tensors. Output tensors of backward
         are the gradient of forward's input tensors.
 
         Args:
@@ -613,7 +611,7 @@ class EagerPyLayer(
 
         Returns:
             Tensor or list of Tensors: The gradient of forward's input tensor(s).
-        
+
         Examples:
             .. code-block:: python
 
@@ -637,11 +635,11 @@ class EagerPyLayer(
         """
 
         raise NotImplementedError(
-            "You must implement the backward function for PyLayer.")
+            "You must implement the backward function for PyLayer."
+        )
 
 
 def once_differentiable(backward):
-
     def wrapper(ctx, *args):
         with paddle.fluid.dygraph.no_grad():
             outputs = backward(ctx, *args)
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index d867f071229..4460e05d797 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -42,12 +42,12 @@ def current_stream(device=None):
     Return the current CUDA stream by the device.
 
     Parameters:
-        device(paddle.CUDAPlace()|int, optional): The device or the ID of the device which want to get stream from. 
+        device(paddle.CUDAPlace()|int, optional): The device or the ID of the device which want to get stream from.
         If device is None, the device is the current device. Default: None.
-    
+
     Returns:
         CUDAStream: the stream to the device.
-    
+
     Examples:
         .. code-block:: python
 
@@ -82,7 +82,7 @@ def synchronize(device=None):
     Parameters:
         device(paddle.CUDAPlace()|int, optional): The device or the ID of the device.
         If device is None, the device is the current device. Default: None.
-    
+
     Examples:
         .. code-block:: python
 
@@ -111,7 +111,7 @@ def synchronize(device=None):
 def device_count():
     '''
     Return the number of GPUs available.
-    
+
     Returns:
         int: the number of GPUs available.
 
@@ -124,8 +124,11 @@ def device_count():
 
     '''
 
-    num_gpus = core.get_cuda_device_count() if hasattr(
-        core, 'get_cuda_device_count') else 0
+    num_gpus = (
+        core.get_cuda_device_count()
+        if hasattr(core, 'get_cuda_device_count')
+        else 0
+    )
 
     return num_gpus
 
@@ -158,14 +161,14 @@ def extract_cuda_device_id(device, op_name):
     Return the id of the given cuda device. It is just a utility that will not be exposed to users.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
             the string name of device like 'gpu:x'.
             Default: None.
 
     Return:
         int: The id of the given device. If device is None, return the id of current device.
     '''
-    if (device is None):
+    if device is None:
         return core.get_cuda_current_device_id()
 
     if isinstance(device, int):
@@ -178,15 +181,19 @@ def extract_cuda_device_id(device, op_name):
         else:
             raise ValueError(
                 "The current string {} is not expected. Because {} only support string which is like 'gpu:x'. "
-                "Please input appropriate string again!".format(
-                    device, op_name))
+                "Please input appropriate string again!".format(device, op_name)
+            )
     else:
         raise ValueError(
             "The device type {} is not expected. Because {} only support int, str or paddle.CUDAPlace. "
-            "Please input appropriate device again!".format(device, op_name))
+            "Please input appropriate device again!".format(device, op_name)
+        )
 
-    assert device_id >= 0, f"The device id must be not less than 0, but got id = {device_id}."
-    assert device_id < device_count(
+    assert (
+        device_id >= 0
+    ), f"The device id must be not less than 0, but got id = {device_id}."
+    assert (
+        device_id < device_count()
     ), f"The device id {device_id} exceeds gpu card number {device_count()}"
 
     return device_id
@@ -197,12 +204,12 @@ def max_memory_allocated(device=None):
     Return the peak size of gpu memory that is allocated to tensor of the given device.
 
     .. note::
-        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need. 
+        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need.
         For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
             Default: None.
 
     Return:
@@ -232,8 +239,8 @@ def max_memory_reserved(device=None):
     Return the peak size of GPU memory that is held by the allocator of the given device.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
             Default: None.
 
     Return:
@@ -263,12 +270,12 @@ def memory_allocated(device=None):
     Return the current size of gpu memory that is allocated to tensor of the given device.
 
     .. note::
-        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need. 
-        For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes. 
+        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need.
+        For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
             Default: None.
 
     Return:
@@ -298,14 +305,14 @@ def memory_reserved(device=None):
     Return the current size of GPU memory that is held by the allocator of the given device.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
             Default: None.
 
     Return:
         int: The current size of GPU memory that is held by the allocator of the given device, in bytes.
 
-    Examples:    
+    Examples:
         .. code-block:: python
 
             # required: gpu
@@ -389,18 +396,18 @@ def get_device_properties(device=None):
     Return the properties of given device.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x' which to get the properties of the 
-            device from. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x' which to get the properties of the
+            device from. If device is None, the device is the current device.
             Default: None.
 
     Returns:
-        _gpuDeviceProperties: The properties of the device which include ASCII string 
-        identifying device, major compute capability, minor compute capability, global 
+        _gpuDeviceProperties: The properties of the device which include ASCII string
+        identifying device, major compute capability, minor compute capability, global
         memory available and the number of multiprocessors on the device.
 
     Examples:
-    
+
         .. code-block:: python
 
             # required: gpu
@@ -424,7 +431,8 @@ def get_device_properties(device=None):
         raise ValueError(
             "The API paddle.device.cuda.get_device_properties is not supported in "
             "CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support "
-            "to call this API.")
+            "to call this API."
+        )
 
     if device is not None:
         if isinstance(device, int):
@@ -438,12 +446,14 @@ def get_device_properties(device=None):
                 raise ValueError(
                     "The current string {} is not expected. Because paddle.device."
                     "cuda.get_device_properties only support string which is like 'gpu:x'. "
-                    "Please input appropriate string again!".format(device))
+                    "Please input appropriate string again!".format(device)
+                )
         else:
             raise ValueError(
                 "The device type {} is not expected. Because paddle.device.cuda."
                 "get_device_properties only support int, str or paddle.CUDAPlace. "
-                "Please input appropriate device again!".format(device))
+                "Please input appropriate device again!".format(device)
+            )
     else:
         device_id = -1
 
@@ -484,7 +494,7 @@ def get_device_capability(device=None):
     Return the major and minor revision numbers defining the device's compute capability which are got from CUDA function `cudaDeviceProp <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g1bf9d625a931d657e08db2b4391170f0>`_.
 
     Parameters:
-        device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device. 
+        device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device.
 
     Returns:
         tuple(int,int): the major and minor revision numbers defining the device's compute capability.
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 95b63cb0518..91805016105 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -105,8 +105,9 @@ def _get_group_map():
     global _group_map
     if _global_env_gid not in _group_map:
         genv = _get_global_env()
-        _group_map[_global_env_gid] = Group(genv.rank, 0,
-                                            list(range(genv.world_size)))
+        _group_map[_global_env_gid] = Group(
+            genv.rank, 0, list(range(genv.world_size))
+        )
     return _group_map
 
 
@@ -121,8 +122,10 @@ def _get_group_map_by_name():
 
 def _get_default_group():
     global _group_map_by_name
-    assert is_initialized(), ("Call paddle.distributed.init_parallel_env first "
-                              "to initialize the distributed environment.")
+    assert is_initialized(), (
+        "Call paddle.distributed.init_parallel_env first "
+        "to initialize the distributed environment."
+    )
     return _get_group_map_by_name()[_default_group_name]
 
 
@@ -178,21 +181,23 @@ def get_group(id=0):
     return gm[id] if id in gm else None
 
 
-def _new_process_group_impl(backend,
-                            store,
-                            rank,
-                            world_size,
-                            group_name,
-                            pg_options,
-                            group_id=0,
-                            src_rank=None,
-                            dst_rank=None):
+def _new_process_group_impl(
+    backend,
+    store,
+    rank,
+    world_size,
+    group_name,
+    pg_options,
+    group_id=0,
+    src_rank=None,
+    dst_rank=None,
+):
     pg = None
     genv = _get_global_env()
     if backend != 'heter':
         assert src_rank is None and dst_rank is None, (
-            "src_rank and dst_rank "
-            "can only be set for heter backend.")
+            "src_rank and dst_rank " "can only be set for heter backend."
+        )
     assert backend in _valid_backend_list, "Unsupported backend: %s." % backend
     if backend == "gloo":
         place = core.CPUPlace()
@@ -221,24 +226,27 @@ def _new_process_group_impl(backend,
         switch_ep = os.getenv("CLUSTER_SWITCH", None)
         assert switch_ep, "please set the CLUSTER_SWITCH variable."
         cluster_size_cumsum = np.cumsum(cluster_size)
-        cluster_offset = 0 if cluster_id == 0 else cluster_size_cumsum[
-            cluster_id - 1]
+        cluster_offset = (
+            0 if cluster_id == 0 else cluster_size_cumsum[cluster_id - 1]
+        )
         global_rank = cluster_offset + rank
         global_world_size = cluster_size_cumsum[-1]
         global_rank, global_world_size = _get_global_config(backend, rank)
-        pg = core.ProcessGroupHeter(store,
-                                    rank=global_rank,
-                                    world_size=global_world_size,
-                                    place=place,
-                                    gid=group_id,
-                                    local_rank=rank,
-                                    local_size=world_size,
-                                    gloo_rank=cluster_id,
-                                    gloo_size=len(cluster_size),
-                                    with_switch=True,
-                                    switch_endpoint=switch_ep,
-                                    src_rank=src_rank,
-                                    dst_rank=dst_rank)
+        pg = core.ProcessGroupHeter(
+            store,
+            rank=global_rank,
+            world_size=global_world_size,
+            place=place,
+            gid=group_id,
+            local_rank=rank,
+            local_size=world_size,
+            gloo_rank=cluster_id,
+            gloo_size=len(cluster_size),
+            with_switch=True,
+            switch_endpoint=switch_ep,
+            src_rank=src_rank,
+            dst_rank=dst_rank,
+        )
 
     return pg
 
@@ -284,10 +292,12 @@ def barrier(group=None):
     if not isinstance(ring_id, int):
         raise ValueError("The type of 'group' for barrier must be int.")
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(type=op_type,
-                     inputs={'X': [temp]},
-                     outputs={'Out': [temp]},
-                     attrs={'ring_id': ring_id})
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [temp]},
+        outputs={'Out': [temp]},
+        attrs={'ring_id': ring_id},
+    )
 
 
 # _custom_gid provides a way for users to
@@ -309,7 +319,7 @@ def _barrier_by_tcp_store(group_name, store, timeout):
         return
 
     barrier_prefix = "Barrier/" + group_name + "/"
-    is_master = (global_rank == 0)
+    is_master = global_rank == 0
 
     def _check_keys_ready(wait_keys):
         start_time = time.time()
@@ -322,9 +332,12 @@ def _barrier_by_tcp_store(group_name, store, timeout):
                     "Keys {} are not ready sinck rank {} is waiting them."
                     "Two reason may cause this error:\n 1. The create process group api should be called by all ranks.\n"
                     " 2. Try to increase the waiting time.\n".format(
-                        group_name, wait_keys, global_rank))
+                        group_name, wait_keys, global_rank
+                    )
+                )
             wait_keys = list(
-                filter(lambda key: int(store.get(key)) != 1, wait_keys))
+                filter(lambda key: int(store.get(key)) != 1, wait_keys)
+            )
 
     # all the workers set their exiting key and exit
     # the master will wait for all workers' exiting key, ensure to exit in the end
@@ -376,22 +389,25 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
                 ranks = global_ranks
             assert len(ranks) <= len(global_ranks), (
                 "Size of new group must be less than or "
-                "equal to that of the default global group.")
+                "equal to that of the default global group."
+            )
         size = len(ranks)
         ranks = sorted(ranks)
         if backend == 'heter' or (size > 1 and global_rank in ranks):
             rank = 0 if backend == 'heter' else ranks.index(global_rank)
             src_rank = ranks[0] if backend == 'heter' else None
             dst_rank = ranks[1] if backend == 'heter' else None
-            pg = _new_process_group_impl(backend,
-                                         _default_store,
-                                         rank,
-                                         size,
-                                         group_name,
-                                         pg_options=None,
-                                         group_id=gid,
-                                         src_rank=src_rank,
-                                         dst_rank=dst_rank)
+            pg = _new_process_group_impl(
+                backend,
+                _default_store,
+                rank,
+                size,
+                group_name,
+                pg_options=None,
+                group_id=gid,
+                src_rank=src_rank,
+                dst_rank=dst_rank,
+            )
         else:
             rank = -1
             pg = None
@@ -399,7 +415,7 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
         _group_map_by_name[group_name] = group
         _group_map[gid] = group
         _group_map_backend[group] = backend
-        #TODO: The method below is a new method for group management, will replace the previous
+        # TODO: The method below is a new method for group management, will replace the previous
         # three in the future.
         _add_new_group(group)
 
@@ -415,7 +431,7 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
 
     if not backend:
         backend = 'nccl'
-    assert backend == 'nccl', ("backend other than nccl is not supported yet")
+    assert backend == 'nccl', "backend other than nccl is not supported yet"
 
     genv = _get_global_env()
     global_rank = genv.rank
@@ -444,30 +460,36 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
 
             if core.is_compiled_with_cuda():
                 place = core.CUDAPlace(genv.device_id)
-                core.NCCLParallelContext(strategy,
-                                         place).init_with_ring_id(ring_id)
+                core.NCCLParallelContext(strategy, place).init_with_ring_id(
+                    ring_id
+                )
             elif core.is_compiled_with_npu():
                 place = core.NPUPlace(genv.device_id)
-                core.HCCLParallelContext(strategy,
-                                         place).init_with_ring_id(ring_id)
+                core.HCCLParallelContext(strategy, place).init_with_ring_id(
+                    ring_id
+                )
             elif core.is_compiled_with_mlu():
                 place = core.MLUPlace(genv.device_id)
-                core.CNCLParallelContext(strategy,
-                                         place).init_with_ring_id(ring_id)
+                core.CNCLParallelContext(strategy, place).init_with_ring_id(
+                    ring_id
+                )
             elif core.is_compiled_with_xpu():
                 place = core.XPUPlace(genv.device_id)
-                core.BKCLParallelContext(strategy,
-                                         place).init_with_ring_id(ring_id)
+                core.BKCLParallelContext(strategy, place).init_with_ring_id(
+                    ring_id
+                )
             else:
-                assert False, ("no cuda device found")
+                assert False, "no cuda device found"
         else:
             return gp
 
     # TODO(shenliang03): This is a temporary solution to solve the problem of
     # hang caused by cross-creation of new_group
-    tmp = paddle.to_tensor(
-        [1], dtype="int32") if _non_static_mode() else fill_constant(
-            [0], dtype="int32", value="1")
+    tmp = (
+        paddle.to_tensor([1], dtype="int32")
+        if _non_static_mode()
+        else fill_constant([0], dtype="int32", value="1")
+    )
     paddle.distributed.all_reduce(tmp, sync_op=True)
     paddle.distributed.wait(tmp)
     return gp
@@ -504,10 +526,10 @@ def destroy_process_group(group=None):
     Destroy a given group for communication
 
     Args:
-        group (ProcessGroup, optional): The group to be destroyed. All of process groups, including 
-                                        the default group, will be destroyed and the distributed 
+        group (ProcessGroup, optional): The group to be destroyed. All of process groups, including
+                                        the default group, will be destroyed and the distributed
                                         environment will be deinitialized.
-    
+
     Returns : None
 
     Examples:
@@ -599,8 +621,9 @@ def _sync_calc_stream(tensor):
 def _sync_comm_stream(tensor, ring_id=0):
 
     if _non_static_mode():
-        return _legacy_C_ops.c_sync_comm_stream([tensor], [tensor], 'ring_id',
-                                                ring_id)
+        return _legacy_C_ops.c_sync_comm_stream(
+            [tensor], [tensor], 'ring_id', ring_id
+        )
 
     op_type = 'c_sync_comm_stream'
 
@@ -661,7 +684,7 @@ def broadcast(tensor, src, group=None, sync_op=True):
     if in_dygraph_mode():
         group = _get_default_group() if group is None else group
         gsrc = group.get_group_rank(src)
-        assert gsrc >= 0, ("src rank out of group, need global rank")
+        assert gsrc >= 0, "src rank out of group, need global rank"
         task = group.process_group.broadcast(tensor, gsrc)
         if sync_op:
             task.wait()
@@ -672,28 +695,48 @@ def broadcast(tensor, src, group=None, sync_op=True):
     use_calc_stream = sync_op
     ring_id = ring_id = 0 if group is None else group.id
     gsrc = src if group is None else group.get_group_rank(src)
-    assert gsrc >= 0, ("src rank out of group, need global rank")
+    assert gsrc >= 0, "src rank out of group, need global rank"
 
     if _non_static_mode():
-        return _legacy_C_ops.c_broadcast(tensor, tensor, 'root', gsrc,
-                                         'use_calc_stream', use_calc_stream,
-                                         'ring_id', ring_id)
+        return _legacy_C_ops.c_broadcast(
+            tensor,
+            tensor,
+            'root',
+            gsrc,
+            'use_calc_stream',
+            use_calc_stream,
+            'ring_id',
+            ring_id,
+        )
 
     op_type = 'c_broadcast'
-    check_variable_and_dtype(tensor, 'tensor', [
-        'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-        'bool'
-    ], 'broadcast')
+    check_variable_and_dtype(
+        tensor,
+        'tensor',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'int8',
+            'uint8',
+            'bool',
+        ],
+        'broadcast',
+    )
 
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(type=op_type,
-                     inputs={'X': [tensor]},
-                     outputs={'Out': [tensor]},
-                     attrs={
-                         'root': gsrc,
-                         'use_calc_stream': use_calc_stream,
-                         'ring_id': ring_id,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={
+            'root': gsrc,
+            'use_calc_stream': use_calc_stream,
+            'ring_id': ring_id,
+        },
+    )
 
 
 def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
@@ -743,7 +786,7 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
         op_type = _get_reduce_op(op, "reduce")
         group = _get_default_group() if group is None else group
         gdst = group.get_group_rank(dst)
-        assert gdst >= 0, ("dst rank out of group, need global rank")
+        assert gdst >= 0, "dst rank out of group, need global rank"
         task = group.process_group.reduce(tensor, gdst, op_type)
         if sync_op:
             task.wait()
@@ -754,34 +797,72 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
     use_calc_stream = sync_op
     ring_id = 0 if group is None else group.id
     gdst = dst if group is None else group.get_group_rank(dst)
-    assert gdst >= 0, ("dst rank out of group, need global rank")
+    assert gdst >= 0, "dst rank out of group, need global rank"
 
     if _non_static_mode():
         if op == ReduceOp.SUM:
-            return _legacy_C_ops.c_reduce_sum(tensor, tensor, 'use_calc_stream',
-                                              use_calc_stream, 'ring_id',
-                                              ring_id, 'root_id', gdst)
+            return _legacy_C_ops.c_reduce_sum(
+                tensor,
+                tensor,
+                'use_calc_stream',
+                use_calc_stream,
+                'ring_id',
+                ring_id,
+                'root_id',
+                gdst,
+            )
         elif op == ReduceOp.MAX:
-            return _legacy_C_ops.c_reduce_max(tensor, tensor, 'use_calc_stream',
-                                              use_calc_stream, 'ring_id',
-                                              ring_id, 'root_id', gdst)
+            return _legacy_C_ops.c_reduce_max(
+                tensor,
+                tensor,
+                'use_calc_stream',
+                use_calc_stream,
+                'ring_id',
+                ring_id,
+                'root_id',
+                gdst,
+            )
         elif op == ReduceOp.MIN:
-            return _legacy_C_ops.c_reduce_min(tensor, tensor, 'use_calc_stream',
-                                              use_calc_stream, 'ring_id',
-                                              ring_id, 'root_id', gdst)
+            return _legacy_C_ops.c_reduce_min(
+                tensor,
+                tensor,
+                'use_calc_stream',
+                use_calc_stream,
+                'ring_id',
+                ring_id,
+                'root_id',
+                gdst,
+            )
         elif op == ReduceOp.PROD:
-            return _legacy_C_ops.c_reduce_prod(tensor, tensor,
-                                               'use_calc_stream',
-                                               use_calc_stream, 'ring_id',
-                                               ring_id, 'root_id', gdst)
+            return _legacy_C_ops.c_reduce_prod(
+                tensor,
+                tensor,
+                'use_calc_stream',
+                use_calc_stream,
+                'ring_id',
+                ring_id,
+                'root_id',
+                gdst,
+            )
         else:
             raise ValueError("Unknown parameter: {}.".format(op))
 
     op_type = 'c_reduce'
-    check_variable_and_dtype(tensor, 'tensor', [
-        'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-        'bool'
-    ], 'reduce')
+    check_variable_and_dtype(
+        tensor,
+        'tensor',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'int8',
+            'uint8',
+            'bool',
+        ],
+        'reduce',
+    )
 
     if op == ReduceOp.SUM:
         op_type = 'c_reduce_sum'
@@ -793,14 +874,16 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
         op_type = 'c_reduce_prod'
 
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(type=op_type,
-                     inputs={'X': [tensor]},
-                     outputs={'Out': [tensor]},
-                     attrs={
-                         'ring_id': ring_id,
-                         'use_calc_stream': use_calc_stream,
-                         'root_id': gdst,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': use_calc_stream,
+            'root_id': gdst,
+        },
+    )
 
 
 def all_gather(tensor_list, tensor, group=None, sync_op=True):
@@ -853,8 +936,9 @@ def all_gather(tensor_list, tensor, group=None, sync_op=True):
             list_of_complex.append(paddle.as_complex(tensor))
         return list_of_complex
 
-    is_input_complex = (tensor.dtype == paddle.complex64
-                        or tensor.dtype == paddle.complex128)
+    is_input_complex = (
+        tensor.dtype == paddle.complex64 or tensor.dtype == paddle.complex128
+    )
     if is_input_complex:
         tensor = paddle.as_real(tensor)
 
@@ -881,33 +965,68 @@ def all_gather(tensor_list, tensor, group=None, sync_op=True):
     nranks = _get_global_group().nranks if group is None else group.nranks
 
     if _non_static_mode():
-        out = _legacy_C_ops.c_allgather(tensor, 'use_calc_stream',
-                                        use_calc_stream, 'ring_id', ring_id,
-                                        'nranks', nranks)
+        out = _legacy_C_ops.c_allgather(
+            tensor,
+            'use_calc_stream',
+            use_calc_stream,
+            'ring_id',
+            ring_id,
+            'nranks',
+            nranks,
+        )
     else:
         op_type = 'c_allgather'
         helper = LayerHelper(op_type, **locals())
         out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
         if not isinstance(tensor_list, list):
-            raise ValueError("The type of 'tensor_list' for all_gather "
-                             "should be list.")
+            raise ValueError(
+                "The type of 'tensor_list' for all_gather " "should be list."
+            )
         for elem in tensor_list:
-            check_variable_and_dtype(elem, 'tensor_list', [
-                'float16', 'float32', 'float64', 'int32', 'int64', 'bool',
-                'int8', 'uint8', 'complex64', 'complex128'
-            ], 'all_gather')
-        check_variable_and_dtype(tensor, 'tensor', [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'bool', 'int8',
-            'uint8', 'complex64', 'complex128'
-        ], 'all_gather')
-        helper.append_op(type=op_type,
-                         inputs={'X': [tensor]},
-                         outputs={'Out': [out]},
-                         attrs={
-                             'ring_id': ring_id,
-                             'use_calc_stream': use_calc_stream,
-                             'nranks': nranks
-                         })
+            check_variable_and_dtype(
+                elem,
+                'tensor_list',
+                [
+                    'float16',
+                    'float32',
+                    'float64',
+                    'int32',
+                    'int64',
+                    'bool',
+                    'int8',
+                    'uint8',
+                    'complex64',
+                    'complex128',
+                ],
+                'all_gather',
+            )
+        check_variable_and_dtype(
+            tensor,
+            'tensor',
+            [
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'bool',
+                'int8',
+                'uint8',
+                'complex64',
+                'complex128',
+            ],
+            'all_gather',
+        )
+        helper.append_op(
+            type=op_type,
+            inputs={'X': [tensor]},
+            outputs={'Out': [out]},
+            attrs={
+                'ring_id': ring_id,
+                'use_calc_stream': use_calc_stream,
+                'nranks': nranks,
+            },
+        )
 
     list_of_tensor = paddle.split(out, nranks, 0)
     if is_input_complex:
@@ -963,7 +1082,8 @@ def all_gather_object(object_list, obj, group=None):
             print(object_list)
             # [{'foo': [1, 2, 3]}, {'bar': [4, 5, 6]}] (2 GPUs)
     """
-    assert in_dygraph_mode(
+    assert (
+        in_dygraph_mode()
     ), "all_gather_object doesn't support static graph mode."
 
     tensor, len_of_tensor = _convert_object_to_tensor(obj)
@@ -984,7 +1104,8 @@ def all_gather_object(object_list, obj, group=None):
     all_gather(tensor_list, input_tensor, group)
     for i, tensor in enumerate(tensor_list):
         object_list.append(
-            _convert_tensor_to_object(tensor, list_len_of_tensor[i]))
+            _convert_tensor_to_object(tensor, list_len_of_tensor[i])
+        )
 
 
 def scatter(tensor, tensor_list=None, src=0, group=None, sync_op=True):
@@ -1046,7 +1167,7 @@ def scatter(tensor, tensor_list=None, src=0, group=None, sync_op=True):
         gsrc = src if group is None else group.get_group_rank(src)
         rank = _get_global_group().rank if group is None else group.rank
         nranks = _get_global_group().nranks if group is None else group.nranks
-    assert gsrc >= 0, ("src rank out of group, need global rank")
+    assert gsrc >= 0, "src rank out of group, need global rank"
 
     if rank != gsrc:
         tensor_list = []
@@ -1063,24 +1184,46 @@ def scatter(tensor, tensor_list=None, src=0, group=None, sync_op=True):
 
     use_calc_stream = sync_op
     if _non_static_mode():
-        return _legacy_C_ops.c_scatter(temp, tensor, 'use_calc_stream',
-                                       use_calc_stream, 'ring_id', ring_id,
-                                       'nranks', nranks, 'root', gsrc)
+        return _legacy_C_ops.c_scatter(
+            temp,
+            tensor,
+            'use_calc_stream',
+            use_calc_stream,
+            'ring_id',
+            ring_id,
+            'nranks',
+            nranks,
+            'root',
+            gsrc,
+        )
     op_type = 'c_scatter'
-    check_variable_and_dtype(tensor, 'tensor', [
-        'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-        'bool'
-    ], 'scatter')
+    check_variable_and_dtype(
+        tensor,
+        'tensor',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'int8',
+            'uint8',
+            'bool',
+        ],
+        'scatter',
+    )
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(type=op_type,
-                     inputs={'X': [temp]},
-                     outputs={'Out': [tensor]},
-                     attrs={
-                         'ring_id': ring_id,
-                         'root': gsrc,
-                         'use_calc_stream': use_calc_stream,
-                         'nranks': nranks,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [temp]},
+        outputs={'Out': [tensor]},
+        attrs={
+            'ring_id': ring_id,
+            'root': gsrc,
+            'use_calc_stream': use_calc_stream,
+            'nranks': nranks,
+        },
+    )
 
 
 def alltoall(in_tensor_list, out_tensor_list, group=None, sync_op=True):
@@ -1105,7 +1248,7 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, sync_op=True):
 
     Returns:
         None.
-    
+
     Examples:
         .. code-block:: python
 
@@ -1132,7 +1275,7 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, sync_op=True):
     if in_dygraph_mode():
         group = _get_default_group() if group is None else group
         backend = _group_map_backend[group]
-        assert backend != 'gloo', ("backend gloo is not supported yet")
+        assert backend != 'gloo', "backend gloo is not supported yet"
     else:
         ring_id = 0 if group is None else group.id
 
@@ -1153,44 +1296,56 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, sync_op=True):
 
     use_calc_stream = sync_op
     if _non_static_mode():
-        out = _legacy_C_ops.alltoall(temp, 'use_calc_stream', use_calc_stream,
-                                     'ring_id', ring_id)
+        out = _legacy_C_ops.alltoall(
+            temp, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id
+        )
     else:
         op_type = 'alltoall'
         helper = LayerHelper(op_type, **locals())
         out = helper.create_variable_for_type_inference(
-            dtype=in_tensor_list[0].dtype)
+            dtype=in_tensor_list[0].dtype
+        )
 
         if not isinstance(in_tensor_list, list):
-            raise ValueError("The type of 'in_tensor_list' for all_to_all "
-                             "should be list.")
+            raise ValueError(
+                "The type of 'in_tensor_list' for all_to_all " "should be list."
+            )
         for elem in in_tensor_list:
             check_variable_and_dtype(
-                elem, 'in_tensor_list',
+                elem,
+                'in_tensor_list',
                 ['float16', 'float32', 'float64', 'int32', 'int64'],
-                'all_to_all')
+                'all_to_all',
+            )
         if not isinstance(out_tensor_list, list):
-            raise ValueError("The type of 'out_tensor_list' for all_to_all "
-                             "should be list.")
+            raise ValueError(
+                "The type of 'out_tensor_list' for all_to_all "
+                "should be list."
+            )
         if len(out_tensor_list) != 0:
-            raise ValueError("The 'out_tensor_list' for all_to_all "
-                             "must be an empty list.")
-        helper.append_op(type=op_type,
-                         inputs={'X': [temp]},
-                         outputs={'Out': [out]},
-                         attrs={
-                             'ring_id': ring_id,
-                             'use_calc_stream': use_calc_stream,
-                         })
+            raise ValueError(
+                "The 'out_tensor_list' for all_to_all " "must be an empty list."
+            )
+        helper.append_op(
+            type=op_type,
+            inputs={'X': [temp]},
+            outputs={'Out': [out]},
+            attrs={
+                'ring_id': ring_id,
+                'use_calc_stream': use_calc_stream,
+            },
+        )
     out_tensor_list.extend(paddle.split(out, nranks, 0))
 
 
-def alltoall_single(in_tensor,
-                    out_tensor,
-                    in_split_sizes=None,
-                    out_split_sizes=None,
-                    group=None,
-                    sync_op=True):
+def alltoall_single(
+    in_tensor,
+    out_tensor,
+    in_split_sizes=None,
+    out_split_sizes=None,
+    group=None,
+    sync_op=True,
+):
     """
     Scatter a single input tensor to all participators and gather the received tensors in out_tensor.
 
@@ -1200,9 +1355,9 @@ def alltoall_single(in_tensor,
     Args:
         in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         out_tensor (Tensor): Output Tensor. The data type should be the same as the data type of the input Tensor.
-        in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor`` 
+        in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor``
             must be divisible by group size and ``in_tensor`` will be scattered averagely to all participators. Default: None.
-        out_split_sizes (list[int], optional): Split sizes of ``out_tensor`` for dim[0]. If not given, dim[0] of ``out_tensor`` 
+        out_split_sizes (list[int], optional): Split sizes of ``out_tensor`` for dim[0]. If not given, dim[0] of ``out_tensor``
             must be divisible by group size and ``out_tensor`` will be gathered averagely from all participators. Default: None.
         group (Group, optional): The group instance return by ``new_group`` or None for global default group. Default: None.
         sync_op (bool, optional): Whether this op is a sync op. The default value is True.
@@ -1263,13 +1418,14 @@ def alltoall_single(in_tensor,
 
     group = _get_default_group() if group is None else group
     backend = _group_map_backend[group]
-    assert backend != 'gloo', ("backend gloo is not supported yet")
+    assert backend != 'gloo', "backend gloo is not supported yet"
 
     in_split_sizes = [] if in_split_sizes is None else in_split_sizes
     out_split_sizes = [] if out_split_sizes is None else out_split_sizes
 
-    task = group.process_group.alltoall_single(in_tensor, out_tensor,
-                                               in_split_sizes, out_split_sizes)
+    task = group.process_group.alltoall_single(
+        in_tensor, out_tensor, in_split_sizes, out_split_sizes
+    )
     if sync_op:
         task.wait()
         return
@@ -1318,7 +1474,7 @@ def send(tensor, dst=0, group=None, sync_op=True):
     if in_dygraph_mode():
         group = _get_default_group() if group is None else group
         backend = _group_map_backend[group]
-        assert backend != 'gloo', ("backend gloo is not supported yet")
+        assert backend != 'gloo', "backend gloo is not supported yet"
         task = group.process_group.send(tensor, dst)
         if sync_op:
             task.wait()
@@ -1330,21 +1486,33 @@ def send(tensor, dst=0, group=None, sync_op=True):
     ring_id = 0 if group is None else group.id
 
     if _non_static_mode():
-        return _legacy_C_ops.send_v2(tensor, 'use_calc_stream', use_calc_stream,
-                                     'ring_id', ring_id, 'peer', dst)
+        return _legacy_C_ops.send_v2(
+            tensor,
+            'use_calc_stream',
+            use_calc_stream,
+            'ring_id',
+            ring_id,
+            'peer',
+            dst,
+        )
     op_type = 'send_v2'
     check_variable_and_dtype(
-        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'send')
+        tensor,
+        'tensor',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'send',
+    )
 
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(type=op_type,
-                     inputs={'X': [tensor]},
-                     attrs={
-                         'ring_id': ring_id,
-                         'peer': dst,
-                         'use_calc_stream': use_calc_stream,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        attrs={
+            'ring_id': ring_id,
+            'peer': dst,
+            'use_calc_stream': use_calc_stream,
+        },
+    )
 
 
 def recv(tensor, src=0, group=None, sync_op=True):
@@ -1385,7 +1553,7 @@ def recv(tensor, src=0, group=None, sync_op=True):
     if in_dygraph_mode():
         group = _get_default_group() if group is None else group
         backend = _group_map_backend[group]
-        assert backend != 'gloo', ("backend gloo is not supported yet")
+        assert backend != 'gloo', "backend gloo is not supported yet"
         task = group.process_group.recv(tensor, src)
         if sync_op:
             task.wait()
@@ -1397,37 +1565,58 @@ def recv(tensor, src=0, group=None, sync_op=True):
     ring_id = 0 if group is None else group.id
 
     if _non_static_mode():
-        return _legacy_C_ops.recv_v2(tensor, 'use_calc_stream', use_calc_stream,
-                                     'ring_id', ring_id, 'peer', src, 'dtype',
-                                     tensor.dtype, 'out_shape', tensor.shape)
+        return _legacy_C_ops.recv_v2(
+            tensor,
+            'use_calc_stream',
+            use_calc_stream,
+            'ring_id',
+            ring_id,
+            'peer',
+            src,
+            'dtype',
+            tensor.dtype,
+            'out_shape',
+            tensor.shape,
+        )
     op_type = 'recv_v2'
     check_variable_and_dtype(
-        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'recv')
+        tensor,
+        'tensor',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'recv',
+    )
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(type=op_type,
-                     outputs={'Out': [tensor]},
-                     attrs={
-                         'ring_id': ring_id,
-                         'peer': src,
-                         'out_shape': tensor.shape,
-                         'dtype': tensor.dtype,
-                         'use_calc_stream': use_calc_stream,
-                     })
+    helper.append_op(
+        type=op_type,
+        outputs={'Out': [tensor]},
+        attrs={
+            'ring_id': ring_id,
+            'peer': src,
+            'out_shape': tensor.shape,
+            'dtype': tensor.dtype,
+            'use_calc_stream': use_calc_stream,
+        },
+    )
 
 
 def _check_single_tensor(tensor, tensor_name):
     if not isinstance(tensor, (core.eager.Tensor, paddle.Tensor)):
-        raise RuntimeError("Invalid function argument. Expected parameter {}"
-                           "to be of type paddle.Tensor, but it's {}".format(
-                               tensor_name, type(tensor)))
+        raise RuntimeError(
+            "Invalid function argument. Expected parameter {}"
+            "to be of type paddle.Tensor, but it's {}".format(
+                tensor_name, type(tensor)
+            )
+        )
 
 
 def _check_tensor_list(tensor_list, tensor_name):
-    if not isinstance(tensor_list, list) or \
-        not all(isinstance(t, (core.eager.Tensor, paddle.Tensor)) for t in tensor_list):
-        raise RuntimeError("Invalid function argument. Expected parameter {}"
-                           "to be of type paddle.Tensor".format(tensor_name))
+    if not isinstance(tensor_list, list) or not all(
+        isinstance(t, (core.eager.Tensor, paddle.Tensor)) for t in tensor_list
+    ):
+        raise RuntimeError(
+            "Invalid function argument. Expected parameter {}"
+            "to be of type paddle.Tensor".format(tensor_name)
+        )
 
 
 def isend(tensor, dst, group=None):
@@ -1439,11 +1628,11 @@ def isend(tensor, dst, group=None):
             should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         dst (int): The destination rank.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
-    
+
     Returns:
         A distributed task object.
 
-    Warning:    
+    Warning:
         This API only supports the dygraph mode.
 
     Examples:
@@ -1472,9 +1661,9 @@ def isend(tensor, dst, group=None):
     if in_dygraph_mode():
         group = _get_default_group() if group is None else group
         backend = _group_map_backend[group]
-        assert backend != 'gloo', ("backend gloo is not supported yet")
+        assert backend != 'gloo', "backend gloo is not supported yet"
         group_dst_rank = group.get_group_rank(dst)
-        assert group_dst_rank >= 0, ("dst rank out of group, need global rank")
+        assert group_dst_rank >= 0, "dst rank out of group, need global rank"
         return group.process_group.send(tensor, group_dst_rank)
     else:
         raise RuntimeError("Only support eager dygraph mode.")
@@ -1493,7 +1682,7 @@ def irecv(tensor, src=None, group=None):
     Returns:
         A distributed task object.
 
-    Warning:    
+    Warning:
         This API only supports the dygraph mode.
 
     Examples:
@@ -1521,9 +1710,9 @@ def irecv(tensor, src=None, group=None):
     if in_dygraph_mode():
         group = _get_default_group() if group is None else group
         backend = _group_map_backend[group]
-        assert backend != 'gloo', ("backend gloo is not supported yet")
+        assert backend != 'gloo', "backend gloo is not supported yet"
         group_src_rank = group.get_group_rank(src)
-        assert group_src_rank >= 0, ("src rank out of group, need global rank")
+        assert group_src_rank >= 0, "src rank out of group, need global rank"
         return group.process_group.recv(tensor, group_src_rank)
     else:
         raise RuntimeError("Only support eager dygraph mode.")
@@ -1542,16 +1731,18 @@ class P2POp(object):
             The type of ``op`` is either ``paddle.distributed.isend`` or ``paddle.distributed.irecv``.
         tensor (Tensor): Tensor to send or receive.
         peer (int): The destination or source rank.
-        group (Group, optional): The group instance return by new_group or None for global 
+        group (Group, optional): The group instance return by new_group or None for global
             default group. Default: None.
 
     """
 
     def __init__(self, op, tensor, peer, group=None):
         if op not in [isend, irecv]:
-            raise RuntimeError("Invalid ``op`` function. Expected ``op`` "
-                               "to be of type ``paddle.distributed.isend`` or "
-                               "``paddle.distributed.irecv``.")
+            raise RuntimeError(
+                "Invalid ``op`` function. Expected ``op`` "
+                "to be of type ``paddle.distributed.isend`` or "
+                "``paddle.distributed.irecv``."
+            )
         _check_single_tensor(tensor, "tensor")
 
         self.op = op
@@ -1577,13 +1768,17 @@ def _check_p2p_op_list(p2p_op_list):
     all ops use the same backend.
     """
     if not isinstance(p2p_op_list, list) or not all(
-            isinstance(p2p_op, P2POp) for p2p_op in p2p_op_list):
-        raise RuntimeError("Invalid ``p2p_op_list``. Each op is expected to "
-                           "to be of type ``paddle.distributed.P2POp``.")
+        isinstance(p2p_op, P2POp) for p2p_op in p2p_op_list
+    ):
+        raise RuntimeError(
+            "Invalid ``p2p_op_list``. Each op is expected to "
+            "to be of type ``paddle.distributed.P2POp``."
+        )
 
     backend = _group_map_backend[p2p_op_list[0].group]
-    if not all(backend == _group_map_backend[p2p_op.group]
-               for p2p_op in p2p_op_list):
+    if not all(
+        backend == _group_map_backend[p2p_op.group] for p2p_op in p2p_op_list
+    ):
         raise RuntimeError("All groups need to use the same backend.")
 
 
@@ -1591,7 +1786,7 @@ def batch_isend_irecv(p2p_op_list):
     """
     Send or Receive a batch of tensors asynchronously and return a list of requests.
 
-    Process each of the point-to-point operations in ``p2p_op_list`` and return the 
+    Process each of the point-to-point operations in ``p2p_op_list`` and return the
     corresponding tasks. NCCL are currently supported.
 
     Args:
@@ -1602,9 +1797,9 @@ def batch_isend_irecv(p2p_op_list):
 
     Returns:
         A list of distributed tasks returned by calling the corresponding
-        op in the op_list. 
+        op in the op_list.
 
-    Warning:    
+    Warning:
         This API only supports the dygraph mode.
 
     Examples:
@@ -1632,7 +1827,7 @@ def batch_isend_irecv(p2p_op_list):
 
             for task in tasks:
                 task.wait()
-            
+
             print(recv_t)
             # paddle.tensor([1, 2])     # Rank-0
             # paddle.tensor([0, 1])     # Rank-1
@@ -1660,11 +1855,9 @@ def batch_isend_irecv(p2p_op_list):
         raise RuntimeError("Don't support static graph mode currently.")
 
 
-def reduce_scatter(tensor,
-                   tensor_list,
-                   op=ReduceOp.SUM,
-                   group=None,
-                   sync_op=True):
+def reduce_scatter(
+    tensor, tensor_list, op=ReduceOp.SUM, group=None, sync_op=True
+):
     """
     Reduces, then scatters a list of tensors to all processes in a group
 
@@ -1673,7 +1866,7 @@ def reduce_scatter(tensor,
         tensor_list (list[Tensor]): List of tensors to reduce and scatter. Every element in the list must be a Tensor whose data type
             should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
-        group (Group, optional): The group instance return by new_group or None for global 
+        group (Group, optional): The group instance return by new_group or None for global
             default group. Default: None.
         sync_op (bool, optional): Whether this op is a sync op. The default value is True.
 
@@ -1715,7 +1908,7 @@ def reduce_scatter(tensor,
         op_type = _get_reduce_op(op, "reduce_scatter")
         group = _get_default_group() if group is None else group
         backend = _group_map_backend[group]
-        assert backend != 'gloo', ("backend gloo is not supported yet")
+        assert backend != 'gloo', "backend gloo is not supported yet"
 
         temp = paddle.concat(tensor_list, axis=0)
         task = group.process_group._reduce_scatter_base(tensor, temp, op_type)
@@ -1728,11 +1921,9 @@ def reduce_scatter(tensor,
         raise RuntimeError("Don't support static graph mode currently.")
 
 
-def _reduce_scatter_base(output,
-                         input,
-                         op=ReduceOp.SUM,
-                         group=None,
-                         sync_op=True):
+def _reduce_scatter_base(
+    output, input, op=ReduceOp.SUM, group=None, sync_op=True
+):
     """
     Reduces, then scatters a flattened tensor to all processes in a group.
 
diff --git a/python/paddle/distributed/fleet/base/private_helper_function.py b/python/paddle/distributed/fleet/base/private_helper_function.py
index 8e2871272a9..42cd25fb38a 100644
--- a/python/paddle/distributed/fleet/base/private_helper_function.py
+++ b/python/paddle/distributed/fleet/base/private_helper_function.py
@@ -24,11 +24,11 @@ def wait_server_ready(endpoints):
     """
     Wait until parameter servers are ready, use connext_ex to detect
     port readiness.
-    
+
     Args:
     endpoints (list|tuple): endpoints string list, like:
     ["127.0.0.1:8080", "127.0.0.1:8081"]
-    
+
     Examples:
     .. code-block:: python
 
@@ -40,8 +40,9 @@ def wait_server_ready(endpoints):
         not_ready_endpoints = []
         for ep in endpoints:
             ip_port = ep.split(":")
-            with closing(socket.socket(socket.AF_INET,
-                                       socket.SOCK_STREAM)) as sock:
+            with closing(
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as sock:
                 sock.settimeout(2)
                 sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                 if hasattr(socket, 'SO_REUSEPORT'):
@@ -53,8 +54,9 @@ def wait_server_ready(endpoints):
                     not_ready_endpoints.append(ep)
         if not all_ok:
             sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-            sys.stderr.write("not ready endpoints:" + str(not_ready_endpoints) +
-                             "\n")
+            sys.stderr.write(
+                "not ready endpoints:" + str(not_ready_endpoints) + "\n"
+            )
             sys.stderr.flush()
             time.sleep(3)
         else:
diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py
index dcf427c3954..ae665b86bb7 100644
--- a/python/paddle/distributed/fleet/fleet.py
+++ b/python/paddle/distributed/fleet/fleet.py
@@ -19,7 +19,11 @@ from types import MethodType
 import numpy as np
 from paddle.fluid.framework import _global_flags
 from paddle.fluid import compiler
-from .base.role_maker import UserDefinedRoleMaker, PaddleCloudRoleMaker, RoleMakerBase
+from .base.role_maker import (
+    UserDefinedRoleMaker,
+    PaddleCloudRoleMaker,
+    RoleMakerBase,
+)
 from .base.strategy_compiler import StrategyCompiler
 from .base.distributed_strategy import DistributedStrategy
 from .base.meta_optimizer_factory import MetaOptimizerFactory
@@ -60,12 +64,12 @@ def apply_ir_passes(main_program, startup_program, config):
         )
         build_strategy.fuse_all_optimizer_ops = False
 
-    return apply_build_strategy(main_program, startup_program, build_strategy,
-                                pass_attrs)
+    return apply_build_strategy(
+        main_program, startup_program, build_strategy, pass_attrs
+    )
 
 
 def _inited_runtime_handler_(func):
-
     def __impl__(*args, **kwargs):
         cls = args[0]
 
@@ -78,15 +82,17 @@ def _inited_runtime_handler_(func):
 
 
 def _is_non_distributed_check_(func):
-
     def __impl__(*args, **kwargs):
         cls = args[0]
 
-        if cls._role_maker is not None and cls._role_maker._is_non_distributed(
-        ) is True:
+        if (
+            cls._role_maker is not None
+            and cls._role_maker._is_non_distributed() is True
+        ):
             logger.warning(
-                "%s() function doesn't work when use non_distributed fleet." %
-                (func.__name__))
+                "%s() function doesn't work when use non_distributed fleet."
+                % (func.__name__)
+            )
             return
 
         return func(*args, **kwargs)
@@ -166,11 +172,13 @@ class Fleet(object):
         self._context = {}
         self.user_defined_optimizer = paddle.optimizer.Optimizer(0.0)
 
-    def init(self,
-             role_maker=None,
-             is_collective=False,
-             strategy=None,
-             log_level="INFO"):
+    def init(
+        self,
+        role_maker=None,
+        is_collective=False,
+        strategy=None,
+        log_level="INFO",
+    ):
         """
         Initialize role_maker in Fleet.
 
@@ -179,13 +187,13 @@ class Fleet(object):
 
         Args:
             role_maker (RoleMakerBase, optional): A ``RoleMakerBase`` containing the configuration
-                of environment variables related to distributed training.If you did not initialize 
+                of environment variables related to distributed training.If you did not initialize
                 the rolemaker by yourself, it will be automatically initialized to PaddleRoleMaker.
                 The default value is None.
-            is_collective (Boolean, optional): A ``Boolean`` variable determines whether the program 
+            is_collective (Boolean, optional): A ``Boolean`` variable determines whether the program
                 runs on the CPU or GPU. False means set distributed training using CPU, and True means
                 GPU.The default value is False.The default value is False.
-            strategy (DistributedStrategy): Extra properties for distributed training. 
+            strategy (DistributedStrategy): Extra properties for distributed training.
                 For details, please refer to paddle.distributed.fleet.DistributedStrategy. Default: None.
             log_level (Integer, String, optional): A ``Integer`` or ``String`` Variable determining how hight
                 the logging level is. Default is "INFO".
@@ -244,22 +252,28 @@ class Fleet(object):
             if isinstance(is_collective, bool):
                 self._is_collective = is_collective
                 self._role_maker = PaddleCloudRoleMaker(
-                    is_collective=self._is_collective)
+                    is_collective=self._is_collective
+                )
             else:
                 raise ValueError(
-                    "`is_collective` should be instance of `bool`, but got {}".
-                    format(type(is_collective)))
+                    "`is_collective` should be instance of `bool`, but got {}".format(
+                        type(is_collective)
+                    )
+                )
         else:
             if isinstance(role_maker, RoleMakerBase):
                 self._role_maker = role_maker
                 self._is_collective = role_maker._is_collective
             else:
                 raise ValueError(
-                    "`role_maker` should be subclass of `RoleMakerBase`, but got {}"
-                    .format(type(role_maker)))
+                    "`role_maker` should be subclass of `RoleMakerBase`, but got {}".format(
+                        type(role_maker)
+                    )
+                )
         self._role_maker._generate_role()
 
         import paddle.distributed.fleet as fleet
+
         fleet.util._set_role_maker(self._role_maker)
 
         self.strategy_compiler = StrategyCompiler()
@@ -280,17 +294,20 @@ class Fleet(object):
                 return
             if parallel_helper._is_parallel_ctx_initialized():
                 logger.warning(
-                    "The dygraph parallel environment has been initialized.")
+                    "The dygraph parallel environment has been initialized."
+                )
             else:
                 # FLAGS_nccl_nrings is used for dynamic graph multi-stream communication
                 if "FLAGS_nccl_nrings" in os.environ:
                     logger.warning(
                         "You have set the environment variable FLAGS_nccl_nrings "
                         "outside the program, so the nccl_comm_num in "
-                        "DistributedStrategy will not take effect here.")
+                        "DistributedStrategy will not take effect here."
+                    )
                 else:
                     os.environ["FLAGS_nccl_nrings"] = str(
-                        self._user_defined_strategy.nccl_comm_num)
+                        self._user_defined_strategy.nccl_comm_num
+                    )
                 paddle.distributed.init_parallel_env()
 
             # hybrid parallel not support for npu/xpu
@@ -312,17 +329,24 @@ class Fleet(object):
             global_ring_id = 3 if use_sharding else 0
             global_ranks = list(range(global_world_size))
 
-            if tp._HYBRID_PARALLEL_GROUP is None: tp._CommunicateGroup()
+            if tp._HYBRID_PARALLEL_GROUP is None:
+                tp._CommunicateGroup()
             cg = tp._HYBRID_PARALLEL_GROUP
             self._hcg = cg
-            cg.set_comm_group('global', global_rank, global_world_size,
-                              global_ring_id, global_ranks)
+            cg.set_comm_group(
+                'global',
+                global_rank,
+                global_world_size,
+                global_ring_id,
+                global_ranks,
+            )
 
             use_tensor_parallel = self._user_defined_strategy.tensor_parallel
             use_mp = use_sharding or use_tensor_parallel
 
             # hybrid group
-            if use_mp is False: return
+            if use_mp is False:
+                return
 
             mp_degree_sharding = 1
             mp_degree_tensor_parallel = 1
@@ -331,14 +355,21 @@ class Fleet(object):
                 mp_degree_sharding = int(sharding_configs['mp_degree'])
 
             if use_tensor_parallel:
-                tensor_parallel_configs = self._user_defined_strategy.tensor_parallel_configs
+                tensor_parallel_configs = (
+                    self._user_defined_strategy.tensor_parallel_configs
+                )
                 mp_degree_tensor_parallel = int(
-                    tensor_parallel_configs['tensor_parallel_degree'])
+                    tensor_parallel_configs['tensor_parallel_degree']
+                )
 
             if use_sharding and use_tensor_parallel:
                 assert mp_degree_sharding == mp_degree_tensor_parallel
 
-            mp_degree = mp_degree_sharding if use_sharding else mp_degree_tensor_parallel
+            mp_degree = (
+                mp_degree_sharding
+                if use_sharding
+                else mp_degree_tensor_parallel
+            )
 
             if mp_degree > 1:
                 assert global_world_size % mp_degree == 0
@@ -347,16 +378,17 @@ class Fleet(object):
                 mp_rank = global_rank % mp_degree
                 mp_group_id = global_rank // mp_degree
                 mp_group_ranks = [
-                    idx for idx in global_ranks
+                    idx
+                    for idx in global_ranks
                     if idx // mp_degree == mp_group_id
                 ]
-                cg.set_comm_group('model', mp_rank, mp_degree, mp_ring_id,
-                                  mp_group_ranks)
+                cg.set_comm_group(
+                    'model', mp_rank, mp_degree, mp_ring_id, mp_group_ranks
+                )
         return self
 
     def _init_hybrid_parallel_env(self):
-        """initialize the hybrid environment
-        """
+        """initialize the hybrid environment"""
         self.hybrid_configs = self._user_defined_strategy.hybrid_configs
         self.dp_degree = self.hybrid_configs["dp_degree"]
         self.mp_degree = self.hybrid_configs["mp_degree"]
@@ -365,7 +397,9 @@ class Fleet(object):
 
         assert self.mp_degree >= 0, "mp_degree should be greater or equal to 0"
         assert self.pp_degree >= 0, "pp_degree should be greater or equal to 0"
-        assert self.sharding_degree >= 0, "sharding_degree should be greater or equal to 0"
+        assert (
+            self.sharding_degree >= 0
+        ), "sharding_degree should be greater or equal to 0"
 
         self.mp_degree = max(self.mp_degree, 1)
         self.pp_degree = max(self.pp_degree, 1)
@@ -379,14 +413,19 @@ class Fleet(object):
         self._topology = tp.CommunicateTopology(
             hybrid_group_names=["data", "pipe", "sharding", "model"],
             dims=[
-                self.dp_degree, self.pp_degree, self.sharding_degree,
-                self.mp_degree
-            ])
+                self.dp_degree,
+                self.pp_degree,
+                self.sharding_degree,
+                self.mp_degree,
+            ],
+        )
 
         self._hcg = tp.HybridCommunicateGroup(self._topology)
 
         if self.mp_degree > 1:
-            tensor_parallel_configs = self._user_defined_strategy.tensor_parallel_configs
+            tensor_parallel_configs = (
+                self._user_defined_strategy.tensor_parallel_configs
+            )
             tensor_init_seed = tensor_parallel_configs["tensor_init_seed"]
             if tensor_init_seed == -1:
                 model_parallel_random_seed()
@@ -826,29 +865,29 @@ class Fleet(object):
                 for name in fetch_var_names
             ]
 
-            self._runtime_handle._save_inference_model(executor, dirname,
-                                                       feeded_var_names,
-                                                       fetch_vars, None, True,
-                                                       0)
+            self._runtime_handle._save_inference_model(
+                executor, dirname, feeded_var_names, fetch_vars, None, True, 0
+            )
         else:
             increment_mode = 0
             if "mode" in configs:
                 increment_mode = int(configs["mode"])
-            self._runtime_handle._save_persistables(executor,
-                                                    dirname,
-                                                    main_program=None,
-                                                    mode=increment_mode)
+            self._runtime_handle._save_persistables(
+                executor, dirname, main_program=None, mode=increment_mode
+            )
 
     @is_non_distributed_check
     @inited_runtime_handler
-    def save_inference_model(self,
-                             executor,
-                             dirname,
-                             feeded_var_names,
-                             target_vars,
-                             main_program=None,
-                             export_for_deployment=True,
-                             mode=0):
+    def save_inference_model(
+        self,
+        executor,
+        dirname,
+        feeded_var_names,
+        target_vars,
+        main_program=None,
+        export_for_deployment=True,
+        mode=0,
+    ):
         """
         save inference model for inference.
 
@@ -869,10 +908,15 @@ class Fleet(object):
 
         """
 
-        self._runtime_handle._save_inference_model(executor, dirname,
-                                                   feeded_var_names,
-                                                   target_vars, main_program,
-                                                   export_for_deployment, mode)
+        self._runtime_handle._save_inference_model(
+            executor,
+            dirname,
+            feeded_var_names,
+            target_vars,
+            main_program,
+            export_for_deployment,
+            mode,
+        )
 
     @is_non_distributed_check
     @inited_runtime_handler
@@ -917,8 +961,9 @@ class Fleet(object):
                 fleet.save_persistables(exe, "dirname", paddle.static.default_main_program())
 
         """
-        self._runtime_handle._save_persistables(executor, dirname, main_program,
-                                                mode)
+        self._runtime_handle._save_persistables(
+            executor, dirname, main_program, mode
+        )
 
     @is_non_distributed_check
     @inited_runtime_handler
@@ -957,12 +1002,9 @@ class Fleet(object):
 
     @is_non_distributed_check
     @inited_runtime_handler
-    def save_dense_params(self,
-                          executor,
-                          dirname,
-                          scope,
-                          program,
-                          var_names=None):
+    def save_dense_params(
+        self, executor, dirname, scope, program, var_names=None
+    ):
         """
         save fleet one table from path
 
@@ -986,8 +1028,9 @@ class Fleet(object):
                 fleet.save_dense_params(exe, "path", scope=paddle.static.global_scope(), program=paddle.static.default_main_program())
 
         """
-        self._runtime_handle._save_dense_params(executor, dirname, scope,
-                                                program, var_names)
+        self._runtime_handle._save_dense_params(
+            executor, dirname, scope, program, var_names
+        )
 
     def shrink(self, threshold=None):
         self._runtime_handle._shrink(threshold)
@@ -1001,10 +1044,10 @@ class Fleet(object):
 
         Args:
             optimizer(Optimizer): The executor to run for init server.
-            strategy(DistributedStrategy): Extra properties for distributed optimizer. 
+            strategy(DistributedStrategy): Extra properties for distributed optimizer.
                 It is recommended to use DistributedStrategy in fleet.init(). The strategy
-                here is for compatibility. If the strategy in fleet.distributed_optimizer() 
-                is not None, then it will overwrite the DistributedStrategy in fleet.init(), 
+                here is for compatibility. If the strategy in fleet.distributed_optimizer()
+                is not None, then it will overwrite the DistributedStrategy in fleet.init(),
                 which will take effect in distributed training.
 
         Returns:
@@ -1031,7 +1074,8 @@ class Fleet(object):
                     "in fleet.init(). The strategy here is only for compatibility. "
                     "If the strategy in fleet.distributed_optimizer() is "
                     "not None, then it will overwrite the DistributedStrategy in fleet.init(), "
-                    "which will take effect in distributed training.")
+                    "which will take effect in distributed training."
+                )
             self._user_defined_strategy = copy.deepcopy(strategy)
 
         self._context = {}
@@ -1050,31 +1094,29 @@ class Fleet(object):
             if hasattr(self.user_defined_optimizer, 'amp_init'):
                 amp_optimizer = self.user_defined_optimizer
 
-        assert amp_optimizer is not None, \
-            "amp_init can only be used when the amp(auto mixed precision) strategy is turned on."
+        assert (
+            amp_optimizer is not None
+        ), "amp_init can only be used when the amp(auto mixed precision) strategy is turned on."
         return amp_optimizer
 
     def get_loss_scaling(self):
-        """Return the real-time loss scaling factor.
-        """
+        """Return the real-time loss scaling factor."""
         amp_optimizer = self._get_amp_optimizer()
         return amp_optimizer.get_loss_scaling()
 
-    def amp_init(self,
-                 place,
-                 scope=None,
-                 test_program=None,
-                 use_fp16_test=False):
+    def amp_init(
+        self, place, scope=None, test_program=None, use_fp16_test=False
+    ):
         """
         Init the amp training, such as cast fp32 parameters to fp16 type.
-  
+
         Args:
-            place(CUDAPlace): place is used to initialize 
+            place(CUDAPlace): place is used to initialize
                 fp16 parameters with fp32 values.
             scope(Scope): The scope is used to find fp32 parameters.
             test_program(Program): The program is used for testing.
             use_fp16_test(bool): Whether to use fp16 testing.
-            
+
         Examples:
             .. code-block:: python
 
@@ -1096,7 +1138,7 @@ class Fleet(object):
                         loss = paddle.mean(hidden)
                     # 2) Create the optimizer and set `multi_precision` to True.
                     # Setting `multi_precision` to True can avoid the poor accuracy
-                    # or the slow convergence in a way. 
+                    # or the slow convergence in a way.
                     optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                     # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                     amp_list = paddle.static.amp.CustomOpLists(
@@ -1116,9 +1158,9 @@ class Fleet(object):
                     # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                     # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                     optimizer.amp_init(place, scope=paddle.static.global_scope())
-                    
+
                 if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
-                    run_example_code()       
+                    run_example_code()
         """
         amp_optimizer = self._get_amp_optimizer()
         return amp_optimizer.amp_init(place, scope, test_program, use_fp16_test)
@@ -1150,11 +1192,9 @@ class Fleet(object):
         else:
             return self._context["applied_graph_list"]
 
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None):
+    def minimize(
+        self, loss, startup_program=None, parameter_list=None, no_grad_set=None
+    ):
         """
         Add distributed operations to minimize ``loss`` by updating ``parameter_list``.
 
@@ -1206,23 +1246,27 @@ class Fleet(object):
 
         """
         if not isinstance(loss, list):
-            return self._minimize_impl(loss, startup_program, parameter_list,
-                                       no_grad_set)
+            return self._minimize_impl(
+                loss, startup_program, parameter_list, no_grad_set
+            )
         else:
-            if paddle.fluid.framework._non_static_mode(
-            ) or self._role_maker._is_non_distributed() or self._is_collective:
+            if (
+                paddle.fluid.framework._non_static_mode()
+                or self._role_maker._is_non_distributed()
+                or self._is_collective
+            ):
                 raise ValueError("loss can be list only in PS mode")
-            return self._minimize_losses_impl(loss, startup_program,
-                                              parameter_list, no_grad_set)
-
-    def _minimize_impl(self,
-                       loss,
-                       startup_program=None,
-                       parameter_list=None,
-                       no_grad_set=None):
+            return self._minimize_losses_impl(
+                loss, startup_program, parameter_list, no_grad_set
+            )
+
+    def _minimize_impl(
+        self, loss, startup_program=None, parameter_list=None, no_grad_set=None
+    ):
         context = {}
         context["user_defined_strategy"] = copy.deepcopy(
-            self._user_defined_strategy)
+            self._user_defined_strategy
+        )
         if paddle.fluid.framework._non_static_mode():
             # imitate target optimizer retrieval
             target_opt = self.user_defined_optimizer
@@ -1235,49 +1279,62 @@ class Fleet(object):
         if not hasattr(self.origin_main_program, "distributed_info_"):
             setattr(self.origin_main_program, "distributed_info_", dict())
             self.origin_main_program.distributed_info_[
-                "dp_degree"] = self._user_defined_strategy.sharding_configs[
-                    "dp_degree"]
+                "dp_degree"
+            ] = self._user_defined_strategy.sharding_configs["dp_degree"]
             self.origin_main_program.distributed_info_[
-                "mp_degree"] = self._user_defined_strategy.sharding_configs[
-                    "mp_degree"]
+                "mp_degree"
+            ] = self._user_defined_strategy.sharding_configs["mp_degree"]
             self.origin_main_program.distributed_info_[
-                "pp_degree"] = self._user_defined_strategy.sharding_configs[
-                    "pp_degree"]
+                "pp_degree"
+            ] = self._user_defined_strategy.sharding_configs["pp_degree"]
             self.origin_main_program.distributed_info_[
-                "sharding_degree"] = self._user_defined_strategy.sharding_configs[
-                    "sharding_degree"]
+                "sharding_degree"
+            ] = self._user_defined_strategy.sharding_configs["sharding_degree"]
 
         context["origin_main_program"] = self.origin_main_program
         context["origin_main_programs"] = [self.origin_main_program]
         context["loss"] = loss
         if startup_program == None:
-            self.origin_startup_program = \
+            self.origin_startup_program = (
                 paddle.static.default_startup_program().clone(for_test=False)
+            )
             startup_program = paddle.static.default_startup_program()
         else:
-            self.origin_startup_program = \
-                startup_program.clone(for_test=False)
+            self.origin_startup_program = startup_program.clone(for_test=False)
 
         context["origin_startup_program"] = startup_program
         context["origin_startup_programs"] = [startup_program]
         context["role_maker"] = self._role_maker
 
         # Use the auto-parallel's routines instead
-        if self._user_defined_strategy.semi_auto or self._user_defined_strategy.auto_search:
+        if (
+            self._user_defined_strategy.semi_auto
+            or self._user_defined_strategy.auto_search
+        ):
             from ..auto_parallel.parallelizer import AutoParallelizer
+
             auto_parallelizer = AutoParallelizer(self)
-            optimize_ops, params_grads, dist_startup_prog, dist_main_prog = auto_parallelizer.parallelize(
-                loss, startup_program, parameter_list, no_grad_set)
+            (
+                optimize_ops,
+                params_grads,
+                dist_startup_prog,
+                dist_main_prog,
+            ) = auto_parallelizer.parallelize(
+                loss, startup_program, parameter_list, no_grad_set
+            )
 
             return optimize_ops, params_grads, dist_startup_prog, dist_main_prog
 
         # compile time
-        distributed_optimizer_list = \
+        distributed_optimizer_list = (
             MetaOptimizerFactory()._get_valid_meta_optimizers(
-                self.user_defined_optimizer)
+                self.user_defined_optimizer
+            )
+        )
 
         context["user_defined_strategy"] = copy.deepcopy(
-            self._user_defined_strategy)
+            self._user_defined_strategy
+        )
         copy_user_defined_strategy = copy.deepcopy(self._user_defined_strategy)
 
         # trigger the auto-parallel in very strict condition
@@ -1295,9 +1352,12 @@ class Fleet(object):
         can_not_apply_optimizer_list = []
         # recall meta optimizers for ranking
         for opt in distributed_optimizer_list:
-            opt._set_basic_info(loss, self._role_maker,
-                                self.user_defined_optimizer,
-                                copy_user_defined_strategy)
+            opt._set_basic_info(
+                loss,
+                self._role_maker,
+                self.user_defined_optimizer,
+                copy_user_defined_strategy,
+            )
             if opt._can_apply() and not opt._is_graph_out():
                 valid_optimizer_list.append(opt)
             elif opt._can_apply() and opt._is_graph_out():
@@ -1305,19 +1365,27 @@ class Fleet(object):
             else:
                 can_not_apply_optimizer_list.append(opt)
         # combine recalled meta optimizers to be a valid meta optimizer
-        meta_optimizer, graph_optimizer = \
-            self.strategy_compiler.generate_optimizer(
-                loss, self._role_maker, self.user_defined_optimizer,
-                copy_user_defined_strategy, valid_optimizer_list,
-                valid_graph_optimizer_list)
+        (
+            meta_optimizer,
+            graph_optimizer,
+        ) = self.strategy_compiler.generate_optimizer(
+            loss,
+            self._role_maker,
+            self.user_defined_optimizer,
+            copy_user_defined_strategy,
+            valid_optimizer_list,
+            valid_graph_optimizer_list,
+        )
 
         valid_strategy = self.strategy_compiler._get_valid_strategy(
-            copy_user_defined_strategy, can_not_apply_optimizer_list)
+            copy_user_defined_strategy, can_not_apply_optimizer_list
+        )
 
         context["valid_strategy"] = copy.deepcopy(valid_strategy)
         logger.debug("valid_strategy: " + str(context["valid_strategy"]))
-        logger.debug("user_defined_strategy: " +
-                     str(context["user_defined_strategy"]))
+        logger.debug(
+            "user_defined_strategy: " + str(context["user_defined_strategy"])
+        )
 
         applied_meta_list = self.strategy_compiler._get_applied_meta_list()
         applied_graph_list = self.strategy_compiler._get_applied_graph_list()
@@ -1338,41 +1406,48 @@ class Fleet(object):
                 self._runtime_handle = RuntimeFactory()._create_runtime(context)
 
             compiled_program = compiler.CompiledProgram(
-                self.origin_main_program).with_data_parallel(
-                    loss_name=loss.name, share_vars_from=None)
+                self.origin_main_program
+            ).with_data_parallel(loss_name=loss.name, share_vars_from=None)
             loss.block.program._graph = compiled_program
-            return self.user_defined_optimizer.minimize(loss,
-                                                        startup_program,
-                                                        parameter_list,
-                                                        no_grad_set=no_grad_set)
+            return self.user_defined_optimizer.minimize(
+                loss, startup_program, parameter_list, no_grad_set=no_grad_set
+            )
 
         if meta_optimizer:
-            logger.debug("before minimize program id: " +
-                         str(id(loss.block.program)))
+            logger.debug(
+                "before minimize program id: " + str(id(loss.block.program))
+            )
             optimize_ops, params_grads = meta_optimizer.minimize(
-                loss, startup_program, parameter_list, no_grad_set=no_grad_set)
-            logger.debug("after minimize program id: " +
-                         str(id(loss.block.program)))
+                loss, startup_program, parameter_list, no_grad_set=no_grad_set
+            )
+            logger.debug(
+                "after minimize program id: " + str(id(loss.block.program))
+            )
             default_program = paddle.static.default_main_program()
             logger.debug("default program id: " + str(id(default_program)))
 
             if id(default_program) != id(loss.block.program):
                 paddle.fluid.framework.switch_main_program(loss.block.program)
-            logger.debug("default program id after switch: " +
-                         str(id(default_program)))
+            logger.debug(
+                "default program id after switch: " + str(id(default_program))
+            )
 
         else:
             optimize_ops, params_grads = self.user_defined_optimizer.minimize(
-                loss, startup_program, parameter_list, no_grad_set=no_grad_set)
+                loss, startup_program, parameter_list, no_grad_set=no_grad_set
+            )
 
         context["program_optimize_ops"] = optimize_ops
         context["program_params_grads"] = params_grads
 
         if graph_optimizer:
-            logger.debug("before graph minimize program id: " +
-                         str(id(loss.block.program)))
+            logger.debug(
+                "before graph minimize program id: "
+                + str(id(loss.block.program))
+            )
             optimize_ops, params_grads = graph_optimizer.minimize(
-                loss, startup_program, parameter_list, no_grad_set=no_grad_set)
+                loss, startup_program, parameter_list, no_grad_set=no_grad_set
+            )
             # since we do not encourage users to use graph operations
             # if a graph optimizer takes effect, mostly
             # optimizers_ops and params_grads are None
@@ -1387,8 +1462,10 @@ class Fleet(object):
             opt_info = {} if program._fleet_opt is None else program._fleet_opt
             opt_info["mpi_size"] = self.worker_num()
             opt_info["mpi_rank"] = self.worker_index()
-            for k, v in self._user_defined_strategy.trainer_desc_configs.items(
-            ):
+            for (
+                k,
+                v,
+            ) in self._user_defined_strategy.trainer_desc_configs.items():
                 if v or k not in opt_info:
                     opt_info[k] = v
             program._fleet_opt = opt_info
@@ -1397,15 +1474,18 @@ class Fleet(object):
             self._runtime_handle = RuntimeFactory()._create_runtime(context)
 
         import paddle.distributed.fleet as fleet
+
         fleet.util._set_strategy(context["valid_strategy"])
 
         return optimize_ops, params_grads
 
-    def _minimize_losses_impl(self,
-                              losses,
-                              startup_programs=None,
-                              parameter_list=None,
-                              no_grad_set=None):
+    def _minimize_losses_impl(
+        self,
+        losses,
+        startup_programs=None,
+        parameter_list=None,
+        no_grad_set=None,
+    ):
         context = {}
 
         # cache original feed forward program
@@ -1421,7 +1501,8 @@ class Fleet(object):
                 startup_programs = [paddle.static.default_startup_program()]
             else:
                 raise ValueError(
-                    "startup_program can't be None when loss is list.")
+                    "startup_program can't be None when loss is list."
+                )
         self.origin_startup_program = startup_programs[0].clone(for_test=False)
         context["origin_startup_program"] = startup_programs[0]
         context["origin_startup_programs"] = []
@@ -1431,7 +1512,8 @@ class Fleet(object):
         context["role_maker"] = self._role_maker
 
         context["user_defined_strategy"] = copy.deepcopy(
-            self._user_defined_strategy)
+            self._user_defined_strategy
+        )
 
         context["valid_strategy"] = copy.deepcopy(self._user_defined_strategy)
 
@@ -1444,12 +1526,17 @@ class Fleet(object):
         params_grads = []
 
         from .meta_optimizers import ParameterServerOptimizer
+
         ps_optimizer = ParameterServerOptimizer(self.user_defined_optimizer)
-        ps_optimizer._set_basic_info(losses, self._role_maker,
-                                     self.user_defined_optimizer,
-                                     self._user_defined_strategy)
+        ps_optimizer._set_basic_info(
+            losses,
+            self._role_maker,
+            self.user_defined_optimizer,
+            self._user_defined_strategy,
+        )
         optimize_ops, params_grads = ps_optimizer.minimize_losses_impl(
-            losses, startup_programs, parameter_list, no_grad_set=no_grad_set)
+            losses, startup_programs, parameter_list, no_grad_set=no_grad_set
+        )
 
         # default_program = paddle.static.default_main_program()
 
@@ -1464,18 +1551,24 @@ class Fleet(object):
             opt_info = {} if program._fleet_opt is None else program._fleet_opt
             opt_info["mpi_size"] = self.worker_num()
             opt_info["mpi_rank"] = self.worker_index()
-            for k, v in self._user_defined_strategy.trainer_desc_configs.items(
-            ):
+            for (
+                k,
+                v,
+            ) in self._user_defined_strategy.trainer_desc_configs.items():
                 if v or k not in opt_info:
                     opt_info[k] = v
             program._fleet_opt = opt_info
-            logger.debug("fleet base opt info: " + str(id(program)) +
-                         str(program._fleet_opt))
+            logger.debug(
+                "fleet base opt info: "
+                + str(id(program))
+                + str(program._fleet_opt)
+            )
 
         if self._runtime_handle is None:
             self._runtime_handle = RuntimeFactory()._create_runtime(context)
 
         import paddle.distributed.fleet as fleet
+
         fleet.util._set_strategy(context["valid_strategy"])
 
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 507a765d0c5..b498ca6a31b 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -30,7 +30,9 @@ from paddle.fluid.framework import _set_expected_place
 from paddle.fluid.dygraph import parallel_helper
 from paddle.distributed.fleet.launch_utils import check_backend
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.distributed.fleet.base.private_helper_function import wait_server_ready  # noqa: F401
+from paddle.distributed.fleet.base.private_helper_function import (
+    wait_server_ready,
+)  # noqa: F401
 from paddle.distributed import collective
 from paddle.distributed.collective import _set_group_map
 from paddle.distributed.collective import _set_group_map_by_name
@@ -63,6 +65,7 @@ def _get_global_parallel_env():
 
 def _start_kv_server(port, http_server_d, size):
     from paddle.distributed.fleet.utils.http_server import KVServer
+
     http_server = KVServer(int(port), size=size)
     http_server.start()
     wait_seconds = 3
@@ -73,10 +76,15 @@ def _start_kv_server(port, http_server_d, size):
 
 def _is_cpuonly(backend):
     check_backend(backend)
-    if (backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl'] and
-        (core.is_compiled_with_cuda() or core.is_compiled_with_xpu()
-         or core.is_compiled_with_npu()
-         or core.is_compiled_with_mlu())) or backend is 'xccl':
+    if (
+        backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl']
+        and (
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_xpu()
+            or core.is_compiled_with_npu()
+            or core.is_compiled_with_mlu()
+        )
+    ) or backend is 'xccl':
 
         # passes 'auto' and can use cuda or xpu, use the default logics. so return False
         return False
@@ -87,9 +95,10 @@ def _is_cpuonly(backend):
 def _check_var_exists(var_name):
     var = os.environ.get(var_name, None)
     if var is None:
-        raise ValueError("paddle.distributed initialize error, "
-                         "environment variable %s is needed, but not set." %
-                         var_name)
+        raise ValueError(
+            "paddle.distributed initialize error, "
+            "environment variable %s is needed, but not set." % var_name
+        )
 
 
 def init_parallel_env():
@@ -106,7 +115,7 @@ def init_parallel_env():
 
     Returns:
         None
-        
+
     Examples:
         .. code-block:: python
             # required: gpu
@@ -120,7 +129,7 @@ def init_parallel_env():
                     super(LinearNet, self).__init__()
                     self._linear1 = nn.Linear(10, 10)
                     self._linear2 = nn.Linear(10, 1)
-                    
+
                 def forward(self, x):
                     return self._linear2(self._linear1(x))
 
@@ -141,7 +150,7 @@ def init_parallel_env():
                 outputs = dp_layer(inputs)
                 labels = paddle.randn([10, 1], 'float32')
                 loss = loss_fn(outputs, labels)
-                
+
                 loss.backward()
 
                 adam.step()
@@ -167,15 +176,21 @@ def init_parallel_env():
     backend = os.environ.get('PADDLE_DISTRI_BACKEND', 'auto')
     is_cpu_only = _is_cpuonly(backend)
     # 1. gpu xpu check, must be gpu or xpu,
-    if not (is_cpu_only or core.is_compiled_with_cuda()
-            or core.is_compiled_with_xpu() or core.is_compiled_with_npu()
-            or core.is_compiled_with_mlu()):
+    if not (
+        is_cpu_only
+        or core.is_compiled_with_cuda()
+        or core.is_compiled_with_xpu()
+        or core.is_compiled_with_npu()
+        or core.is_compiled_with_mlu()
+    ):
         raise NotImplementedError(
-            "If you want to use CPU-only version, please use 'gloo' as backend")
+            "If you want to use CPU-only version, please use 'gloo' as backend"
+        )
 
     if backend == "xccl":
         FLAGS_selected_custom_devices = 'FLAGS_selected_{}s'.format(
-            parallel_env.device_type)
+            parallel_env.device_type
+        )
         _check_var_exists(FLAGS_selected_custom_devices)
     else:
         if not is_cpu_only and core.is_compiled_with_cuda():
@@ -203,8 +218,9 @@ def init_parallel_env():
     # they need to call a function to change default place,
     # here just set correctly place to users
     if backend == "xccl":
-        place = core.CustomPlace(parallel_env.device_type,
-                                 parallel_env.device_id)
+        place = core.CustomPlace(
+            parallel_env.device_type, parallel_env.device_id
+        )
     elif is_cpu_only:
         place = core.CPUPlace()
     elif core.is_compiled_with_cuda():
@@ -228,11 +244,15 @@ def init_parallel_env():
         assert rank >= 0 and world_size > rank and world_size > 1, (
             "rank must be non-negative and world_size must be the "
             "maximum rank plus one. Moreover, at least two processes are "
-            "required to create a process group.")
+            "required to create a process group."
+        )
         master_addr = os.getenv("MASTER_ADDR", None)
         master_port = os.getenv("MASTER_PORT", None)
-        endpoints = ":".join([master_addr, master_port
-                              ]) if master_addr and master_port else None
+        endpoints = (
+            ":".join([master_addr, master_port])
+            if master_addr and master_port
+            else None
+        )
         if endpoints is None:
             endpoints = os.getenv("PADDLE_MASTER", None)
         if endpoints is None:
@@ -241,23 +261,28 @@ def init_parallel_env():
             "The environment variable 'MASTER_ADDR' and 'MASTER_PORT' "
             "must be specified, for example 'export MASTER_ADDR=127.0.0.1' "
             "and 'export MASTER_ADDR=54612'. Or you can start your training"
-            "with paddle.distributed.run module.")
+            "with paddle.distributed.run module."
+        )
         master_addr, master_port = endpoints.split(":")
         master_port = int(master_port)
         is_master = rank == 0
         stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900"))
-        default_store = core.TCPStore(master_addr,
-                                      master_port,
-                                      is_master,
-                                      world_size,
-                                      timeout=stop_check_timeout)
+        default_store = core.TCPStore(
+            master_addr,
+            master_port,
+            is_master,
+            world_size,
+            timeout=stop_check_timeout,
+        )
         _set_default_store(default_store)
-        pg = _new_process_group_impl(backend,
-                                     default_store,
-                                     rank,
-                                     world_size,
-                                     _default_group_name,
-                                     pg_options=None)
+        pg = _new_process_group_impl(
+            backend,
+            default_store,
+            rank,
+            world_size,
+            _default_group_name,
+            pg_options=None,
+        )
         ranks = list(range(world_size))
         group = Group(rank, 0, ranks, pg=pg, name=_default_group_name)
         _set_group_map_by_name(_default_group_name, group)
@@ -283,8 +308,10 @@ def init_parallel_env():
             size = {'_worker': parallel_env.world_size}
             if backend == "heter":
                 size = {'_worker': len(node_num)}
-            http_server = Process(target=_start_kv_server,
-                                  args=(int(ep_rank_0[1]), http_server_d, size))
+            http_server = Process(
+                target=_start_kv_server,
+                args=(int(ep_rank_0[1]), http_server_d, size),
+            )
             http_server.daemon = True
             http_server_d["running"] = True
             http_server.start()
@@ -302,22 +329,28 @@ def init_parallel_env():
     # init nccl or hccl or bkcl or heter context
     if is_cpu_only:
         parallel_helper._set_parallel_ctx(
-            core.GLOOParallelContext(strategy, place))
-    elif (backend == "heter"):
+            core.GLOOParallelContext(strategy, place)
+        )
+    elif backend == "heter":
         parallel_helper._set_parallel_ctx(
-            core.HeterParallelContext(strategy, parallel_env.device_id))
+            core.HeterParallelContext(strategy, parallel_env.device_id)
+        )
     elif core.is_compiled_with_cuda():
         parallel_helper._set_parallel_ctx(
-            core.NCCLParallelContext(strategy, place))
+            core.NCCLParallelContext(strategy, place)
+        )
     elif core.is_compiled_with_xpu():
         parallel_helper._set_parallel_ctx(
-            core.BKCLParallelContext(strategy, place))
+            core.BKCLParallelContext(strategy, place)
+        )
     elif core.is_compiled_with_npu():
         parallel_helper._set_parallel_ctx(
-            core.HCCLParallelContext(strategy, place))
+            core.HCCLParallelContext(strategy, place)
+        )
     elif core.is_compiled_with_mlu():
         parallel_helper._set_parallel_ctx(
-            core.CNCLParallelContext(strategy, place))
+            core.CNCLParallelContext(strategy, place)
+        )
 
     if backend != "heter":
         other_endpoints = strategy.trainer_endpoints[:]
diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py
index 1d67989c065..7c19b9ca64d 100644
--- a/python/paddle/distributed/sharding/group_sharded.py
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -23,30 +23,48 @@ from paddle.distributed.utils.log_utils import get_logger
 from paddle.fluid.framework import in_dygraph_mode
 
 # Old version
-from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
-from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
-from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3
-from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import (
+    ShardingOptimizerStage2,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import (
+    ShardingStage2,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import (
+    ShardingStage3,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import (
+    ShardingScaler,
+)
 
 # New version
-from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2
-from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2
-from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import GroupShardedStage3
-from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import GroupShardedScaler
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import (
+    GroupShardedOptimizerStage2,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import (
+    GroupShardedStage2,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import (
+    GroupShardedStage3,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import (
+    GroupShardedScaler,
+)
 
 logger_ = get_logger(logging.WARNING)
 
 
-def group_sharded_parallel(model,
-                           optimizer,
-                           level,
-                           scaler=None,
-                           group=None,
-                           offload=False,
-                           sync_buffers=False,
-                           buffer_max_size=2**23,
-                           segment_size=2**20,
-                           sync_comm=False):
+def group_sharded_parallel(
+    model,
+    optimizer,
+    level,
+    scaler=None,
+    group=None,
+    offload=False,
+    sync_buffers=False,
+    buffer_max_size=2**23,
+    segment_size=2**20,
+    sync_comm=False,
+):
     """
     Use group_sharded_parallel can perform group shared configuration on the model, optimizer and GradScaler. Level has three string options, 'os', 'os_g' and 'p_g_os' corresponds to three different usage scenarios: optimizer state segmentation, optimizer state + gradient segmentation, and parameter + gradient + optimizer state segmentation.
     Usually, optimizer state + gradient segmentation is actually a re optimization of optimizer state segmentation, so optimizer state + gradient segmentation can be used to realize optimizer state segmentation.
@@ -62,12 +80,12 @@ def group_sharded_parallel(model,
         buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. The larger the size, the more GPU memory will be used. Defaults to 2**23, which means that the dimension of the buffer is 2**23.
         segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20, indicating that the dimension of the minimum segmented parameter is 2**20.
         sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False, indicating that asynchronous communication is used.
-    
+
     Returns:
         model: A wrapper for group sharded given model.
         optimizer: A wrapper for group sharded given optimizer.
         scaler: A wrapper for group sharded given scaler.
-    
+
     Examples:
         .. code-block:: python
 
@@ -100,13 +118,16 @@ def group_sharded_parallel(model,
     """
     # check optition type
     assert isinstance(
-        model,
-        paddle.nn.Layer), "The model must be the instance of paddle.nn.Layer."
+        model, paddle.nn.Layer
+    ), "The model must be the instance of paddle.nn.Layer."
     assert isinstance(
         optimizer, Optimizer
     ), "The optimizer must be the instance of paddle.optimizer.Optimizer."
-    assert level in ['os', 'os_g',
-                     'p_g_os'], "The level must be os, os_g or p_g_os."
+    assert level in [
+        'os',
+        'os_g',
+        'p_g_os',
+    ], "The level must be os, os_g or p_g_os."
 
     def check_dtype(param):
         return param.dtype == paddle.float16
@@ -124,39 +145,50 @@ def group_sharded_parallel(model,
                 params=optimizer._parameter_list,
                 optim=optimizer,
                 group=group,
-                offload=offload)
-            model = GroupShardedStage2(model,
-                                       optimizer,
-                                       group=group,
-                                       sync_buffers=sync_buffers,
-                                       buffer_max_size=buffer_max_size)
+                offload=offload,
+            )
+            model = GroupShardedStage2(
+                model,
+                optimizer,
+                group=group,
+                sync_buffers=sync_buffers,
+                buffer_max_size=buffer_max_size,
+            )
         else:
-            optimizer = ShardingOptimizerStage2(params=model.parameters(),
-                                                optim=optimizer,
-                                                group=group,
-                                                offload=offload)
-            model = ShardingStage2(model,
-                                   optimizer,
-                                   group=group,
-                                   sync_buffers=sync_buffers,
-                                   buffer_max_size=buffer_max_size)
+            optimizer = ShardingOptimizerStage2(
+                params=model.parameters(),
+                optim=optimizer,
+                group=group,
+                offload=offload,
+            )
+            model = ShardingStage2(
+                model,
+                optimizer,
+                group=group,
+                sync_buffers=sync_buffers,
+                buffer_max_size=buffer_max_size,
+            )
     elif level == 'p_g_os':
         if in_dygraph_mode():
-            model = GroupShardedStage3(model,
-                                       optimizer=optimizer,
-                                       group=group,
-                                       sync_buffers=sync_buffers,
-                                       segment_size=segment_size,
-                                       offload=offload,
-                                       sync_comm=sync_comm)
+            model = GroupShardedStage3(
+                model,
+                optimizer=optimizer,
+                group=group,
+                sync_buffers=sync_buffers,
+                segment_size=segment_size,
+                offload=offload,
+                sync_comm=sync_comm,
+            )
         else:
-            model = ShardingStage3(model,
-                                   optimizer=optimizer,
-                                   group=group,
-                                   sync_buffers=sync_buffers,
-                                   segment_size=segment_size,
-                                   offload=offload,
-                                   sync_comm=sync_comm)
+            model = ShardingStage3(
+                model,
+                optimizer=optimizer,
+                group=group,
+                sync_buffers=sync_buffers,
+                segment_size=segment_size,
+                offload=offload,
+                sync_comm=sync_comm,
+            )
     else:
         raise ValueError("Please enter the correct level.")
     if isinstance(scaler, paddle.amp.GradScaler):
@@ -184,7 +216,7 @@ def save_group_sharded_model(model, output, optimizer=None):
         model (Layer): A wrapper for group sharded given model.
         output (str): Save directory.
         optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None, indicating that the optimizer state is not saved.
-    
+
     Examples:
         .. code-block:: python
 
@@ -219,7 +251,8 @@ def save_group_sharded_model(model, output, optimizer=None):
             save_group_sharded_model(model, optimizer, output=output_dir)
     """
     logger_.info(
-        "==========Begin to save group sharded model and optimizer==========")
+        "==========Begin to save group sharded model and optimizer=========="
+    )
     assert not os.path.isfile(
         output
     ), "Saving directory ({}) should be a directory, not a file".format(output)
@@ -243,4 +276,5 @@ def save_group_sharded_model(model, output, optimizer=None):
         output_opt = os.path.join(output, "model.pdopt")
         paddle.save(optimizer._optim.state_dict(), output_opt)
     logger_.info(
-        "==========End to save group sharded model and optimizer==========")
+        "==========End to save group sharded model and optimizer=========="
+    )
diff --git a/python/paddle/distribution/distribution.py b/python/paddle/distribution/distribution.py
index 8c5843521b0..6e36976e5ee 100644
--- a/python/paddle/distribution/distribution.py
+++ b/python/paddle/distribution/distribution.py
@@ -28,35 +28,56 @@ import numpy as np
 import paddle
 from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid import core
-from paddle.fluid.data_feeder import (check_dtype, check_type,
-                                      check_variable_and_dtype, convert_dtype)
-from paddle.fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
-from paddle.fluid.layers import (control_flow, elementwise_add, elementwise_div,
-                                 elementwise_mul, elementwise_sub, nn, ops,
-                                 tensor)
+from paddle.fluid.data_feeder import (
+    check_dtype,
+    check_type,
+    check_variable_and_dtype,
+    convert_dtype,
+)
+from paddle.fluid.framework import (
+    _non_static_mode,
+    in_dygraph_mode,
+    _in_legacy_dygraph,
+)
+from paddle.fluid.layers import (
+    control_flow,
+    elementwise_add,
+    elementwise_div,
+    elementwise_mul,
+    elementwise_sub,
+    nn,
+    ops,
+    tensor,
+)
 from paddle.tensor import arange, concat, gather_nd, multinomial
 
 
 class Distribution(object):
     """
-    The abstract base class for probability distributions. Functions are 
+    The abstract base class for probability distributions. Functions are
     implemented in specific distributions.
 
     Args:
-        batch_shape(Sequence[int], optional):  independent, not identically 
+        batch_shape(Sequence[int], optional):  independent, not identically
             distributed draws, aka a "collection" or "bunch" of distributions.
-        event_shape(Sequence[int], optional): the shape of a single 
-            draw from the distribution; it may be dependent across dimensions. 
-            For scalar distributions, the event shape is []. For n-dimension 
+        event_shape(Sequence[int], optional): the shape of a single
+            draw from the distribution; it may be dependent across dimensions.
+            For scalar distributions, the event shape is []. For n-dimension
             multivariate distribution, the event shape is [n].
     """
 
     def __init__(self, batch_shape=(), event_shape=()):
 
-        self._batch_shape = batch_shape if isinstance(
-            batch_shape, tuple) else tuple(batch_shape)
-        self._event_shape = event_shape if isinstance(
-            event_shape, tuple) else tuple(event_shape)
+        self._batch_shape = (
+            batch_shape
+            if isinstance(batch_shape, tuple)
+            else tuple(batch_shape)
+        )
+        self._event_shape = (
+            event_shape
+            if isinstance(event_shape, tuple)
+            else tuple(event_shape)
+        )
 
         super(Distribution, self).__init__()
 
@@ -118,16 +139,16 @@ class Distribution(object):
 
     def probs(self, value):
         """Probability density/mass function.
-        
-        .. note:: 
-        
-            This method will be deprecated in the future, please use `prob` 
+
+        .. note::
+
+            This method will be deprecated in the future, please use `prob`
             instead.
         """
         raise NotImplementedError
 
     def _extend_shape(self, sample_shape):
-        """compute shape of the sample 
+        """compute shape of the sample
 
         Args:
             sample_shape (Tensor): sample shape
@@ -155,7 +176,8 @@ class Distribution(object):
 
         if is_variable and is_number:
             raise ValueError(
-                'if one argument is Tensor, all arguments should be Tensor')
+                'if one argument is Tensor, all arguments should be Tensor'
+            )
 
         return is_variable
 
@@ -170,15 +192,17 @@ class Distribution(object):
         """
         numpy_args = []
         variable_args = []
-        tmp = 0.
+        tmp = 0.0
 
         for arg in args:
             if isinstance(arg, float):
                 arg = [arg]
             if not isinstance(arg, (list, tuple, np.ndarray, tensor.Variable)):
                 raise TypeError(
-                    "Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}"
-                    .format(type(arg)))
+                    "Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}".format(
+                        type(arg)
+                    )
+                )
 
             arg_np = np.array(arg)
             arg_dtype = arg_np.dtype
@@ -216,20 +240,24 @@ class Distribution(object):
             value (Tensor): Change value's dtype if value's dtype is different from param.
         """
         if _non_static_mode():
-            if value.dtype != param.dtype and convert_dtype(
-                    value.dtype) in ['float32', 'float64']:
+            if value.dtype != param.dtype and convert_dtype(value.dtype) in [
+                'float32',
+                'float64',
+            ]:
                 warnings.warn(
                     "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
                 )
                 if in_dygraph_mode():
                     return _C_ops.cast(value, param.dtype)
                 if _in_legacy_dygraph():
-                    return _legacy_C_ops.cast(value, 'in_dtype', value.dtype,
-                                              'out_dtype', param.dtype)
+                    return _legacy_C_ops.cast(
+                        value, 'in_dtype', value.dtype, 'out_dtype', param.dtype
+                    )
             return value
 
-        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
-                                 'log_prob')
+        check_variable_and_dtype(
+            value, 'value', ['float32', 'float64'], 'log_prob'
+        )
         if value.dtype != param.dtype:
             warnings.warn(
                 "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
@@ -239,19 +267,25 @@ class Distribution(object):
 
     def _probs_to_logits(self, probs, is_binary=False):
         r"""
-        Converts probabilities into logits. For the binary, probs denotes the 
-        probability of occurrence of the event indexed by `1`. For the 
-        multi-dimensional, values of last axis denote the probabilities of 
+        Converts probabilities into logits. For the binary, probs denotes the
+        probability of occurrence of the event indexed by `1`. For the
+        multi-dimensional, values of last axis denote the probabilities of
         occurrence of each of the events.
         """
-        return (paddle.log(probs) - paddle.log1p(-probs)) \
-            if is_binary else paddle.log(probs)
+        return (
+            (paddle.log(probs) - paddle.log1p(-probs))
+            if is_binary
+            else paddle.log(probs)
+        )
 
     def _logits_to_probs(self, logits, is_binary=False):
         r"""
-        Converts logits into probabilities. For the binary, each value denotes 
-        log odds, whereas for the multi-dimensional case, the values along the 
+        Converts logits into probabilities. For the binary, each value denotes
+        log odds, whereas for the multi-dimensional case, the values along the
         last dimension denote the log probabilities of the events.
         """
-        return paddle.nn.functional.sigmoid(logits) \
-            if is_binary else paddle.nn.functional.softmax(logits, axis=-1)
+        return (
+            paddle.nn.functional.sigmoid(logits)
+            if is_binary
+            else paddle.nn.functional.softmax(logits, axis=-1)
+        )
diff --git a/python/paddle/distribution/kl.py b/python/paddle/distribution/kl.py
index c5ad3f04358..f6ab37c3a63 100644
--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -35,7 +35,7 @@ def kl_divergence(p, q):
 
     .. math::
 
-        KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x 
+        KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x
 
     Args:
         p (Distribution): ``Distribution`` object.
@@ -64,11 +64,11 @@ def kl_divergence(p, q):
 def register_kl(cls_p, cls_q):
     """Decorator for register a KL divergence implemention function.
 
-    The ``kl_divergence(p, q)`` function will search concrete implemention 
-    functions registered by ``register_kl``, according to multi-dispatch pattern. 
-    If an implemention function is found, it will return the result, otherwise, 
-    it will raise ``NotImplementError`` exception. Users can register 
-    implemention funciton by the decorator. 
+    The ``kl_divergence(p, q)`` function will search concrete implemention
+    functions registered by ``register_kl``, according to multi-dispatch pattern.
+    If an implemention function is found, it will return the result, otherwise,
+    it will raise ``NotImplementError`` exception. Users can register
+    implemention funciton by the decorator.
 
     Args:
         cls_p(Distribution): Subclass derived from ``Distribution``.
@@ -83,8 +83,9 @@ def register_kl(cls_p, cls_q):
             def kl_beta_beta():
                 pass # insert implementation here
     """
-    if (not issubclass(cls_p, Distribution)
-            or not issubclass(cls_q, Distribution)):
+    if not issubclass(cls_p, Distribution) or not issubclass(
+        cls_q, Distribution
+    ):
         raise TypeError('cls_p and cls_q must be subclass of Distribution')
 
     def decorator(f):
@@ -98,8 +99,11 @@ def _dispatch(cls_p, cls_q):
     """Multiple dispatch into concrete implement function"""
 
     # find all matched super class pair of p and q
-    matchs = [(super_p, super_q) for super_p, super_q in _REGISTER_TABLE
-              if issubclass(cls_p, super_p) and issubclass(cls_q, super_q)]
+    matchs = [
+        (super_p, super_q)
+        for super_p, super_q in _REGISTER_TABLE
+        if issubclass(cls_p, super_p) and issubclass(cls_q, super_q)
+    ]
     if not matchs:
         raise NotImplementedError
 
@@ -108,16 +112,20 @@ def _dispatch(cls_p, cls_q):
 
     if _REGISTER_TABLE[left_p, left_q] is not _REGISTER_TABLE[right_p, right_q]:
         warnings.warn(
-            'Ambiguous kl_divergence({}, {}). Please register_kl({}, {})'.
-            format(cls_p.__name__, cls_q.__name__, left_p.__name__,
-                   right_q.__name__), RuntimeWarning)
+            'Ambiguous kl_divergence({}, {}). Please register_kl({}, {})'.format(
+                cls_p.__name__,
+                cls_q.__name__,
+                left_p.__name__,
+                right_q.__name__,
+            ),
+            RuntimeWarning,
+        )
 
     return _REGISTER_TABLE[left_p, left_q]
 
 
 @functools.total_ordering
 class _Compare(object):
-
     def __init__(self, *classes):
         self.classes = classes
 
@@ -135,22 +143,33 @@ class _Compare(object):
 
 @register_kl(Beta, Beta)
 def _kl_beta_beta(p, q):
-    return ((q.alpha.lgamma() + q.beta.lgamma() + (p.alpha + p.beta).lgamma()) -
-            (p.alpha.lgamma() + p.beta.lgamma() + (q.alpha + q.beta).lgamma()) +
-            ((p.alpha - q.alpha) * p.alpha.digamma()) +
-            ((p.beta - q.beta) * p.beta.digamma()) +
-            (((q.alpha + q.beta) - (p.alpha + p.beta)) *
-             (p.alpha + p.beta).digamma()))
+    return (
+        (q.alpha.lgamma() + q.beta.lgamma() + (p.alpha + p.beta).lgamma())
+        - (p.alpha.lgamma() + p.beta.lgamma() + (q.alpha + q.beta).lgamma())
+        + ((p.alpha - q.alpha) * p.alpha.digamma())
+        + ((p.beta - q.beta) * p.beta.digamma())
+        + (
+            ((q.alpha + q.beta) - (p.alpha + p.beta))
+            * (p.alpha + p.beta).digamma()
+        )
+    )
 
 
 @register_kl(Dirichlet, Dirichlet)
 def _kl_dirichlet_dirichlet(p, q):
     return (
-        (p.concentration.sum(-1).lgamma() - q.concentration.sum(-1).lgamma()) -
-        ((p.concentration.lgamma() - q.concentration.lgamma()).sum(-1)) +
-        (((p.concentration - q.concentration) *
-          (p.concentration.digamma() -
-           p.concentration.sum(-1).digamma().unsqueeze(-1))).sum(-1)))
+        (p.concentration.sum(-1).lgamma() - q.concentration.sum(-1).lgamma())
+        - ((p.concentration.lgamma() - q.concentration.lgamma()).sum(-1))
+        + (
+            (
+                (p.concentration - q.concentration)
+                * (
+                    p.concentration.digamma()
+                    - p.concentration.sum(-1).digamma().unsqueeze(-1)
+                )
+            ).sum(-1)
+        )
+    )
 
 
 @register_kl(Categorical, Categorical)
@@ -170,8 +189,7 @@ def _kl_uniform_uniform(p, q):
 
 @register_kl(ExponentialFamily, ExponentialFamily)
 def _kl_expfamily_expfamily(p, q):
-    """Compute kl-divergence using `Bregman divergences <https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf>`_
-    """
+    """Compute kl-divergence using `Bregman divergences <https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf>`_"""
     if not type(p) == type(q):
         raise NotImplementedError
 
@@ -187,19 +205,22 @@ def _kl_expfamily_expfamily(p, q):
 
     try:
         if _non_static_mode():
-            p_grads = paddle.grad(p_log_norm,
-                                  p_natural_params,
-                                  create_graph=True)
+            p_grads = paddle.grad(
+                p_log_norm, p_natural_params, create_graph=True
+            )
         else:
             p_grads = paddle.static.gradients(p_log_norm, p_natural_params)
     except RuntimeError as e:
         raise TypeError(
-            "Cann't compute kl_divergence({cls_p}, {cls_q}) use bregman divergence. Please register_kl({cls_p}, {cls_q})."
-            .format(cls_p=type(p).__name__, cls_q=type(q).__name__)) from e
+            "Cann't compute kl_divergence({cls_p}, {cls_q}) use bregman divergence. Please register_kl({cls_p}, {cls_q}).".format(
+                cls_p=type(p).__name__, cls_q=type(q).__name__
+            )
+        ) from e
 
     kl = q._log_normalizer(*q_natural_params) - p_log_norm
-    for p_param, q_param, p_grad in zip(p_natural_params, q_natural_params,
-                                        p_grads):
+    for p_param, q_param, p_grad in zip(
+        p_natural_params, q_natural_params, p_grads
+    ):
         term = (q_param - p_param) * p_grad
         kl -= _sum_rightmost(term, len(q.event_shape))
 
diff --git a/python/paddle/distribution/normal.py b/python/paddle/distribution/normal.py
index f248e1a0927..69b473c037e 100644
--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -19,12 +19,23 @@ import numpy as np
 from paddle import _C_ops, _legacy_C_ops
 from paddle.distribution import distribution
 from paddle.fluid import core
-from paddle.fluid.data_feeder import (check_dtype, check_type,
-                                      check_variable_and_dtype, convert_dtype)
+from paddle.fluid.data_feeder import (
+    check_dtype,
+    check_type,
+    check_variable_and_dtype,
+    convert_dtype,
+)
 from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
-from paddle.fluid.layers import (control_flow, elementwise_add, elementwise_div,
-                                 elementwise_mul, elementwise_sub, nn, ops,
-                                 tensor)
+from paddle.fluid.layers import (
+    control_flow,
+    elementwise_add,
+    elementwise_div,
+    elementwise_mul,
+    elementwise_sub,
+    nn,
+    ops,
+    tensor,
+)
 
 
 class Normal(distribution.Distribution):
@@ -55,7 +66,7 @@ class Normal(distribution.Distribution):
 
     Examples:
         .. code-block:: python
-          
+
           import paddle
           from paddle.distribution import Normal
 
@@ -90,12 +101,18 @@ class Normal(distribution.Distribution):
 
     def __init__(self, loc, scale, name=None):
         if not _non_static_mode():
-            check_type(loc, 'loc',
-                       (int, float, np.ndarray, tensor.Variable, list, tuple),
-                       'Normal')
-            check_type(scale, 'scale',
-                       (int, float, np.ndarray, tensor.Variable, list, tuple),
-                       'Normal')
+            check_type(
+                loc,
+                'loc',
+                (int, float, np.ndarray, tensor.Variable, list, tuple),
+                'Normal',
+            )
+            check_type(
+                scale,
+                'scale',
+                (int, float, np.ndarray, tensor.Variable, list, tuple),
+                'Normal',
+            )
 
         self.batch_size_unknown = False
         self.all_arg_is_float = False
@@ -115,11 +132,15 @@ class Normal(distribution.Distribution):
         else:
             if isinstance(loc, float) and isinstance(scale, float):
                 self.all_arg_is_float = True
-            if isinstance(loc, np.ndarray) and str(
-                    loc.dtype) in ['float32', 'float64']:
+            if isinstance(loc, np.ndarray) and str(loc.dtype) in [
+                'float32',
+                'float64',
+            ]:
                 self.dtype = loc.dtype
-            elif isinstance(scale, np.ndarray) and str(
-                    scale.dtype) in ['float32', 'float64']:
+            elif isinstance(scale, np.ndarray) and str(scale.dtype) in [
+                'float32',
+                'float64',
+            ]:
                 self.dtype = scale.dtype
             # pylint: disable=unbalanced-tuple-unpacking
             self.loc, self.scale = self._to_tensor(loc, scale)
@@ -149,21 +170,21 @@ class Normal(distribution.Distribution):
         if self.batch_size_unknown:
             output_shape = shape + batch_shape
             zero_tmp = tensor.fill_constant_batch_size_like(
-                self.loc + self.scale, batch_shape + shape, self.dtype, 0.)
+                self.loc + self.scale, batch_shape + shape, self.dtype, 0.0
+            )
             zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
             zero_tmp_shape = nn.shape(zero_tmp_reshape)
-            normal_random_tmp = nn.gaussian_random(zero_tmp_shape,
-                                                   mean=0.,
-                                                   std=1.,
-                                                   seed=seed,
-                                                   dtype=self.dtype)
+            normal_random_tmp = nn.gaussian_random(
+                zero_tmp_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
+            )
             output = normal_random_tmp * (zero_tmp_reshape + self.scale)
             output = elementwise_add(output, self.loc, name=name)
             return output
         else:
             output_shape = shape + batch_shape
-            output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed, dtype=self.dtype) * \
-                     (tensor.zeros(output_shape, dtype=self.dtype) + self.scale)
+            output = nn.gaussian_random(
+                output_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
+            ) * (tensor.zeros(output_shape, dtype=self.dtype) + self.scale)
             output = elementwise_add(output, self.loc, name=name)
             if self.all_arg_is_float:
                 return nn.reshape(output, shape, name=name)
@@ -189,13 +210,14 @@ class Normal(distribution.Distribution):
         """
         name = self.name + '_entropy'
         batch_shape = list((self.loc + self.scale).shape)
-        zero_tmp = tensor.fill_constant_batch_size_like(self.loc + self.scale,
-                                                        batch_shape, self.dtype,
-                                                        0.)
-        return elementwise_add(0.5 + zero_tmp,
-                               0.5 * math.log(2 * math.pi) + nn.log(
-                                   (self.scale + zero_tmp)),
-                               name=name)
+        zero_tmp = tensor.fill_constant_batch_size_like(
+            self.loc + self.scale, batch_shape, self.dtype, 0.0
+        )
+        return elementwise_add(
+            0.5 + zero_tmp,
+            0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)),
+            name=name,
+        )
 
     def log_prob(self, value):
         """Log probability density/mass function.
@@ -212,10 +234,11 @@ class Normal(distribution.Distribution):
 
         var = self.scale * self.scale
         log_scale = nn.log(self.scale)
-        return elementwise_sub(-1. * ((value - self.loc) * (value - self.loc)) /
-                               (2. * var),
-                               log_scale + math.log(math.sqrt(2. * math.pi)),
-                               name=name)
+        return elementwise_sub(
+            -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var),
+            log_scale + math.log(math.sqrt(2.0 * math.pi)),
+            name=name,
+        )
 
     def probs(self, value):
         """Probability density/mass function.
@@ -231,10 +254,13 @@ class Normal(distribution.Distribution):
         value = self._check_values_dtype_in_probs(self.loc, value)
 
         var = self.scale * self.scale
-        return elementwise_div(ops.exp(-1. * ((value - self.loc) *
-                                              (value - self.loc)) / (2. * var)),
-                               (math.sqrt(2 * math.pi) * self.scale),
-                               name=name)
+        return elementwise_div(
+            ops.exp(
+                -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var)
+            ),
+            (math.sqrt(2 * math.pi) * self.scale),
+            name=name,
+        )
 
     def kl_divergence(self, other):
         r"""The KL-divergence between two normal distributions.
@@ -248,7 +274,7 @@ class Normal(distribution.Distribution):
         .. math::
 
             ratio = \\frac{\sigma_0}{\sigma_1}
-        
+
         .. math::
 
             diff = \mu_1 - \mu_0
@@ -274,9 +300,9 @@ class Normal(distribution.Distribution):
 
         name = self.name + '_kl_divergence'
         var_ratio = self.scale / other.scale
-        var_ratio = (var_ratio * var_ratio)
+        var_ratio = var_ratio * var_ratio
         t1 = (self.loc - other.loc) / other.scale
-        t1 = (t1 * t1)
-        return elementwise_add(0.5 * var_ratio,
-                               0.5 * (t1 - 1. - nn.log(var_ratio)),
-                               name=name)
+        t1 = t1 * t1
+        return elementwise_add(
+            0.5 * var_ratio, 0.5 * (t1 - 1.0 - nn.log(var_ratio)), name=name
+        )
diff --git a/python/paddle/distribution/transform.py b/python/paddle/distribution/transform.py
index d7a512aade2..7e2d7f447cb 100644
--- a/python/paddle/distribution/transform.py
+++ b/python/paddle/distribution/transform.py
@@ -21,20 +21,33 @@ import typing
 
 import paddle
 import paddle.nn.functional as F
-from paddle.distribution import (constraint, distribution,
-                                 transformed_distribution, variable)
+from paddle.distribution import (
+    constraint,
+    distribution,
+    transformed_distribution,
+    variable,
+)
 
 __all__ = [  # noqa
-    'Transform', 'AbsTransform', 'AffineTransform', 'ChainTransform',
-    'ExpTransform', 'IndependentTransform', 'PowerTransform',
-    'ReshapeTransform', 'SigmoidTransform', 'SoftmaxTransform',
-    'StackTransform', 'StickBreakingTransform', 'TanhTransform'
+    'Transform',
+    'AbsTransform',
+    'AffineTransform',
+    'ChainTransform',
+    'ExpTransform',
+    'IndependentTransform',
+    'PowerTransform',
+    'ReshapeTransform',
+    'SigmoidTransform',
+    'SoftmaxTransform',
+    'StackTransform',
+    'StickBreakingTransform',
+    'TanhTransform',
 ]
 
 
 class Type(enum.Enum):
-    """Mapping type of a transformation.
-    """
+    """Mapping type of a transformation."""
+
     BIJECTION = 'bijection'  # bijective(injective and surjective)
     INJECTION = 'injection'  # injective-only
     SURJECTION = 'surjection'  # surjective-only
@@ -42,8 +55,7 @@ class Type(enum.Enum):
 
     @classmethod
     def is_injective(cls, _type):
-        """Both bijection and injection are injective mapping.
-        """
+        """Both bijection and injection are injective mapping."""
         return _type in (cls.BIJECTION, cls.INJECTION)
 
 
@@ -121,14 +133,14 @@ class Transform(object):
         return Type.is_injective(cls._type)
 
     def __call__(self, input):
-        """Make this instance as a callable object. The return value is 
-        depening on the input type. 
+        """Make this instance as a callable object. The return value is
+        depening on the input type.
 
-        * If the input is a ``Tensor`` instance, return 
+        * If the input is a ``Tensor`` instance, return
           ``self.forward(input)`` .
-        * If the input is a ``Distribution`` instance, return 
+        * If the input is a ``Distribution`` instance, return
           ``TransformedDistribution(base=input, transforms=[self])`` .
-        * If the input is a ``Transform`` instance, return 
+        * If the input is a ``Transform`` instance, return
           ``ChainTransform([self, input])`` .
 
         Args:
@@ -139,18 +151,19 @@ class Transform(object):
         """
         if isinstance(input, distribution.Distribution):
             return transformed_distribution.TransformedDistribution(
-                input, [self])
+                input, [self]
+            )
         if isinstance(input, Transform):
             return ChainTransform([self, input])
         return self.forward(x)
 
     def forward(self, x):
-        """Forward transformation with mapping :math:`y = f(x)`. 
+        """Forward transformation with mapping :math:`y = f(x)`.
 
         Useful for turning one random outcome into another.
 
         Args:
-            x (Tensos): Input parameter, generally is a sample generated 
+            x (Tensos): Input parameter, generally is a sample generated
                 from ``Distribution``.
 
         Returns:
@@ -158,15 +171,17 @@ class Transform(object):
         """
         if not isinstance(x, paddle.fluid.framework.Variable):
             raise TypeError(
-                f"Expected 'x' is a Tensor or Real, but got {type(x)}.")
+                f"Expected 'x' is a Tensor or Real, but got {type(x)}."
+            )
         if x.dim() < self._domain.event_rank:
             raise ValueError(
                 f'The dimensions of x({x.dim()}) should be '
-                f'grater than or equal to {self._domain.event_rank}')
+                f'grater than or equal to {self._domain.event_rank}'
+            )
         return self._forward(x)
 
     def inverse(self, y):
-        """Inverse transformation :math:`x = f^{-1}(y)`. It's useful for "reversing" 
+        """Inverse transformation :math:`x = f^{-1}(y)`. It's useful for "reversing"
         a transformation to compute one probability in terms of another.
 
         Args:
@@ -177,46 +192,53 @@ class Transform(object):
         """
         if not isinstance(y, paddle.fluid.framework.Variable):
             raise TypeError(
-                f"Expected 'y' is a Tensor or Real, but got {type(y)}.")
+                f"Expected 'y' is a Tensor or Real, but got {type(y)}."
+            )
         if y.dim() < self._codomain.event_rank:
             raise ValueError(
                 f'The dimensions of y({y.dim()}) should be '
-                f'grater than or equal to {self._codomain.event_rank}')
+                f'grater than or equal to {self._codomain.event_rank}'
+            )
         return self._inverse(y)
 
     def forward_log_det_jacobian(self, x):
-        """The log of the absolute value of the determinant of the matrix of all 
+        """The log of the absolute value of the determinant of the matrix of all
         first-order partial derivatives of the inverse function.
 
         Args:
-            x (Tensor): Input tensor, generally is a sample generated from 
+            x (Tensor): Input tensor, generally is a sample generated from
                 ``Distribution``
 
         Returns:
-            Tensor: The log of the absolute value of Jacobian determinant. 
+            Tensor: The log of the absolute value of Jacobian determinant.
         """
         if not isinstance(x, paddle.fluid.framework.Variable):
             raise TypeError(
-                f"Expected 'y' is a Tensor or Real, but got {type(x)}.")
-        if isinstance(x, paddle.fluid.framework.Variable
-                      ) and x.dim() < self._domain.event_rank:
+                f"Expected 'y' is a Tensor or Real, but got {type(x)}."
+            )
+        if (
+            isinstance(x, paddle.fluid.framework.Variable)
+            and x.dim() < self._domain.event_rank
+        ):
             raise ValueError(
                 f'The dimensions of x({x.dim()}) should be '
-                f'grater than or equal to {self._domain.event_rank}')
+                f'grater than or equal to {self._domain.event_rank}'
+            )
         if not self._is_injective():
             raise NotImplementedError(
                 "forward_log_det_jacobian can't be implemented for non-injective"
-                "transforms.")
+                "transforms."
+            )
 
         return self._call_forward_log_det_jacobian(x)
 
     def inverse_log_det_jacobian(self, y):
         """Compute :math:`log|det J_{f^{-1}}(y)|`.
-        Note that ``forward_log_det_jacobian`` is the negative of this function, 
+        Note that ``forward_log_det_jacobian`` is the negative of this function,
         evaluated at :math:`f^{-1}(y)`.
 
         Args:
-            y (Tensor): The input to the ``inverse`` Jacobian determinant 
+            y (Tensor): The input to the ``inverse`` Jacobian determinant
                 evaluation.
 
         Returns:
@@ -227,7 +249,8 @@ class Transform(object):
         if y.dim() < self._codomain.event_rank:
             raise ValueError(
                 f'The dimensions of y({y.dim()}) should be '
-                f'grater than or equal to {self._codomain.event_rank}')
+                f'grater than or equal to {self._codomain.event_rank}'
+            )
         return self._call_inverse_log_det_jacobian(y)
 
     def forward_shape(self, shape):
@@ -241,7 +264,8 @@ class Transform(object):
         """
         if not isinstance(shape, typing.Sequence):
             raise TypeError(
-                f"Expected shape is Sequence[int] type, but got {type(shape)}.")
+                f"Expected shape is Sequence[int] type, but got {type(shape)}."
+            )
         return self._forward_shape(shape)
 
     def inverse_shape(self, shape):
@@ -255,7 +279,8 @@ class Transform(object):
         """
         if not isinstance(shape, typing.Sequence):
             raise TypeError(
-                f"Expected shape is Sequence[int] type, but got {type(shape)}.")
+                f"Expected shape is Sequence[int] type, but got {type(shape)}."
+            )
         return self._inverse_shape(shape)
 
     @property
@@ -269,13 +294,13 @@ class Transform(object):
         return variable.real
 
     def _forward(self, x):
-        """Inner method for publid API ``forward``, subclass should 
+        """Inner method for publid API ``forward``, subclass should
         overwrite this method for supporting forward transformation.
         """
         raise NotImplementedError('Forward not implemented')
 
     def _inverse(self, y):
-        """Inner method of public API ``inverse``, subclass should 
+        """Inner method of public API ``inverse``, subclass should
         overwrite this method for supporting inverse transformation.
         """
         raise NotImplementedError('Inverse not implemented')
@@ -288,7 +313,8 @@ class Transform(object):
             return -self._inverse_log_det_jacobian(self.forward(y))
         raise NotImplementedError(
             'Neither _forward_log_det_jacobian nor _inverse_log_det_jacobian'
-            'is implemented. One of them is required.')
+            'is implemented. One of them is required.'
+        )
 
     def _call_inverse_log_det_jacobian(self, y):
         """Inner method called by ``inverse_log_det_jacobian``"""
@@ -298,38 +324,39 @@ class Transform(object):
             return -self._forward_log_det_jacobian(self._inverse(y))
         raise NotImplementedError(
             'Neither _forward_log_det_jacobian nor _inverse_log_det_jacobian '
-            'is implemented. One of them is required')
+            'is implemented. One of them is required'
+        )
 
     def _forward_shape(self, shape):
-        """Inner method called by ``forward_shape``, which is used to infer the 
-        forward shape. Subclass should overwrite this method for supporting 
+        """Inner method called by ``forward_shape``, which is used to infer the
+        forward shape. Subclass should overwrite this method for supporting
         ``forward_shape``.
         """
         return shape
 
     def _inverse_shape(self, shape):
-        """Inner method called by ``inverse_shape``, whic is used to infer the 
-        invese shape. Subclass should overwrite this method for supporting 
+        """Inner method called by ``inverse_shape``, whic is used to infer the
+        invese shape. Subclass should overwrite this method for supporting
         ``inverse_shape``.
         """
         return shape
 
 
 class AbsTransform(Transform):
-    r"""Absolute transformation with formula :math:`y = f(x) = abs(x)`, 
+    r"""Absolute transformation with formula :math:`y = f(x) = abs(x)`,
     element-wise.
 
-    This non-injective transformation allows for transformations of scalar 
-    distributions with the absolute value function, which maps ``(-inf, inf)`` 
+    This non-injective transformation allows for transformations of scalar
+    distributions with the absolute value function, which maps ``(-inf, inf)``
     to ``[0, inf)`` .
 
-    * For ``y`` in ``(0, inf)`` , ``AbsTransform.inverse(y)`` returns the set invese 
+    * For ``y`` in ``(0, inf)`` , ``AbsTransform.inverse(y)`` returns the set invese
       ``{x  in (-inf, inf) : |x| = y}`` as a tuple, ``-y, y`` .
-    * For ``y`` equal ``0`` , ``AbsTransform.inverse(0)`` returns ``0, 0``, which is not 
-      the set inverse (the set inverse is the singleton {0}), but "works" in 
-      conjunction with ``TransformedDistribution`` to produce a left 
+    * For ``y`` equal ``0`` , ``AbsTransform.inverse(0)`` returns ``0, 0``, which is not
+      the set inverse (the set inverse is the singleton {0}), but "works" in
+      conjunction with ``TransformedDistribution`` to produce a left
       semi-continuous pdf.
-    * For ``y`` in ``(-inf, 0)`` , ``AbsTransform.inverse(y)`` returns the 
+    * For ``y`` in ``(-inf, 0)`` , ``AbsTransform.inverse(y)`` returns the
       wrong thing ``-y, y``. This is done for efficiency.
 
     Examples:
@@ -388,7 +415,7 @@ class AbsTransform(Transform):
 
 
 class AffineTransform(Transform):
-    r"""Affine transformation with mapping 
+    r"""Affine transformation with mapping
     :math:`y = \text{loc} + \text{scale} \times x`.
 
     Args:
@@ -421,7 +448,8 @@ class AffineTransform(Transform):
             raise TypeError(f"Expected 'loc' is a Tensor, but got {type(loc)}")
         if not isinstance(scale, paddle.fluid.framework.Variable):
             raise TypeError(
-                f"Expected scale is a Tensor, but got {type(scale)}")
+                f"Expected scale is a Tensor, but got {type(scale)}"
+            )
         self._loc = loc
         self._scale = scale
         super(AffineTransform, self).__init__()
@@ -447,13 +475,17 @@ class AffineTransform(Transform):
         return tuple(
             paddle.broadcast_shape(
                 paddle.broadcast_shape(shape, self._loc.shape),
-                self._scale.shape))
+                self._scale.shape,
+            )
+        )
 
     def _inverse_shape(self, shape):
         return tuple(
             paddle.broadcast_shape(
                 paddle.broadcast_shape(shape, self._loc.shape),
-                self._scale.shape))
+                self._scale.shape,
+            )
+        )
 
     @property
     def _domain(self):
@@ -505,7 +537,8 @@ class ChainTransform(Transform):
             )
         if not all(isinstance(t, Transform) for t in transforms):
             raise TypeError(
-                "All elements of transforms should be Transform type.")
+                "All elements of transforms should be Transform type."
+            )
 
         self.transforms = transforms
         super(ChainTransform, self).__init__()
@@ -524,11 +557,12 @@ class ChainTransform(Transform):
         return y
 
     def _forward_log_det_jacobian(self, x):
-        value = 0.
+        value = 0.0
         event_rank = self._domain.event_rank
         for t in self.transforms:
-            value += self._sum_rightmost(t.forward_log_det_jacobian(x),
-                                         event_rank - t._domain.event_rank)
+            value += self._sum_rightmost(
+                t.forward_log_det_jacobian(x), event_rank - t._domain.event_rank
+            )
             x = t.forward(x)
             event_rank += t._codomain.event_rank - t._domain.event_rank
         return value
@@ -638,26 +672,26 @@ class ExpTransform(Transform):
 
 class IndependentTransform(Transform):
     r"""
-    ``IndependentTransform`` wraps a base transformation, reinterprets 
+    ``IndependentTransform`` wraps a base transformation, reinterprets
     some of the rightmost batch axes as event axes.
 
     Generally, it is used to expand the event axes. This has no effect on the
-    forward or inverse transformaion, but does sum out the 
-    ``reinterpretd_bach_rank`` rightmost dimensions in computing the determinant 
+    forward or inverse transformaion, but does sum out the
+    ``reinterpretd_bach_rank`` rightmost dimensions in computing the determinant
     of Jacobian matrix.
 
-    To see this, consider the ``ExpTransform`` applied to a Tensor which has 
-    sample, batch, and event ``(S,B,E)`` shape semantics. Suppose the Tensor's 
+    To see this, consider the ``ExpTransform`` applied to a Tensor which has
+    sample, batch, and event ``(S,B,E)`` shape semantics. Suppose the Tensor's
     paritioned-shape is ``(S=[4], B=[2, 2], E=[3])`` , reinterpreted_batch_rank
     is 1. Then the reinterpreted Tensor's shape  is ``(S=[4], B=[2], E=[2, 3])`` .
-    The shape returned by ``forward`` and ``inverse`` is unchanged, ie, 
-    ``[4,2,2,3]`` . However the shape returned by ``inverse_log_det_jacobian`` 
-    is ``[4,2]``, because the Jacobian determinant is a reduction over the 
+    The shape returned by ``forward`` and ``inverse`` is unchanged, ie,
+    ``[4,2,2,3]`` . However the shape returned by ``inverse_log_det_jacobian``
+    is ``[4,2]``, because the Jacobian determinant is a reduction over the
     event dimensions.
 
     Args:
         base (Transform): The base transformation.
-        reinterpreted_batch_rank (int): The num of rightmost batch rank that 
+        reinterpreted_batch_rank (int): The num of rightmost batch rank that
             will be reinterpreted as event rank.
 
     Examples:
@@ -683,7 +717,8 @@ class IndependentTransform(Transform):
     def __init__(self, base, reinterpreted_batch_rank):
         if not isinstance(base, Transform):
             raise TypeError(
-                f"Expected 'base' is Transform type, but get {type(base)}")
+                f"Expected 'base' is Transform type, but get {type(base)}"
+            )
         if reinterpreted_batch_rank <= 0:
             raise ValueError(
                 f"Expected 'reinterpreted_batch_rank' is grater than zero, but got {reinterpreted_batch_rank}"
@@ -708,7 +743,8 @@ class IndependentTransform(Transform):
 
     def _forward_log_det_jacobian(self, x):
         return self._base.forward_log_det_jacobian(x).sum(
-            list(range(-self._reinterpreted_batch_rank, 0)))
+            list(range(-self._reinterpreted_batch_rank, 0))
+        )
 
     def _forward_shape(self, shape):
         return self._base.forward_shape(shape)
@@ -718,13 +754,15 @@ class IndependentTransform(Transform):
 
     @property
     def _domain(self):
-        return variable.Independent(self._base._domain,
-                                    self._reinterpreted_batch_rank)
+        return variable.Independent(
+            self._base._domain, self._reinterpreted_batch_rank
+        )
 
     @property
     def _codomain(self):
-        return variable.Independent(self._base._codomain,
-                                    self._reinterpreted_batch_rank)
+        return variable.Independent(
+            self._base._codomain, self._reinterpreted_batch_rank
+        )
 
 
 class PowerTransform(Transform):
@@ -758,7 +796,8 @@ class PowerTransform(Transform):
     def __init__(self, power):
         if not isinstance(power, paddle.fluid.framework.Variable):
             raise TypeError(
-                f"Expected 'power' is a tensor, but got {type(power)}")
+                f"Expected 'power' is a tensor, but got {type(power)}"
+            )
         self._power = power
         super(PowerTransform, self).__init__()
 
@@ -793,7 +832,7 @@ class PowerTransform(Transform):
 class ReshapeTransform(Transform):
     r"""Reshape the event shape of a tensor.
 
-    Note that ``in_event_shape`` and ``out_event_shape`` must have the same 
+    Note that ``in_event_shape`` and ``out_event_shape`` must have the same
     number of elements.
 
     Args:
@@ -827,13 +866,16 @@ class ReshapeTransform(Transform):
 
     def __init__(self, in_event_shape, out_event_shape):
         if not isinstance(in_event_shape, typing.Sequence) or not isinstance(
-                out_event_shape, typing.Sequence):
+            out_event_shape, typing.Sequence
+        ):
             raise TypeError(
                 f"Expected type of 'in_event_shape' and 'out_event_shape' is "
                 f"Squence[int], but got 'in_event_shape': {in_event_shape}, "
-                f"'out_event_shape': {out_event_shape}")
+                f"'out_event_shape': {out_event_shape}"
+            )
         if functools.reduce(operator.mul, in_event_shape) != functools.reduce(
-                operator.mul, out_event_shape):
+            operator.mul, out_event_shape
+        ):
             raise ValueError(
                 f"The numel of 'in_event_shape' should be 'out_event_shape', "
                 f"but got {functools.reduce(operator.mul, in_event_shape)}!={functools.reduce(operator.mul, out_event_shape)}"
@@ -861,39 +903,45 @@ class ReshapeTransform(Transform):
 
     def _forward(self, x):
         return x.reshape(
-            tuple(x.shape)[:x.dim() - len(self._in_event_shape)] +
-            self._out_event_shape)
+            tuple(x.shape)[: x.dim() - len(self._in_event_shape)]
+            + self._out_event_shape
+        )
 
     def _inverse(self, y):
         return y.reshape(
-            tuple(y.shape)[:y.dim() - len(self._out_event_shape)] +
-            self._in_event_shape)
+            tuple(y.shape)[: y.dim() - len(self._out_event_shape)]
+            + self._in_event_shape
+        )
 
     def _forward_shape(self, shape):
         if len(shape) < len(self._in_event_shape):
             raise ValueError(
                 f"Expected length of 'shape' is not less than {len(self._in_event_shape)}, but got {len(shape)}"
             )
-        if shape[-len(self._in_event_shape):] != self._in_event_shape:
+        if shape[-len(self._in_event_shape) :] != self._in_event_shape:
             raise ValueError(
                 f"Event shape mismatch, expected: {self._in_event_shape}, but got {shape[-len(self._in_event_shape):]}"
             )
-        return tuple(shape[:-len(self._in_event_shape)]) + self._out_event_shape
+        return (
+            tuple(shape[: -len(self._in_event_shape)]) + self._out_event_shape
+        )
 
     def _inverse_shape(self, shape):
         if len(shape) < len(self._out_event_shape):
             raise ValueError(
                 f"Expected 'shape' length is not less than {len(self._out_event_shape)}, but got {len(shape)}"
             )
-        if shape[-len(self._out_event_shape):] != self._out_event_shape:
+        if shape[-len(self._out_event_shape) :] != self._out_event_shape:
             raise ValueError(
                 f"Event shape mismatch, expected: {self._out_event_shape}, but got {shape[-len(self._out_event_shape):]}"
             )
-        return tuple(shape[:-len(self._out_event_shape)]) + self._in_event_shape
+        return (
+            tuple(shape[: -len(self._out_event_shape)]) + self._in_event_shape
+        )
 
     def _forward_log_det_jacobian(self, x):
         # paddle.zeros not support zero dimension Tensor.
-        shape = x.shape[:x.dim() - len(self._in_event_shape)] or [1]
+        shape = x.shape[: x.dim() - len(self._in_event_shape)] or [1]
         return paddle.zeros(shape, dtype=x.dtype)
 
 
@@ -928,7 +976,7 @@ class SigmoidTransform(Transform):
 
     @property
     def _codomain(self):
-        return variable.Variable(False, 0, constraint.Range(0., 1.))
+        return variable.Variable(False, 0, constraint.Range(0.0, 1.0))
 
     def _forward(self, x):
         return F.sigmoid(x)
@@ -943,8 +991,8 @@ class SigmoidTransform(Transform):
 class SoftmaxTransform(Transform):
     r"""Softmax transformation with mapping :math:`y=\exp(x)` then normalizing.
 
-    It's generally used to convert unconstrained space to simplex. This mapping 
-    is not injective, so ``forward_log_det_jacobian`` and 
+    It's generally used to convert unconstrained space to simplex. This mapping
+    is not injective, so ``forward_log_det_jacobian`` and
     ``inverse_log_det_jacobian`` are not implemented.
 
     Examples:
@@ -997,11 +1045,11 @@ class SoftmaxTransform(Transform):
 
 
 class StackTransform(Transform):
-    r""" ``StackTransform`` applies a sequence of transformations along the 
+    r"""``StackTransform`` applies a sequence of transformations along the
     specific axis.
 
     Args:
-        transforms(Sequence[Transform]): The sequence of transformations. 
+        transforms(Sequence[Transform]): The sequence of transformations.
         axis(int): The axis along which will be transformed.
 
     Examples:
@@ -1042,7 +1090,8 @@ class StackTransform(Transform):
             )
         if not all(isinstance(t, Transform) for t in transforms):
             raise TypeError(
-                'Expected all element in transforms is Transform Type.')
+                'Expected all element in transforms is Transform Type.'
+            )
         if not isinstance(axis, int):
             raise TypeError(f"Expected 'axis' is int, but got{type(axis)}.")
 
@@ -1062,34 +1111,45 @@ class StackTransform(Transform):
 
     def _forward(self, x):
         self._check_size(x)
-        return paddle.stack([
-            t.forward(v)
-            for v, t in zip(paddle.unstack(x, self._axis), self._transforms)
-        ], self._axis)
+        return paddle.stack(
+            [
+                t.forward(v)
+                for v, t in zip(paddle.unstack(x, self._axis), self._transforms)
+            ],
+            self._axis,
+        )
 
     def _inverse(self, y):
         self._check_size(y)
-        return paddle.stack([
-            t.inverse(v)
-            for v, t in zip(paddle.unstack(y, self._axis), self._transforms)
-        ], self._axis)
+        return paddle.stack(
+            [
+                t.inverse(v)
+                for v, t in zip(paddle.unstack(y, self._axis), self._transforms)
+            ],
+            self._axis,
+        )
 
     def _forward_log_det_jacobian(self, x):
         self._check_size(x)
-        return paddle.stack([
-            t.forward_log_det_jacobian(v)
-            for v, t in zip(paddle.unstack(x, self._axis), self._transforms)
-        ], self._axis)
+        return paddle.stack(
+            [
+                t.forward_log_det_jacobian(v)
+                for v, t in zip(paddle.unstack(x, self._axis), self._transforms)
+            ],
+            self._axis,
+        )
 
     def _check_size(self, v):
         if not (-v.dim() <= self._axis < v.dim()):
             raise ValueError(
                 f'Input dimensions {v.dim()} should be grater than stack '
-                f'transform axis {self._axis}.')
+                f'transform axis {self._axis}.'
+            )
         if v.shape[self._axis] != len(self._transforms):
             raise ValueError(
                 f'Input size along {self._axis} should be equal to the '
-                f'length of transforms.')
+                f'length of transforms.'
+            )
 
     @property
     def _domain(self):
@@ -1097,12 +1157,13 @@ class StackTransform(Transform):
 
     @property
     def _codomain(self):
-        return variable.Stack([t._codomain for t in self._transforms],
-                              self._axis)
+        return variable.Stack(
+            [t._codomain for t in self._transforms], self._axis
+        )
 
 
 class StickBreakingTransform(Transform):
-    r"""Convert an unconstrained vector to the simplex with one additional 
+    r"""Convert an unconstrained vector to the simplex with one additional
     dimension by the stick-breaking construction.
 
     Examples:
@@ -1131,8 +1192,9 @@ class StickBreakingTransform(Transform):
         offset = x.shape[-1] + 1 - paddle.ones([x.shape[-1]]).cumsum(-1)
         z = F.sigmoid(x - offset.log())
         z_cumprod = (1 - z).cumprod(-1)
-        return F.pad(z, [0]*2*(len(x.shape)-1) + [0, 1], value=1) * \
-            F.pad(z_cumprod, [0]*2*(len(x.shape)-1) + [1, 0], value=1)
+        return F.pad(z, [0] * 2 * (len(x.shape) - 1) + [0, 1], value=1) * F.pad(
+            z_cumprod, [0] * 2 * (len(x.shape) - 1) + [1, 0], value=1
+        )
 
     def _inverse(self, y):
         y_crop = y[..., :-1]
@@ -1150,12 +1212,12 @@ class StickBreakingTransform(Transform):
     def _forward_shape(self, shape):
         if not shape:
             raise ValueError(f"Expected 'shape' is not empty, but got {shape}")
-        return shape[:-1] + (shape[-1] + 1, )
+        return shape[:-1] + (shape[-1] + 1,)
 
     def _inverse_shape(self, shape):
         if not shape:
             raise ValueError(f"Expected 'shape' is not empty, but got {shape}")
-        return shape[:-1] + (shape[-1] - 1, )
+        return shape[:-1] + (shape[-1] - 1,)
 
     @property
     def _domain(self):
@@ -1213,10 +1275,10 @@ class TanhTransform(Transform):
         return y.atanh()
 
     def _forward_log_det_jacobian(self, x):
-        """We implicitly rely on _forward_log_det_jacobian rather than 
-        explicitly implement ``_inverse_log_det_jacobian`` since directly using 
+        """We implicitly rely on _forward_log_det_jacobian rather than
+        explicitly implement ``_inverse_log_det_jacobian`` since directly using
         ``-tf.math.log1p(-tf.square(y))`` has lower numerical precision.
 
         See details: https://github.com/tensorflow/probability/blob/master/tensorflow_probability/python/bijectors/tanh.py#L69-L80
         """
-        return 2. * (math.log(2.) - x - F.softplus(-2. * x))
+        return 2.0 * (math.log(2.0) - x - F.softplus(-2.0 * x))
diff --git a/python/paddle/distribution/uniform.py b/python/paddle/distribution/uniform.py
index 7c085da3156..769d60eeb23 100644
--- a/python/paddle/distribution/uniform.py
+++ b/python/paddle/distribution/uniform.py
@@ -19,12 +19,27 @@ import numpy as np
 from paddle import _C_ops, _legacy_C_ops
 from paddle.distribution import distribution
 from paddle.fluid import core
-from paddle.fluid.data_feeder import (check_dtype, check_type,
-                                      check_variable_and_dtype, convert_dtype)
-from paddle.fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
-from paddle.fluid.layers import (control_flow, elementwise_add, elementwise_div,
-                                 elementwise_mul, elementwise_sub, nn, ops,
-                                 tensor)
+from paddle.fluid.data_feeder import (
+    check_dtype,
+    check_type,
+    check_variable_and_dtype,
+    convert_dtype,
+)
+from paddle.fluid.framework import (
+    _non_static_mode,
+    in_dygraph_mode,
+    _in_legacy_dygraph,
+)
+from paddle.fluid.layers import (
+    control_flow,
+    elementwise_add,
+    elementwise_div,
+    elementwise_mul,
+    elementwise_sub,
+    nn,
+    ops,
+    tensor,
+)
 from paddle.tensor import arange, concat, gather_nd, multinomial
 
 
@@ -91,12 +106,18 @@ class Uniform(distribution.Distribution):
 
     def __init__(self, low, high, name=None):
         if not _non_static_mode():
-            check_type(low, 'low',
-                       (int, float, np.ndarray, tensor.Variable, list, tuple),
-                       'Uniform')
-            check_type(high, 'high',
-                       (int, float, np.ndarray, tensor.Variable, list, tuple),
-                       'Uniform')
+            check_type(
+                low,
+                'low',
+                (int, float, np.ndarray, tensor.Variable, list, tuple),
+                'Uniform',
+            )
+            check_type(
+                high,
+                'high',
+                (int, float, np.ndarray, tensor.Variable, list, tuple),
+                'Uniform',
+            )
 
         self.all_arg_is_float = False
         self.batch_size_unknown = False
@@ -116,11 +137,15 @@ class Uniform(distribution.Distribution):
         else:
             if isinstance(low, float) and isinstance(high, float):
                 self.all_arg_is_float = True
-            if isinstance(low, np.ndarray) and str(
-                    low.dtype) in ['float32', 'float64']:
+            if isinstance(low, np.ndarray) and str(low.dtype) in [
+                'float32',
+                'float64',
+            ]:
                 self.dtype = low.dtype
-            elif isinstance(high, np.ndarray) and str(
-                    high.dtype) in ['float32', 'float64']:
+            elif isinstance(high, np.ndarray) and str(high.dtype) in [
+                'float32',
+                'float64',
+            ]:
                 self.dtype = high.dtype
             # pylint: disable=unbalanced-tuple-unpacking
             self.low, self.high = self._to_tensor(low, high)
@@ -148,27 +173,33 @@ class Uniform(distribution.Distribution):
         if self.batch_size_unknown:
             output_shape = shape + batch_shape
             zero_tmp = tensor.fill_constant_batch_size_like(
-                self.low + self.high, batch_shape + shape, self.dtype, 0.)
+                self.low + self.high, batch_shape + shape, self.dtype, 0.0
+            )
             uniform_random_tmp = nn.uniform_random_batch_size_like(
                 zero_tmp,
                 zero_tmp.shape,
                 dtype=self.dtype,
-                min=0.,
-                max=1.,
-                seed=seed)
+                min=0.0,
+                max=1.0,
+                seed=seed,
+            )
             zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
-            uniform_random_tmp_reshape = nn.reshape(uniform_random_tmp,
-                                                    output_shape)
-            output = uniform_random_tmp_reshape * (zero_tmp_reshape +
-                                                   self.high - self.low)
+            uniform_random_tmp_reshape = nn.reshape(
+                uniform_random_tmp, output_shape
+            )
+            output = uniform_random_tmp_reshape * (
+                zero_tmp_reshape + self.high - self.low
+            )
             output = elementwise_add(output, self.low, name=name)
             return output
         else:
             output_shape = shape + batch_shape
             output = nn.uniform_random(
-                output_shape, dtype=self.dtype, min=0., max=1.,
-                seed=seed) * (tensor.zeros(output_shape, dtype=self.dtype) +
-                              (self.high - self.low))
+                output_shape, dtype=self.dtype, min=0.0, max=1.0, seed=seed
+            ) * (
+                tensor.zeros(output_shape, dtype=self.dtype)
+                + (self.high - self.low)
+            )
             output = elementwise_add(output, self.low, name=name)
             if self.all_arg_is_float:
                 return nn.reshape(output, shape, name=name)
@@ -197,10 +228,12 @@ class Uniform(distribution.Distribution):
                 return nn.log(lb * ub) - nn.log(self.high - self.low)
 
             if _in_legacy_dygraph():
-                lb = _legacy_C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype,
-                                        'out_dtype', value.dtype)
-                ub = _legacy_C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype,
-                                        'out_dtype', value.dtype)
+                lb = _legacy_C_ops.cast(
+                    lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype', value.dtype
+                )
+                ub = _legacy_C_ops.cast(
+                    ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype', value.dtype
+                )
                 return nn.log(lb * ub) - nn.log(self.high - self.low)
 
         name = self.name + '_log_prob'
@@ -208,9 +241,9 @@ class Uniform(distribution.Distribution):
         ub_bool = value < self.high
         lb = tensor.cast(lb_bool, dtype=value.dtype)
         ub = tensor.cast(ub_bool, dtype=value.dtype)
-        return elementwise_sub(nn.log(lb * ub),
-                               nn.log(self.high - self.low),
-                               name=name)
+        return elementwise_sub(
+            nn.log(lb * ub), nn.log(self.high - self.low), name=name
+        )
 
     def probs(self, value):
         """Probability density/mass function.
@@ -233,10 +266,12 @@ class Uniform(distribution.Distribution):
                 return (lb * ub) / (self.high - self.low)
 
             if _in_legacy_dygraph():
-                lb = _legacy_C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype,
-                                        'out_dtype', value.dtype)
-                ub = _legacy_C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype,
-                                        'out_dtype', value.dtype)
+                lb = _legacy_C_ops.cast(
+                    lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype', value.dtype
+                )
+                ub = _legacy_C_ops.cast(
+                    ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype', value.dtype
+                )
                 return (lb * ub) / (self.high - self.low)
 
         name = self.name + '_probs'
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 9321068de74..cc05ec3297a 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -75,7 +75,7 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
 
 _dygraph_tracer_ = None
-_in_eager_mode_ = (os.environ.get('FLAGS_enable_eager_mode', '1') == '1')
+_in_eager_mode_ = os.environ.get('FLAGS_enable_eager_mode', '1') == '1'
 _global_expected_place_ = None
 _current_device = None
 global_prog_seed = 0
@@ -84,10 +84,12 @@ _already_patch_eager_tensor = False
 _already_patch_varbase = False
 _current_cuda_graph_mode = None
 _global_flags_ = core.globals()
-_enable_standalone_executor_ = (os.environ.get('FLAGS_USE_STANDALONE_EXECUTOR',
-                                               None))
-_dy2st_enable_standalone_executor_ = (os.environ.get(
-    'FLAGS_DY2ST_USE_STANDALONE_EXECUTOR', 0))
+_enable_standalone_executor_ = os.environ.get(
+    'FLAGS_USE_STANDALONE_EXECUTOR', None
+)
+_dy2st_enable_standalone_executor_ = os.environ.get(
+    'FLAGS_DY2ST_USE_STANDALONE_EXECUTOR', 0
+)
 
 # Some explanation of our execution system 2022.03
 # For now we have 3 kinds of execution system, since we refactored dygraph mode to
@@ -148,6 +150,7 @@ def _update_monkey_methods(is_eager):
 
 def _switch_tensor_bind_type(is_eager):
     import paddle
+
     if is_eager:
         paddle.Tensor = core.eager.Tensor
     else:
@@ -182,8 +185,12 @@ def _fallback_legacy_dygraph():
     global _is_first_import_
     need_fallback = False
     # Only enable eager on CPU/GPU
-    is_not_support = core.is_compiled_with_xpu() or core.is_compiled_with_npu(
-    ) or core.is_compiled_with_ipu() or core.is_compiled_with_mlu()
+    is_not_support = (
+        core.is_compiled_with_xpu()
+        or core.is_compiled_with_npu()
+        or core.is_compiled_with_ipu()
+        or core.is_compiled_with_mlu()
+    )
 
     if _in_eager_mode_ and is_not_support:
         # switch into legacy dygraph mode
@@ -283,15 +290,15 @@ def ipu_shard_guard(index=-1, stage=-1):
         index(int, optional): Specify which ipu the Tensor is computed on, (such as '0, 1, 2, 3').
             The default value is -1, which means the Op only run on IPU 0.
         stage(int, optional): Specify the computation order of the sharded model(such as '0, 1, 2, 3').
-            The sharded model will be computed from small to large. The default value is -1, 
+            The sharded model will be computed from small to large. The default value is -1,
             which means no pipelining computation order and run Ops in terms of graph.
-    
+
     **Note**:
-    Only if the enable_manual_shard=True, the 'index' is able to be set not -1. Please refer 
-    to :code:`paddle.static.IpuStrategy` . 
-    Only if the enable_pipelining=True, the 'stage' is able to be set not -1. Please refer 
+    Only if the enable_manual_shard=True, the 'index' is able to be set not -1. Please refer
     to :code:`paddle.static.IpuStrategy` .
-    A index is allowed to match none stage or a stage. A stage is only allowed to match a new or 
+    Only if the enable_pipelining=True, the 'stage' is able to be set not -1. Please refer
+    to :code:`paddle.static.IpuStrategy` .
+    A index is allowed to match none stage or a stage. A stage is only allowed to match a new or
     duplicated index.
 
     Examples:
@@ -336,7 +343,7 @@ def set_ipu_shard(call_func, index=-1, stage=-1):
         index(int, optional): Specify which ipu the Tensor is computed on, (such as ‘0, 1, 2, 3’).
             The default value is -1, which means the Op only run on IPU 0.
         stage(int, optional): Specify the computation order of the sharded model(such as ‘0, 1, 2, 3’).
-            The sharded model will be computed from small to large. The default value is -1, 
+            The sharded model will be computed from small to large. The default value is -1,
             which means no pipelining computation order and run Ops in terms of graph.
 
     Returns:
@@ -357,7 +364,6 @@ def set_ipu_shard(call_func, index=-1, stage=-1):
     """
 
     def decorate(func):
-
         def wrapper(*args, **kwargs):
             with ipu_shard_guard(index=index, stage=stage):
                 return func(*args, **kwargs)
@@ -365,16 +371,17 @@ def set_ipu_shard(call_func, index=-1, stage=-1):
         return wrapper
 
     from .dygraph.layers import Layer
+
     if not isinstance(call_func, Layer):
         if callable(call_func):
             return decorate(call_func)
         else:
             raise TypeError(
-                "Unsupported type. Only accept paddle.nn.Layer or function.")
+                "Unsupported type. Only accept paddle.nn.Layer or function."
+            )
 
     # patch paddle.nn.Layer
     class BlockFn(type(call_func)):
-
         def __call__(self, *args, **kwargs):
             with ipu_shard_guard(index=index, stage=stage):
                 return super().__call__(*args, **kwargs)
@@ -386,62 +393,68 @@ def set_ipu_shard(call_func, index=-1, stage=-1):
 
 def require_version(min_version, max_version=None):
     """
-        Check if the installed version of PaddlePaddle is in [min_version, max_version],
-        if the installed version is lower than ``min_version`` or higher than ``max_version``,
-        an exception will be thrown, NO returns if the installed version is satisfied.
+    Check if the installed version of PaddlePaddle is in [min_version, max_version],
+    if the installed version is lower than ``min_version`` or higher than ``max_version``,
+    an exception will be thrown, NO returns if the installed version is satisfied.
 
-        Args:
-            min_version (str): the minimum version required (like '1.4.0').
-            max_version (str, optional): the max version required (like '1.6.0'), default is None,
-                meaning any version equal or higher than ``min_version`` is acceptable.
+    Args:
+        min_version (str): the minimum version required (like '1.4.0').
+        max_version (str, optional): the max version required (like '1.6.0'), default is None,
+            meaning any version equal or higher than ``min_version`` is acceptable.
 
-        Returns:
-            None.
+    Returns:
+        None.
 
-        Raises:
-            TypeError: if the type of ``min_version`` is not str.
-            TypeError: if the type of ``max_version`` is not str or type(None).
-            ValueError: if the value of ``min_version`` is not in version format.
-            ValueError: if the value of ``max_version`` is not in version format or None.
-            Exception: if the installed version is lower than ``min_version`` or higher than ``max_version``.
+    Raises:
+        TypeError: if the type of ``min_version`` is not str.
+        TypeError: if the type of ``max_version`` is not str or type(None).
+        ValueError: if the value of ``min_version`` is not in version format.
+        ValueError: if the value of ``max_version`` is not in version format or None.
+        Exception: if the installed version is lower than ``min_version`` or higher than ``max_version``.
 
-        Examples:
-            .. code-block:: python
+    Examples:
+        .. code-block:: python
 
-                import paddle.fluid as fluid
+            import paddle.fluid as fluid
 
-                # any version >= 0.1.0 is acceptable.
-                fluid.require_version('0.1.0')
+            # any version >= 0.1.0 is acceptable.
+            fluid.require_version('0.1.0')
 
-                # if 0.1.0 <= version <= 10.0.0, it is acceptable.
-                fluid.require_version(min_version='0.1.0', max_version='10.0.0')
-        """
+            # if 0.1.0 <= version <= 10.0.0, it is acceptable.
+            fluid.require_version(min_version='0.1.0', max_version='10.0.0')
+    """
     if not isinstance(min_version, str):
         raise TypeError(
             "The type of 'min_version' in require_version must be str, but received %s."
-            % (type(min_version)))
+            % (type(min_version))
+        )
 
     if not isinstance(max_version, (str, type(None))):
         raise TypeError(
             "The type of 'max_version' in require_version must be str or type(None), but received %s."
-            % (type(max_version)))
+            % (type(max_version))
+        )
 
     check_format = re.match(r'\d+(\.\d+){0,3}', min_version)
     if check_format is None or check_format.group() != min_version:
         raise ValueError(
             "The value of 'min_version' in require_version must be in format '\\d+(\\.\\d+){0,3}', "
-            "like '1.5.2.0', but received %s" % min_version)
+            "like '1.5.2.0', but received %s" % min_version
+        )
 
     if max_version is not None:
         check_format = re.match(r'\d+(\.\d+){0,3}', max_version)
         if check_format is None or check_format.group() != max_version:
             raise ValueError(
                 "The value of 'max_version' in require_version must be in format '\\d+(\\.\\d+){0,3}', "
-                "like '1.5.2.0', but received %s" % max_version)
+                "like '1.5.2.0', but received %s" % max_version
+            )
 
     version_installed = [
-        fluid_version.major, fluid_version.minor, fluid_version.patch,
-        fluid_version.rc
+        fluid_version.major,
+        fluid_version.minor,
+        fluid_version.patch,
+        fluid_version.rc,
     ]
     zero_version = ['0', '0', '0', '0']
 
@@ -458,75 +471,86 @@ def require_version(min_version, max_version=None):
             warnings.warn(
                 "PaddlePaddle version in [%s, %s] required, but %s installed. "
                 "Maybe you are using a develop version, "
-                "please make sure the version is good with your code." %
-                (min_version, max_version, fluid_version.full_version))
+                "please make sure the version is good with your code."
+                % (min_version, max_version, fluid_version.full_version)
+            )
         else:
             warnings.warn(
                 "PaddlePaddle version %s or higher is required, but %s installed, "
                 "Maybe you are using a develop version, "
-                "please make sure the version is good with your code." %
-                (min_version, fluid_version.full_version))
+                "please make sure the version is good with your code."
+                % (min_version, fluid_version.full_version)
+            )
         return
 
     min_version_split = min_version.split('.')
-    min_version_to_check = min_version_split + zero_version[
-        len(min_version_split):]
+    min_version_to_check = (
+        min_version_split + zero_version[len(min_version_split) :]
+    )
 
     if max_version is not None:
         max_version_split = max_version.split('.')
-        max_version_to_check = max_version_split + zero_version[
-            len(max_version_split):]
+        max_version_to_check = (
+            max_version_split + zero_version[len(max_version_split) :]
+        )
 
-        if version_cmp(version_installed,
-                       max_version_to_check) > 0 or version_cmp(
-                           version_installed, min_version_to_check) < 0:
+        if (
+            version_cmp(version_installed, max_version_to_check) > 0
+            or version_cmp(version_installed, min_version_to_check) < 0
+        ):
             raise Exception(
                 "VersionError: PaddlePaddle version in [%s, %s] required, but %s installed."
-                % (min_version, max_version, fluid_version.full_version))
+                % (min_version, max_version, fluid_version.full_version)
+            )
     else:
         if version_cmp(version_installed, min_version_to_check) < 0:
             raise Exception(
                 "VersionError: PaddlePaddle version %s or higher is required, but %s installed, "
                 "please upgrade your PaddlePaddle to %s or other higher version."
-                % (min_version, fluid_version.full_version, min_version))
+                % (min_version, fluid_version.full_version, min_version)
+            )
 
 
 def _dygraph_not_support_(func):
-
     def __impl__(*args, **kwargs):
-        assert not _non_static_mode(
-        ), "We don't support %s in dynamic graph mode" % func.__name__
+        assert not _non_static_mode(), (
+            "We don't support %s in dynamic graph mode" % func.__name__
+        )
         return func(*args, **kwargs)
 
     return __impl__
 
 
 def _dygraph_only_(func):
-
     def __impl__(*args, **kwargs):
-        assert _non_static_mode(
-        ), "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode." % func.__name__
+        assert _non_static_mode(), (
+            "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode."
+            % func.__name__
+        )
         return func(*args, **kwargs)
 
     return __impl__
 
 
 def _non_static_only_(func):
-
     def __impl__(*args, **kwargs):
         from .dygraph.base import in_declarative_mode
-        assert _non_static_mode() or in_declarative_mode(
-        ), "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode." % func.__name__
+
+        assert _non_static_mode() or in_declarative_mode(), (
+            "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode."
+            % func.__name__
+        )
         return func(*args, **kwargs)
 
     return __impl__
 
 
 def _static_only_(func):
-
     def __impl__(*args, **kwargs):
-        assert not _non_static_mode(
-        ), "In PaddlePaddle 2.x, we turn on dynamic graph mode by default, and '%s()' is only supported in static graph mode. So if you want to use this api, please call 'paddle.enable_static()' before this api to enter static graph mode." % func.__name__
+        assert not _non_static_mode(), (
+            "In PaddlePaddle 2.x, we turn on dynamic graph mode by default, and '%s()' is only supported in static graph mode. So if you want to use this api, please call 'paddle.enable_static()' before this api to enter static graph mode."
+            % func.__name__
+        )
         return func(*args, **kwargs)
 
     return __impl__
@@ -545,14 +569,14 @@ def _set_pipeline_stage(stage):
 # TODO(zhiqiu): We should make VarBase consistent with Variable in future, for example, by inheritting
 # same base class.
 def _fake_interface_only_(func):
-
     def __impl__(*args, **kwargs):
         raise AssertionError(
             "'%s' only can be called by `paddle.Tensor` in dynamic graph mode. Suggestions:\n"
             "  1. If you are in static graph mode, you can switch to dynamic graph mode by turning off `paddle.enable_static()` or calling `paddle.disable_static()`.\n"
             "  2. If you are using `@paddle.jit.to_static`, you can turn off ProgramTranslator by calling `paddle.jit.ProgramTranslator().enable(False)`. "
             "If you have to translate dynamic graph to static graph, please use other API to replace '%s'."
-            % (func.__name__, func.__name__))
+            % (func.__name__, func.__name__)
+        )
 
     return __impl__
 
@@ -563,13 +587,13 @@ def _fake_interface_only_(func):
 # NOTE(chenweihang): not using `wrap_decorator` here is because `wrap_decorator` will
 # move kwargs to args, which doesn't work in this decorate case
 def deprecate_stat_dict(func):
-
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
         if 'stat_dict' in kwargs:
             warnings.warn(
                 "The argument `stat_dict` has deprecated, please change it to `state_dict`.",
-                DeprecationWarning)
+                DeprecationWarning,
+            )
             kwargs['state_dict'] = kwargs['stat_dict']
             kwargs.pop('stat_dict')
         return func(*args, **kwargs)
@@ -651,12 +675,12 @@ def _set_expected_place(place):
 
 # TODO(zhiqiu): remove this function.
 def _var_base_to_np(var_base):
-    """	
-    convert VarBase tp numpy	
+    """
+    convert VarBase tp numpy
 
-    Args:	
-        var_base(VarBase) : the VarBase to convert	
-    Returns (np.ndarray): the np.ndarray contain the value of VarBase	
+    Args:
+        var_base(VarBase) : the VarBase to convert
+    Returns (np.ndarray): the np.ndarray contain the value of VarBase
     """
 
     warnings.warn(
@@ -675,7 +699,9 @@ def _cpu_num():
                 'And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.\n\n'
                 'export CPU_NUM={} # for example, set CPU_NUM as number of physical CPU core which is {}.\n\n'
                 '!!! The default number of CPU_NUM=1.\n'.format(
-                    multiprocessing.cpu_count(), multiprocessing.cpu_count()))
+                    multiprocessing.cpu_count(), multiprocessing.cpu_count()
+                )
+            )
         os.environ['CPU_NUM'] = str(1)
     cpu_num = os.environ.get('CPU_NUM')
     return int(cpu_num)
@@ -754,14 +780,14 @@ def disable_signal_handler():
     Paddle installs signal handlers at C++ level to log debug information upon failing.
     However, conflicts can happen if another python module is making use of such signal.
     Such being the case, one may disblae paddle signal handler via this interface.
-    
+
     Known frameworks that require disabling signal handler includes:
     1. TVM
     2. ADLIK
 
     Make sure you called paddle.disable_signal_handler() before using above mentioned frameworks.
 
-    Returns: None 
+    Returns: None
 
     Examples:
         .. code-block:: python
@@ -834,7 +860,7 @@ def cuda_places(device_ids=None):
 
     If :code:`device_ids` is not None, it should be the device
     ids of GPUs. For example, if :code:`device_ids=[0,1,2]`,
-    the returned list would be 
+    the returned list would be
     [paddle.CUDAPlace(0), paddle.CUDAPlace(1), paddle.CUDAPlace(2)].
 
     Parameters:
@@ -844,21 +870,20 @@ def cuda_places(device_ids=None):
         list of paddle.CUDAPlace: Created GPU place list.
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
             import paddle.static as static
 
             # required: gpu
-            
+
             paddle.enable_static()
 
             cuda_places = static.cuda_places()
 
     """
-    assert core.is_compiled_with_cuda(), \
-        "Not compiled with CUDA"
+    assert core.is_compiled_with_cuda(), "Not compiled with CUDA"
     if device_ids is None:
         device_ids = _cuda_ids()
     elif not isinstance(device_ids, (list, tuple)):
@@ -879,9 +904,9 @@ def xpu_places(device_ids=None):
         xpu places would be returned.
         If :code:`device_ids` is not None, it should be the device
         ids of XPUs. For example, if :code:`device_ids=[0,1,2]`,
-        the returned list would be 
+        the returned list would be
         [paddle.XPUPlace(0), paddle.XPUPlace(1), paddle.XPUPlace(2)].
-    
+
     Parameters:
         device_ids (list or tuple of int, optional): list of XPU device ids.
     Returns:
@@ -893,12 +918,11 @@ def xpu_places(device_ids=None):
 
             import paddle
             import paddle.static as static
-            
+
             paddle.enable_static()
             xpu_places = static.xpu_places()
     """
-    assert core.is_compiled_with_xpu(), \
-        "Not compiled with XPU"
+    assert core.is_compiled_with_xpu(), "Not compiled with XPU"
     if device_ids is None:
         device_ids = _xpu_ids()
     elif not isinstance(device_ids, (list, tuple)):
@@ -910,7 +934,7 @@ def npu_places(device_ids=None):
     """
     **Note**:
         For multi-card tasks, please use `FLAGS_selected_npus` environment variable to set the visible NPU device.
-    
+
     This function creates a list of :code:`paddle.NPUPlace` objects.
     If :code:`device_ids` is None, environment variable of
     :code:`FLAGS_selected_npus` would be checked first. For example, if
@@ -920,9 +944,9 @@ def npu_places(device_ids=None):
     npu places would be returned.
     If :code:`device_ids` is not None, it should be the device
     ids of NPUs. For example, if :code:`device_ids=[0,1,2]`,
-    the returned list would be 
+    the returned list would be
     [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)].
-    
+
     Parameters:
         device_ids (list or tuple of int, optional): list of NPU device ids.
     Returns:
@@ -934,12 +958,11 @@ def npu_places(device_ids=None):
 
             import paddle
             import paddle.static as static
-            
+
             paddle.enable_static()
             npu_places = static.npu_places()
     """
-    assert core.is_compiled_with_npu(), \
-        "Not compiled with NPU"
+    assert core.is_compiled_with_npu(), "Not compiled with NPU"
     if device_ids is None:
         device_ids = _npu_ids()
     elif not isinstance(device_ids, (list, tuple)):
@@ -952,7 +975,7 @@ def cpu_places(device_count=None):
     This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list.
 
     If :code:`device_count` is None, the device count would
-    be determined by environment variable :code:`CPU_NUM`. 
+    be determined by environment variable :code:`CPU_NUM`.
     If :code:`CPU_NUM` is not set, the default value is 1,
     i.e. CPU_NUM=1.
     :code:`CPU_NUM` indicates the number of devices used in the current task.
@@ -965,7 +988,7 @@ def cpu_places(device_count=None):
         list of paddle.CPUPlace: Created list of CPU places.
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
@@ -986,7 +1009,7 @@ def cuda_pinned_places(device_count=None):
     This function creates a list of :code:`fluid.CUDAPinnedPlace` objects.
 
     If :code:`device_count` is None, the device count would
-    be determined by environment variable :code:`CPU_NUM`. 
+    be determined by environment variable :code:`CPU_NUM`.
     If :code:`CPU_NUM` is not set, the default value is 1,
     i.e. CPU_NUM=1.
     :code:`CPU_NUM` indicates the number of devices used in the current task.
@@ -1007,8 +1030,7 @@ def cuda_pinned_places(device_count=None):
             cuda_pinned_places = fluid.cuda_pinned_places(1)
 
     """
-    assert core.is_compiled_with_cuda(), \
-        "Not compiled with CUDA"
+    assert core.is_compiled_with_cuda(), "Not compiled with CUDA"
     if device_count is None:
         device_count = len(_cuda_ids())
     return [core.CUDAPinnedPlace()] * device_count
@@ -1047,8 +1069,7 @@ def mlu_places(device_ids=None):
             paddle.enable_static()
             mlu_places = static.mlu_places()
     """
-    assert core.is_compiled_with_mlu(), \
-        "Not compiled with MLU"
+    assert core.is_compiled_with_mlu(), "Not compiled with MLU"
     if device_ids is None:
         device_ids = _mlu_ids()
     elif not isinstance(device_ids, (list, tuple)):
@@ -1057,7 +1078,6 @@ def mlu_places(device_ids=None):
 
 
 class NameScope(object):
-
     def __init__(self, name="", parent=None):
         self._children = dict()
         self._name = name
@@ -1068,8 +1088,9 @@ class NameScope(object):
             new_child = NameScope(prefix, self)
             self._children[prefix] = [new_child]
         else:
-            new_child = NameScope(prefix + "_%d" % len(self._children[prefix]),
-                                  self)
+            new_child = NameScope(
+                prefix + "_%d" % len(self._children[prefix]), self
+            )
             self._children[prefix].append(new_child)
         return new_child
 
@@ -1089,7 +1110,7 @@ def name_scope(prefix=None):
 
     Generate hierarchical name prefix for the operators in Static Graph.
 
-    Note: 
+    Note:
         This should only used for debugging and visualization purpose.
         Don't use it for serious analysis such as graph/program transformations.
         Don't use it in dygraph, since it will cause memory leak.
@@ -1098,7 +1119,7 @@ def name_scope(prefix=None):
         prefix(str, optional): prefix. Default is none.
 
     Examples:
-    
+
         .. code-block:: python
 
           import paddle
@@ -1115,7 +1136,7 @@ def name_scope(prefix=None):
           with paddle.static.name_scope("s4"):
                 g = f - 1
 
-          # Op are created in the default main program.  
+          # Op are created in the default main program.
           for op in paddle.static.default_main_program().block(0).ops:
               # elementwise_add is created in /s1/
               if op.type == 'elementwise_add':
@@ -1159,6 +1180,7 @@ def _full_name_scope():
 
 def generate_control_dev_var_name():
     import random
+
     return CONTROL_DEP_VAR_PREFIX + "@" + str(random.random())
 
 
@@ -1226,8 +1248,9 @@ def dtype_is_floating(dtype):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     return dtype in [
-        core.VarDesc.VarType.FP16, core.VarDesc.VarType.FP32,
-        core.VarDesc.VarType.FP64
+        core.VarDesc.VarType.FP16,
+        core.VarDesc.VarType.FP32,
+        core.VarDesc.VarType.FP64,
     ]
 
 
@@ -1247,16 +1270,20 @@ def _debug_string_(proto, throw_on_error=True):
     if not proto.IsInitialized(error_fields) and throw_on_error:
         raise ValueError(
             "{0} are not initialized.\nThe message is {1}:\n".format(
-                error_fields, proto))
+                error_fields, proto
+            )
+        )
     return proto.__str__()
 
 
-def _varbase_creator(type=core.VarDesc.VarType.LOD_TENSOR,
-                     name=None,
-                     shape=None,
-                     dtype=None,
-                     persistable=None,
-                     **kwargs):
+def _varbase_creator(
+    type=core.VarDesc.VarType.LOD_TENSOR,
+    name=None,
+    shape=None,
+    dtype=None,
+    persistable=None,
+    **kwargs
+):
     if dtype is not None:
         if not isinstance(dtype, core.VarDesc.VarType):
             dtype = convert_np_dtype_to_dtype_(dtype)
@@ -1264,16 +1291,21 @@ def _varbase_creator(type=core.VarDesc.VarType.LOD_TENSOR,
     if _in_eager_mode_:
         eager_tensor = core.eager.Tensor(
             dtype if dtype else core.VarDesc.VarType.FP32,
-            list(shape) if shape else [], name,
+            list(shape) if shape else [],
+            name,
             type if type else core.VarDesc.VarType.LOD_TENSOR,
-            True if persistable else False)
+            True if persistable else False,
+        )
         eager_tensor.retain_grads()
         return eager_tensor
     else:
-        return core.VarBase(dtype if dtype else core.VarDesc.VarType.FP32,
-                            list(shape) if shape else [], name,
-                            type if type else core.VarDesc.VarType.LOD_TENSOR,
-                            True if persistable else False)
+        return core.VarBase(
+            dtype if dtype else core.VarDesc.VarType.FP32,
+            list(shape) if shape else [],
+            name,
+            type if type else core.VarDesc.VarType.LOD_TENSOR,
+            True if persistable else False,
+        )
 
 
 def _all_is_type(vals, expected_type):
@@ -1283,12 +1315,12 @@ def _all_is_type(vals, expected_type):
     NOTE: BuiltIn all() will always return True if vals is empty.
     """
     assert isinstance(vals, (list, tuple))
-    if not vals: return False
+    if not vals:
+        return False
     return all(isinstance(v, expected_type) for v in vals)
 
 
 class VariableMetaClass(type):
-
     @classmethod
     def __instancecheck__(cls, instance):
         t = type(instance)
@@ -1301,7 +1333,6 @@ class VariableMetaClass(type):
 
 
 class ParameterMetaClass(VariableMetaClass):
-
     @classmethod
     def __instancecheck__(cls, instance):
         t = type(instance)
@@ -1358,21 +1389,23 @@ class Variable(object):
 
     """
 
-    def __init__(self,
-                 block,
-                 type=core.VarDesc.VarType.LOD_TENSOR,
-                 name=None,
-                 shape=None,
-                 dtype=None,
-                 lod_level=None,
-                 capacity=None,
-                 persistable=None,
-                 error_clip=None,
-                 stop_gradient=False,
-                 is_data=False,
-                 need_check_feed=False,
-                 belong_to_optimizer=False,
-                 **kwargs):
+    def __init__(
+        self,
+        block,
+        type=core.VarDesc.VarType.LOD_TENSOR,
+        name=None,
+        shape=None,
+        dtype=None,
+        lod_level=None,
+        capacity=None,
+        persistable=None,
+        error_clip=None,
+        stop_gradient=False,
+        is_data=False,
+        need_check_feed=False,
+        belong_to_optimizer=False,
+        **kwargs
+    ):
         self.block = block
         if name is None:
             name = unique_name.generate('_generated_var')
@@ -1403,10 +1436,11 @@ class Variable(object):
         if is_new_var:
             self.desc.set_type(type)
         elif self.desc.type() != type:
-            raise ValueError("Variable '{0}' has been created before. The "
-                             "previous type is {1}, the new type is {2}. They"
-                             " are not matched".format(self.name,
-                                                       self.desc.type(), type))
+            raise ValueError(
+                "Variable '{0}' has been created before. The "
+                "previous type is {1}, the new type is {2}. They"
+                " are not matched".format(self.name, self.desc.type(), type)
+            )
 
         if shape is not None:
             if is_new_var:
@@ -1418,29 +1452,32 @@ class Variable(object):
                     raise ValueError(
                         "Variable '{0}' has been created before. The previous "
                         "shape is {1}, the new shape is {2}. They are not "
-                        "matched.".format(self.name, old_shape, shape))
+                        "matched.".format(self.name, old_shape, shape)
+                    )
         if dtype is not None:
             if is_new_var:
                 self.desc.set_dtype(dtype)
             else:
                 old_dtype = self.dtype
                 if dtype != old_dtype:
-                    raise ValueError("Variable '{0}' has been created before. "
-                                     "The previous data type is {1}, the new "
-                                     "data type is {2}. They are not "
-                                     "matched.".format(self.name, old_dtype,
-                                                       dtype))
+                    raise ValueError(
+                        "Variable '{0}' has been created before. "
+                        "The previous data type is {1}, the new "
+                        "data type is {2}. They are not "
+                        "matched.".format(self.name, old_dtype, dtype)
+                    )
 
         if lod_level is not None:
             if is_new_var:
                 self.desc.set_lod_level(lod_level)
             else:
                 if lod_level != self.lod_level:
-                    raise ValueError("Variable '{0}' has been created before. "
-                                     "The previous lod_level is {1}, the new "
-                                     "lod_level is {2}. They are not "
-                                     "matched".format(self.name, self.lod_level,
-                                                      lod_level))
+                    raise ValueError(
+                        "Variable '{0}' has been created before. "
+                        "The previous lod_level is {1}, the new "
+                        "lod_level is {2}. They are not "
+                        "matched".format(self.name, self.lod_level, lod_level)
+                    )
         if persistable is not None:
             if is_new_var:
                 self.desc.set_persistable(persistable)
@@ -1450,7 +1487,9 @@ class Variable(object):
                         "Variable '{0}' has been created before."
                         "The previous persistable is {1}, the new "
                         "persistable is {2}. They are not matched".format(
-                            self.name, self.persistable, persistable))
+                            self.name, self.persistable, persistable
+                        )
+                    )
 
         if need_check_feed and is_new_var:
             self.desc.set_need_check_feed(need_check_feed)
@@ -1491,20 +1530,22 @@ class Variable(object):
                 y = x.detach()
         """
 
-        assert self.type == core.VarDesc.VarType.SELECTED_ROWS or \
-            self.type == core.VarDesc.VarType.LOD_TENSOR, \
-            "only support a variable with SELECTED_ROWS or LOD_TENSOR to be detached"
+        assert (
+            self.type == core.VarDesc.VarType.SELECTED_ROWS
+            or self.type == core.VarDesc.VarType.LOD_TENSOR
+        ), "only support a variable with SELECTED_ROWS or LOD_TENSOR to be detached"
 
         output = self.block.create_var(
             name=unique_name.generate_with_ignorable_key("detach_" + self.name),
             dtype=self.dtype,
             type=self.type,
             persistable=self.persistable,
-            stop_gradient=True)
+            stop_gradient=True,
+        )
 
-        self.block.append_op(type='share_data',
-                             inputs={'X': [self]},
-                             outputs={'Out': [output]})
+        self.block.append_op(
+            type='share_data', inputs={'X': [self]}, outputs={'Out': [output]}
+        )
         return output
 
     @fake_interface_only
@@ -1694,14 +1735,20 @@ class Variable(object):
         """
         # VarType.LOD_TENSOR -> LOD_TENSOR
         type_str = str(self.type).split('.')[1]
-        if self.type == core.VarDesc.VarType.SELECTED_ROWS or self.type == core.VarDesc.VarType.LOD_TENSOR:
+        if (
+            self.type == core.VarDesc.VarType.SELECTED_ROWS
+            or self.type == core.VarDesc.VarType.LOD_TENSOR
+        ):
             dtype_str = str(self.dtype).split('.')[1]
-            var_str = "{name} : {type}.shape{shape}.dtype({dtype}).stop_gradient({stop_gradient})".\
-                format(name=self.name, type=type_str, shape=self.shape,
-                       dtype=dtype_str, stop_gradient=self.stop_gradient)
+            var_str = "{name} : {type}.shape{shape}.dtype({dtype}).stop_gradient({stop_gradient})".format(
+                name=self.name,
+                type=type_str,
+                shape=self.shape,
+                dtype=dtype_str,
+                stop_gradient=self.stop_gradient,
+            )
         else:
-            var_str = "{name} : {type})".\
-                format(name=self.name, type=type_str)
+            var_str = "{name} : {type})".format(name=self.name, type=type_str)
 
         if self.is_parameter:
             if self.trainable:
@@ -1714,12 +1761,16 @@ class Variable(object):
         if self.persistable:
             var_str = "persist " + var_str
 
-        from paddle.distributed.auto_parallel.dist_context import get_default_distributed_context
+        from paddle.distributed.auto_parallel.dist_context import (
+            get_default_distributed_context,
+        )
+
         dist_context = get_default_distributed_context()
         dist_tensor = dist_context.get_dist_tensor_for_program(self)
         if dist_tensor is not None:
-            var_str += ", {name} = {value}".format(name="dist_attr",
-                                                   value=dist_tensor)
+            var_str += ", {name} = {value}".format(
+                name="dist_attr", value=dist_tensor
+            )
 
         return var_str
 
@@ -1753,15 +1804,18 @@ class Variable(object):
                 print(new_variable.to_string(True, True))
         """
         assert isinstance(throw_on_error, bool) and isinstance(
-            with_details, bool)
+            with_details, bool
+        )
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.VarDesc.FromString(six.binary_type(protostr))
         res_str = _debug_string_(proto, throw_on_error)
         if with_details:
-            additional_attr = ("error_clip", )
+            additional_attr = ("error_clip",)
             for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (attr_name,
-                                         cpt.to_text(getattr(self, attr_name)))
+                res_str += "%s: %s\n" % (
+                    attr_name,
+                    cpt.to_text(getattr(self, attr_name)),
+                )
 
         return res_str
 
@@ -1770,7 +1824,7 @@ class Variable(object):
     def element_size(self):
         """
         Returns the size in bytes of an element in the Tensor.
-        
+
         Examples:
           .. code-block:: python
 
@@ -2056,27 +2110,28 @@ class Variable(object):
             dtype=self.dtype,
             type=self.type,
             persistable=False,
-            stop_gradient=False)
+            stop_gradient=False,
+        )
         input_shape = self.block.create_var(
             name=unique_name.generate_with_ignorable_key(self.name + '.tmp'),
             dtype=self.dtype,
             type=core.VarDesc.VarType.LOD_TENSOR,
             persistable=False,
-            stop_gradient=False)
-
-        self.block.append_op(type='transpose2',
-                             inputs={'X': [self]},
-                             outputs={
-                                 'Out': [out],
-                                 'XShape': [input_shape]
-                             },
-                             attrs={'axis': perm})
+            stop_gradient=False,
+        )
+
+        self.block.append_op(
+            type='transpose2',
+            inputs={'X': [self]},
+            outputs={'Out': [out], 'XShape': [input_shape]},
+            attrs={'axis': perm},
+        )
         return out
 
     def clone(self):
         """
         Returns a new static Variable, which is the clone of the original static
-        Variable. It remains in the current graph, that is, the cloned Variable 
+        Variable. It remains in the current graph, that is, the cloned Variable
         provides gradient propagation. Calling ``out = tensor.clone()`` is same
         as ``out = assign(tensor)`` .
 
@@ -2101,11 +2156,12 @@ class Variable(object):
             dtype=self.dtype,
             type=self.type,
             persistable=self.persistable,
-            stop_gradient=self.stop_gradient)
+            stop_gradient=self.stop_gradient,
+        )
 
-        self.block.append_op(type='assign',
-                             inputs={'X': [self]},
-                             outputs={'Out': [output]})
+        self.block.append_op(
+            type='assign', inputs={'X': [self]}, outputs={'Out': [output]}
+        )
         return output
 
     def _set_error_clip(self, error_clip):
@@ -2128,7 +2184,7 @@ class Variable(object):
             key(str): Key for this information.
             value(object): The value associated to the key.
 
-        Returns: 
+        Returns:
             None
         """
         if not hasattr(self, "_info"):
@@ -2142,7 +2198,7 @@ class Variable(object):
         Args:
             key(str): Key for this information.
 
-        Returns: 
+        Returns:
             object
         """
         if hasattr(self, "_info") and key in self._info:
@@ -2171,8 +2227,9 @@ class Variable(object):
             start = upper if step < 0 else lower
         else:
             start = slice.start
-            start = max(start +
-                        length, lower) if start < 0 else min(start, upper)
+            start = (
+                max(start + length, lower) if start < 0 else min(start, upper)
+            )
 
         # Compute stop.
         if slice.stop is None:
@@ -2218,11 +2275,15 @@ class Variable(object):
         for index, o in enumerate(item):
             if isinstance(o, int):
                 start = int(o)
-                if (index > 0 and index >= self.shape[index]) \
-                        or (index < 0 and (index + self.shape[index]) < 0):
+                if (index > 0 and index >= self.shape[index]) or (
+                    index < 0 and (index + self.shape[index]) < 0
+                ):
                     raise IndexError("invalid index")
-                start = max(start + self.shape[index], 0) if start < 0 else min(
-                    start, self.shape[index])
+                start = (
+                    max(start + self.shape[index], 0)
+                    if start < 0
+                    else min(start, self.shape[index])
+                )
                 starts.append(start)
                 ends.append(start + 1)
             elif isinstance(o, slice):
@@ -2240,30 +2301,31 @@ class Variable(object):
         if not copy:
             return self.block.create_var(
                 name=unique_name.generate_with_ignorable_key(self.name),
-                dtype=self.dtype)
+                dtype=self.dtype,
+            )
         else:
             return self
 
     def _sliceVar(self, axes, starts, ends):
         new_var = self._cloneVar()
-        self.block.append_op(type="slice",
-                             inputs={'Input': [self]},
-                             outputs={'Out': [new_var]},
-                             attrs={
-                                 'axes': axes,
-                                 'starts': starts,
-                                 'ends': ends
-                             })
+        self.block.append_op(
+            type="slice",
+            inputs={'Input': [self]},
+            outputs={'Out': [new_var]},
+            attrs={'axes': axes, 'starts': starts, 'ends': ends},
+        )
         return new_var
 
     def _concatVar(self, inputs, axis):
         new_var = self._cloneVar()
-        self.block.append_op(type="concat",
-                             inputs={'X': inputs},
-                             outputs={'Out': [new_var]},
-                             attrs={
-                                 'axis': axis,
-                             })
+        self.block.append_op(
+            type="concat",
+            inputs={'X': inputs},
+            outputs={'Out': [new_var]},
+            attrs={
+                'axis': axis,
+            },
+        )
         return new_var
 
     def _sliceAndConcatVar(self, item, axis):
@@ -2277,21 +2339,24 @@ class Variable(object):
                 vars = []
                 if step > 0:
                     while start < stop:
-                        vars.append(self._sliceVar([axis], [start],
-                                                   [start + 1]))
+                        vars.append(
+                            self._sliceVar([axis], [start], [start + 1])
+                        )
                         start += step
                 else:
                     while start > stop:
-                        vars.append(self._sliceVar([axis], [start],
-                                                   [start + 1]))
+                        vars.append(
+                            self._sliceVar([axis], [start], [start + 1])
+                        )
                         start += step
                 return self._concatVar(vars, axis)
         elif isinstance(item, int):
             if self.shape[axis] < 0:
                 return self._cloneVar(True)
             index = int(item)
-            if (index > 0 and index >= self.shape[axis]) \
-                    or (index < 0 and (index + self.shape[axis]) < 0):
+            if (index > 0 and index >= self.shape[axis]) or (
+                index < 0 and (index + self.shape[axis]) < 0
+            ):
                 raise IndexError("invalid index")
             return self._sliceVar([axis], [index], [index + 1])
         else:
@@ -2305,10 +2370,10 @@ class Variable(object):
 
     def get_value(self, scope=None):
         """
-        Get the value of variable in given scope. 
+        Get the value of variable in given scope.
 
         Args:
-            scope(Scope, optional) : If `scope` is None, it will be set to global scope 
+            scope(Scope, optional) : If `scope` is None, it will be set to global scope
                 obtained through 'paddle.static.global_scope()'. Otherwise, use `scope`.
                 Default: None
 
@@ -2319,7 +2384,7 @@ class Variable(object):
             .. code-block:: python
 
                 import paddle
-                import paddle.static as static 
+                import paddle.static as static
                 import numpy as np
 
                 paddle.enable_static()
@@ -2348,38 +2413,42 @@ class Variable(object):
         # can not be imported at the begainning of this file.
         # Therefore, the above two modules are dynamically imported.
         from .executor import global_scope
+
         if scope is not None and not isinstance(scope, core._Scope):
             raise TypeError(
-                "`scope` should be None or `paddle.static.Scope` type, but received {}."
-                .format(type(scope)))
+                "`scope` should be None or `paddle.static.Scope` type, but received {}.".format(
+                    type(scope)
+                )
+            )
 
         if scope is None:
             scope = global_scope()
         var_temp = scope.find_var(self.name)
         if var_temp is None:
-            raise ValueError("Can not find Variable '{}' in the Scope.".format(
-                self.name))
+            raise ValueError(
+                "Can not find Variable '{}' in the Scope.".format(self.name)
+            )
         t = var_temp.get_tensor()
         return t
 
     def set_value(self, value, scope=None):
         '''
-        Set the value to the tensor in given scope. 
+        Set the value to the tensor in given scope.
 
         Args:
             value(Tensor/ndarray) : The value to be set.
-            scope(Scope, optional) : If `scope` is None, it will be set to global scope 
+            scope(Scope, optional) : If `scope` is None, it will be set to global scope
                 obtained through 'paddle.static.global_scope()'. Otherwise, use `scope`.
                 Default: None
 
         Returns:
             None
-        
+
         Examples:
             .. code-block:: python
 
                 import paddle
-                import paddle.static as static 
+                import paddle.static as static
                 import numpy as np
 
                 paddle.enable_static()
@@ -2412,21 +2481,26 @@ class Variable(object):
 
         if not (isinstance(value, np.ndarray) or hasattr(value, '__array__')):
             raise TypeError(
-                "`value` should be `numpy.ndarray` or `LoDTensor`, but received {}."
-                .format(type(value)))
+                "`value` should be `numpy.ndarray` or `LoDTensor`, but received {}.".format(
+                    type(value)
+                )
+            )
 
         if scope is not None and not isinstance(scope, core._Scope):
             raise TypeError(
-                "`scope` should be None or `paddle.static.Scope` type, but received {}."
-                .format(type(scope)))
+                "`scope` should be None or `paddle.static.Scope` type, but received {}.".format(
+                    type(scope)
+                )
+            )
 
         if scope is None:
             scope = global_scope()
 
         var_temp = scope.find_var(self.name)
         if var_temp is None:
-            raise ValueError("Can not find Variable '{}' in the Scope.".format(
-                self.name))
+            raise ValueError(
+                "Can not find Variable '{}' in the Scope.".format(self.name)
+            )
 
         t = var_temp.get_tensor()
 
@@ -2437,8 +2511,10 @@ class Variable(object):
                 value_shape = value.shape
             if list(t.shape()) != list(value_shape):
                 raise ValueError(
-                    "{} expected a shape {}, but the received shape is {}.".
-                    format(self.name, list(t.shape()), list(value_shape)))
+                    "{} expected a shape {}, but the received shape is {}.".format(
+                        self.name, list(t.shape()), list(value_shape)
+                    )
+                )
 
         p = t._place()
         if p.is_cpu_place():
@@ -2487,11 +2563,12 @@ class Variable(object):
 
         output = self.block.create_var(
             name=unique_name.generate_with_ignorable_key(self.name + "_size"),
-            dtype=core.VarDesc.VarType.INT64)
+            dtype=core.VarDesc.VarType.INT64,
+        )
 
-        self.block.append_op(type='size',
-                             inputs={'Input': [self]},
-                             outputs={'Out': [output]})
+        self.block.append_op(
+            type='size', inputs={'Input': [self]}, outputs={'Out': [output]}
+        )
         return output
 
     def _set_attr(self, name, val):
@@ -2590,8 +2667,8 @@ class OpProtoHolder(object):
 
     def __init__(self):
         assert not hasattr(
-            self.__class__,
-            '_instance'), 'Please use `instance()` to get OpProtoHolder object!'
+            self.__class__, '_instance'
+        ), 'Please use `instance()` to get OpProtoHolder object!'
         op_protos = get_all_op_protos()
         self.op_proto_map = {}
         for proto in op_protos:
@@ -2627,7 +2704,7 @@ class OpProtoHolder(object):
             core.op_proto_and_checker_maker.kOpRoleVarAttrName(),
             core.op_proto_and_checker_maker.kOpNameScopeAttrName(),
             core.op_proto_and_checker_maker.kOpCreationCallstackAttrName(),
-            core.op_proto_and_checker_maker.kOpDeviceAttrName()
+            core.op_proto_and_checker_maker.kOpDeviceAttrName(),
         }
 
 
@@ -2674,24 +2751,44 @@ class Operator(object):
                                 inputs={"X": [var1, var2, var3]},
                                 outputs={"Out": [var1]})
     """
+
     OP_WITHOUT_KERNEL_SET = {
-        'feed', 'fetch', 'recurrent', 'go', 'rnn_memory_helper_grad',
-        'conditional_block', 'while', 'send', 'recv', 'listen_and_serv',
-        'fl_listen_and_serv', 'ncclInit', 'select', 'checkpoint_notify',
-        'gen_bkcl_id', 'c_gen_bkcl_id', 'gen_nccl_id', 'c_gen_nccl_id',
-        'c_comm_init', 'c_sync_calc_stream', 'c_sync_comm_stream',
-        'queue_generator', 'dequeue', 'enqueue', 'heter_listen_and_serv',
-        'c_wait_comm', 'c_wait_compute', 'c_gen_hccl_id', 'c_comm_init_hccl',
-        'copy_cross_scope', 'c_gen_cncl_id'
+        'feed',
+        'fetch',
+        'recurrent',
+        'go',
+        'rnn_memory_helper_grad',
+        'conditional_block',
+        'while',
+        'send',
+        'recv',
+        'listen_and_serv',
+        'fl_listen_and_serv',
+        'ncclInit',
+        'select',
+        'checkpoint_notify',
+        'gen_bkcl_id',
+        'c_gen_bkcl_id',
+        'gen_nccl_id',
+        'c_gen_nccl_id',
+        'c_comm_init',
+        'c_sync_calc_stream',
+        'c_sync_comm_stream',
+        'queue_generator',
+        'dequeue',
+        'enqueue',
+        'heter_listen_and_serv',
+        'c_wait_comm',
+        'c_wait_compute',
+        'c_gen_hccl_id',
+        'c_comm_init_hccl',
+        'copy_cross_scope',
+        'c_gen_cncl_id',
     }
 
-    def __init__(self,
-                 block,
-                 desc,
-                 type=None,
-                 inputs=None,
-                 outputs=None,
-                 attrs=None):
+    def __init__(
+        self, block, desc, type=None, inputs=None, outputs=None, attrs=None
+    ):
         # read attr type index from op proto to avoid unexpected type
         # conversions, e.g. narrowing conversion like double to float
         try:
@@ -2705,7 +2802,8 @@ class Operator(object):
         if _non_static_mode():
             if type is None:
                 raise ValueError(
-                    "`type` to initialized an Operator can not be None.")
+                    "`type` to initialized an Operator can not be None."
+                )
             self._type = type
             self.attrs = attrs if attrs else {}
         else:
@@ -2725,11 +2823,14 @@ class Operator(object):
 
             if op_maker.kOpRoleAttrName() not in op_attrs:
                 op_attrs[
-                    op_maker.kOpRoleAttrName()] = self.block.program._op_role
+                    op_maker.kOpRoleAttrName()
+                ] = self.block.program._op_role
 
             role_var_name = op_maker.kOpRoleVarAttrName()
-            if len(self.block.program._op_role_var
-                   ) != 0 and role_var_name not in op_attrs:
+            if (
+                len(self.block.program._op_role_var) != 0
+                and role_var_name not in op_attrs
+            ):
                 op_attrs[role_var_name] = self.block.program._op_role_var
 
             if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
@@ -2744,16 +2845,20 @@ class Operator(object):
                 return
             if type is None:
                 raise ValueError(
-                    "`type` to initialized an Operator can not be None.")
+                    "`type` to initialized an Operator can not be None."
+                )
             else:
                 callstack_var_name = op_maker.kOpCreationCallstackAttrName()
                 op_attrs[callstack_var_name] = []
                 for frame in traceback.extract_stack():
                     op_attrs[callstack_var_name].append(
                         '  File "{}", line {}, in {}'.format(
-                            frame[0], frame[1], frame[2]))
-                    op_attrs[callstack_var_name].append('    {}'.format(
-                        frame[3]))
+                            frame[0], frame[1], frame[2]
+                        )
+                    )
+                    op_attrs[callstack_var_name].append(
+                        '    {}'.format(frame[3])
+                    )
 
             self.desc.set_type(type)
             proto = OpProtoHolder.instance().get_op_proto(type)
@@ -2769,20 +2874,25 @@ class Operator(object):
                     op_device = op_maker.kOpDeviceAttrName()
                     op_attrs[op_device] = _current_device
                 else:
-                    warnings.warn("The Op(%s) is not support to set device." %
-                                  type)
+                    warnings.warn(
+                        "The Op(%s) is not support to set device." % type
+                    )
                 if 'force_cpu' in op_attrs:
-                    if (type == 'less_than' and op_attrs['force_cpu'] != None
-                        ) or op_attrs['force_cpu'] != False:
+                    if (
+                        type == 'less_than' and op_attrs['force_cpu'] != None
+                    ) or op_attrs['force_cpu'] != False:
                         warnings.warn(
                             "The Attr(force_cpu) of Op(%s) will be deprecated in the future, "
                             "please use 'device_guard' instead. 'device_guard' has higher priority when they are "
-                            "used at the same time." % type)
+                            "used at the same time." % type
+                        )
             if _current_pipeline_stage is not None:
-                pipeline_attr_name = 'pipeline_stage' + core.kAutoParallelSuffix(
+                pipeline_attr_name = (
+                    'pipeline_stage' + core.kAutoParallelSuffix()
+                )
+                self._update_desc_attr(
+                    pipeline_attr_name, _current_pipeline_stage
                 )
-                self._update_desc_attr(pipeline_attr_name,
-                                       _current_pipeline_stage)
 
             def find_name(var_list, name):
                 for var_name in var_list:
@@ -2793,8 +2903,9 @@ class Operator(object):
             if inputs is not None:
                 for in_proto in proto.inputs:
                     found = find_name(inputs, in_proto.name)
-                    assert found or in_proto.dispensable, "Input {} not found".format(
-                        in_proto.name)
+                    assert (
+                        found or in_proto.dispensable
+                    ), "Input {} not found".format(in_proto.name)
                     if found:
                         in_args = inputs[in_proto.name]
                         if not isinstance(in_args, (list, tuple)):
@@ -2802,7 +2913,8 @@ class Operator(object):
                         if not in_proto.duplicable and len(in_args) > 1:
                             raise ValueError(
                                 "Input %s expects only one input, but %d are given."
-                                % (in_proto.name, len(in_args)))
+                                % (in_proto.name, len(in_args))
+                            )
                         in_arg_names = []
                         for index, arg in enumerate(in_args):
                             if isinstance(arg, six.string_types):
@@ -2816,8 +2928,9 @@ class Operator(object):
                                     "The type of '%s' in operator %s should be "
                                     "one of [basestring(), str, Varibale] in python2, "
                                     "or one of [str, bytes, Variable] in python3."
-                                    "but received : %s" %
-                                    (in_proto.name, type, arg))
+                                    "but received : %s"
+                                    % (in_proto.name, type, arg)
+                                )
                         self.desc.set_input(in_proto.name, in_arg_names)
                     else:
                         self.desc.set_input(in_proto.name, [])
@@ -2828,9 +2941,12 @@ class Operator(object):
                         continue
                     if not ((m.name in outputs) or m.dispensable):
                         raise ValueError(
-                            ("Incorrect setting for output(s) of "
-                             "operator \"%s\", should set: [%s].") %
-                            (type, m.name))
+                            (
+                                "Incorrect setting for output(s) of "
+                                "operator \"%s\", should set: [%s]."
+                            )
+                            % (type, m.name)
+                        )
                 for out_proto in proto.outputs:
                     if out_proto.name not in outputs:
                         continue
@@ -2840,7 +2956,8 @@ class Operator(object):
                     if not out_proto.duplicable and len(out_args) > 1:
                         raise ValueError(
                             "Output %s expects only one output, but %d are given."
-                            % (out_proto.name, len(out_args)))
+                            % (out_proto.name, len(out_args))
+                        )
                     out_arg_names = []
                     for arg in out_args:
                         if isinstance(arg, six.string_types):
@@ -2861,27 +2978,32 @@ class Operator(object):
                     raise TypeError("'attrs' should be a dict.")
                 for attr in proto.attrs:
                     attr_name = attr.name
-                    if (attr_name
-                            not in op_attrs) or (op_attrs[attr_name] is None):
+                    if (attr_name not in op_attrs) or (
+                        op_attrs[attr_name] is None
+                    ):
                         continue
                     attr_val = op_attrs[attr_name]
                     self._update_desc_attr(attr_name, attr_val)
                 for attr_name in extra_attrs_map.keys():
-                    if (attr_name
-                            not in op_attrs) or (op_attrs[attr_name] is None):
-                        self._update_desc_attr(attr_name,
-                                               extra_attrs_map[attr_name])
+                    if (attr_name not in op_attrs) or (
+                        op_attrs[attr_name] is None
+                    ):
+                        self._update_desc_attr(
+                            attr_name, extra_attrs_map[attr_name]
+                        )
                     else:
                         self._update_desc_attr(attr_name, op_attrs[attr_name])
 
             # proto.attrs doesn't include ipu_index
             if core.is_compiled_with_ipu():
                 if global_ipu_index >= 0:
-                    self._update_desc_attr(ipu_index_attr_name,
-                                           global_ipu_index)
+                    self._update_desc_attr(
+                        ipu_index_attr_name, global_ipu_index
+                    )
                 if global_ipu_stage >= 0:
-                    self._update_desc_attr(ipu_stage_attr_name,
-                                           global_ipu_stage)
+                    self._update_desc_attr(
+                        ipu_stage_attr_name, global_ipu_stage
+                    )
 
             self.desc.check_attrs()
             if self._has_kernel(type):
@@ -2940,7 +3062,8 @@ class Operator(object):
         assert isinstance(
             skip_op_callstack, bool
         ), "skip_op_callstack parameter's type is error, expect bool, received {}".format(
-            type(skip_op_callstack))
+            type(skip_op_callstack)
+        )
         outputs_str = "{"
         for i in range(0, len(self.output_names)):
             outputs_str += "{name}=".format(name=self.output_names[i])
@@ -2970,9 +3093,9 @@ class Operator(object):
             attr_type = self.desc.attr_type(name, True)
             if attr_type == core.AttrType.VAR:
                 attr_var_name = self.desc.attr(name, True).name()
-                a = "{name} = Var['{value}']".format(name=name,
-                                                     type=attr_type,
-                                                     value=attr_var_name)
+                a = "{name} = Var['{value}']".format(
+                    name=name, type=attr_type, value=attr_var_name
+                )
                 attrs_str += a
                 if i != len(attr_names) - 1:
                     attrs_str += ", "
@@ -2983,7 +3106,8 @@ class Operator(object):
                     "'%s'" % var.name() for var in self.desc.attr(name, True)
                 ]
                 a = "{name} = Vars[{value}]".format(
-                    name=name, type=attr_type, value=','.join(attr_var_names))
+                    name=name, type=attr_type, value=','.join(attr_var_names)
+                )
                 attrs_str += a
                 if i != len(attr_names) - 1:
                     attrs_str += ", "
@@ -2991,7 +3115,8 @@ class Operator(object):
 
             if attr_type == core.AttrType.BLOCK:
                 a = "{name} = block[{value}]".format(
-                    name=name, type=attr_type, value=self._block_attr_id(name))
+                    name=name, type=attr_type, value=self._block_attr_id(name)
+                )
                 attrs_str += a
                 if i != len(attr_names) - 1:
                     attrs_str += ", "
@@ -2999,17 +3124,19 @@ class Operator(object):
 
             if attr_type == core.AttrType.BLOCKS:
                 a = "{name} = blocks{value}".format(
-                    name=name,
-                    type=attr_type,
-                    value=self._blocks_attr_ids(name))
+                    name=name, type=attr_type, value=self._blocks_attr_ids(name)
+                )
                 attrs_str += a
                 if i != len(attr_names) - 1:
                     attrs_str += ", "
                 continue
 
             # it is bytes of serialized protobuf
-            if is_compiled_with_cinn(
-            ) and self.type == 'cinn_launch' and name == 'compilation_key':
+            if (
+                is_compiled_with_cinn()
+                and self.type == 'cinn_launch'
+                and name == 'compilation_key'
+            ):
                 key = self.desc.attr(name)
                 v = core.get_serialize_comile_key(key)
                 prog = Program()
@@ -3021,28 +3148,36 @@ class Operator(object):
             else:
                 value = self.desc.attr(name)
 
-            a = "{name} = {value}".format(name=name,
-                                          type=attr_type,
-                                          value=value)
+            a = "{name} = {value}".format(
+                name=name, type=attr_type, value=value
+            )
 
             attrs_str += a
             if i != len(attr_names) - 1:
                 attrs_str += ", "
 
-        from paddle.distributed.auto_parallel.dist_context import get_default_distributed_context
+        from paddle.distributed.auto_parallel.dist_context import (
+            get_default_distributed_context,
+        )
+
         dist_context = get_default_distributed_context()
         dist_op = dist_context.get_dist_op_for_program(self)
         if dist_op is not None:
-            attrs_str += ", {name} = {value}".format(name="dist_attr",
-                                                     value=dist_op)
+            attrs_str += ", {name} = {value}".format(
+                name="dist_attr", value=dist_op
+            )
 
         if outputs_str != "{}":
-            op_str = "{outputs} = {op_type}(inputs={inputs}, {attrs})".\
-                format(outputs=outputs_str, op_type=self.type,
-                       inputs=inputs_str, attrs=attrs_str)
+            op_str = "{outputs} = {op_type}(inputs={inputs}, {attrs})".format(
+                outputs=outputs_str,
+                op_type=self.type,
+                inputs=inputs_str,
+                attrs=attrs_str,
+            )
         else:
-            op_str = "{op_type}(inputs={inputs}, {attrs})".\
-                format(op_type=self.type, inputs=inputs_str, attrs=attrs_str)
+            op_str = "{op_type}(inputs={inputs}, {attrs})".format(
+                op_type=self.type, inputs=inputs_str, attrs=attrs_str
+            )
         return op_str
 
     def __str__(self):
@@ -3128,7 +3263,8 @@ class Operator(object):
             if op == self:
                 return i
         raise ValueError(
-            "Can't find op itself in it's block. It could be a bug of Paddle.")
+            "Can't find op itself in it's block. It could be a bug of Paddle."
+        )
 
     def has_attr(self, name):
         """
@@ -3190,8 +3326,9 @@ class Operator(object):
             self.desc.set_block_attr(name, val.desc)
         elif isinstance(val, list) and val and _all_is_type(val, Block):
             self.desc.set_blocks_attr(name, [v.desc for v in val])
-        elif isinstance(val, core.BlockDesc) or \
-                isinstance(val, core.ProgramDesc):
+        elif isinstance(val, core.BlockDesc) or isinstance(
+            val, core.ProgramDesc
+        ):
             self.desc.set_serialized_attr(name, val.serialize_to_string())
         else:
             self._update_desc_plain_attr(name, val)
@@ -3272,7 +3409,7 @@ class Operator(object):
         """
 
         id = self._block_attr_id(name)
-        assert (id >= 0 and id < len(self.block.program.blocks))
+        assert id >= 0 and id < len(self.block.program.blocks)
         return self.block.program.blocks[id]
 
     def _blocks_attr(self, name):
@@ -3287,7 +3424,7 @@ class Operator(object):
         """
         attrs = []
         for i in self._blocks_attr_ids(name):
-            assert (i >= 0 and i < len(self.block.program.blocks))
+            assert i >= 0 and i < len(self.block.program.blocks)
             attrs.append(self.block.program.blocks[i])
 
         return attrs
@@ -3316,8 +3453,11 @@ class Operator(object):
             Variable: the Variable attribute.
         """
         attr_type = self.desc.attr_type(name, True)
-        assert attr_type == core.AttrType.VAR, "Required type attr({}) is Variable, but received {}".format(
-            name, attr_type)
+        assert (
+            attr_type == core.AttrType.VAR
+        ), "Required type attr({}) is Variable, but received {}".format(
+            name, attr_type
+        )
         attr_var_name = self.desc.attr(name, True).name()
         return self.block._var_recursive(attr_var_name)
 
@@ -3332,8 +3472,11 @@ class Operator(object):
             Variables: the Variables attribute.
         """
         attr_type = self.desc.attr_type(name, True)
-        assert attr_type == core.AttrType.VARS, "Required type attr({}) is list[Variable], but received {}".format(
-            name, attr_type)
+        assert (
+            attr_type == core.AttrType.VARS
+        ), "Required type attr({}) is list[Variable], but received {}".format(
+            name, attr_type
+        )
         attr_vars = [
             self.block._var_recursive(var.name())
             for var in self.desc.attr(name, True)
@@ -3480,7 +3623,8 @@ class Block(object):
         assert isinstance(
             skip_op_callstack, bool
         ), "skip_op_callstack parameter's type is error, expect bool, received {}".format(
-            type(skip_op_callstack))
+            type(skip_op_callstack)
+        )
         block_str = "{ // block "
         block_str += "{}\n".format(self.idx)
         for var in list(self.vars.values()):
@@ -3488,7 +3632,8 @@ class Block(object):
         block_str += "\n"
         for op in self.ops:
             block_str += "    {}\n".format(
-                op._to_readable_code(skip_op_callstack))
+                op._to_readable_code(skip_op_callstack)
+            )
         block_str += "}"
         return block_str
 
@@ -3507,22 +3652,28 @@ class Block(object):
             str: The debug string.
         """
         assert isinstance(throw_on_error, bool) and isinstance(
-            with_details, bool)
+            with_details, bool
+        )
         if with_details:
             re_add_indent = re.compile(r"\n(.)")
             res_str = "blocks {\n  idx: %d\n  parent_idx: %d" % (
-                self.idx, self.parent_idx)
+                self.idx,
+                self.parent_idx,
+            )
             for var in list(self.vars.values()):
                 res_str += "\n  vars {\n    %s  }" % re_add_indent.sub(
-                    r"\n    \1", var.to_string(throw_on_error, with_details))
+                    r"\n    \1", var.to_string(throw_on_error, with_details)
+                )
             for op in self.ops:
                 res_str += "\n  ops {\n    %s  }" % re_add_indent.sub(
-                    r"\n    \1", op.to_string(throw_on_error))
+                    r"\n    \1", op.to_string(throw_on_error)
+                )
             res_str += "\n}"
         else:
             protostr = self.desc.serialize_to_string()
             proto = framework_pb2.BlockDesc.FromString(
-                six.binary_type(protostr))
+                six.binary_type(protostr)
+            )
             res_str = _debug_string_(proto, throw_on_error)
         return res_str
 
@@ -3576,8 +3727,9 @@ class Block(object):
         """
         if not isinstance(name, six.string_types):
             raise TypeError(
-                "var require string as parameter, but get %s instead." %
-                (type(name)))
+                "var require string as parameter, but get %s instead."
+                % (type(name))
+            )
         v = self.vars.get(name, None)
         if v is None:
             raise ValueError("var %s not in this block" % name)
@@ -3643,8 +3795,11 @@ class Block(object):
         return list(self.iter_parameters())
 
     def iter_parameters(self):
-        return (item[1] for item in six.iteritems(self.vars)
-                if isinstance(item[1], Parameter))
+        return (
+            item[1]
+            for item in six.iteritems(self.vars)
+            if isinstance(item[1], Parameter)
+        )
 
     def create_var(self, *args, **kwargs):
         if _non_static_mode():
@@ -3699,43 +3854,51 @@ class Block(object):
         d = self.desc.find_var(cpt.to_bytes(new_name))
         if var_type == "Parameter":
             if in_dygraph_mode():
-                var = EagerParamBase(d.shape(),
-                                     d.dtype(),
-                                     type=orig_var_type,
-                                     name=new_name,
-                                     stop_gradient=stop_gradient,
-                                     trainable=trainable,
-                                     optimize_attr=optimize_attr,
-                                     regularizer=regularizer,
-                                     error_clip=error_clip)
+                var = EagerParamBase(
+                    d.shape(),
+                    d.dtype(),
+                    type=orig_var_type,
+                    name=new_name,
+                    stop_gradient=stop_gradient,
+                    trainable=trainable,
+                    optimize_attr=optimize_attr,
+                    regularizer=regularizer,
+                    error_clip=error_clip,
+                )
             else:
                 if _in_legacy_dygraph():
-                    var = ParamBase(d.shape(),
-                                    d.dtype(),
-                                    type=orig_var_type,
-                                    name=new_name,
-                                    stop_gradient=stop_gradient,
-                                    trainable=trainable,
-                                    optimize_attr=optimize_attr,
-                                    regularizer=regularizer,
-                                    error_clip=error_clip)
+                    var = ParamBase(
+                        d.shape(),
+                        d.dtype(),
+                        type=orig_var_type,
+                        name=new_name,
+                        stop_gradient=stop_gradient,
+                        trainable=trainable,
+                        optimize_attr=optimize_attr,
+                        regularizer=regularizer,
+                        error_clip=error_clip,
+                    )
                 else:
-                    var = Parameter(self,
-                                    d.shape(),
-                                    d.dtype(),
-                                    type=orig_var_type,
-                                    name=new_name,
-                                    stop_gradient=stop_gradient,
-                                    trainable=trainable,
-                                    optimize_attr=optimize_attr,
-                                    regularizer=regularizer,
-                                    error_clip=error_clip)
+                    var = Parameter(
+                        self,
+                        d.shape(),
+                        d.dtype(),
+                        type=orig_var_type,
+                        name=new_name,
+                        stop_gradient=stop_gradient,
+                        trainable=trainable,
+                        optimize_attr=optimize_attr,
+                        regularizer=regularizer,
+                        error_clip=error_clip,
+                    )
         elif var_type == "Variable":
-            var = Variable(self,
-                           type=orig_var_type,
-                           name=new_name,
-                           error_clip=error_clip,
-                           stop_gradient=stop_gradient)
+            var = Variable(
+                self,
+                type=orig_var_type,
+                name=new_name,
+                error_clip=error_clip,
+                stop_gradient=stop_gradient,
+            )
 
         # rename the python side, _sync_with_cpp will only add
         # new vars/ops to python side.
@@ -3772,8 +3935,9 @@ class Block(object):
                         # Think of "c_broadcast" and "c_sync_comm_stream" as a special case here.
                         # NOTE: "coalesce_tensor" is a special case for rnn with cudnn support
                         if op.type in [
-                                "c_broadcast", "c_sync_comm_stream",
-                                "coalesce_tensor"
+                            "c_broadcast",
+                            "c_sync_comm_stream",
+                            "coalesce_tensor",
                         ]:
                             continue
                         init_ops.append(op)
@@ -3783,9 +3947,12 @@ class Block(object):
             init_ops = _is_inited_by(global_block, param)
             init_ops_len = len(init_ops)
             if init_ops_len > 1:
-                raise RuntimeError("param " + param.name +
-                                   " is inited by multiple init ops " +
-                                   str(init_ops))
+                raise RuntimeError(
+                    "param "
+                    + param.name
+                    + " is inited by multiple init ops "
+                    + str(init_ops)
+                )
             elif init_ops_len == 1:
                 # TODO already inited, do nothing, should log a warning
                 pass
@@ -3807,24 +3974,31 @@ class Block(object):
             warnings.warn(
                 "Op `%s` is executed through `append_op` under the dynamic mode, "
                 "the corresponding API implementation needs to be upgraded to "
-                "using `_C_ops` method." % type, DeprecationWarning)
-            op = Operator(block=self,
-                          desc=None,
-                          type=type,
-                          inputs=None,
-                          outputs=None,
-                          attrs=attrs)
+                "using `_C_ops` method." % type,
+                DeprecationWarning,
+            )
+            op = Operator(
+                block=self,
+                desc=None,
+                type=type,
+                inputs=None,
+                outputs=None,
+                attrs=attrs,
+            )
 
             # record ops in tracer rather than blocks
             #
             # TODO(minqiyang): add op stop_gradient support in static mode too.
             # currently, we only support stop_gradient in dygraph mode.
 
-            _dygraph_tracer().trace_op(type, kwargs.get("inputs", {}),
-                                       kwargs.get("outputs",
-                                                  {}), attrs if attrs else {},
-                                       kwargs.get("stop_gradient", False),
-                                       inplace_map)
+            _dygraph_tracer().trace_op(
+                type,
+                kwargs.get("inputs", {}),
+                kwargs.get("outputs", {}),
+                attrs if attrs else {},
+                kwargs.get("stop_gradient", False),
+                inplace_map,
+            )
         else:
             from paddle.fluid.dygraph.base import param_guard
 
@@ -3835,12 +4009,14 @@ class Block(object):
             inputs = kwargs.get("inputs", None)
             outputs = kwargs.get("outputs", None)
             with param_guard(inputs), param_guard(outputs):
-                op = Operator(block=self,
-                              desc=op_desc,
-                              type=kwargs.get("type", None),
-                              inputs=inputs,
-                              outputs=outputs,
-                              attrs=kwargs.get("attrs", None))
+                op = Operator(
+                    block=self,
+                    desc=op_desc,
+                    type=kwargs.get("type", None),
+                    inputs=inputs,
+                    outputs=outputs,
+                    attrs=kwargs.get("attrs", None),
+                )
 
             self.ops.append(op)
 
@@ -3861,7 +4037,7 @@ class Block(object):
 
     def _insert_op_without_sync(self, index, *args, **kwargs):
         """
-        Insert an Operator according to the giving arguments, 
+        Insert an Operator according to the giving arguments,
         without sync_with_cpp to meke the compilation faster.
 
         Args:
@@ -3907,25 +4083,27 @@ class Block(object):
         if _non_static_mode():
             type = kwargs.get("type", None)
             attrs = kwargs.get("attrs", {})
-            op = Operator(self,
-                          None,
-                          type=type,
-                          inputs=None,
-                          outputs=None,
-                          attrs=attrs)
-
-            _dygraph_tracer().trace_op(type, kwargs.get("inputs", {}),
-                                       kwargs.get("outputs", {}),
-                                       attrs if attrs else {},
-                                       kwargs.get("stop_gradient", False))
+            op = Operator(
+                self, None, type=type, inputs=None, outputs=None, attrs=attrs
+            )
+
+            _dygraph_tracer().trace_op(
+                type,
+                kwargs.get("inputs", {}),
+                kwargs.get("outputs", {}),
+                attrs if attrs else {},
+                kwargs.get("stop_gradient", False),
+            )
         else:
             op_desc = self.desc._prepend_op()
-            op = Operator(self,
-                          op_desc,
-                          type=kwargs.get("type", None),
-                          inputs=kwargs.get("inputs", None),
-                          outputs=kwargs.get("outputs", None),
-                          attrs=kwargs.get("attrs", None))
+            op = Operator(
+                self,
+                op_desc,
+                type=kwargs.get("type", None),
+                inputs=kwargs.get("inputs", None),
+                outputs=kwargs.get("outputs", None),
+                attrs=kwargs.get("attrs", None),
+            )
             self.ops.insert(0, op)
 
         return op
@@ -3942,17 +4120,21 @@ class Block(object):
                 if var.has_stop_gradient():
                     is_stop_gradient = var.stop_gradient()
                 if var.has_is_parameter() and var.is_parameter():
-                    self.create_parameter(name=var.name(),
-                                          desc=var,
-                                          type=var.type(),
-                                          shape=var.shape(),
-                                          dtype=var.dtype(),
-                                          stop_gradient=is_stop_gradient)
+                    self.create_parameter(
+                        name=var.name(),
+                        desc=var,
+                        type=var.type(),
+                        shape=var.shape(),
+                        dtype=var.dtype(),
+                        stop_gradient=is_stop_gradient,
+                    )
                 else:
-                    self.create_var(name=var.name(),
-                                    desc=var,
-                                    type=var.type(),
-                                    stop_gradient=is_stop_gradient)
+                    self.create_var(
+                        name=var.name(),
+                        desc=var,
+                        type=var.type(),
+                        stop_gradient=is_stop_gradient,
+                    )
 
         # sync variables removed from c++ end
         for var in list(self.vars.keys()):
@@ -3998,9 +4180,12 @@ class Block(object):
             ops_in_cpp_index = 0
             ops_in_python_index = 0
             while ops_in_python_index < len(
-                    self.ops) and ops_in_cpp_index < len(ops_in_cpp):
-                if self.ops[ops_in_python_index].desc != ops_in_cpp[
-                        ops_in_cpp_index]:
+                self.ops
+            ) and ops_in_cpp_index < len(ops_in_cpp):
+                if (
+                    self.ops[ops_in_python_index].desc
+                    != ops_in_cpp[ops_in_cpp_index]
+                ):
                     del self.ops[ops_in_python_index]
                 else:
                     ops_in_cpp_index += 1
@@ -4026,7 +4211,8 @@ class Block(object):
         """
         if not isinstance(other, Block):
             raise TypeError(
-                "_copy_param_info_from should be invoked with Block")
+                "_copy_param_info_from should be invoked with Block"
+            )
         for p in other.iter_parameters():
             assert isinstance(p, Parameter)
             v = self.vars.get(p.name, None)
@@ -4036,28 +4222,32 @@ class Block(object):
             assert isinstance(v, Variable)
             new_p = None
             if in_dygraph_mode():
-                new_p = EagerParamBase(shape=v.shape,
-                                       dtype=v.dtype,
-                                       type=v.type,
-                                       lod_level=v.lod_level,
-                                       stop_gradient=p.stop_gradient,
-                                       trainable=p.trainable,
-                                       optimize_attr=p.optimize_attr,
-                                       regularizer=p.regularizer,
-                                       error_clip=p.error_clip,
-                                       name=v.name)
+                new_p = EagerParamBase(
+                    shape=v.shape,
+                    dtype=v.dtype,
+                    type=v.type,
+                    lod_level=v.lod_level,
+                    stop_gradient=p.stop_gradient,
+                    trainable=p.trainable,
+                    optimize_attr=p.optimize_attr,
+                    regularizer=p.regularizer,
+                    error_clip=p.error_clip,
+                    name=v.name,
+                )
             else:
                 if _in_legacy_dygraph():
-                    new_p = ParamBase(shape=v.shape,
-                                      dtype=v.dtype,
-                                      type=v.type,
-                                      lod_level=v.lod_level,
-                                      stop_gradient=p.stop_gradient,
-                                      trainable=p.trainable,
-                                      optimize_attr=p.optimize_attr,
-                                      regularizer=p.regularizer,
-                                      error_clip=p.error_clip,
-                                      name=v.name)
+                    new_p = ParamBase(
+                        shape=v.shape,
+                        dtype=v.dtype,
+                        type=v.type,
+                        lod_level=v.lod_level,
+                        stop_gradient=p.stop_gradient,
+                        trainable=p.trainable,
+                        optimize_attr=p.optimize_attr,
+                        regularizer=p.regularizer,
+                        error_clip=p.error_clip,
+                        name=v.name,
+                    )
                 else:
                     new_p = Parameter(
                         block=self,
@@ -4065,13 +4255,15 @@ class Block(object):
                         dtype=v.dtype,
                         type=v.type,
                         lod_level=v.lod_level
-                        if v.type == core.VarDesc.VarType.LOD_TENSOR else None,
+                        if v.type == core.VarDesc.VarType.LOD_TENSOR
+                        else None,
                         stop_gradient=p.stop_gradient,
                         trainable=p.trainable,
                         optimize_attr=p.optimize_attr,
                         regularizer=p.regularizer,
                         error_clip=p.error_clip,
-                        name=v.name)
+                        name=v.name,
+                    )
             self.vars[new_p.name] = new_p
 
     def _clone_variable(self, var, force_persistable=True):
@@ -4091,13 +4283,13 @@ class Block(object):
         ret_var = None
         # make STEP_SCOPES var can be safely cloned.
         if var.type == core.VarDesc.VarType.STEP_SCOPES:
-            ret_var = self.create_var(name=var.name,
-                                      persistable=var.persistable,
-                                      type=var.type)
+            ret_var = self.create_var(
+                name=var.name, persistable=var.persistable, type=var.type
+            )
         elif var.type == core.VarDesc.VarType.RAW:
-            ret_var = self.create_var(name=var.name,
-                                      persistable=var.persistable,
-                                      type=var.type)
+            ret_var = self.create_var(
+                name=var.name, persistable=var.persistable, type=var.type
+            )
         elif var.type == core.VarDesc.VarType.SELECTED_ROWS:
             ret_var = self.create_var(
                 name=var.name,
@@ -4106,7 +4298,8 @@ class Block(object):
                 type=var.type,
                 persistable=True if force_persistable else var.persistable,
                 is_data=var.is_data,
-                need_check_feed=var.desc.need_check_feed())
+                need_check_feed=var.desc.need_check_feed(),
+            )
         else:
             ret_var = self.create_var(
                 name=var.name,
@@ -4116,7 +4309,8 @@ class Block(object):
                 lod_level=var.lod_level,
                 persistable=True if force_persistable else var.persistable,
                 is_data=var.is_data,
-                need_check_feed=var.desc.need_check_feed())
+                need_check_feed=var.desc.need_check_feed(),
+            )
         return ret_var
 
 
@@ -4126,17 +4320,20 @@ class Block(object):
 # re-constructed inside this method. The underlying VarDesc(OpDesc)
 # of some old Python Variables(all old Python Operators) may have
 # been destructed.
-def _apply_pass(main_program,
-                startup_program,
-                pass_name,
-                pass_attrs={},
-                pass_attr_types={}):
+def _apply_pass(
+    main_program, startup_program, pass_name, pass_attrs={}, pass_attr_types={}
+):
     assert isinstance(pass_attrs, dict), "pass_attrs must be dict"
     assert isinstance(pass_attr_types, dict), "pass_attr_types must be dict"
     tmp_main_program = core.ProgramDesc(main_program.desc)
     tmp_startup_program = core.ProgramDesc(startup_program.desc)
-    attrs = core.apply_pass(tmp_main_program, tmp_startup_program, pass_name,
-                            pass_attrs, pass_attr_types)
+    attrs = core.apply_pass(
+        tmp_main_program,
+        tmp_startup_program,
+        pass_name,
+        pass_attrs,
+        pass_attr_types,
+    )
     main_program._rebuild_from_desc(tmp_main_program)
     startup_program._rebuild_from_desc(tmp_startup_program)
     return attrs
@@ -4154,8 +4351,9 @@ class IrNode(object):
         Args:
             node(core.Node): C++ Node.
         """
-        assert isinstance(node,
-                          core.Node), 'node must be the instance of core.Node.'
+        assert isinstance(
+            node, core.Node
+        ), 'node must be the instance of core.Node.'
         self.node = node
 
     def name(self):
@@ -4331,8 +4529,9 @@ class IrVarNode(IrNode):
         Args:
             node(core.Node): C++ Node.
         """
-        assert isinstance(node, core.Node) and node.is_var(), \
-            'node must be the instance of core.Node and it must be a variable node.'
+        assert (
+            isinstance(node, core.Node) and node.is_var()
+        ), 'node must be the instance of core.Node and it must be a variable node.'
         super(IrVarNode, self).__init__(node)
         self.node = node
 
@@ -4343,8 +4542,9 @@ class IrVarNode(IrNode):
         Args:
             shape(list): shape to be set.
         """
-        assert self.node.var() is not None, \
-            "The node variable description can not be None."
+        assert (
+            self.node.var() is not None
+        ), "The node variable description can not be None."
         self.node.var().set_shape(shape)
 
     def persistable(self):
@@ -4354,8 +4554,9 @@ class IrVarNode(IrNode):
         Returns:
             bool: indicate whether the variable is persistable.
         """
-        assert self.node.var() is not None, \
-            "The node variable description can not be None."
+        assert (
+            self.node.var() is not None
+        ), "The node variable description can not be None."
         return self.node.var().persistable()
 
     def type(self):
@@ -4365,8 +4566,9 @@ class IrVarNode(IrNode):
         Returns:
             core.VarDesc.VarType: the variable type.
         """
-        assert self.node.var() is not None, \
-            "The node variable description can not be None."
+        assert (
+            self.node.var() is not None
+        ), "The node variable description can not be None."
         return self.node.var().type()
 
     def dtype(self):
@@ -4376,8 +4578,9 @@ class IrVarNode(IrNode):
         Returns:
             core.VarDesc.VarType: the variable data type.
         """
-        assert self.node.var() is not None, \
-            "The node variable description can not be None."
+        assert (
+            self.node.var() is not None
+        ), "The node variable description can not be None."
         return self.node.var().dtype()
 
     def shape(self):
@@ -4387,8 +4590,9 @@ class IrVarNode(IrNode):
         Returns:
             list: the variable shape.
         """
-        assert self.node.var() is not None, \
-            "The node variable description can not be None."
+        assert (
+            self.node.var() is not None
+        ), "The node variable description can not be None."
         return self.node.var().shape()
 
     @property
@@ -4424,8 +4628,9 @@ class IrOpNode(IrNode):
         Args:
             node(core.Node): C++ Node.
         """
-        assert isinstance(node, core.Node) and node.is_op(), \
-            'node must be the instance of core.Node and it must be a operator node.'
+        assert (
+            isinstance(node, core.Node) and node.is_op()
+        ), 'node must be the instance of core.Node and it must be a operator node.'
         super(IrOpNode, self).__init__(node)
         self.node = node
 
@@ -4437,8 +4642,9 @@ class IrOpNode(IrNode):
             old_input_name(str): the old input name.
             new_input_name(str): the new input name.
         """
-        assert self.node.op() is not None, \
-            "The node operator description can not be None."
+        assert (
+            self.node.op() is not None
+        ), "The node operator description can not be None."
         self.node.op()._rename_input(old_input_name, new_input_name)
 
     def rename_output(self, old_output_name, new_output_name):
@@ -4449,8 +4655,9 @@ class IrOpNode(IrNode):
             old_output_name(str): the old output name.
             new_output_name(str): the new output name.
         """
-        assert self.node.op() is not None, \
-            "The node operator description can not be None."
+        assert (
+            self.node.op() is not None
+        ), "The node operator description can not be None."
         self.node.op()._rename_output(old_output_name, new_output_name)
 
     def input(self, name):
@@ -4463,8 +4670,9 @@ class IrOpNode(IrNode):
         Returns:
             list(str): the argument name list.
         """
-        assert self.node.op() is not None, \
-            "The node operator description can not be None."
+        assert (
+            self.node.op() is not None
+        ), "The node operator description can not be None."
         return self.node.op().input(name)
 
     def output(self, name):
@@ -4477,8 +4685,9 @@ class IrOpNode(IrNode):
         Returns:
             list(str): the argument name list.
         """
-        assert self.node.op() is not None, \
-            "The node operator description can not be None."
+        assert (
+            self.node.op() is not None
+        ), "The node operator description can not be None."
         return self.node.op().output(name)
 
     def set_type(self, new_type):
@@ -4488,8 +4697,9 @@ class IrOpNode(IrNode):
         Args:
             new_type(str): new operator type to be set.
         """
-        assert self.node.op() is not None, \
-            "The node operator description can not be None."
+        assert (
+            self.node.op() is not None
+        ), "The node operator description can not be None."
         return self.node.op().set_type(new_type)
 
     def set_attr(self, name, val):
@@ -4506,8 +4716,9 @@ class IrOpNode(IrNode):
         """
         Update the value of the op desc's attribute by attribute's name.
         """
-        assert self.node.op() is not None, \
-            "The node operator description can not be None."
+        assert (
+            self.node.op() is not None
+        ), "The node operator description can not be None."
         desc = self.node.op()
         if isinstance(val, Variable):
             desc.set_var_attr(name, val.desc)
@@ -4517,8 +4728,9 @@ class IrOpNode(IrNode):
             desc.set_block_attr(name, val.desc)
         elif isinstance(val, list) and val and _all_is_type(val, Block):
             desc.set_blocks_attr(name, [v.desc for v in val])
-        elif isinstance(val, core.BlockDesc) or \
-                isinstance(val, core.ProgramDesc):
+        elif isinstance(val, core.BlockDesc) or isinstance(
+            val, core.ProgramDesc
+        ):
             desc.set_serialized_attr(name, val.serialize_to_string())
         else:
             desc._set_attr(name, val)
@@ -4530,8 +4742,9 @@ class IrOpNode(IrNode):
         Returns:
             list(str): input arguments' names of this op node.
         """
-        assert self.node.op() is not None, \
-            "The node operator description can not be None."
+        assert (
+            self.node.op() is not None
+        ), "The node operator description can not be None."
         return self.node.op().input_arg_names()
 
     def output_arg_names(self):
@@ -4541,8 +4754,9 @@ class IrOpNode(IrNode):
         Returns:
             list(str): output arguments' names of this op node.
         """
-        assert self.node.op() is not None, \
-            "The node operator description can not be None."
+        assert (
+            self.node.op() is not None
+        ), "The node operator description can not be None."
         return self.node.op().output_arg_names()
 
     @property
@@ -4583,7 +4797,8 @@ class IrGraph(object):
             for_test(bool): True for the test graph and false for the train graph.
         """
         assert isinstance(
-            graph, core.Graph), 'graph must be the instance of core.Graph.'
+            graph, core.Graph
+        ), 'graph must be the instance of core.Graph.'
         self.graph = graph
         self._for_test = for_test
 
@@ -4624,8 +4839,11 @@ class IrGraph(object):
         """
         persistable_nodes = set()
         for node in self.graph.nodes():
-            if node.is_var() and node.var() is not None and node.var(
-            ).persistable():
+            if (
+                node.is_var()
+                and node.var() is not None
+                and node.var().persistable()
+            ):
                 persistable_nodes.add(node)
         return {IrVarNode(p) for p in persistable_nodes}
 
@@ -4732,13 +4950,15 @@ class IrGraph(object):
         for input_name, var_nodes in six.iteritems(inputs):
             if not isinstance(var_nodes, list):
                 var_nodes = [var_nodes]
-            op_desc.set_input(input_name,
-                              [var_node.name() for var_node in var_nodes])
+            op_desc.set_input(
+                input_name, [var_node.name() for var_node in var_nodes]
+            )
         for output_name, var_nodes in six.iteritems(outputs):
             if not isinstance(var_nodes, list):
                 var_nodes = [var_nodes]
-            op_desc.set_output(output_name,
-                               [var_node.name() for var_node in var_nodes])
+            op_desc.set_output(
+                output_name, [var_node.name() for var_node in var_nodes]
+            )
         return IrOpNode(self.graph.create_op_node(op_desc))
 
     def create_op_node_from_desc(self, op_desc):
@@ -4762,9 +4982,11 @@ class IrGraph(object):
             new_input_node(IrNode): the new input node of the giving op_node.
             op_node(IrOpNode): the operator node that is needed to update input's link.
         """
-        assert old_input_node.node in self.graph.nodes() and new_input_node.node in \
-            self.graph.nodes() and op_node.node in self.graph.nodes(), \
-            'The three arguments(old_input_node&new_input_node&op_node) must be in the graph nodes.'
+        assert (
+            old_input_node.node in self.graph.nodes()
+            and new_input_node.node in self.graph.nodes()
+            and op_node.node in self.graph.nodes()
+        ), 'The three arguments(old_input_node&new_input_node&op_node) must be in the graph nodes.'
         old_input_node.remove_output(op_node)
         op_node.remove_input(old_input_node)
         new_input_node.append_output(op_node)
@@ -4780,9 +5002,11 @@ class IrGraph(object):
             new_output_node(IrNode): the new output node of the giving op_node.
             op_node(IrOpNode): the operator node that is needed to update input's link.
         """
-        assert old_output_node.node in self.graph.nodes() and new_output_node.node in \
-            self.graph.nodes() and op_node.node in self.graph.nodes(), \
-            'The three arguments(old_output_node &new_output_node &op_node) must be in the graph nodes.'
+        assert (
+            old_output_node.node in self.graph.nodes()
+            and new_output_node.node in self.graph.nodes()
+            and op_node.node in self.graph.nodes()
+        ), 'The three arguments(old_output_node &new_output_node &op_node) must be in the graph nodes.'
         old_output_node.remove_input(op_node)
         op_node.remove_output(old_output_node)
         new_output_node.append_input(op_node)
@@ -4798,9 +5022,11 @@ class IrGraph(object):
             node_out(IrNode): the output node.
         """
         assert node_in.node in self.graph.nodes(), (
-            'node_in(%s) must be in the graph nodes.' % node_in.node.name())
+            'node_in(%s) must be in the graph nodes.' % node_in.node.name()
+        )
         assert node_out.node in self.graph.nodes(), (
-            'node_out(%s) must be in the graph nodes.' % node_out.node.name())
+            'node_out(%s) must be in the graph nodes.' % node_out.node.name()
+        )
         node_in.append_output(node_out)
         node_out.append_input(node_in)
 
@@ -4837,8 +5063,8 @@ class IrGraph(object):
                         ]
                     else:
                         var_nodes[each_var_name].append(
-                            self._find_node_by_name(node.outputs,
-                                                    each_var_name))
+                            self._find_node_by_name(node.outputs, each_var_name)
+                        )
         self.graph.resolve_hazard(var_nodes)
 
     def has_circle(self):
@@ -4900,13 +5126,15 @@ class IrGraph(object):
 
         def _convert_to_pdf(dot_file_path):
             pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
-            exited_code = subprocess.call('dot -Tpdf ' + dot_file_path +
-                                          ' -o ' + pdf_save_path,
-                                          shell=True)
+            exited_code = subprocess.call(
+                'dot -Tpdf ' + dot_file_path + ' -o ' + pdf_save_path,
+                shell=True,
+            )
             if exited_code != 0:
                 print('The dot command is needed for creating pdf files.')
-                print('The {} is saved as the dot filetype.'.format(
-                    dot_file_path))
+                print(
+                    'The {} is saved as the dot filetype.'.format(dot_file_path)
+                )
 
         remove_ctr_vars = set()
         if remove_ctr_var:
@@ -4963,7 +5191,8 @@ class IrGraph(object):
             if n.name() == node_name:
                 target_node = n
         assert target_node is not None, (
-            "Cannot find the target node (%s)in the giving set." % node_name)
+            "Cannot find the target node (%s)in the giving set." % node_name
+        )
         return target_node
 
     def _update_desc_attr(self, desc, name, val):
@@ -4978,8 +5207,9 @@ class IrGraph(object):
             desc.set_block_attr(name, val.desc)
         elif isinstance(val, list) and val and _all_is_type(val, Block):
             desc.set_blocks_attr(name, [v.desc for v in val])
-        elif isinstance(val, core.BlockDesc) or \
-                isinstance(val, core.ProgramDesc):
+        elif isinstance(val, core.BlockDesc) or isinstance(
+            val, core.ProgramDesc
+        ):
             desc.set_serialized_attr(name, val.serialize_to_string())
         else:
             desc._set_attr(name, val)
@@ -5081,7 +5311,8 @@ class Program(object):
 
         # identifier for auto checkpoint
         self._auto_checkpoint_name = unique_name.generate(
-            "__auto_checkpoint_program__")
+            "__auto_checkpoint_program__"
+        )
 
         # compiled program, i.e. Graph
         self._graph = None
@@ -5101,7 +5332,7 @@ class Program(object):
         all_new_vars = []
         block_num = new_desc.num_blocks()
         for idx in range(block_num):
-            if (idx > (len(self.blocks) - 1)):
+            if idx > (len(self.blocks) - 1):
                 self._create_block()
             new_block_desc = new_desc.block(idx)
             all_new_vars.append([])
@@ -5113,60 +5344,75 @@ class Program(object):
                     old_var = None
 
                 kwargs = {
-                    'type':
-                    new_var_desc.type(),
-                    'name':
-                    new_var_desc.name(),
-                    'shape':
-                    get_var_desc_attr_or_none(new_var_desc, "shape", [
-                        core.VarDesc.VarType.LOD_TENSOR,
-                        core.VarDesc.VarType.SELECTED_ROWS,
-                        core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-                    ]),
-                    'dtype':
-                    get_var_desc_attr_or_none(new_var_desc, "dtype", [
-                        core.VarDesc.VarType.LOD_TENSOR,
-                        core.VarDesc.VarType.SELECTED_ROWS,
-                        core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-                    ]),
-                    'lod_level':
-                    get_var_desc_attr_or_none(new_var_desc, "lod_level", [
-                        core.VarDesc.VarType.LOD_TENSOR,
-                        core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-                    ]),
-                    'error_clip':
-                    old_var.error_clip if old_var is not None else None,
-                    'stop_gradient':
-                    old_var.stop_gradient if old_var is not None else False,
-                    'is_data':
-                    old_var.is_data if old_var is not None else False,
-                    'need_check_feed':
-                    new_var_desc.need_check_feed(),
-                    'belong_to_optimizer':
-                    old_var.belong_to_optimizer
-                    if old_var is not None else False,
+                    'type': new_var_desc.type(),
+                    'name': new_var_desc.name(),
+                    'shape': get_var_desc_attr_or_none(
+                        new_var_desc,
+                        "shape",
+                        [
+                            core.VarDesc.VarType.LOD_TENSOR,
+                            core.VarDesc.VarType.SELECTED_ROWS,
+                            core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                        ],
+                    ),
+                    'dtype': get_var_desc_attr_or_none(
+                        new_var_desc,
+                        "dtype",
+                        [
+                            core.VarDesc.VarType.LOD_TENSOR,
+                            core.VarDesc.VarType.SELECTED_ROWS,
+                            core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                        ],
+                    ),
+                    'lod_level': get_var_desc_attr_or_none(
+                        new_var_desc,
+                        "lod_level",
+                        [
+                            core.VarDesc.VarType.LOD_TENSOR,
+                            core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                        ],
+                    ),
+                    'error_clip': old_var.error_clip
+                    if old_var is not None
+                    else None,
+                    'stop_gradient': old_var.stop_gradient
+                    if old_var is not None
+                    else False,
+                    'is_data': old_var.is_data
+                    if old_var is not None
+                    else False,
+                    'need_check_feed': new_var_desc.need_check_feed(),
+                    'belong_to_optimizer': old_var.belong_to_optimizer
+                    if old_var is not None
+                    else False,
                 }
 
                 if isinstance(old_var, Parameter):
-                    kwargs.update({
-                        'trainable': old_var.trainable,
-                        'optimize_attr': old_var.optimize_attr,
-                        'regularizer': old_var.regularizer,
-                        'do_model_average': old_var.do_model_average,
-                        'need_clip': old_var.need_clip,
-                        'is_distributed': old_var.is_distributed,
-                        'is_parameter': old_var.is_parameter,
-                    })
-                    block_new_vars.append({
-                        'class': Parameter,
-                        'kwargs': copy.deepcopy(kwargs),
-                    })
+                    kwargs.update(
+                        {
+                            'trainable': old_var.trainable,
+                            'optimize_attr': old_var.optimize_attr,
+                            'regularizer': old_var.regularizer,
+                            'do_model_average': old_var.do_model_average,
+                            'need_clip': old_var.need_clip,
+                            'is_distributed': old_var.is_distributed,
+                            'is_parameter': old_var.is_parameter,
+                        }
+                    )
+                    block_new_vars.append(
+                        {
+                            'class': Parameter,
+                            'kwargs': copy.deepcopy(kwargs),
+                        }
+                    )
                 else:
                     kwargs['persistable'] = new_var_desc.persistable()
-                    block_new_vars.append({
-                        'class': Variable,
-                        'kwargs': copy.deepcopy(kwargs),
-                    })
+                    block_new_vars.append(
+                        {
+                            'class': Variable,
+                            'kwargs': copy.deepcopy(kwargs),
+                        }
+                    )
 
         return all_new_vars
 
@@ -5398,7 +5644,8 @@ class Program(object):
         assert isinstance(
             skip_op_callstack, bool
         ), "skip_op_callstack parameter's type is error, expect bool, received {}".format(
-            type(skip_op_callstack))
+            type(skip_op_callstack)
+        )
         program_str = ""
         for block in self.blocks:
             program_str += block._to_readable_code(skip_op_callstack)
@@ -5440,11 +5687,13 @@ class Program(object):
         assert isinstance(
             throw_on_error, bool
         ), "The type of throw_on_error parameter is wrong, expected bool, but received {}.".format(
-            type(throw_on_error))
+            type(throw_on_error)
+        )
         assert isinstance(
             with_details, bool
         ), "The type of with_details parameter is wrong, expected bool, but received {}.".format(
-            type(with_details))
+            type(with_details)
+        )
 
         if with_details:
             res_str = ""
@@ -5453,7 +5702,8 @@ class Program(object):
         else:
             protostr = self.desc.serialize_to_string()
             proto = framework_pb2.ProgramDesc.FromString(
-                six.binary_type(protostr))
+                six.binary_type(protostr)
+            )
             res_str = _debug_string_(proto, throw_on_error)
         return res_str
 
@@ -5473,8 +5723,8 @@ class Program(object):
     def clone(self, for_test=False):
         """
         .. note:::
-            1. :code:`Program.clone()` method DOES NOT clone :ref:`api_paddle_io_DataLoader` . 
-            2. Recommend you to use :code:`clone` before using :code:`Opimizer.minimize` . 
+            1. :code:`Program.clone()` method DOES NOT clone :ref:`api_paddle_io_DataLoader` .
+            2. Recommend you to use :code:`clone` before using :code:`Opimizer.minimize` .
             3. This API has no effect in Dygraph Mode.
 
         Create a new Program with forward content of original one when ``for_test=True``.
@@ -5651,7 +5901,8 @@ class Program(object):
         if for_test:
             forward_prog = Program()
             forward_prog.desc, pruned_origin_block_id_map = core.prune_backward(
-                self.desc)
+                self.desc
+            )
             forward_prog.blocks = [
                 Block(forward_prog, i)
                 for i in six.moves.range(forward_prog.desc.num_blocks())
@@ -5702,8 +5953,8 @@ class Program(object):
     def _prune_with_input(self, feeded_var_names, targets):
         """
         Prune operators and variables which are not needed to generate
-        :code:`targets`. Prune operators and variables which are needed 
-        to generate feeded_var 
+        :code:`targets`. Prune operators and variables which are needed
+        to generate feeded_var
 
         Notes: This is a very low level API. Users should not use this API
         directly. This API is in flux and not stable.
@@ -5731,7 +5982,8 @@ class Program(object):
             if not isinstance(var, six.string_types):
                 raise ValueError(
                     "All feeded_var_names of Program._prune_with_input() can only be "
-                    "str, but received %s." % type(var))
+                    "str, but received %s." % type(var)
+                )
 
         # find out all variables that can be generated or updated with given feed
         generatable_vars = set()
@@ -5759,7 +6011,8 @@ class Program(object):
                 else:
                     raise ValueError(
                         "All targets of Program._prune_with_input() can only be "
-                        "Variable or Operator, but received %s." % type(t))
+                        "Variable or Operator, but received %s." % type(t)
+                    )
 
                 # NOTEZ(zhiqiu): For variable to be fed in fetch_list, there two cases:
                 # (1) the variable is leaf, it has no op that generates it;
@@ -5793,7 +6046,8 @@ class Program(object):
 
         res = Program()
         res.desc, pruned_origin_block_id_map = core.prune(
-            self.desc, set(feeded_var_names), targets_idx)
+            self.desc, set(feeded_var_names), targets_idx
+        )
         res.blocks = [
             Block(res, i) for i in six.moves.range(res.desc.num_blocks())
         ]
@@ -5834,8 +6088,10 @@ class Program(object):
         root_block = res.desc.block(0)
         if prune_read_op:
             while True:
-                if read_op_idx >= root_block.op_size() or root_block.op(
-                        read_op_idx).type() == 'read':
+                if (
+                    read_op_idx >= root_block.op_size()
+                    or root_block.op(read_op_idx).type() == 'read'
+                ):
                     break
                 read_op_idx += 1
             if read_op_idx < root_block.op_size():
@@ -5931,14 +6187,22 @@ class Program(object):
                 # for name in remove_output_list:
                 #     op.remove_output(name)
 
-                op_quant_name = core.op_proto_and_checker_maker.kOpWithQuantAttrName(
+                op_quant_name = (
+                    core.op_proto_and_checker_maker.kOpWithQuantAttrName()
+                )
+                quant = (
+                    bool(op.attr(op_quant_name))
+                    if op_quant_name in op.attr_names()
+                    else False
                 )
-                quant = bool(op.attr(op_quant_name)
-                             ) if op_quant_name in op.attr_names() else False
                 quant_attrs = [
-                    op_quant_name, "quantization_type", "skip_quant",
-                    "activation_bits", "bit_length", "quantize_weight_bits",
-                    "weight_quant_scale"
+                    op_quant_name,
+                    "quantization_type",
+                    "skip_quant",
+                    "activation_bits",
+                    "bit_length",
+                    "quantize_weight_bits",
+                    "weight_quant_scale",
                 ]
                 for extra_attr_name in extra_attrs_map.keys():
                     op.remove_attr(extra_attr_name)
@@ -5969,7 +6233,7 @@ class Program(object):
     def parse_from_string(binary_str):
         """
         .. note::
-            1. All information about parameters will be lost after serialization; 
+            1. All information about parameters will be lost after serialization;
             2. This API has no effect in Dygraph mode.
 
         Deserialize a Program from  `protobuf <https://en.wikipedia.org/wiki/Protocol_Buffers>`_  binary string.
@@ -6034,7 +6298,7 @@ class Program(object):
         The default random seed for random operators in Program. ``0`` means get
         the random seed from random device.
 
-        .. note:: 
+        .. note::
             It must be set before the operators have been added.
 
         Returns:
@@ -6072,7 +6336,7 @@ class Program(object):
         """
         The number of :ref:`api_guide_Block_en`  in this Program.
 
-        .. note:: 
+        .. note::
             This API has no effect in Dygraph mode.
 
         Returns:
@@ -6101,7 +6365,8 @@ class Program(object):
         if not isinstance(seed, int):
             raise ValueError(
                 "Program.random_seed's input seed must be an integer, but received %s."
-                % type(seed))
+                % type(seed)
+            )
         self._seed = seed
 
     def __repr__(self):
@@ -6198,8 +6463,11 @@ class Program(object):
             Block: The new block.
         """
         new_block_idx = len(self.blocks)
-        parent = self.current_block() if parent_idx is None else self.block(
-            parent_idx)
+        parent = (
+            self.current_block()
+            if parent_idx is None
+            else self.block(parent_idx)
+        )
         self.desc.append_block(parent.desc)
         self.current_block_idx = new_block_idx
         self.blocks.append(Block(self, self.current_block_idx))
@@ -6245,7 +6513,8 @@ class Program(object):
         if not isinstance(other, Program):
             raise TypeError(
                 "Function Program._copy_param_info_from() needs to pass in a source Program, but received %s"
-                % type(other))
+                % type(other)
+            )
 
         self.global_block()._copy_param_info_from(other.global_block())
 
@@ -6262,7 +6531,8 @@ class Program(object):
         if not isinstance(other, Program):
             raise TypeError(
                 "Function Program._copy_param_info_from() needs to pass in a source Program, but received %s"
-                % type(other))
+                % type(other)
+            )
         self._is_distributed = other._is_distributed
         self._is_chief = other._is_chief
         self._parameters_on_pservers = other._parameters_on_pservers
@@ -6280,8 +6550,8 @@ class Program(object):
         Args:
             other(Program): Other program
             pruned_origin_block_id_map(dict{int:int}): A dict which maps the block id in program
-            self to the block id in program other. For example, {0:0, 1:1, 2:3} means block 0 in self is 
-            cloned from block 0 in other, etc. Default is None, which means default mapped, 
+            self to the block id in program other. For example, {0:0, 1:1, 2:3} means block 0 in self is
+            cloned from block 0 in other, etc. Default is None, which means default mapped,
             {0:0, 1:1,..., n:n}.
 
         Returns:
@@ -6290,12 +6560,12 @@ class Program(object):
         if not isinstance(other, Program):
             raise TypeError(
                 "Function Program._copy_param_info_from() needs to pass in a source Program, but received %s"
-                % type(other))
+                % type(other)
+            )
 
         if not pruned_origin_block_id_map:
             pruned_origin_block_id_map = {
-                i: i
-                for i in six.moves.range(self.desc.num_blocks())
+                i: i for i in six.moves.range(self.desc.num_blocks())
             }
 
         # NOTE(zhiqiu): All vars in cloned program exist in original program.
@@ -6387,12 +6657,12 @@ class Program(object):
             This function MUST called after run start_up_program
 
         Args:
-            mode(str, optional): Source of the obtained parameters and buffers. 
-                    'opt' :  The return value only contains the variable in the optimizer. 
-                    'param' : The return value only contains the variable in the network, not the variable in the optimizer.  
+            mode(str, optional): Source of the obtained parameters and buffers.
+                    'opt' :  The return value only contains the variable in the optimizer.
+                    'param' : The return value only contains the variable in the network, not the variable in the optimizer.
                     'all' : The return value contains the variable in the network and optimizer.
                     Default: 'all'
-            scope(Scope, optional) : If scope is None, state_dict will be set to global scope 
+            scope(Scope, optional) : If scope is None, state_dict will be set to global scope
                 obtained through 'paddle.static.global_scope()'. Otherwise, value will be set to scope.
                 Default: None
 
@@ -6423,10 +6693,13 @@ class Program(object):
         # can not be imported at the begainning of this file.
         # Therefore, the above two modules are dynamically imported.
         from .executor import global_scope
+
         if scope is not None and not isinstance(scope, core._Scope):
             raise TypeError(
-                "`scope` should be None or `paddle.static.Scope'` type, but received {}."
-                .format(type(scope)))
+                "`scope` should be None or `paddle.static.Scope'` type, but received {}.".format(
+                    type(scope)
+                )
+            )
 
         if scope is None:
             scope = global_scope()
@@ -6434,15 +6707,19 @@ class Program(object):
         if not isinstance(mode, str):
             raise TypeError(
                 "Type of `mode` should be string, but received {}.".format(
-                    type(mode)))
+                    type(mode)
+                )
+            )
 
         def is_parameter(var):
             return isinstance(var, Parameter)
 
         def is_persistable(var):
-            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                var.desc.type() == core.VarDesc.VarType.READER:
+            if (
+                var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH
+                or var.desc.type() == core.VarDesc.VarType.FETCH_LIST
+                or var.desc.type() == core.VarDesc.VarType.READER
+            ):
                 return False
             return var.persistable
 
@@ -6461,8 +6738,10 @@ class Program(object):
                 return is_parameter(var) or is_belong_to_optimizer(var)
             else:
                 raise ValueError(
-                    "`mode` string should be 'param', 'opt' or 'all', but received {}."
-                    .format(mode))
+                    "`mode` string should be 'param', 'opt' or 'all', but received {}.".format(
+                        mode
+                    )
+                )
 
         var_list = filter(condition, self.list_vars())
 
@@ -6471,28 +6750,30 @@ class Program(object):
             var_temp = scope.find_var(var.name)
             if var_temp is None:
                 raise ValueError(
-                    "Can not find Variable '{}' in the scope. Make sure it is initialized"
-                    .format(var.name))
+                    "Can not find Variable '{}' in the scope. Make sure it is initialized".format(
+                        var.name
+                    )
+                )
             state_dict[var.name] = var_temp.get_tensor()
 
         return state_dict
 
     def set_state_dict(self, state_dict, scope=None):
         """
-        Set parameters and persistable buffers in state_dict to program. 
+        Set parameters and persistable buffers in state_dict to program.
         An exception will throw if shape or dtype of the parameters is not match.
-        
+
         .. note::
             This function MUST called after run start_up_program
 
         Args:
-            state_dict(dict): the dict store parameters and persistable buffers. 
+            state_dict(dict): the dict store parameters and persistable buffers.
                 The key is the name of the parameter or the name of the buffer.
                 The value is the tensor of this variable in the given scope.
-            scope(Scope, optional) : If scope is None, state_dict will be set to global scope 
+            scope(Scope, optional) : If scope is None, state_dict will be set to global scope
                 obtained through 'paddle.static.global_scope()'. Otherwise, value will be set to scope.
                 Default: None
-        
+
         Returns:
             None
 
@@ -6522,10 +6803,14 @@ class Program(object):
         if not isinstance(state_dict, dict):
             raise TypeError(
                 "Type of `state_dict` should be dict, but received {}.".format(
-                    type(state_dict)))
+                    type(state_dict)
+                )
+            )
 
         vars_dict = {var.name: var for var in self.list_vars()}
-        condition = True if 'StructuredToParameterName@@' in state_dict else False
+        condition = (
+            True if 'StructuredToParameterName@@' in state_dict else False
+        )
         for name, value in state_dict.items():
             if condition:
                 if name == "StructuredToParameterName@@":
@@ -6537,14 +6822,20 @@ class Program(object):
                     vars_dict[name].set_value(value, scope)
                 except ValueError as err:
                     warnings.warn(
-                        ("Skip loading for '{}'. ".format(name) + str(err)))
+                        ("Skip loading for '{}'. ".format(name) + str(err))
+                    )
                 except TypeError as err:
                     warnings.warn(
-                        ("Skip loading for '{}'. ".format(name) + str(err)))
+                        ("Skip loading for '{}'. ".format(name) + str(err))
+                    )
             else:
                 warnings.warn(
-                    ("Skip loading for '{0}'. Because '{0}' not in the program."
-                     .format(name)))
+                    (
+                        "Skip loading for '{0}'. Because '{0}' not in the program.".format(
+                            name
+                        )
+                    )
+                )
 
 
 @six.add_metaclass(ParameterMetaClass)
@@ -6568,16 +6859,18 @@ class Parameter(Variable):
             be applied on the parameter. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this parameter.
-        need_clip (bool): Whether the parameter gradient need to be cliped 
+        need_clip (bool): Whether the parameter gradient need to be cliped
             in optimizer. Default is True.
     """
 
-    def __init__(self,
-                 block,
-                 shape,
-                 dtype,
-                 type=core.VarDesc.VarType.LOD_TENSOR,
-                 **kwargs):
+    def __init__(
+        self,
+        block,
+        shape,
+        dtype,
+        type=core.VarDesc.VarType.LOD_TENSOR,
+        **kwargs
+    ):
         if shape is None:
             raise ValueError("The shape of Parameter should not be None")
         if dtype is None:
@@ -6585,21 +6878,25 @@ class Parameter(Variable):
 
         if len(shape) == 0:
             raise ValueError(
-                "The dimensions of shape for Parameter must be greater than 0")
+                "The dimensions of shape for Parameter must be greater than 0"
+            )
 
         for each in shape:
             if each < 0:
                 raise ValueError(
                     "Each dimension of shape for Parameter must be greater than 0, but received %s"
-                    % list(shape))
-
-        Variable.__init__(self,
-                          block,
-                          persistable=True,
-                          shape=shape,
-                          dtype=dtype,
-                          type=type,
-                          **kwargs)
+                    % list(shape)
+                )
+
+        Variable.__init__(
+            self,
+            block,
+            persistable=True,
+            shape=shape,
+            dtype=dtype,
+            type=type,
+            **kwargs
+        )
         self.trainable = kwargs.get('trainable', True)
 
         self.optimize_attr = kwargs.get('optimize_attr', {'learning_rate': 1.0})
@@ -6640,14 +6937,22 @@ class Parameter(Variable):
                 print(debug_str)
         """
         assert isinstance(throw_on_error, bool) and isinstance(
-            with_details, bool)
+            with_details, bool
+        )
         if with_details:
             res_str = Variable.to_string(self, throw_on_error, True)
-            additional_attr = ("trainable", "optimize_attr", "regularizer",
-                               "do_model_average", "need_clip")
+            additional_attr = (
+                "trainable",
+                "optimize_attr",
+                "regularizer",
+                "do_model_average",
+                "need_clip",
+            )
             for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (attr_name,
-                                         cpt.to_text(getattr(self, attr_name)))
+                res_str += "%s: %s\n" % (
+                    attr_name,
+                    cpt.to_text(getattr(self, attr_name)),
+                )
         else:
             res_str = Variable.to_string(self, throw_on_error, False)
         return res_str
@@ -6657,8 +6962,8 @@ class Parameter(Variable):
 
 class ParamBase(core.VarBase):
     """
-    ParamBase is derived from Tensor( Which is the concept in Dygraph Mode). 
-    A ParamBase is a persistable Tensor, and will be updated by optimizers 
+    ParamBase is derived from Tensor( Which is the concept in Dygraph Mode).
+    A ParamBase is a persistable Tensor, and will be updated by optimizers
     after each iteration.
     The training of a neural network is essentially the updating of
     its ParamBase.
@@ -6676,7 +6981,7 @@ class ParamBase(core.VarBase):
             be applied on the ParamBase. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this ParamBase.
-        need_clip (bool): Whether the parameter gradient need to be cliped 
+        need_clip (bool): Whether the parameter gradient need to be cliped
             in optimizer. Default is True.
     """
 
@@ -6689,13 +6994,15 @@ class ParamBase(core.VarBase):
 
         if len(shape) == 0:
             raise ValueError(
-                "The dimensions of shape for Parameter must be greater than 0")
+                "The dimensions of shape for Parameter must be greater than 0"
+            )
 
         for each in shape:
             if each < 0:
                 raise ValueError(
                     "Each dimension of shape for Parameter must be greater than 0, but received %s"
-                    % list(shape))
+                    % list(shape)
+                )
 
         if dtype is not None:
             if not isinstance(dtype, core.VarDesc.VarType):
@@ -6703,10 +7010,13 @@ class ParamBase(core.VarBase):
 
         name = kwargs.get('name', unique_name.generate('_param_base'))
 
-        super(ParamBase,
-              self).__init__(dtype if dtype else core.VarDesc.VarType.FP32,
-                             list(shape) if shape else [], name,
-                             core.VarDesc.VarType.LOD_TENSOR, True)
+        super(ParamBase, self).__init__(
+            dtype if dtype else core.VarDesc.VarType.FP32,
+            list(shape) if shape else [],
+            name,
+            core.VarDesc.VarType.LOD_TENSOR,
+            True,
+        )
 
         trainable = kwargs.get('trainable', True)
         self.stop_gradient = not trainable
@@ -6733,7 +7043,8 @@ class ParamBase(core.VarBase):
         else:
             raise ValueError(
                 "The type of trainable MUST be bool, but the type is ",
-                type(trainable))
+                type(trainable),
+            )
 
     def __str__(self):
         """
@@ -6754,7 +7065,8 @@ class ParamBase(core.VarBase):
                 #         [-0.54217887,  0.48439729,  0.34082305]])
         """
         return "Parameter containing:\n{tensor}".format(
-            tensor=super(ParamBase, self).__str__())
+            tensor=super(ParamBase, self).__str__()
+        )
 
     def __deepcopy__(self, memo):
         """
@@ -6803,8 +7115,8 @@ else:
 
 class EagerParamBase(_core_eager_eagertensor):
     """
-    EagerParamBase is derived from Tensor( Which is the concept in Eager-Dygraph Mode). 
-    A EagerParamBase is a persistable Tensor, and will be updated by optimizers 
+    EagerParamBase is derived from Tensor( Which is the concept in Eager-Dygraph Mode).
+    A EagerParamBase is a persistable Tensor, and will be updated by optimizers
     after each iteration.
     The training of a neural network is essentially the updating of
     its EagerParamBase.
@@ -6822,7 +7134,7 @@ class EagerParamBase(_core_eager_eagertensor):
             be applied on the EagerParamBase. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this EagerParamBase.
-        need_clip (bool): Whether the parameter gradient need to be cliped 
+        need_clip (bool): Whether the parameter gradient need to be cliped
             in optimizer. Default is True.
     """
 
@@ -6835,13 +7147,15 @@ class EagerParamBase(_core_eager_eagertensor):
 
         if len(shape) == 0:
             raise ValueError(
-                "The dimensions of shape for Parameter must be greater than 0")
+                "The dimensions of shape for Parameter must be greater than 0"
+            )
 
         for each in shape:
             if each < 0:
                 raise ValueError(
                     "Each dimension of shape for Parameter must be greater than 0, but received %s"
-                    % list(shape))
+                    % list(shape)
+                )
 
         if dtype is not None:
             if not isinstance(dtype, core.VarDesc.VarType):
@@ -6852,10 +7166,13 @@ class EagerParamBase(_core_eager_eagertensor):
         if isinstance(shape, core.eager.Tensor):
             shape = shape.numpy()
 
-        super(EagerParamBase,
-              self).__init__(dtype if dtype else core.VarDesc.VarType.FP32,
-                             list(shape) if shape else [], name,
-                             core.VarDesc.VarType.LOD_TENSOR, True)
+        super(EagerParamBase, self).__init__(
+            dtype if dtype else core.VarDesc.VarType.FP32,
+            list(shape) if shape else [],
+            name,
+            core.VarDesc.VarType.LOD_TENSOR,
+            True,
+        )
         self.retain_grads()
 
         trainable = kwargs.get('trainable', True)
@@ -6879,7 +7196,9 @@ class EagerParamBase(_core_eager_eagertensor):
 
     @dygraph_only
     def initialize(self):
-        assert self._init_func is not None, "Required self._init_func is not None, but received None."
+        assert (
+            self._init_func is not None
+        ), "Required self._init_func is not None, but received None."
         self._init_func()
         # clear function handle to release resource
         self._init_func = None
@@ -6895,13 +7214,16 @@ class EagerParamBase(_core_eager_eagertensor):
         else:
             raise ValueError(
                 "The type of trainable MUST be bool, but the type is ",
-                type(trainable))
+                type(trainable),
+            )
 
     def _create_init_op(self, block):
         """
         Call init_op_creator function to create initializer operation in block.
         """
-        assert self._init_op_creator is not None, "Required self._init_op_creator is not None, but received None."
+        assert (
+            self._init_op_creator is not None
+        ), "Required self._init_op_creator is not None, but received None."
         self._init_op_creator(block)
 
     def __str__(self):
@@ -6923,7 +7245,8 @@ class EagerParamBase(_core_eager_eagertensor):
                 #         [-0.54217887,  0.48439729,  0.34082305]])
         """
         return "Parameter containing:\n{tensor}".format(
-            tensor=super(EagerParamBase, self).__str__())
+            tensor=super(EagerParamBase, self).__str__()
+        )
 
     def __deepcopy__(self, memo):
         """
@@ -6975,7 +7298,7 @@ def default_startup_program():
     Get default/global startup program.
 
     The :code:`paddle.nn` function will append the initialization operators into startup program.
-    The :code:`startup_program` will initialize the parameters by the OPs. 
+    The :code:`startup_program` will initialize the parameters by the OPs.
 
     This method will return the default or the current startup program. Users can use
     :ref:`api_paddle_fluid_framework_program_guard`  to switch :ref:`api_paddle_fluid_framework_Program` .
@@ -6983,7 +7306,7 @@ def default_startup_program():
     Returns:
         Program: current default startup program.
 
-    Returns type: 
+    Returns type:
 
     Examples:
         .. code-block:: python
@@ -7001,13 +7324,13 @@ def default_startup_program():
 
 def default_main_program():
     """
-    This API can be used to get ``default main program`` which store the 
+    This API can be used to get ``default main program`` which store the
     descriptions of Ops and tensors.
 
-    For example ``z = paddle.add(x, y)`` will create a new ``add`` 
-    Op and a new ``z`` tensor, and they will be recorded in ``default main program`` . 
+    For example ``z = paddle.add(x, y)`` will create a new ``add``
+    Op and a new ``z`` tensor, and they will be recorded in ``default main program`` .
 
-    The ``default main program`` is the default value for ``Program`` parameter in 
+    The ``default main program`` is the default value for ``Program`` parameter in
     a lot of APIs. For example, the :code:`Executor.run()` will execute the
     :code:`default_main_program` when the program is not specified.
 
@@ -7077,8 +7400,8 @@ def program_guard(main_program, startup_program=None):
 
     Args:
         main_program(Program): New main program inside ``with`` statement.
-        startup_program(Program, optional): New startup program inside ``with`` 
-            statement. :code:`None` means not changing startup program, 
+        startup_program(Program, optional): New startup program inside ``with``
+            statement. :code:`None` means not changing startup program,
             default_startup_program is still used.
             Default: None.
 
@@ -7110,12 +7433,18 @@ def program_guard(main_program, startup_program=None):
 
     """
     from .data_feeder import check_type
-    check_type(main_program, 'main_program', Program,
-               'paddle.static.program_guard')
+
+    check_type(
+        main_program, 'main_program', Program, 'paddle.static.program_guard'
+    )
     main_program = switch_main_program(main_program)
     if startup_program is not None:
-        check_type(startup_program, 'startup_program', Program,
-                   'paddle.static.program_guard')
+        check_type(
+            startup_program,
+            'startup_program',
+            Program,
+            'paddle.static.program_guard',
+        )
         # Tag the program __is_start_up as True
         startup_program._is_start_up_program_ = True
         startup_program = switch_startup_program(startup_program)
@@ -7185,7 +7514,7 @@ def switch_device(device):
 @signature_safe_contextmanager
 def device_guard(device=None):
     """
-    
+
     Note:
         The API only supports static mode.
 
@@ -7193,7 +7522,7 @@ def device_guard(device=None):
 
     Args:
         device(str|None): Specify the device to use in the context. It should be ``cpu``,
-            ``gpu`` or ``gpu:x``, where ``x`` is the index of the GPUs. 
+            ``gpu`` or ``gpu:x``, where ``x`` is the index of the GPUs.
             When it is set to 'cpu' or 'gpu', all OPs created in the context will be
             placed on CPUPlace or CUDAPlace. When 'gpu' is set and the program runs on
             single-card, the device index will be the same as the device on which the
@@ -7201,9 +7530,9 @@ def device_guard(device=None):
             assigned devices.
 
     Examples:
-    
+
         .. code-block:: python
-            
+
             # required: gpu
             import paddle
 
@@ -7238,7 +7567,8 @@ def device_guard(device=None):
     if device not in ['cpu', 'gpu', 'npu', 'xpu', 'mlu', '', None]:
         raise ValueError(
             "The Attr(device) should be 'cpu' 'npu' 'xpu' 'mlu' or 'gpu', and it can also be empty string or None "
-            "when there is no need to specify device. But received %s" % device)
+            "when there is no need to specify device. But received %s" % device
+        )
     if index:
         device = ":".join([device, index])
     pre_device = switch_device(device)
@@ -7268,9 +7598,11 @@ def _cuda_graph_guard(cuda_graph_attr=None):
         cuda_graph_attr(str|None): The cuda graph attr with the format of:
                                    cuda_graph_capture_mode;memory_pool_id;cuda_graph_id
     """
-    assert not _non_static_mode(
+    assert (
+        not _non_static_mode()
     ), "cuda_graph_guard only works under static mode"
-    assert core.is_compiled_with_cuda(
+    assert (
+        core.is_compiled_with_cuda()
     ), "cuda_graph_guard context can be only used when Paddle is compiled with cuda"
     pre_mode = _switch_cuda_graph_mode(cuda_graph_attr)
     try:
@@ -7300,7 +7632,8 @@ def set_flags(flags):
             _global_flags()[key] = value
         else:
             raise ValueError(
-                "Flag %s cannot set its value through this function." % (key))
+                "Flag %s cannot set its value through this function." % (key)
+            )
 
 
 def get_flags(flags):
@@ -7327,22 +7660,24 @@ def get_flags(flags):
     flags_value = {}
     if isinstance(flags, (list, tuple)):
         for key in flags:
-            if (_global_flags().is_public(key)):
+            if _global_flags().is_public(key):
                 value = _global_flags()[key]
                 temp = {key: value}
                 flags_value.update(temp)
             else:
                 raise ValueError(
-                    'Flag %s cannot get its value through this function.' %
-                    (key))
+                    'Flag %s cannot get its value through this function.'
+                    % (key)
+                )
     elif isinstance(flags, str):
-        if (_global_flags().is_public(flags)):
+        if _global_flags().is_public(flags):
             value = _global_flags()[flags]
             temp = {flags: value}
             flags_value.update(temp)
         else:
             raise ValueError(
-                'Flag %s cannot get its value through this function.' % (flags))
+                'Flag %s cannot get its value through this function.' % (flags)
+            )
     else:
         raise TypeError('Flags in get_flags should be a list, tuple or string.')
     return flags_value
@@ -7352,20 +7687,32 @@ def _get_paddle_place(place):
     "convert the string to paddle Place"
     if place is None:
         return place
-    if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace,
-                          core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace,
-                          core.IPUPlace, core.MLUPlace, core.CustomPlace)):
+    if isinstance(
+        place,
+        (
+            core.Place,
+            core.XPUPlace,
+            core.CPUPlace,
+            core.CUDAPinnedPlace,
+            core.CUDAPlace,
+            core.NPUPlace,
+            core.IPUPlace,
+            core.MLUPlace,
+            core.CustomPlace,
+        ),
+    ):
         return place
 
     if not isinstance(place, str):
         raise ValueError(
-            "place only support string which is 'Place' and so on.")
+            "place only support string which is 'Place' and so on."
+        )
 
     place = place.lower()
-    if (place == "cpu"):
+    if place == "cpu":
         return core.CPUPlace()
 
-    if (place == "device"):
+    if place == "device":
         return core.Place()
 
     # GPU
@@ -7373,8 +7720,9 @@ def _get_paddle_place(place):
     if place == "gpu_pinned" or place == "gpu" or avaliable_gpu_place:
         if not core.is_compiled_with_cuda():
             raise ValueError(
-                "The device should not be {}, since PaddlePaddle is " \
-                "not compiled with CUDA".format(avaliable_gpu_place))
+                "The device should not be {}, since PaddlePaddle is "
+                "not compiled with CUDA".format(avaliable_gpu_place)
+            )
         if place == "gpu_pinned":
             return core.CUDAPinnedPlace()
         elif place == "gpu":
@@ -7390,8 +7738,9 @@ def _get_paddle_place(place):
     if avaliable_xpu_place:
         if not core.is_compiled_with_xpu():
             raise ValueError(
-                "The device should not be {}, since PaddlePaddle is " \
-                "not compiled with XPU".format(avaliable_xpu_place))
+                "The device should not be {}, since PaddlePaddle is "
+                "not compiled with XPU".format(avaliable_xpu_place)
+            )
         place_info_list = place.split(':', 1)
         device_id = place_info_list[1]
         device_id = int(device_id)
@@ -7402,8 +7751,9 @@ def _get_paddle_place(place):
     if avaliable_npu_place:
         if not core.is_compiled_with_npu():
             raise ValueError(
-                "The device should not be {}, since PaddlePaddle is " \
-                "not compiled with NPU".format(avaliable_npu_place))
+                "The device should not be {}, since PaddlePaddle is "
+                "not compiled with NPU".format(avaliable_npu_place)
+            )
         place_info_list = place.split(':', 1)
         device_id = place_info_list[1]
         device_id = int(device_id)
@@ -7414,8 +7764,9 @@ def _get_paddle_place(place):
     if avaliable_ipu_place:
         if not core.is_compiled_with_ipu():
             raise ValueError(
-                "The device should not be {}, since PaddlePaddle is " \
-                "not compiled with IPU".format(avaliable_ipu_place))
+                "The device should not be {}, since PaddlePaddle is "
+                "not compiled with IPU".format(avaliable_ipu_place)
+            )
         place_info_list = place.split(':', 1)
         device_id = place_info_list[1]
         device_id = int(device_id)
@@ -7426,16 +7777,19 @@ def _get_paddle_place(place):
     if avaliable_mlu_place:
         if not core.is_compiled_with_mlu():
             raise ValueError(
-                "The device should not be {}, since PaddlePaddle is " \
-                "not compiled with MLU".format(avaliable_mlu_place))
+                "The device should not be {}, since PaddlePaddle is "
+                "not compiled with MLU".format(avaliable_mlu_place)
+            )
         place_info_list = place.split(':', 1)
         device_id = place_info_list[1]
         device_id = int(device_id)
         return core.MLUPlace(device_id)
 
     raise ValueError(
-        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, IPUPlace, MLUPlace and NPUPlace, but received {}."
-        .format(place))
+        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, IPUPlace, MLUPlace and NPUPlace, but received {}.".format(
+            place
+        )
+    )
 
 
 def _get_paddle_place_list(places):
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index 41fd0c0703b..cbbdfe92628 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -46,11 +46,16 @@ def set_default_dtype(d):
         else:
             raise TypeError(
                 "set_default_dtype only supports [float16, float32, float64] "
-                ", but received %s" % d.__name__)
+                ", but received %s" % d.__name__
+            )
     else:
         if d in [
-                'float16', 'float32', 'float64', u'float16', u'float32',
-                u'float64'
+            'float16',
+            'float32',
+            'float64',
+            u'float16',
+            u'float32',
+            u'float64',
         ]:
             # this code is a little bit dangerous, since error could happen
             # when casting no-ascii code to str in python2.
@@ -61,7 +66,8 @@ def set_default_dtype(d):
         else:
             raise TypeError(
                 "set_default_dtype only supports [float16, float32, float64] "
-                ", but received %s" % str(d))
+                ", but received %s" % str(d)
+            )
 
     LayerHelperBase.set_default_dtype(d)
 
@@ -94,7 +100,7 @@ def set_grad_enabled(mode):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
             x = paddle.ones([3, 2])
             x.stop_gradient = False
@@ -127,9 +133,9 @@ def is_grad_enabled():
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
-            
+
             # Dygraph gradient calculation mode is enabled by default.
             paddle.is_grad_enabled() # True
 
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 09f3c512401..103f7fa4dd3 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -26,14 +26,35 @@ import paddle
 # deprecated module import
 from paddle import fluid
 from paddle.fluid import core
-from paddle.fluid.io import _unpack_saved_dict, _pack_loaded_dict, _pickle_loads_mac
+from paddle.fluid.io import (
+    _unpack_saved_dict,
+    _pack_loaded_dict,
+    _pickle_loads_mac,
+)
 from paddle.fluid.io import _legacy_save as _legacy_static_save
 from paddle.fluid.io import _open_file_buffer, _is_file_path, _is_memory_buffer
 
-from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, _non_static_mode, ParamBase, EagerParamBase, _current_expected_place, Program
+from paddle.fluid.framework import (
+    Variable,
+    _varbase_creator,
+    _dygraph_tracer,
+    _non_static_mode,
+    ParamBase,
+    EagerParamBase,
+    _current_expected_place,
+    Program,
+)
 from paddle.fluid.dygraph.jit import _SaveLoadConfig
-from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
-from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
+from paddle.fluid.dygraph.io import (
+    _construct_program_holders,
+    _construct_params_and_buffers,
+)
+from paddle.fluid.dygraph.io import (
+    INFER_MODEL_SUFFIX,
+    INFER_PARAMS_SUFFIX,
+    INFER_PARAMS_INFO_SUFFIX,
+)
+
 try:
     from collections.abc import Iterable
 except:
@@ -70,7 +91,8 @@ def _load_state_dict_from_save_inference_model(model_path, config):
     # 2. load layer parameters & buffers
     with fluid.dygraph.guard():
         persistable_var_dict = _construct_params_and_buffers(
-            model_path, programs, config.params_filename, append_suffix=False)
+            model_path, programs, config.params_filename, append_suffix=False
+        )
 
         # 3. construct state_dict
         load_param_dict = dict()
@@ -86,10 +108,15 @@ def _load_state_dict_from_save_inference_model(model_path, config):
             structured_para_dict = dict()
             for var_name in load_param_dict:
                 structured_name = extra_var_info[var_name].get(
-                    'structured_name', None)
-                assert structured_name is not None, "Cannot find saved variable (%s)'s structured name in saved model." % var_name
+                    'structured_name', None
+                )
+                assert structured_name is not None, (
+                    "Cannot find saved variable (%s)'s structured name in saved model."
+                    % var_name
+                )
                 structured_para_dict[structured_name] = load_param_dict[
-                    var_name]
+                    var_name
+                ]
             load_param_dict = structured_para_dict
 
     return load_param_dict
@@ -117,7 +144,8 @@ def _load_state_dict_from_save_params(model_path):
                 type='load',
                 inputs={},
                 outputs={'Out': new_var},
-                attrs={'file_path': os.path.join(model_path, name)})
+                attrs={'file_path': os.path.join(model_path, name)},
+            )
             load_var_list.append(new_var)
 
     # 3. construct state_dict
@@ -153,7 +181,8 @@ def _build_load_path_and_config(path, config):
         raise ValueError(
             "The %s.pdmodel and %s directory exist at the same time, "
             "don't know which one to load, please make sure that the specified target "
-            "of ``path`` is unique." % (path, path))
+            "of ``path`` is unique." % (path, path)
+        )
     elif not prefix_format_exist and not directory_format_exist:
         error_msg = "The ``path`` (%s) to load model not exists."
         # if current path is a prefix, and the path.pdparams or path.pdopt
@@ -162,10 +191,12 @@ def _build_load_path_and_config(path, config):
         params_file_path = path + ".pdparams"
         opti_file_path = path + ".pdopt"
         if os.path.exists(params_file_path) or os.path.exists(opti_file_path):
-            error_msg += " If you want to load the results saved by `fluid.save_dygraph`, " \
-                "please specify the full file name, not just the file name prefix. For " \
-                "example, it should be written as `paddle.load('model.pdparams')` instead of " \
+            error_msg += (
+                " If you want to load the results saved by `fluid.save_dygraph`, "
+                "please specify the full file name, not just the file name prefix. For "
+                "example, it should be written as `paddle.load('model.pdparams')` instead of "
                 "`paddle.load('model')`."
+            )
         raise ValueError(error_msg % path)
     else:
         if prefix_format_exist:
@@ -175,13 +206,15 @@ def _build_load_path_and_config(path, config):
                 warnings.warn(
                     "When loading the result saved with the "
                     "specified file prefix, the ``model_filename`` config does "
-                    "not take effect.")
+                    "not take effect."
+                )
             config.model_filename = file_prefix + INFER_MODEL_SUFFIX
             if config.params_filename is not None:
                 warnings.warn(
                     "When loading the result saved with the "
                     "specified file prefix, the ``params_filename`` config does "
-                    "not take effect.")
+                    "not take effect."
+                )
             config.params_filename = file_prefix + INFER_PARAMS_SUFFIX
         else:
             # Compatible with the old save_inference_model format
@@ -192,7 +225,10 @@ def _build_load_path_and_config(path, config):
 
 def _parse_load_config(configs):
     supported_configs = [
-        'model_filename', 'params_filename', 'keep_name_table', 'return_numpy'
+        'model_filename',
+        'params_filename',
+        'keep_name_table',
+        'return_numpy',
     ]
 
     # input check
@@ -200,7 +236,8 @@ def _parse_load_config(configs):
         if key not in supported_configs:
             raise ValueError(
                 "The additional config (%s) of `paddle.load` is not supported."
-                % key)
+                % key
+            )
 
     # construct inner config
     inner_config = _SaveLoadConfig()
@@ -220,7 +257,8 @@ def _parse_save_config(configs):
         if key not in supported_configs:
             raise ValueError(
                 "The additional config (%s) of `paddle.save` is not supported."
-                % key)
+                % key
+            )
 
     # construct inner config
     inner_config = _SaveLoadConfig()
@@ -233,19 +271,22 @@ def _parse_save_config(configs):
 def _pickle_save(obj, f, protocol):
     # TODO(weixin):add support for BytesIO.
     if not isinstance(protocol, int):
-        raise ValueError("The 'protocol' MUST be `int`, but received {}".format(
-            type(protocol)))
+        raise ValueError(
+            "The 'protocol' MUST be `int`, but received {}".format(
+                type(protocol)
+            )
+        )
 
     if protocol < 2 or protocol > 4:
         raise ValueError(
-            "Expected 1<'protocol'<5, but received protocol={}".format(
-                protocol))
+            "Expected 1<'protocol'<5, but received protocol={}".format(protocol)
+        )
 
     def reduce_varbase(self):
         data = self.numpy()
         name = self.name
 
-        return (tuple, ((name, data), ))
+        return (tuple, ((name, data),))
 
     def reduce_LoDTensor(self):
         data = np.array(self)
@@ -254,7 +295,8 @@ def _pickle_save(obj, f, protocol):
 
     def reduce_Layer(self):
         raise ValueError(
-            "paddle do not support saving `paddle.nn.Layer` object.")
+            "paddle do not support saving `paddle.nn.Layer` object."
+        )
 
     dispatch_table_layer = dict()
 
@@ -262,8 +304,9 @@ def _pickle_save(obj, f, protocol):
         dispatch_table_layer[layer.__class__] = reduce_Layer
         return layer
 
-    _parse_every_object(obj, lambda v: isinstance(v, fluid.Layer),
-                        create_layer_dispatch_table)
+    _parse_every_object(
+        obj, lambda v: isinstance(v, fluid.Layer), create_layer_dispatch_table
+    )
 
     def add_dispatch_table():
         # This is not a good method, because the pickle module has been modified.
@@ -291,7 +334,7 @@ def _pickle_save(obj, f, protocol):
 
         max_bytes = 2**30
         for i in range(0, len(pickle_bytes), max_bytes):
-            f.write(pickle_bytes[i:i + max_bytes])
+            f.write(pickle_bytes[i : i + max_bytes])
     else:
         pickler = pickle.Pickler(f, protocol)
         pickler.dispatch_table = copyreg.dispatch_table.copy()
@@ -308,7 +351,8 @@ def _pickle_save(obj, f, protocol):
 def _contain_x(obj, condition_func):
     if isinstance(obj, core.SelectedRows):
         raise NotImplementedError(
-            "`paddle.save` do not support saving 'SelectedRows'.")
+            "`paddle.save` do not support saving 'SelectedRows'."
+        )
 
     if condition_func(obj):
         return True
@@ -332,8 +376,16 @@ def _is_state_dict(obj):
 
         def condition(obj):
             return isinstance(
-                obj, (fluid.Layer, Program, core.VarBase, core.eager.Tensor,
-                      core.LoDTensor, core.SelectedRows))
+                obj,
+                (
+                    fluid.Layer,
+                    Program,
+                    core.VarBase,
+                    core.eager.Tensor,
+                    core.LoDTensor,
+                    core.SelectedRows,
+                ),
+            )
 
         # If the value of a dict is a core.VarBase/LoDTensor or a dict
         # that does not contain a paddle type(Layer, Program, VarBase, LoDTensor, SelectedRows),
@@ -344,7 +396,8 @@ def _is_state_dict(obj):
                     if _contain_x(v, condition):
                         return False
             elif not isinstance(
-                    value, (core.VarBase, core.eager.Tensor, core.LoDTensor)):
+                value, (core.VarBase, core.eager.Tensor, core.LoDTensor)
+            ):
                 return False
         return True
 
@@ -372,8 +425,10 @@ def _transformed_from_lodtensor(obj):
 def _to_LodTensor(ndarray):
     if not isinstance(ndarray, np.ndarray):
         raise TypeError(
-            'Type of `ndarray` should be numpy.ndarray, but received {}.'.
-            format(type(ndarray)))
+            'Type of `ndarray` should be numpy.ndarray, but received {}.'.format(
+                type(ndarray)
+            )
+        )
     t = core.LoDTensor()
     place = _current_expected_place()
     t.set(ndarray, place)
@@ -420,26 +475,30 @@ def _parse_every_object(obj, condition_func, convert_func):
             if condition_func(obj[key]):
                 obj[key] = convert_func(obj[key])
             else:
-                obj[key] = _parse_every_object(obj[key], condition_func,
-                                               convert_func)
+                obj[key] = _parse_every_object(
+                    obj[key], condition_func, convert_func
+                )
         return obj
     elif type(obj) == tuple:
         return tuple(
-            _parse_every_object(list(obj), condition_func, convert_func))
+            _parse_every_object(list(obj), condition_func, convert_func)
+        )
     elif type(obj) == set:
         return set(_parse_every_object(list(obj), condition_func, convert_func))
     else:
         if isinstance(obj, Iterable) and not isinstance(
-                obj,
-            (str, np.ndarray, core.VarBase, core.eager.Tensor, core.LoDTensor)):
+            obj,
+            (str, np.ndarray, core.VarBase, core.eager.Tensor, core.LoDTensor),
+        ):
             raise NotImplementedError(
-                "The iteratable objects supported are tuple, list, dict, OrderedDict, string. But received {}."
-                .format(type(obj)))
+                "The iteratable objects supported are tuple, list, dict, OrderedDict, string. But received {}.".format(
+                    type(obj)
+                )
+            )
         return obj
 
 
 def _parse_load_result(obj, return_numpy):
-
     def is_layer(obj):
         return isinstance(obj, fluid.Layer)
 
@@ -465,13 +524,15 @@ def _parse_load_result(obj, return_numpy):
     # tuple(name, ndarry) was converted from varbase of paddle2.1,
     # and all tuple(name, ndarry) are converted to tensor.
     if _contain_x(obj, _transformed_from_varbase):
-        return _parse_every_object(obj, _transformed_from_varbase,
-                                   tuple_to_tensor)
+        return _parse_every_object(
+            obj, _transformed_from_varbase, tuple_to_tensor
+        )
     # If there is no tuple(name, ndary), it is considered to be saved by paddle2.0
     # or converted from LoDTensor, and all ndarrays are converted to tensor.
     else:
-        return _parse_every_object(obj, _transformed_from_lodtensor,
-                                   ndarray_to_tensor)
+        return _parse_every_object(
+            obj, _transformed_from_lodtensor, ndarray_to_tensor
+        )
 
 
 def _save_lod_tensor(tensor, file_name):
@@ -492,8 +553,10 @@ def _save_lod_tensor(tensor, file_name):
 
     else:
         raise NotImplementedError(
-            'Only supports saving objects to file or BytesIO, but received {}'.
-            format(type(file_name)))
+            'Only supports saving objects to file or BytesIO, but received {}'.format(
+                type(file_name)
+            )
+        )
     return _seek
 
 
@@ -511,8 +574,10 @@ def _load_lod_tensor(file_name):
 
     else:
         raise NotImplementedError(
-            'Only supports load objects from file or BytesIO, but received {}'.
-            format(type(file_name)))
+            'Only supports load objects from file or BytesIO, but received {}'.format(
+                type(file_name)
+            )
+        )
 
     return temp_t, _seek
 
@@ -531,8 +596,10 @@ def _save_selected_rows(selected_rows, file_name):
             _seek = f.tell()
     else:
         raise NotImplementedError(
-            'Only supports saving objects to file or BytesIO, but received {}'.
-            format(type(file_name)))
+            'Only supports saving objects to file or BytesIO, but received {}'.format(
+                type(file_name)
+            )
+        )
     return _seek
 
 
@@ -546,13 +613,16 @@ def _load_selected_rows(file_name):
         with _open_file_buffer(file_name, 'rb') as f:
             selected_rows_bytes = f.read()
             paddle.fluid.core.load_selected_rows_from_memory(
-                temp_sr, selected_rows_bytes)
+                temp_sr, selected_rows_bytes
+            )
         _seek = f.tell()
 
     else:
         raise NotImplementedError(
-            'Only supports load objects from file or BytesIO, but received {}'.
-            format(type(file_name)))
+            'Only supports load objects from file or BytesIO, but received {}'.format(
+                type(file_name)
+            )
+        )
 
     return temp_sr, _seek
 
@@ -567,34 +637,36 @@ def _save_binary_var(obj, path):
     else:
         # Since the concept of 'Tensor' is only exposed to users, the error message can only contain tensor instead of 'LoDTensor' or 'SelectedRows'
         raise NotImplementedError(
-            "When use_binary_format = True, `paddle.save`  expected Tensor, but received {}."
-            .format(type(obj)))
+            "When use_binary_format = True, `paddle.save`  expected Tensor, but received {}.".format(
+                type(obj)
+            )
+        )
 
 
 def save(obj, path, protocol=4, **configs):
     '''
     Save an object to the specified path.
-    
+
     .. note::
         Now supports saving ``state_dict`` of Layer/Optimizer, Tensor and nested structure containing Tensor, Program.
 
     .. note::
-        Different from ``paddle.jit.save``, since the save result of ``paddle.save`` is a single file, 
-        there is no need to distinguish multiple saved files by adding a suffix. The argument ``path`` 
-        of ``paddle.save`` will be directly used as the saved file name instead of a prefix. 
+        Different from ``paddle.jit.save``, since the save result of ``paddle.save`` is a single file,
+        there is no need to distinguish multiple saved files by adding a suffix. The argument ``path``
+        of ``paddle.save`` will be directly used as the saved file name instead of a prefix.
         In order to unify the saved file name format, we recommend using the paddle standard suffix:
-        1. for ``Layer.state_dict`` , recommend to use ``.pdparams`` ; 
-        2. for ``Optimizer.state_dict`` , recommend to use ``.pdopt`` . 
+        1. for ``Layer.state_dict`` , recommend to use ``.pdparams`` ;
+        2. for ``Optimizer.state_dict`` , recommend to use ``.pdopt`` .
         For specific examples, please refer to API code examples.
-    
+
     Args:
         obj(Object) : The object to be saved.
-        path(str|BytesIO) : The path/buffer of the object to be saved. 
-          If saved in the current directory, the input path string will be used as the file name. 
+        path(str|BytesIO) : The path/buffer of the object to be saved.
+          If saved in the current directory, the input path string will be used as the file name.
         protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
                                  Default: 4
         **configs(dict, optional): optional keyword arguments. The following options are currently supported:
-          use_binary_format(bool): When the saved object is static graph variable, you can specify ``use_binary_for_var``. 
+          use_binary_format(bool): When the saved object is static graph variable, you can specify ``use_binary_for_var``.
           If True, save the file in the c++ binary format when saving a single static graph variable; otherwise, save it in pickle format.
           Default: False
 
@@ -687,7 +759,7 @@ def save(obj, path, protocol=4, **configs):
             paddle.save(state_dict, byio)
             tensor = paddle.randn([2, 3], dtype='float32')
             paddle.save(tensor, byio)
-    
+
     '''
     if _is_file_path(path):
         # 1. input check
@@ -696,7 +768,8 @@ def save(obj, path, protocol=4, **configs):
             raise ValueError(
                 "The input path MUST be format of dirname/filename "
                 "[dirname\\filename in Windows system], but received "
-                "filename is empty string.")
+                "filename is empty string."
+            )
 
         # 2. save object
         dirname = os.path.dirname(path)
@@ -704,15 +777,19 @@ def save(obj, path, protocol=4, **configs):
             os.makedirs(dirname)
     elif not _is_memory_buffer(path):
         raise ValueError(
-            "only supports saving objects to file and `BytesIO`, but got {}".
-            format(type(path)))
+            "only supports saving objects to file and `BytesIO`, but got {}".format(
+                type(path)
+            )
+        )
 
     config = _parse_save_config(configs)
 
     if not isinstance(config.use_binary_format, bool):
         raise TypeError(
-            "Type of `use_binary_format` should be bool, but received {}.".
-            format(type(config.use_binary_format)))
+            "Type of `use_binary_format` should be bool, but received {}.".format(
+                type(config.use_binary_format)
+            )
+        )
 
     if config.use_binary_format:
         _save_binary_var(obj, path)
@@ -744,19 +821,23 @@ def _legacy_save(obj, path, protocol=2):
     if not isinstance(obj, dict):
         raise NotImplementedError(
             "Now only supports save state_dict of Layer or Optimizer, "
-            "expect dict, but received %s." % type(obj))
+            "expect dict, but received %s." % type(obj)
+        )
 
     if len(obj) == 0:
         warnings.warn("The input state dict is empty, no need to save.")
 
     if not isinstance(protocol, int):
-        raise ValueError("The 'protocol' MUST be `int`, but received {}".format(
-            type(protocol)))
+        raise ValueError(
+            "The 'protocol' MUST be `int`, but received {}".format(
+                type(protocol)
+            )
+        )
 
     if protocol < 2 or protocol > 4:
         raise ValueError(
-            "Expected 1<'protocol'<5, but received protocol={}".format(
-                protocol))
+            "Expected 1<'protocol'<5, but received protocol={}".format(protocol)
+        )
 
     if _is_file_path(path):
         filename = os.path.basename(path)
@@ -764,7 +845,8 @@ def _legacy_save(obj, path, protocol=2):
             raise ValueError(
                 "The input path MUST be format of dirname/filename "
                 "[dirname\\filename in Windows system], but received "
-                "filename is empty string.")
+                "filename is empty string."
+            )
         # 2. save object
         dirname = os.path.dirname(path)
         if dirname and not os.path.exists(dirname):
@@ -776,13 +858,16 @@ def _legacy_save(obj, path, protocol=2):
     saved_obj = _unpack_saved_dict(saved_obj, protocol)
 
     # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
-    if _is_file_path(
-            path) and sys.platform == 'darwin' and sys.version_info.major == 3:
+    if (
+        _is_file_path(path)
+        and sys.platform == 'darwin'
+        and sys.version_info.major == 3
+    ):
         pickle_bytes = pickle.dumps(saved_obj, protocol=protocol)
         with open(path, 'wb') as f:
             max_bytes = 2**30
             for i in range(0, len(pickle_bytes), max_bytes):
-                f.write(pickle_bytes[i:i + max_bytes])
+                f.write(pickle_bytes[i : i + max_bytes])
     else:
         with _open_file_buffer(path, 'wb') as f:
             pickle.dump(saved_obj, f, protocol=protocol)
@@ -796,42 +881,42 @@ def load(path, **configs):
         Now supports loading ``state_dict`` of Layer/Optimizer, Tensor and nested structure containing Tensor, Program.
 
     .. note::
-        In order to use the model parameters saved by paddle more efficiently, 
-        ``paddle.load`` supports loading ``state_dict`` of Layer from the result of 
-        other save APIs except ``paddle.save`` , but the argument ``path`` format is 
+        In order to use the model parameters saved by paddle more efficiently,
+        ``paddle.load`` supports loading ``state_dict`` of Layer from the result of
+        other save APIs except ``paddle.save`` , but the argument ``path`` format is
         different:
-        1. loading from ``paddle.static.save`` or ``paddle.Model().save(training=True)`` ,  
-        ``path`` needs to be a complete file name, such as ``model.pdparams`` or 
-        ``model.pdopt`` ; 
-        2. loading from ``paddle.jit.save`` or ``paddle.static.save_inference_model`` 
-        or ``paddle.Model().save(training=False)`` , ``path`` need to be a file prefix, 
-        such as ``model/mnist``, and ``paddle.load`` will get information from 
+        1. loading from ``paddle.static.save`` or ``paddle.Model().save(training=True)`` ,
+        ``path`` needs to be a complete file name, such as ``model.pdparams`` or
+        ``model.pdopt`` ;
+        2. loading from ``paddle.jit.save`` or ``paddle.static.save_inference_model``
+        or ``paddle.Model().save(training=False)`` , ``path`` need to be a file prefix,
+        such as ``model/mnist``, and ``paddle.load`` will get information from
         ``mnist.pdmodel`` and ``mnist.pdiparams`` ;
-        3. loading from paddle 1.x APIs ``paddle.fluid.io.save_inference_model`` or 
-        ``paddle.fluid.io.save_params/save_persistables`` , ``path`` need to be a 
+        3. loading from paddle 1.x APIs ``paddle.fluid.io.save_inference_model`` or
+        ``paddle.fluid.io.save_params/save_persistables`` , ``path`` need to be a
         directory, such as ``model`` and model is a directory.
 
     .. note::
-        If you load ``state_dict`` from the saved result of static mode API such as 
-        ``paddle.static.save`` or ``paddle.static.save_inference_model`` , 
-        the structured variable name in dynamic mode will cannot be restored. 
-        You need to set the argument ``use_structured_name=False`` when using 
+        If you load ``state_dict`` from the saved result of static mode API such as
+        ``paddle.static.save`` or ``paddle.static.save_inference_model`` ,
+        the structured variable name in dynamic mode will cannot be restored.
+        You need to set the argument ``use_structured_name=False`` when using
         ``Layer.set_state_dict`` later.
 
     Args:
-        path(str|BytesIO) : The path/buffer to load the target object. Generally, the path is the target 
-            file path. When loading state_dict from the saved result of the API used to save 
+        path(str|BytesIO) : The path/buffer to load the target object. Generally, the path is the target
+            file path. When loading state_dict from the saved result of the API used to save
             the inference model, the path may be a file prefix or directory.
-        **configs (dict, optional): other load configuration options for compatibility. We do not 
-            recommend using these configurations, they may be removed in the future. If not necessary, 
+        **configs (dict, optional): other load configuration options for compatibility. We do not
+            recommend using these configurations, they may be removed in the future. If not necessary,
             DO NOT use them. Default None.
             The following options are currently supported:
-            (1) model_filename (str): The inference model file name of the paddle 1.x 
-            ``save_inference_model`` save format. Default file name is :code:`__model__` . 
-            (2) params_filename (str): The persistable variables file name of the paddle 1.x 
-            ``save_inference_model`` save format. No default file name, save variables separately 
-            by default.            
-            (3) return_numpy(bool): If specified as True, return tensor as numpy.ndarray, otherwise return tensor as paddle.Tensor. 
+            (1) model_filename (str): The inference model file name of the paddle 1.x
+            ``save_inference_model`` save format. Default file name is :code:`__model__` .
+            (2) params_filename (str): The persistable variables file name of the paddle 1.x
+            ``save_inference_model`` save format. No default file name, save variables separately
+            by default.
+            (3) return_numpy(bool): If specified as True, return tensor as numpy.ndarray, otherwise return tensor as paddle.Tensor.
             Default False.
 
     Returns:
@@ -949,9 +1034,11 @@ def load(path, **configs):
         try:
             with _open_file_buffer(path, 'rb') as f:
                 # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
-                if _is_file_path(
-                        path
-                ) and sys.platform == 'darwin' and sys.version_info.major == 3:
+                if (
+                    _is_file_path(path)
+                    and sys.platform == 'darwin'
+                    and sys.version_info.major == 3
+                ):
                     load_result = _pickle_loads_mac(path, f)
                 else:
                     load_result = pickle.load(f, encoding='latin1')
@@ -965,18 +1052,24 @@ def load(path, **configs):
                         for key in load_result["StructuredToParameterName@@"]:
                             if isinstance(load_result[key], np.ndarray):
                                 load_result[key] = _ndarray_to_tensor(
-                                    load_result[key], config.return_numpy)
+                                    load_result[key], config.return_numpy
+                                )
 
-                        if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
+                        if (
+                            not config.keep_name_table
+                            and "StructuredToParameterName@@" in load_result
+                        ):
                             del load_result["StructuredToParameterName@@"]
                     else:
                         # paddle2.1 static.save/load
                         load_result = _parse_load_result(
-                            load_result, config.return_numpy)
+                            load_result, config.return_numpy
+                        )
 
                 else:
-                    load_result = _parse_load_result(load_result,
-                                                     config.return_numpy)
+                    load_result = _parse_load_result(
+                        load_result, config.return_numpy
+                    )
 
         except exception_type as msg_pickle:
             try:
@@ -996,12 +1089,15 @@ def load(path, **configs):
                         with _open_file_buffer(path, "rb") as f:
                             program_desc_str = f.read()
                             program = Program.parse_from_string(
-                                program_desc_str)
+                                program_desc_str
+                            )
                             return program
                     except:
                         raise ValueError(
                             "`paddle.load` can not parse the file:{}.".format(
-                                path))
+                                path
+                            )
+                        )
 
     else:
         load_result = _legacy_load(path, **configs)
@@ -1018,7 +1114,10 @@ def _legacy_load(path, **configs):
         with _open_file_buffer(path, 'rb') as f:
             load_result = pickle.load(f, encoding='latin1')
         load_result = _pack_loaded_dict(load_result)
-        if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
+        if (
+            not config.keep_name_table
+            and "StructuredToParameterName@@" in load_result
+        ):
             del load_result["StructuredToParameterName@@"]
     else:
         # file prefix and directory are compatible cases
@@ -1039,7 +1138,8 @@ def _legacy_load(path, **configs):
             # the user to configure the `use_structured_name` argument when `set_state_dict`
             # NOTE(chenweihang): `jit.save` doesn't save optimizer state
             load_result = _load_state_dict_from_save_inference_model(
-                model_path, config)
+                model_path, config
+            )
         else:
             # load state dict by `io.save_params/persistables` save format
             # TODO(chenweihang): [ Now only supports loading parameters separately ]
diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py
index ba7a2537df1..9e8739b9da3 100644
--- a/python/paddle/incubate/autograd/primapi.py
+++ b/python/paddle/incubate/autograd/primapi.py
@@ -28,8 +28,8 @@ def forward_grad(outputs, inputs, grad_inputs=None):
     Args:
         outputs(Tensor|Sequence[Tensor]): The output tensor or tensors.
         inputs(Tensor|Sequence[Tensor]): The input tensor or tensors.
-        grad_inputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or 
-            Tensors of inputs which has the same shape with inputs, Defaults to 
+        grad_inputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or
+            Tensors of inputs which has the same shape with inputs, Defaults to
             None, in this case is equivalent to all ones.
 
     Returns:
@@ -50,7 +50,7 @@ def forward_grad(outputs, inputs, grad_inputs=None):
 
             with paddle.static.program_guard(main_program, startup_program):
                 x = paddle.static.data('x', shape=[1], dtype='float32')
-                y = x * x 
+                y = x * x
                 y_grad = paddle.incubate.autograd.forward_grad(y, x)
                 paddle.incubate.autograd.prim2orig()
 
@@ -64,25 +64,35 @@ def forward_grad(outputs, inputs, grad_inputs=None):
             paddle.disable_static()
     """
     if not utils.prim_enabled():
-        raise RuntimeError('forward_grad must be running on primitive'
-                           'operators, use enable_prim to turn it on.')
+        raise RuntimeError(
+            'forward_grad must be running on primitive'
+            'operators, use enable_prim to turn it on.'
+        )
 
     if not isinstance(outputs, (framework.Variable, typing.Sequence)):
-        raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], '
-                        f'but got {type(outputs)}.')
+        raise TypeError(
+            f'Expected outputs is Tensor|Sequence[Tesnor], '
+            f'but got {type(outputs)}.'
+        )
 
     if not isinstance(inputs, (framework.Variable, typing.Sequence)):
-        raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], '
-                        f'but got {type(inputs)}.')
+        raise TypeError(
+            f'Expected inputs is Tensor|Sequence[Tesnor], '
+            f'but got {type(inputs)}.'
+        )
 
-    ys, xs, xs_dot = utils.as_tensors(outputs), utils.as_tensors(
-        inputs), utils.as_tensors(grad_inputs)
+    ys, xs, xs_dot = (
+        utils.as_tensors(outputs),
+        utils.as_tensors(inputs),
+        utils.as_tensors(grad_inputs),
+    )
 
     block = framework.default_main_program().current_block()
     if any(x.block != block for x in xs + ys):
         raise RuntimeError(
             'Variable in inputs and targets should exist in current block of '
-            'main program.')
+            'main program.'
+        )
 
     primx.orig2prim(block)
     ad = primx.Transform(ys[0].block)
@@ -101,12 +111,12 @@ def grad(outputs, inputs, grad_outputs=None):
     Args:
         outputs(Tensor|Sequence[Tensor]): The output Tensor or Tensors.
         inputs(Tensor|Sequence[Tensor]): The input Tensor or Tensors.
-        grad_outputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or 
-            Tensors of outputs which has the same shape with outputs, Defaults 
+        grad_outputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or
+            Tensors of outputs which has the same shape with outputs, Defaults
             to None, in this case is equivalent to all ones.
 
     Returns:
-        grad_inputs(Tensor|Tensors): The gradients for inputs. 
+        grad_inputs(Tensor|Tensors): The gradients for inputs.
 
     Examples:
 
@@ -123,7 +133,7 @@ def grad(outputs, inputs, grad_outputs=None):
             with paddle.static.program_guard(main_program, startup_program):
                 x = paddle.static.data('x', shape=[1], dtype='float32')
                 x.stop_gradients = False
-                y = x * x 
+                y = x * x
                 x_grad = paddle.incubate.autograd.grad(y, x)
                 paddle.incubate.autograd.prim2orig()
 
@@ -132,7 +142,7 @@ def grad(outputs, inputs, grad_outputs=None):
             x_grad = exe.run(main_program, feed={'x': np.array([2.]).astype('float32')}, fetch_list=[x_grad])
             print(x_grad)
             # [array([4.], dtype=float32)]
-            
+
             paddle.incubate.autograd.disable_prim()
             paddle.disable_static()
     """
@@ -141,22 +151,32 @@ def grad(outputs, inputs, grad_outputs=None):
         # backward.gradients returns a list though the inputs is a signle Tensor.
         # The follow code snippet fixes the problem by return the first element
         # of grad_inputs when the inputs is a signle Tensor.
-        if isinstance(inputs, framework.Variable) and isinstance(
-                grad_inputs, typing.Sequence) and len(grad_inputs) > 0:
+        if (
+            isinstance(inputs, framework.Variable)
+            and isinstance(grad_inputs, typing.Sequence)
+            and len(grad_inputs) > 0
+        ):
             return grad_inputs[0]
         else:
             return grad_inputs
 
     if not isinstance(outputs, (framework.Variable, typing.Sequence)):
-        raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], '
-                        f'but got {type(outputs)}.')
+        raise TypeError(
+            f'Expected outputs is Tensor|Sequence[Tesnor], '
+            f'but got {type(outputs)}.'
+        )
 
     if not isinstance(inputs, (framework.Variable, typing.Sequence)):
-        raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], '
-                        f'but got {type(inputs)}.')
+        raise TypeError(
+            f'Expected inputs is Tensor|Sequence[Tesnor], '
+            f'but got {type(inputs)}.'
+        )
 
-    ys, xs, ys_bar = utils.as_tensors(outputs), utils.as_tensors(
-        inputs), utils.as_tensors(grad_outputs)
+    ys, xs, ys_bar = (
+        utils.as_tensors(outputs),
+        utils.as_tensors(inputs),
+        utils.as_tensors(grad_outputs),
+    )
     block = framework.default_main_program().current_block()
     if any((x is not None and x.block != block) for x in xs + ys):
         raise RuntimeError(
diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py
index 565fcb0b4ed..7fdbb08bfcb 100644
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
@@ -21,17 +21,25 @@ from paddle.fluid.framework import Operator, default_main_program
 from paddle.incubate.autograd.utils import as_tensors
 
 from .primops import add, fill_const
-from .primreg import (lookup_orig2prim, lookup_prim2orig, op_position_inputs,
-                      op_position_output)
+from .primreg import (
+    lookup_orig2prim,
+    lookup_prim2orig,
+    op_position_inputs,
+    op_position_output,
+)
 from .primrules import _jvp, _orig2prim, _prim2orig, _transpose
-from .utils import (flatten, flatten_and_remove_none, get_input_var_list,
-                    get_output_var_list)
+from .utils import (
+    flatten,
+    flatten_and_remove_none,
+    get_input_var_list,
+    get_output_var_list,
+)
 
 
 def topo_path(xs, ys, block=None):
-    """ Returns the list of ops on the path from `xs` to `ys` in topological 
+    """Returns the list of ops on the path from `xs` to `ys` in topological
     order.
-    
+
     TODO(Tongxin): supporting control flow and nested blocks.
     Args:
         xs: a list|tuple of vars as source
@@ -51,13 +59,16 @@ def topo_path(xs, ys, block=None):
 
     # Initialize reached vars
     for x in xs:
-        assert x is None or x.block == block, f'x is not None and x.block != block'
+        assert (
+            x is None or x.block == block
+        ), f'x is not None and x.block != block'
         reached_vars[id(x)] = x
 
     # Reaching test, returning whether an op is reached from the given input
     reaching = lambda op: any(
         id(v) in reached_vars
-        for v in flatten_and_remove_none(get_input_var_list(op)))
+        for v in flatten_and_remove_none(get_input_var_list(op))
+    )
 
     # block.ops are supposedly in the order that preserves correct data
     # dependence.
@@ -71,7 +82,8 @@ def topo_path(xs, ys, block=None):
     used_vars = OrderedDict((id(y), y) for y in ys if id(y) in reached_vars)
     back_reaching = lambda op: any(
         id(out) in used_vars
-        for out in flatten_and_remove_none(get_output_var_list(op)))
+        for out in flatten_and_remove_none(get_output_var_list(op))
+    )
 
     # Backward pass to find all used variables
     for op in reversed(path):
@@ -87,9 +99,9 @@ def topo_path(xs, ys, block=None):
 
 
 def output_vars_on_path(path):
-    """ Returns the output variables of all the ops on the path from `xs`
+    """Returns the output variables of all the ops on the path from `xs`
     to `ys`.
-    
+
     Args:
         path: a list of ops on which to find the output variables
 
@@ -105,8 +117,8 @@ def output_vars_on_path(path):
 
 
 class VarMap(object):
-    """ A general map data structure for linking variables to variables.
-    
+    """A general map data structure for linking variables to variables.
+
     An example is linking variables to their gradients.
     """
 
@@ -126,7 +138,8 @@ class VarMap(object):
         if isinstance(key_vars, paddle.fluid.framework.Variable):
             if not isinstance(value_vars, paddle.fluid.framework.Variable):
                 raise TypeError(
-                    f'value_vars must be Variable, but got {type(value_vars)}')
+                    f'value_vars must be Variable, but got {type(value_vars)}'
+                )
             self.tab[id(key_vars)] = id(value_vars)
         else:
             assert len(key_vars) == len(value_vars), (
@@ -169,11 +182,12 @@ class VarMap(object):
 
 # TODO(lml): supporting control flow, nested blocks, and block other than current block of main program.
 class Transform(object):
-    """ An object that maintains the state of transformations applied to a 
-    primitve program. """
+    """An object that maintains the state of transformations applied to a
+    primitve program."""
 
     def __init__(self, block):
-        assert block == default_main_program().current_block(
+        assert (
+            block == default_main_program().current_block()
         ), f'only support transform on current block of main program.'
         self.block = block
         self.vars = self.init_vars(block)
@@ -225,7 +239,7 @@ class Transform(object):
         block._sync_with_cpp()
 
     def var2dot_rec(self, vars):
-        """ Lookup var2dot recursively."""
+        """Lookup var2dot recursively."""
         if isinstance(vars, paddle.fluid.framework.Variable):
             dot = self.var2dot.lookup(vars)
             return dot
@@ -244,9 +258,9 @@ class Transform(object):
         return bars
 
     def linearize(self, xs, ys, xs_dot=None):
-        """ Performs the linearization transform, a.k.a, forward mode AD 
+        """Performs the linearization transform, a.k.a, forward mode AD
         transform, on a primitive lowered program.
-        
+
         Args:
             xs: a list of input variables
             ys: a list of output variables
@@ -256,9 +270,9 @@ class Transform(object):
 
         Returns:
             (xs_dot, ys_dot): a tuple of two lists. `xs_dot` is the list of
-            gradient inputs of the resulting linearized program. `ys_dot` is 
+            gradient inputs of the resulting linearized program. `ys_dot` is
             the list gradient outputs of the resulting linearized program
-            
+
         """
         if xs_dot is None:
             xs_dot = [fill_const(1.0, shape=x.shape, dtype=x.dtype) for x in xs]
@@ -266,15 +280,18 @@ class Transform(object):
         else:
             assert len(xs) == len(xs_dot), (
                 f'len(xs) should be equal to len(xs_dot), '
-                f'but len(xs)={len(xs)} and len(xs_dot)={len(xs_dot)}')
+                f'but len(xs)={len(xs)} and len(xs_dot)={len(xs_dot)}'
+            )
 
         for x, dot in zip(xs, xs_dot):
             assert x.dtype == dot.dtype, (
                 f'x.dtype should be equal to dot.dtype, '
-                f'but x.dtype={x.dtype} and dot.dtype={dot.dtype}')
+                f'but x.dtype={x.dtype} and dot.dtype={dot.dtype}'
+            )
             assert x.shape == dot.shape, (
                 f'x.shape should be equal to dot.shape, '
-                f'but x.shape={x.shape} and dot.shape={dot.shape}')
+                f'but x.shape={x.shape} and dot.shape={dot.shape}'
+            )
             self.var2dot.add(x, dot)
 
         path, unused_xs, _ = topo_path(xs, ys, self.block)
@@ -300,23 +317,23 @@ class Transform(object):
         return xs_dot, ys_dot
 
     def transpose(self, ys_dot, xs_dot, ys_bar=None, retain_fwd=False):
-        """ Performs the transpose transform, a.k.a, reverse mode AD 
+        """Performs the transpose transform, a.k.a, reverse mode AD
         transform, on a linearized primitive program.
 
         Note, `transpose` is supposed to be used in couple with `linearize`.
-        
+
         Args:
             ys_dot: a list of outputs of the linearized program.
             xs_dot: a list of inputs of the linearized program.
-            ys_bar: optional, a list of inputs of the resulting transposed 
+            ys_bar: optional, a list of inputs of the resulting transposed
                 program. The list size must be equal to `len(ys_dot)`. The shape
                 and dtype of each element must be the same as in `ys_dot`
 
         Returns:
             (ys_bar, xs_bar): a tuple of two lists. `ys_bar` is the list of
-            inputs of the resulting transposed program. `xs_bar` is 
+            inputs of the resulting transposed program. `xs_bar` is
             the list outputs of the resulting transposed program
-            
+
         """
         assert all(v is not None for v in xs_dot), f'`xs_dot` includes None.'
         assert all(v is not None for v in ys_dot), f'`ys_dot` includes None.'
@@ -329,7 +346,8 @@ class Transform(object):
         else:
             assert len(ys_dot) == len(ys_bar), (
                 f'len(ys_dot) should be equal to len(ys_bar), '
-                f'but len(ys_dot)={len(ys_dot)} and len(ys_bar)={len(ys_bar)}')
+                f'but len(ys_dot)={len(ys_dot)} and len(ys_bar)={len(ys_bar)}'
+            )
             for y_dot, y_bar in zip(ys_dot, ys_bar):
                 assert y_dot.shape == y_bar.shape, (
                     f'y_dot.shape should be equal to y_bar.shape, '
@@ -373,7 +391,8 @@ class Transform(object):
             ins = flatten(op_position_inputs(op))
             assert len(ins) == len(ins_bar), (
                 f'len(ins) should be equal to len(ins_bar), '
-                f'but len(ins)={len(ins)} and len(ins_bar)={len(ins_bar)}')
+                f'but len(ins)={len(ins)} and len(ins_bar)={len(ins_bar)}'
+            )
 
             for dot, bar in zip(ins, ins_bar):
                 if bar is not None:
@@ -392,7 +411,8 @@ class Transform(object):
             vars_to_remove = set()
             for op in path:
                 vars_to_remove.update(
-                    flatten_and_remove_none(get_output_var_list(op)))
+                    flatten_and_remove_none(get_output_var_list(op))
+                )
 
             op_indexes = []
 
@@ -460,10 +480,12 @@ def _lower(block, reverse, blacklist):
             bind(input_args, to_bind, value_table)
 
             for orig_out, new_out in zip(
-                    expand_nested_list(get_output_var_list(op)),
-                    expand_nested_list(as_tensors(lower_fn(op, *input_args)))):
+                expand_nested_list(get_output_var_list(op)),
+                expand_nested_list(as_tensors(lower_fn(op, *input_args))),
+            ):
                 assert not (orig_out is None) ^ (
-                    new_out is None), "orig_out and new_out should match."
+                    new_out is None
+                ), "orig_out and new_out should match."
                 vars_to_remove.add(new_out.name)
                 value_table[new_out.name] = new_out
                 to_bind[orig_out.name] = new_out.name
@@ -472,7 +494,8 @@ def _lower(block, reverse, blacklist):
             inputs = {}
             for i in range(len(op.input_names)):
                 inputs[op.input_names[i]] = bind_name(
-                    op.input(op.input_names[i]), to_bind)
+                    op.input(op.input_names[i]), to_bind
+                )
 
             outputs = {}
             for i in range(len(op.output_names)):
@@ -482,14 +505,17 @@ def _lower(block, reverse, blacklist):
             for name in sorted(op.attr_names):
                 attrs[name] = op.attr(name)
             from paddle.fluid.dygraph.base import param_guard
+
             new_op_desc = block.desc.append_op()
             with param_guard(inputs), param_guard(outputs):
-                op = Operator(block=block,
-                              desc=new_op_desc,
-                              type=op.type,
-                              inputs=inputs,
-                              outputs=outputs,
-                              attrs=attrs)
+                op = Operator(
+                    block=block,
+                    desc=new_op_desc,
+                    type=op.type,
+                    inputs=inputs,
+                    outputs=outputs,
+                    attrs=attrs,
+                )
             block.ops.append(op)
 
     # Step3: Do some post-processing work
@@ -509,8 +535,9 @@ def _lower(block, reverse, blacklist):
                 op._rename_output(out_name, to_bind_rev[out_name])
 
     for var_name in sorted(vars_to_remove):
-        assert var_name in to_bind_rev, 'var_name "{}" is not in to_bind_rev.'.format(
-            var_name)
+        assert (
+            var_name in to_bind_rev
+        ), 'var_name "{}" is not in to_bind_rev.'.format(var_name)
         if var_name != to_bind_rev[var_name]:
             block.desc._remove_var(cpt.to_bytes(var_name))
             del block.vars[var_name]
@@ -519,7 +546,7 @@ def _lower(block, reverse, blacklist):
 
 @framework.static_only
 def orig2prim(block=None):
-    """ 
+    """
     .. note::
         **This API is ONLY available in the static mode.**
         **Args block must be None or current block of main program.**
@@ -528,7 +555,7 @@ def orig2prim(block=None):
     If it is an original operator, it will be transformed into
     one or a series of automatic differential basic operators with
     equivalent function.
-    
+
     Args:
         block(paddle.static.Block|None, optional): The
             target block to process on. Default None, and will
@@ -536,7 +563,8 @@ def orig2prim(block=None):
     """
 
     block = default_main_program().current_block() if block is None else block
-    assert block == default_main_program().current_block(
+    assert (
+        block == default_main_program().current_block()
     ), f'block is neither None nor current block of main program'
     _lower(block, reverse=False, blacklist=[])
 
@@ -552,7 +580,7 @@ def prim2orig(block=None, blacklist=None):
     If it is an automatic differential basic operator, it will be
     transformed into one or a series of original operators with
     equivalent function to support execution.
-    
+
     Args:
         block(paddle.static.Block|None, optional): The
             target block to process on. Default None, and will
@@ -568,10 +596,10 @@ def prim2orig(block=None, blacklist=None):
 
             import paddle
             from paddle.incubate.autograd import enable_prim, prim_enabled, prim2orig
-            
+
             paddle.enable_static()
             enable_prim()
-            
+
             x = paddle.ones(shape=[2, 2], dtype='float32')
             x.stop_gradients = False
             y = x * x
@@ -581,7 +609,8 @@ def prim2orig(block=None, blacklist=None):
     """
 
     block = default_main_program().current_block() if block is None else block
-    assert block == default_main_program().current_block(
+    assert (
+        block == default_main_program().current_block()
     ), f'block is neither None nor current block of main program'
     blacklist = [] if blacklist is None else blacklist
     _lower(block, reverse=True, blacklist=blacklist)
diff --git a/python/paddle/incubate/autograd/utils.py b/python/paddle/incubate/autograd/utils.py
index 96faf7f7440..5a5bb333a2d 100644
--- a/python/paddle/incubate/autograd/utils.py
+++ b/python/paddle/incubate/autograd/utils.py
@@ -18,7 +18,6 @@ from paddle.fluid import framework as framework
 
 
 class PrimOption(object):
-
     def __init__(self):
         self.enable_prim = False
 
@@ -38,9 +37,9 @@ def prim_enabled():
     .. note::
         **ONLY available in the static mode.**
 
-    Shows whether the automatic differentiation mechanism based on 
+    Shows whether the automatic differentiation mechanism based on
     automatic differential basic operators is ON. Defaults to OFF.
-     
+
     Returns:
         flag(bool): Whether the automatic differentiation mechanism based on automatic differential basic operators is ON.
 
@@ -50,7 +49,7 @@ def prim_enabled():
 
             import paddle
             from paddle.incubate.autograd import enable_prim, disable_prim, prim_enabled
-            
+
             paddle.enable_static()
             enable_prim()
 
@@ -69,16 +68,16 @@ def enable_prim():
     .. note::
         **ONLY available in the static mode.**
 
-    Turns ON automatic differentiation mechanism based on automatic 
+    Turns ON automatic differentiation mechanism based on automatic
     differential basic operators.
-    
+
     Examples:
 
         .. code-block:: python
 
             import paddle
             from paddle.incubate.autograd import enable_prim, prim_enabled
-            
+
             paddle.enable_static()
             enable_prim()
 
@@ -93,16 +92,16 @@ def disable_prim():
     .. note::
         **ONLY available in the static mode.**
 
-    Turns OFF automatic differentiation mechanism based on automatic 
+    Turns OFF automatic differentiation mechanism based on automatic
     differential basic operators.
-    
+
     Examples:
 
         .. code-block:: python
 
             import paddle
             from paddle.incubate.autograd import enable_prim, disable_prim, prim_enabled
-            
+
             paddle.enable_static()
             enable_prim()
 
@@ -175,7 +174,7 @@ def flatten_and_remove_none(inp):
 
 def as_tensors(xs):
     if isinstance(xs, framework.Variable):
-        return (xs, )
+        return (xs,)
     elif isinstance(xs, typing.Sequence):
         return tuple(xs)
     else:
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index b1d759b6953..6a36715324c 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -28,26 +28,28 @@ def _verify_dropout_rate(dropout_rate):
         raise ValueError("dropout_rate argument should between 0 and 1")
 
 
-def fused_feedforward(x,
-                      linear1_weight,
-                      linear2_weight,
-                      linear1_bias=None,
-                      linear2_bias=None,
-                      ln1_scale=None,
-                      ln1_bias=None,
-                      ln2_scale=None,
-                      ln2_bias=None,
-                      dropout1_rate=0.5,
-                      dropout2_rate=0.5,
-                      activation="relu",
-                      ln1_epsilon=1e-5,
-                      ln2_epsilon=1e-5,
-                      pre_layer_norm=False,
-                      training=True,
-                      mode='upscale_in_train',
-                      ring_id=-1,
-                      add_residual=True,
-                      name=None):
+def fused_feedforward(
+    x,
+    linear1_weight,
+    linear2_weight,
+    linear1_bias=None,
+    linear2_bias=None,
+    ln1_scale=None,
+    ln1_bias=None,
+    ln2_scale=None,
+    ln2_bias=None,
+    dropout1_rate=0.5,
+    dropout2_rate=0.5,
+    activation="relu",
+    ln1_epsilon=1e-5,
+    ln2_epsilon=1e-5,
+    pre_layer_norm=False,
+    training=True,
+    mode='upscale_in_train',
+    ring_id=-1,
+    add_residual=True,
+    name=None,
+):
     r"""
     This is a fusion operator to compute feed forward layer in transformer model architecture.
     This operator only supports running on GPU. The function of the operator is consistent with
@@ -126,112 +128,161 @@ def fused_feedforward(x,
         raise ValueError(
             "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
         )
-    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+    mode = (
+        'downgrade_in_infer' if mode == 'downscale_in_infer' else mode
+    )  # semantic transfer
 
     if _non_static_mode():
         if default_main_program().random_seed != 0:
             seed = default_main_program().random_seed
         out, _, _, _, _, _, _, _, _, _, _ = _legacy_C_ops.fused_feedforward(
-            x, None, None, linear1_weight, linear1_bias, linear2_weight,
-            linear2_bias, ln1_scale, ln1_bias, ln2_scale, ln2_bias,
-            'pre_layer_norm', pre_layer_norm, 'ln1_epsilon', ln1_epsilon,
-            'ln2_epsilon', ln2_epsilon, 'act_method', activation,
-            'dropout1_rate', dropout1_rate, 'dropout2_rate', dropout2_rate,
-            "is_test", not training, "dropout1_fix_seed", seed is not None,
-            "dropout2_fix_seed", seed is not None, "dropout1_seed",
-            seed if seed is not None else 0, "dropout2_seed",
-            seed if seed is not None else 0, 'dropout1_implementation', mode,
-            'dropout2_implementation', mode, 'add_residual', add_residual,
-            'ring_id', ring_id)
+            x,
+            None,
+            None,
+            linear1_weight,
+            linear1_bias,
+            linear2_weight,
+            linear2_bias,
+            ln1_scale,
+            ln1_bias,
+            ln2_scale,
+            ln2_bias,
+            'pre_layer_norm',
+            pre_layer_norm,
+            'ln1_epsilon',
+            ln1_epsilon,
+            'ln2_epsilon',
+            ln2_epsilon,
+            'act_method',
+            activation,
+            'dropout1_rate',
+            dropout1_rate,
+            'dropout2_rate',
+            dropout2_rate,
+            "is_test",
+            not training,
+            "dropout1_fix_seed",
+            seed is not None,
+            "dropout2_fix_seed",
+            seed is not None,
+            "dropout1_seed",
+            seed if seed is not None else 0,
+            "dropout2_seed",
+            seed if seed is not None else 0,
+            'dropout1_implementation',
+            mode,
+            'dropout2_implementation',
+            mode,
+            'add_residual',
+            add_residual,
+            'ring_id',
+            ring_id,
+        )
         return out
 
     helper = LayerHelper("fused_feedforward")
     dtype = x.dtype
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'fused_feedforward')
-    check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
-                'fused_feedforward')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'fused_feedforward'
+    )
+    check_dtype(
+        dtype, 'dtype', ['float16', 'float32', 'float64'], 'fused_feedforward'
+    )
 
     out = helper.create_variable_for_type_inference(x.dtype)
     dropout1_mask = helper.create_variable_for_type_inference(
-        'uint8', stop_gradient=True)
+        'uint8', stop_gradient=True
+    )
     dropout2_mask = helper.create_variable_for_type_inference(
-        'uint8', stop_gradient=True)
-    ln1_mean = helper.create_variable_for_type_inference(x.dtype,
-                                                         stop_gradient=True)
-    ln1_variance = helper.create_variable_for_type_inference(x.dtype,
-                                                             stop_gradient=True)
-    ln2_mean = helper.create_variable_for_type_inference(x.dtype,
-                                                         stop_gradient=True)
-    ln2_variance = helper.create_variable_for_type_inference(x.dtype,
-                                                             stop_gradient=True)
-    linear1_out = helper.create_variable_for_type_inference(x.dtype,
-                                                            stop_gradient=True)
-    ln1_out = helper.create_variable_for_type_inference(x.dtype,
-                                                        stop_gradient=True)
-    dropout1_out = helper.create_variable_for_type_inference(x.dtype,
-                                                             stop_gradient=True)
-    dropout2_out = helper.create_variable_for_type_inference(x.dtype,
-                                                             stop_gradient=True)
+        'uint8', stop_gradient=True
+    )
+    ln1_mean = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True
+    )
+    ln1_variance = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True
+    )
+    ln2_mean = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True
+    )
+    ln2_variance = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True
+    )
+    linear1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True
+    )
+    ln1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True
+    )
+    dropout1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True
+    )
+    dropout2_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True
+    )
 
     if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
         seed = helper.main_program.random_seed
 
-    helper.append_op(type='fused_feedforward',
-                     inputs={
-                         'X': x,
-                         'Linear1Weight': linear1_weight,
-                         'Linear1Bias': linear1_bias,
-                         'Linear2Weight': linear2_weight,
-                         'Linear2Bias': linear2_bias,
-                         'Ln1Scale': ln1_scale,
-                         'Ln1Bias': ln1_bias,
-                         'Ln2Scale': ln2_scale,
-                         'Ln2Bias': ln2_bias,
-                     },
-                     outputs={
-                         'Out': out,
-                         'Dropout1Mask': dropout1_mask,
-                         'Dropout2Mask': dropout2_mask,
-                         'Ln1Mean': ln1_mean,
-                         'Ln1Variance': ln1_variance,
-                         'Ln2Mean': ln2_mean,
-                         'Ln2Variance': ln2_variance,
-                         'Linear1Out': linear1_out,
-                         'Ln1Out': ln1_out,
-                         'Dropout1Out': dropout1_out,
-                         'Dropout2Out': dropout2_out,
-                     },
-                     attrs={
-                         'dropout1_rate': dropout1_rate,
-                         'dropout2_rate': dropout2_rate,
-                         'act_method': activation,
-                         'pre_layer_norm': pre_layer_norm,
-                         'ln1_epsilon': ln1_epsilon,
-                         'ln2_epsilon': ln2_epsilon,
-                         'is_test': not training,
-                         'dropout1_fix_seed': seed is not None,
-                         'dropout2_fix_seed': seed is not None,
-                         'dropout1_seed': seed if seed is not None else 0,
-                         'dropout2_seed': seed if seed is not None else 0,
-                         'dropout1_implementation': mode,
-                         'dropout2_implementation': mode,
-                         'add_residual': add_residual,
-                         'ring_id': ring_id,
-                     })
+    helper.append_op(
+        type='fused_feedforward',
+        inputs={
+            'X': x,
+            'Linear1Weight': linear1_weight,
+            'Linear1Bias': linear1_bias,
+            'Linear2Weight': linear2_weight,
+            'Linear2Bias': linear2_bias,
+            'Ln1Scale': ln1_scale,
+            'Ln1Bias': ln1_bias,
+            'Ln2Scale': ln2_scale,
+            'Ln2Bias': ln2_bias,
+        },
+        outputs={
+            'Out': out,
+            'Dropout1Mask': dropout1_mask,
+            'Dropout2Mask': dropout2_mask,
+            'Ln1Mean': ln1_mean,
+            'Ln1Variance': ln1_variance,
+            'Ln2Mean': ln2_mean,
+            'Ln2Variance': ln2_variance,
+            'Linear1Out': linear1_out,
+            'Ln1Out': ln1_out,
+            'Dropout1Out': dropout1_out,
+            'Dropout2Out': dropout2_out,
+        },
+        attrs={
+            'dropout1_rate': dropout1_rate,
+            'dropout2_rate': dropout2_rate,
+            'act_method': activation,
+            'pre_layer_norm': pre_layer_norm,
+            'ln1_epsilon': ln1_epsilon,
+            'ln2_epsilon': ln2_epsilon,
+            'is_test': not training,
+            'dropout1_fix_seed': seed is not None,
+            'dropout2_fix_seed': seed is not None,
+            'dropout1_seed': seed if seed is not None else 0,
+            'dropout2_seed': seed if seed is not None else 0,
+            'dropout1_implementation': mode,
+            'dropout2_implementation': mode,
+            'add_residual': add_residual,
+            'ring_id': ring_id,
+        },
+    )
     return out
 
 
-def fused_bias_dropout_residual_layer_norm(x,
-                                           residual,
-                                           bias=None,
-                                           ln_scale=None,
-                                           ln_bias=None,
-                                           dropout_rate=0.5,
-                                           ln_epsilon=1e-5,
-                                           training=True,
-                                           mode='upscale_in_train',
-                                           name=None):
+def fused_bias_dropout_residual_layer_norm(
+    x,
+    residual,
+    bias=None,
+    ln_scale=None,
+    ln_bias=None,
+    dropout_rate=0.5,
+    ln_epsilon=1e-5,
+    training=True,
+    mode='upscale_in_train',
+    name=None,
+):
     r"""
     The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows:
 
@@ -291,37 +342,72 @@ def fused_bias_dropout_residual_layer_norm(x,
         raise ValueError(
             "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
         )
-    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+    mode = (
+        'downgrade_in_infer' if mode == 'downscale_in_infer' else mode
+    )  # semantic transfer
 
     if ln_scale is not None:
-        assert len(ln_scale.shape
-                   ) == 1, "The dims of the shape of ln_scale should be 1."
-        assert x.shape[len(x.shape) - 1] == ln_scale.shape[
-            0], "The dim of ln_scale must equal to the last dim of x."
+        assert (
+            len(ln_scale.shape) == 1
+        ), "The dims of the shape of ln_scale should be 1."
+        assert (
+            x.shape[len(x.shape) - 1] == ln_scale.shape[0]
+        ), "The dim of ln_scale must equal to the last dim of x."
     if ln_bias is not None:
-        assert len(
-            ln_bias.shape) == 1, "The dims of the shape of ln_bias should be 1."
-        assert x.shape[len(x.shape) - 1] == ln_bias.shape[
-            0], "The dim of ln_bias must equal to the last dim of x."
+        assert (
+            len(ln_bias.shape) == 1
+        ), "The dims of the shape of ln_bias should be 1."
+        assert (
+            x.shape[len(x.shape) - 1] == ln_bias.shape[0]
+        ), "The dim of ln_bias must equal to the last dim of x."
 
     if _non_static_mode():
         if default_main_program().random_seed != 0:
             seed = default_main_program().random_seed
-        _, _, _, _, final_out = _legacy_C_ops.fused_bias_dropout_residual_layer_norm(
-            x, residual, bias, ln_scale, ln_bias, 'dropout_rate', dropout_rate,
-            'ln_epsilon', ln_epsilon, 'is_test', not training,
-            'dropout_fix_seed', seed is not None, 'dropout_seed',
-            seed if seed is not None else 0, 'dropout_implementation', mode)
+        (
+            _,
+            _,
+            _,
+            _,
+            final_out,
+        ) = _legacy_C_ops.fused_bias_dropout_residual_layer_norm(
+            x,
+            residual,
+            bias,
+            ln_scale,
+            ln_bias,
+            'dropout_rate',
+            dropout_rate,
+            'ln_epsilon',
+            ln_epsilon,
+            'is_test',
+            not training,
+            'dropout_fix_seed',
+            seed is not None,
+            'dropout_seed',
+            seed if seed is not None else 0,
+            'dropout_implementation',
+            mode,
+        )
         return final_out
     else:
-        helper = LayerHelper('fused_bias_dropout_residual_layer_norm',
-                             **locals())
+        helper = LayerHelper(
+            'fused_bias_dropout_residual_layer_norm', **locals()
+        )
         dtype = x.dtype
         # check dtypes
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'fused_bias_dropout_residual_layer_norm')
-        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
-                    'fused_bias_dropout_residual_layer_norm')
+        check_variable_and_dtype(
+            x,
+            'x',
+            ['float16', 'float32', 'float64'],
+            'fused_bias_dropout_residual_layer_norm',
+        )
+        check_dtype(
+            dtype,
+            'dtype',
+            ['float16', 'float32', 'float64'],
+            'fused_bias_dropout_residual_layer_norm',
+        )
         # set inputs
         inputs = dict()
         inputs['X'] = [x]
@@ -345,50 +431,57 @@ def fused_bias_dropout_residual_layer_norm(x,
         }
         # set outputs
         dropout_mask_out = helper.create_variable_for_type_inference(
-            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True
+        )
         ln_mean_out = helper.create_variable_for_type_inference(
-            dtype=dtype, stop_gradient=True)
+            dtype=dtype, stop_gradient=True
+        )
         ln_variance_out = helper.create_variable_for_type_inference(
-            dtype=dtype, stop_gradient=True)
+            dtype=dtype, stop_gradient=True
+        )
         bias_dropout_residual_out = helper.create_variable_for_type_inference(
-            dtype=dtype)
+            dtype=dtype
+        )
         final_out = helper.create_variable_for_type_inference(dtype=dtype)
 
-        helper.append_op(type='fused_bias_dropout_residual_layer_norm',
-                         inputs=inputs,
-                         outputs={
-                             "BiasDropoutResidualOut":
-                             bias_dropout_residual_out,
-                             "DropoutMaskOut": dropout_mask_out,
-                             "LnMean": ln_mean_out,
-                             "LnVariance": ln_variance_out,
-                             'Y': final_out,
-                         },
-                         attrs=attrs)
+        helper.append_op(
+            type='fused_bias_dropout_residual_layer_norm',
+            inputs=inputs,
+            outputs={
+                "BiasDropoutResidualOut": bias_dropout_residual_out,
+                "DropoutMaskOut": dropout_mask_out,
+                "LnMean": ln_mean_out,
+                "LnVariance": ln_variance_out,
+                'Y': final_out,
+            },
+            attrs=attrs,
+        )
         return final_out
 
 
-def fused_multi_head_attention(x,
-                               qkv_weight,
-                               linear_weight,
-                               pre_layer_norm=False,
-                               pre_ln_scale=None,
-                               pre_ln_bias=None,
-                               ln_scale=None,
-                               ln_bias=None,
-                               pre_ln_epsilon=1e-05,
-                               qkv_bias=None,
-                               linear_bias=None,
-                               cache_kv=None,
-                               attn_mask=None,
-                               dropout_rate=0.5,
-                               attn_dropout_rate=0.5,
-                               ln_epsilon=1e-05,
-                               training=True,
-                               mode='upscale_in_train',
-                               ring_id=-1,
-                               add_residual=True,
-                               name=None):
+def fused_multi_head_attention(
+    x,
+    qkv_weight,
+    linear_weight,
+    pre_layer_norm=False,
+    pre_ln_scale=None,
+    pre_ln_bias=None,
+    ln_scale=None,
+    ln_bias=None,
+    pre_ln_epsilon=1e-05,
+    qkv_bias=None,
+    linear_bias=None,
+    cache_kv=None,
+    attn_mask=None,
+    dropout_rate=0.5,
+    attn_dropout_rate=0.5,
+    ln_epsilon=1e-05,
+    training=True,
+    mode='upscale_in_train',
+    ring_id=-1,
+    add_residual=True,
+    name=None,
+):
     r"""
     Attention mapps queries and a set of key-value pairs to outputs, and
     Multi-Head Attention performs multiple parallel attention to jointly attending
@@ -512,7 +605,9 @@ def fused_multi_head_attention(x,
         raise ValueError(
             "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
         )
-    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+    mode = (
+        'downgrade_in_infer' if mode == 'downscale_in_infer' else mode
+    )  # semantic transfer
 
     if _non_static_mode():
         if default_main_program().random_seed != 0:
@@ -520,29 +615,83 @@ def fused_multi_head_attention(x,
         # pre_ln_mean, pre_ln_variance, pre_ln_out, qkv_out, qkv_bias_out, transpose_out, qk_out,
         # qktv_out, softmax_out, attn_dropout_mask_out, attn_dropout_out, attn_mask_out, fmha_out,
         # linear_out, dropout_mask_out, ln_mean_out, ln_var_out, bias_dropout_residual_out, final_out
-        assert len(qkv_weight.shape
-                   ) == 4, "The dims of the shape of qkv_weight should be 4."
-        assert qkv_weight.shape[
-            0] == 3, "The shape of qkv_weight should be [3, num_head, head_dim, embed_dim]."
-        assert qkv_weight.shape[3] == x.shape[
-            2], "The 3rd dim of qkv_weight and 2nd dim of x should be the same, i.e., embed_dim."
+        assert (
+            len(qkv_weight.shape) == 4
+        ), "The dims of the shape of qkv_weight should be 4."
+        assert (
+            qkv_weight.shape[0] == 3
+        ), "The shape of qkv_weight should be [3, num_head, head_dim, embed_dim]."
+        assert (
+            qkv_weight.shape[3] == x.shape[2]
+        ), "The 3rd dim of qkv_weight and 2nd dim of x should be the same, i.e., embed_dim."
         if ring_id == -1:
             # under mp, the num head will be split, this equation will not hold
-            assert qkv_weight.shape[1] * qkv_weight.shape[2] == qkv_weight.shape[
-                3], "embed_dim must be divisible by num_heads."
-
-        _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, cache_kv_out, final_out = _legacy_C_ops.fused_attention(
-            x, pre_ln_scale, pre_ln_bias, qkv_weight, qkv_bias, cache_kv,
-            attn_mask, linear_weight, linear_bias, ln_scale, ln_bias,
-            'pre_layer_norm', pre_layer_norm, 'epsilon', pre_ln_epsilon,
-            'dropout_rate', dropout_rate, 'attn_dropout_rate',
-            attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'is_test',
-            not training, 'attn_dropout_fix_seed', seed is not None,
-            'dropout_fix_seed', seed is not None, 'attn_dropout_seed',
-            seed if seed is not None else 0, 'dropout_seed',
-            seed if seed is not None else 0, 'attn_dropout_implementation',
-            mode, 'dropout_implementation', mode, 'add_residual', add_residual,
-            'ring_id', ring_id)
+            assert (
+                qkv_weight.shape[1] * qkv_weight.shape[2] == qkv_weight.shape[3]
+            ), "embed_dim must be divisible by num_heads."
+
+        (
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            cache_kv_out,
+            final_out,
+        ) = _legacy_C_ops.fused_attention(
+            x,
+            pre_ln_scale,
+            pre_ln_bias,
+            qkv_weight,
+            qkv_bias,
+            cache_kv,
+            attn_mask,
+            linear_weight,
+            linear_bias,
+            ln_scale,
+            ln_bias,
+            'pre_layer_norm',
+            pre_layer_norm,
+            'epsilon',
+            pre_ln_epsilon,
+            'dropout_rate',
+            dropout_rate,
+            'attn_dropout_rate',
+            attn_dropout_rate,
+            'ln_epsilon',
+            ln_epsilon,
+            'is_test',
+            not training,
+            'attn_dropout_fix_seed',
+            seed is not None,
+            'dropout_fix_seed',
+            seed is not None,
+            'attn_dropout_seed',
+            seed if seed is not None else 0,
+            'dropout_seed',
+            seed if seed is not None else 0,
+            'attn_dropout_implementation',
+            mode,
+            'dropout_implementation',
+            mode,
+            'add_residual',
+            add_residual,
+            'ring_id',
+            ring_id,
+        )
         if cache_kv is not None:
             return final_out, cache_kv_out
         return final_out
@@ -550,10 +699,18 @@ def fused_multi_head_attention(x,
         helper = LayerHelper('fused_multi_head_attention', **locals())
         dtype = x.dtype
         # check dtypes
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'fused_multihead_attention')
-        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
-                    'fused_multi_head_attention')
+        check_variable_and_dtype(
+            x,
+            'x',
+            ['float16', 'float32', 'float64'],
+            'fused_multihead_attention',
+        )
+        check_dtype(
+            dtype,
+            'dtype',
+            ['float16', 'float32', 'float64'],
+            'fused_multi_head_attention',
+        )
 
         # set inputs
         inputs = dict()
@@ -573,7 +730,8 @@ def fused_multi_head_attention(x,
             inputs['Ln2Scale'] = [ln_scale]
         if ln_bias:
             inputs['Ln2Bias'] = [ln_bias]
-        if cache_kv: inputs['CacheKV'] = [cache_kv]
+        if cache_kv:
+            inputs['CacheKV'] = [cache_kv]
 
         if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
             seed = helper.main_program.random_seed
@@ -593,14 +751,16 @@ def fused_multi_head_attention(x,
             'attn_dropout_implementation': mode,
             'dropout_implementation': mode,
             'add_residual': add_residual,
-            'ring_id': ring_id
+            'ring_id': ring_id,
         }
 
         # set outputs
         pre_ln_mean_out = helper.create_variable_for_type_inference(
-            dtype=dtype, stop_gradient=True)
+            dtype=dtype, stop_gradient=True
+        )
         pre_ln_variance_out = helper.create_variable_for_type_inference(
-            dtype=dtype, stop_gradient=True)
+            dtype=dtype, stop_gradient=True
+        )
         pre_ln_out = helper.create_variable_for_type_inference(dtype=dtype)
 
         qkv_out = helper.create_variable_for_type_inference(dtype=dtype)
@@ -611,78 +771,87 @@ def fused_multi_head_attention(x,
         qktv_out = helper.create_variable_for_type_inference(dtype=dtype)
         softmax_out = helper.create_variable_for_type_inference(dtype=dtype)
         attn_dropout_mask_out = helper.create_variable_for_type_inference(
-            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True
+        )
         attn_dropout_out = helper.create_variable_for_type_inference(
-            dtype=dtype)
+            dtype=dtype
+        )
         attn_mask_out = helper.create_variable_for_type_inference(dtype=dtype)
         fmha_out = helper.create_variable_for_type_inference(dtype=dtype)
         out_linear_out = helper.create_variable_for_type_inference(dtype=dtype)
         dropout_mask_out = helper.create_variable_for_type_inference(
-            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True
+        )
         ln_mean_out = helper.create_variable_for_type_inference(
-            dtype=dtype, stop_gradient=True)
+            dtype=dtype, stop_gradient=True
+        )
         ln_variance_out = helper.create_variable_for_type_inference(
-            dtype=dtype, stop_gradient=True)
+            dtype=dtype, stop_gradient=True
+        )
         bias_dropout_residual_out = helper.create_variable_for_type_inference(
-            dtype=dtype)
+            dtype=dtype
+        )
         final_out = helper.create_variable_for_type_inference(dtype=dtype)
         cache_kv_out = helper.create_variable_for_type_inference(dtype=dtype)
 
-        helper.append_op(type='fused_attention',
-                         inputs=inputs,
-                         outputs={
-                             "LnMean": pre_ln_mean_out,
-                             "LnVariance": pre_ln_variance_out,
-                             "LnOut": pre_ln_out,
-                             "QKVOut": qkv_out,
-                             "QKVBiasOut": qkv_bias_out,
-                             "TransposeOut2": transpose_out,
-                             "QKOut": qk_out,
-                             "QKTVOut": qktv_out,
-                             "SoftmaxOut": softmax_out,
-                             "AttnDropoutMaskOut": attn_dropout_mask_out,
-                             "AttnDropoutOut": attn_dropout_out,
-                             "SrcMaskOut": attn_mask_out,
-                             "FMHAOut": fmha_out,
-                             "OutLinearOut": out_linear_out,
-                             "DropoutMaskOut": dropout_mask_out,
-                             "Ln2Mean": ln_mean_out,
-                             "Ln2Variance": ln_variance_out,
-                             "BiasDropoutResidualOut":
-                             bias_dropout_residual_out,
-                             'Y': final_out,
-                             'CacheKVOut': cache_kv_out
-                         },
-                         attrs=attrs)
+        helper.append_op(
+            type='fused_attention',
+            inputs=inputs,
+            outputs={
+                "LnMean": pre_ln_mean_out,
+                "LnVariance": pre_ln_variance_out,
+                "LnOut": pre_ln_out,
+                "QKVOut": qkv_out,
+                "QKVBiasOut": qkv_bias_out,
+                "TransposeOut2": transpose_out,
+                "QKOut": qk_out,
+                "QKTVOut": qktv_out,
+                "SoftmaxOut": softmax_out,
+                "AttnDropoutMaskOut": attn_dropout_mask_out,
+                "AttnDropoutOut": attn_dropout_out,
+                "SrcMaskOut": attn_mask_out,
+                "FMHAOut": fmha_out,
+                "OutLinearOut": out_linear_out,
+                "DropoutMaskOut": dropout_mask_out,
+                "Ln2Mean": ln_mean_out,
+                "Ln2Variance": ln_variance_out,
+                "BiasDropoutResidualOut": bias_dropout_residual_out,
+                'Y': final_out,
+                'CacheKVOut': cache_kv_out,
+            },
+            attrs=attrs,
+        )
 
         return (final_out, cache_kv_out) if cache_kv else final_out
 
 
-def fused_multi_transformer(x,
-                            ln_scales,
-                            ln_biases,
-                            qkv_weights,
-                            qkv_biases,
-                            linear_weights,
-                            linear_biases,
-                            ffn_ln_scales,
-                            ffn_ln_biases,
-                            ffn1_weights,
-                            ffn1_biases,
-                            ffn2_weights,
-                            ffn2_biases,
-                            pre_layer_norm=True,
-                            epsilon=1e-05,
-                            cache_kvs=None,
-                            time_step=None,
-                            attn_mask=None,
-                            dropout_rate=0.0,
-                            activation="gelu",
-                            training=False,
-                            mode='upscale_in_train',
-                            trans_qkvw=True,
-                            ring_id=-1,
-                            name=None):
+def fused_multi_transformer(
+    x,
+    ln_scales,
+    ln_biases,
+    qkv_weights,
+    qkv_biases,
+    linear_weights,
+    linear_biases,
+    ffn_ln_scales,
+    ffn_ln_biases,
+    ffn1_weights,
+    ffn1_biases,
+    ffn2_weights,
+    ffn2_biases,
+    pre_layer_norm=True,
+    epsilon=1e-05,
+    cache_kvs=None,
+    time_step=None,
+    attn_mask=None,
+    dropout_rate=0.0,
+    activation="gelu",
+    training=False,
+    mode='upscale_in_train',
+    trans_qkvw=True,
+    ring_id=-1,
+    name=None,
+):
     r"""
     This is a fusion operator to compute multi transformer layers in transformer model architecture.
     This operator only supports running on GPU. The function of the transformer layer is consistent
@@ -821,17 +990,46 @@ def fused_multi_transformer(x,
         raise ValueError(
             "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
         )
-    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+    mode = (
+        'downgrade_in_infer' if mode == 'downscale_in_infer' else mode
+    )  # semantic transfer
 
     if _non_static_mode():
         cache_kv_out, final_out = _legacy_C_ops.fused_multi_transformer(
-            x, ln_scales, ln_biases, qkv_weights, qkv_biases, cache_kvs,
-            time_step, attn_mask, linear_weights, linear_biases, ffn_ln_scales,
-            ffn_ln_biases, ffn1_weights, ffn1_biases, ffn2_weights, ffn2_biases,
-            cache_kvs, 'pre_layer_norm', pre_layer_norm, 'epsilon', epsilon,
-            'dropout_rate', dropout_rate, 'is_test', not training,
-            'dropout_implementation', mode, 'act_method', activation,
-            'trans_qkvw', trans_qkvw, 'ring_id', ring_id)
+            x,
+            ln_scales,
+            ln_biases,
+            qkv_weights,
+            qkv_biases,
+            cache_kvs,
+            time_step,
+            attn_mask,
+            linear_weights,
+            linear_biases,
+            ffn_ln_scales,
+            ffn_ln_biases,
+            ffn1_weights,
+            ffn1_biases,
+            ffn2_weights,
+            ffn2_biases,
+            cache_kvs,
+            'pre_layer_norm',
+            pre_layer_norm,
+            'epsilon',
+            epsilon,
+            'dropout_rate',
+            dropout_rate,
+            'is_test',
+            not training,
+            'dropout_implementation',
+            mode,
+            'act_method',
+            activation,
+            'trans_qkvw',
+            trans_qkvw,
+            'ring_id',
+            ring_id,
+        )
         if cache_kvs is not None:
             return final_out, cache_kv_out
         return final_out
@@ -839,10 +1037,12 @@ def fused_multi_transformer(x,
         helper = LayerHelper('fused_multi_transformer', **locals())
         dtype = x.dtype
         # check dtypes
-        check_variable_and_dtype(x, 'x', ['float16', 'float32'],
-                                 'fused_multi_transformer')
-        check_dtype(dtype, 'dtype', ['float16', 'float32'],
-                    'fused_multi_transformer')
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32'], 'fused_multi_transformer'
+        )
+        check_dtype(
+            dtype, 'dtype', ['float16', 'float32'], 'fused_multi_transformer'
+        )
 
         # set inputs
         inputs = dict()
@@ -880,7 +1080,7 @@ def fused_multi_transformer(x,
             'dropout_implementation': mode,
             'act_method': activation,
             'trans_qkvw': trans_qkvw,
-            'ring_id': ring_id
+            'ring_id': ring_id,
         }
 
         outputs = dict()
@@ -890,9 +1090,11 @@ def fused_multi_transformer(x,
             # NOTE: inplace
             outputs['CacheKVOut'] = cache_kvs
 
-        helper.append_op(type='fused_multi_transformer',
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type='fused_multi_transformer',
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+        )
 
         return (final_out, cache_kvs) if cache_kvs else final_out
diff --git a/python/paddle/incubate/optimizer/lookahead.py b/python/paddle/incubate/optimizer/lookahead.py
index 8f70f321c0d..98acfff7fa4 100644
--- a/python/paddle/incubate/optimizer/lookahead.py
+++ b/python/paddle/incubate/optimizer/lookahead.py
@@ -14,7 +14,14 @@
 
 from paddle.optimizer import Optimizer
 from paddle.fluid import core, framework, layers, unique_name
-from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
+from paddle.fluid.framework import (
+    Program,
+    Variable,
+    name_scope,
+    default_main_program,
+    default_startup_program,
+    device_guard,
+)
 from paddle.fluid.layer_helper import LayerHelper
 import paddle
 import numpy as np
@@ -29,18 +36,18 @@ class LookAhead(Optimizer):
     paper : https://arxiv.org/abs/1907.08610.
 
     Lookahead keeps two sets of params: the fast_params and
-    the slow_params. inner_optimizer update fast_params every 
-    training step. Lookahead updates the slow_params and fast_params 
+    the slow_params. inner_optimizer update fast_params every
+    training step. Lookahead updates the slow_params and fast_params
     every k training steps as follows:
 
     .. math::
-        
+
         slow\_param_t &= slow\_param_{t-1} + \\alpha * (fast\_param_{t-1} - slow\_param_{t-1})
-	    
+
         fast\_param_t &=  slow\_param_t
 
     Args:
-        inner_optimizer (Optimizer): The optimizer that update fast params step by step. 
+        inner_optimizer (Optimizer): The optimizer that update fast params step by step.
         alpha (float, optinal): The learning rate of Lookahead. The default value is 0.5.
         k (int, optinal): The slow params is updated every k steps. The default value is 5.
         name (str, optional): Normally there is no need for user to set this property.
@@ -50,7 +57,7 @@ class LookAhead(Optimizer):
     Examples:
 
         .. code-block:: python
-        
+
             import numpy as np
             import paddle
             import paddle.nn as nn
@@ -109,31 +116,34 @@ class LookAhead(Optimizer):
                 shuffle=True,
                 drop_last=True,
                 num_workers=2)
-            
+
             train(layer, loader, loss_fn, lookahead)
 
     """
     _slow_str = "slow"
 
     def __init__(self, inner_optimizer, alpha=0.5, k=5, name=None):
-        assert (inner_optimizer is not None), "inner optimizer can not be None"
+        assert inner_optimizer is not None, "inner optimizer can not be None"
         assert (
             0.0 <= alpha <= 1.0
         ), "alpha should be larger or equal to 0.0, and less or equal than 1.0"
-        assert (isinstance(k, int) and k > 0), "k should be a positive integer"
+        assert isinstance(k, int) and k > 0, "k should be a positive integer"
 
         self.inner_optimizer = inner_optimizer
         if self.inner_optimizer._parameter_list is None:
-            parameters = framework.default_main_program().global_block(
-            ).all_parameters()
+            parameters = (
+                framework.default_main_program().global_block().all_parameters()
+            )
         else:
             parameters = self.inner_optimizer._parameter_list
 
-        super(LookAhead, self).__init__(learning_rate=alpha,
-                                        parameters=parameters,
-                                        weight_decay=None,
-                                        grad_clip=None,
-                                        name=name)
+        super(LookAhead, self).__init__(
+            learning_rate=alpha,
+            parameters=parameters,
+            weight_decay=None,
+            grad_clip=None,
+            name=name,
+        )
 
         self.alpha = alpha
         self.k = k
@@ -147,7 +157,7 @@ class LookAhead(Optimizer):
     def step(self):
         """
         Execute the optimizer and update parameters once.
-        
+
         Returns:
             None
 
@@ -179,9 +189,9 @@ class LookAhead(Optimizer):
                 grad_var = param._grad_ivar()
                 params_grads.append((param, grad_var))
 
-        self._apply_optimize(loss=None,
-                             startup_program=None,
-                             params_grads=params_grads)
+        self._apply_optimize(
+            loss=None, startup_program=None, params_grads=params_grads
+        )
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
@@ -196,24 +206,28 @@ class LookAhead(Optimizer):
                 shape=[1],
                 value=0,
                 dtype='int32',
-                persistable=True)
+                persistable=True,
+            )
 
-        self.helper.append_op(type='increment',
-                              inputs={'X': [self._global_step_var]},
-                              outputs={'Out': [self._global_step_var]},
-                              attrs={'step': 1.0})
+        self.helper.append_op(
+            type='increment',
+            inputs={'X': [self._global_step_var]},
+            outputs={'Out': [self._global_step_var]},
+            attrs={'step': 1.0},
+        )
 
     def _append_optimize_op(self, block, param_and_grad):
         one_var = paddle.ones(shape=[1], dtype='int32', name='lookahead_ones')
-        zero_var = paddle.zeros(shape=[1],
-                                dtype='int32',
-                                name='lookahead_zeros')
+        zero_var = paddle.zeros(
+            shape=[1], dtype='int32', name='lookahead_zeros'
+        )
         k_var = layers.create_global_var(
             name=unique_name.generate("lookahead_k"),
             shape=[1],
             value=self.k,
             dtype='int32',
-            persistable=True)
+            persistable=True,
+        )
 
         mod = paddle.remainder(self._global_step_var, k_var)
 
@@ -236,11 +250,9 @@ class LookAhead(Optimizer):
         paddle.assign(tmp_var_1, slow_var)
 
     @imperative_base.no_grad
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameters=None,
-                 no_grad_set=None):
+    def minimize(
+        self, loss, startup_program=None, parameters=None, no_grad_set=None
+    ):
         """
         Add operations to minimize ``loss`` by updating ``parameters``.
 
@@ -259,8 +271,8 @@ class LookAhead(Optimizer):
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
             by minimize and a list of (param, grad) tensor pairs, param is
             ``Parameter``, grad is the gradient value corresponding to the parameter.
-            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
-            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
+            indicate program pruning. If so, the program will be pruned by ``feed`` and
             ``fetch_list`` before run, see details in ``Executor``.
 
         Examples:
@@ -287,12 +299,13 @@ class LookAhead(Optimizer):
             loss,
             startup_program=startup_program,
             parameters=parameters,
-            no_grad_set=no_grad_set)
+            no_grad_set=no_grad_set,
+        )
 
         self._increment_global_var()
 
-        _ = self._apply_optimize(loss,
-                                 startup_program=startup_program,
-                                 params_grads=params_grads)
+        _ = self._apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads
+        )
 
         return optimize_ops, params_grads
diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py
index 67be022c288..ec455a41460 100644
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
@@ -163,17 +163,21 @@ class ModelAverage(Optimizer):
   
     """
 
-    def __init__(self,
-                 average_window_rate,
-                 parameters=None,
-                 min_average_window=10000,
-                 max_average_window=10000,
-                 name=None):
-        super(ModelAverage, self).__init__(learning_rate=0.0,
-                                           parameters=parameters,
-                                           weight_decay=None,
-                                           grad_clip=None,
-                                           name=name)
+    def __init__(
+        self,
+        average_window_rate,
+        parameters=None,
+        min_average_window=10000,
+        max_average_window=10000,
+        name=None,
+    ):
+        super(ModelAverage, self).__init__(
+            learning_rate=0.0,
+            parameters=parameters,
+            weight_decay=None,
+            grad_clip=None,
+            name=name,
+        )
 
         self.helper = LayerHelper(self.__class__.__name__)
         self.average_window = average_window_rate
@@ -183,7 +187,8 @@ class ModelAverage(Optimizer):
 
         if not framework._non_static_mode():
             global_block = framework.default_main_program().global_block()
-            all_parameters = parameters if parameters else global_block.all_parameters(
+            all_parameters = (
+                parameters if parameters else global_block.all_parameters()
             )
 
             self._create_accumulators(global_block, all_parameters)
@@ -208,18 +213,15 @@ class ModelAverage(Optimizer):
             self._add_accumulator('sum_2', param)
             self._add_accumulator('sum_3', param)
             self._add_accumulator('restore', param)
-            self._add_accumulator('num_accumulates',
-                                  param,
-                                  dtype='int64',
-                                  shape=[1])
-            self._add_accumulator('old_num_accumulates',
-                                  param,
-                                  dtype='int64',
-                                  shape=[1])
-            self._add_accumulator('num_updates',
-                                  param,
-                                  dtype='int64',
-                                  shape=[1])
+            self._add_accumulator(
+                'num_accumulates', param, dtype='int64', shape=[1]
+            )
+            self._add_accumulator(
+                'old_num_accumulates', param, dtype='int64', shape=[1]
+            )
+            self._add_accumulator(
+                'num_updates', param, dtype='int64', shape=[1]
+            )
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -227,26 +229,50 @@ class ModelAverage(Optimizer):
         sum_1 = self._get_accumulator('sum_1', param_and_grad[0])
         sum_2 = self._get_accumulator('sum_2', param_and_grad[0])
         sum_3 = self._get_accumulator('sum_3', param_and_grad[0])
-        num_accumulates = self._get_accumulator('num_accumulates',
-                                                param_and_grad[0])
-        old_num_accumulates = self._get_accumulator('old_num_accumulates',
-                                                    param_and_grad[0])
+        num_accumulates = self._get_accumulator(
+            'num_accumulates', param_and_grad[0]
+        )
+        old_num_accumulates = self._get_accumulator(
+            'old_num_accumulates', param_and_grad[0]
+        )
         num_updates = self._get_accumulator('num_updates', param_and_grad[0])
 
         if in_dygraph_mode():
             _, _, _, _, _, _ = _C_ops.average_accumulates_(
-                param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
-                old_num_accumulates, num_updates, self.average_window,
-                self.max_average_window, self.min_average_window)
+                param_and_grad[0],
+                sum_1,
+                sum_2,
+                sum_3,
+                num_accumulates,
+                old_num_accumulates,
+                num_updates,
+                self.average_window,
+                self.max_average_window,
+                self.min_average_window,
+            )
             return None
         elif framework._non_static_mode():
             _, _, _, _, _, _ = _legacy_C_ops.average_accumulates(
-                param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
-                old_num_accumulates, num_updates, sum_1, sum_2, sum_3,
-                num_accumulates, old_num_accumulates, num_updates,
-                'average_window', self.average_window, 'min_average_window',
-                self.min_average_window, 'max_average_window',
-                self.max_average_window)
+                param_and_grad[0],
+                sum_1,
+                sum_2,
+                sum_3,
+                num_accumulates,
+                old_num_accumulates,
+                num_updates,
+                sum_1,
+                sum_2,
+                sum_3,
+                num_accumulates,
+                old_num_accumulates,
+                num_updates,
+                'average_window',
+                self.average_window,
+                'min_average_window',
+                self.min_average_window,
+                'max_average_window',
+                self.max_average_window,
+            )
             return None
 
         block = framework.default_main_program().global_block()
@@ -263,7 +289,7 @@ class ModelAverage(Optimizer):
             "in_sum_3": sum_3,
             "in_num_accumulates": num_accumulates,
             "in_old_num_accumulates": old_num_accumulates,
-            "in_num_updates": num_updates
+            "in_num_updates": num_updates,
         }
 
         outputs = {
@@ -275,23 +301,23 @@ class ModelAverage(Optimizer):
             "out_num_updates": num_updates,
         }
 
-        average_accumulates_op = block.append_op(type=self.type,
-                                                 inputs=inputs,
-                                                 outputs=outputs,
-                                                 attrs=attrs,
-                                                 stop_gradient=True)
+        average_accumulates_op = block.append_op(
+            type=self.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=True,
+        )
 
         return average_accumulates_op
 
     @imperative_base.no_grad
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameters=None,
-                 no_grad_set=None):
+    def minimize(
+        self, loss, startup_program=None, parameters=None, no_grad_set=None
+    ):
         """
         Add operations to minimize ``loss`` by updating ``parameters``.
-        
+
         Args:
             loss (Tensor): A ``Tensor`` containing the value to minimize.
             startup_program (Program, optional): :ref:`api_fluid_Program` for
@@ -302,17 +328,17 @@ class ModelAverage(Optimizer):
                 will be updated.
             no_grad_set (set, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
                 to be updated. The default value is None.
-        
+
         Returns:
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
             by minimize and a list of (param, grad) tensor pairs, param is
             ``Parameter``, grad is the gradient value corresponding to the parameter.
-            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
-            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
+            indicate program pruning. If so, the program will be pruned by ``feed`` and
             ``fetch_list`` before run, see details in ``Executor``.
-        
+
         Examples:
-        
+
             .. code-block:: python
 
                 import paddle
@@ -343,7 +369,7 @@ class ModelAverage(Optimizer):
     def step(self):
         """
         Execute the optimizer and update parameters once.
-        
+
         Returns:
             None
 
@@ -414,7 +440,7 @@ class ModelAverage(Optimizer):
                                                             max_average_window=4)
                 sgd.step()
                 modelaverage.step()
-                
+
                 with modelaverage.apply():
                     for param in linear.parameters():
                         print(param)
@@ -424,10 +450,12 @@ class ModelAverage(Optimizer):
         """
         if framework._non_static_mode():
             for param in self._parameter_list:
-                num_accumulates = self._get_accumulator('num_accumulates',
-                                                        param)
+                num_accumulates = self._get_accumulator(
+                    'num_accumulates', param
+                )
                 old_num_accumulates = self._get_accumulator(
-                    'old_num_accumulates', param)
+                    'old_num_accumulates', param
+                )
                 sum_1 = self._get_accumulator('sum_1', param)
                 sum_2 = self._get_accumulator('sum_2', param)
                 sum_3 = self._get_accumulator('sum_3', param)
@@ -437,8 +465,9 @@ class ModelAverage(Optimizer):
                 total_param = sum_1 + sum_2 + sum_3
                 total_accumulates = num_accumulates + old_num_accumulates
                 total_param = paddle.cast(total_param, dtype='float32')
-                total_accumulates = paddle.cast(total_accumulates,
-                                                dtype='float32')
+                total_accumulates = paddle.cast(
+                    total_accumulates, dtype='float32'
+                )
                 average_param = total_param / total_accumulates
                 paddle.assign(average_param, param)
             try:
@@ -449,7 +478,8 @@ class ModelAverage(Optimizer):
             return
         if executor is None:
             raise RuntimeError(
-                "Executor should not be None in static graph mode.")
+                "Executor should not be None in static graph mode."
+            )
         executor.run(self.apply_program)
         try:
             yield
@@ -461,7 +491,7 @@ class ModelAverage(Optimizer):
     def restore(self, executor=None):
         """
         Restore ``Parameter`` values of current model.
-        
+
         Args:
             executor(Executor): The network executor in static-graph mode. The default value is None in dygraph mode
 
@@ -485,7 +515,7 @@ class ModelAverage(Optimizer):
                                                             max_average_window=4)
                 sgd.step()
                 modelaverage.step()
-                
+
                 with modelaverage.apply(need_restore=False):
                     for param in linear.parameters():
                         print(param)
@@ -505,7 +535,8 @@ class ModelAverage(Optimizer):
             return
         if executor is None:
             raise RuntimeError(
-                "Executor should not be None in static graph mode.")
+                "Executor should not be None in static graph mode."
+            )
         executor.run(self.restore_program)
 
     def _add_average_apply_op(self, block, param):
@@ -515,18 +546,22 @@ class ModelAverage(Optimizer):
         sum_2 = block._clone_variable(self._get_accumulator('sum_2', param))
         sum_3 = block._clone_variable(self._get_accumulator('sum_3', param))
         num_accumulates = block._clone_variable(
-            self._get_accumulator('num_accumulates', param))
+            self._get_accumulator('num_accumulates', param)
+        )
         old_num_accumulates = block._clone_variable(
-            self._get_accumulator('old_num_accumulates', param))
+            self._get_accumulator('old_num_accumulates', param)
+        )
         # backup param value to grad
         layers.assign(input=param, output=grad)
         # param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates)
         tmp = layers.sum(x=[num_accumulates, old_num_accumulates])
         sum = layers.sum(x=[sum_1, sum_2, sum_3])
         tmp = layers.cast(
-            x=tmp, dtype='float32' if self._dtype is None else self._dtype)
+            x=tmp, dtype='float32' if self._dtype is None else self._dtype
+        )
         sum = layers.cast(
-            x=sum, dtype='float32' if self._dtype is None else self._dtype)
+            x=sum, dtype='float32' if self._dtype is None else self._dtype
+        )
         layers.ops._elementwise_div(x=sum, y=tmp, out=param)
 
     def _add_average_restore_op(self, block, param):
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 7cebcbbfcab..7647b7d8bf5 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -23,7 +23,11 @@ from ...tensor.math import multiply
 import warnings
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import convert_np_dtype_to_dtype_
-from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode
+from ...fluid.framework import (
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+    _non_static_mode,
+)
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 import paddle
 from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
@@ -71,10 +75,12 @@ def celu(x, alpha=1.0, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'celu')
     helper = LayerHelper("celu", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='celu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'alpha': alpha})
+    helper.append_op(
+        type='celu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'alpha': alpha},
+    )
     return out
 
 
@@ -122,10 +128,12 @@ def elu(x, alpha=1.0, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu')
     helper = LayerHelper("elu", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='elu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'alpha': alpha})
+    helper.append_op(
+        type='elu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'alpha': alpha},
+    )
     return out
 
 
@@ -135,7 +143,7 @@ def elu_(x, alpha=1.0, name=None):
     Inplace version of ``elu`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_nn_cn_elu`.
     """
-    assert alpha >= 0., "elu_ only support alpha >= 0, please use elu instead."
+    assert alpha >= 0.0, "elu_ only support alpha >= 0, please use elu instead."
     if in_dygraph_mode():
         return _C_ops.elu_(x, alpha)
     return _legacy_C_ops.elu_(x, 'alpha', alpha)
@@ -190,10 +198,12 @@ def gelu(x, approximate=False, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'gelu')
     helper = LayerHelper("gelu", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='gelu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'approximate': approximate})
+    helper.append_op(
+        type='gelu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'approximate': approximate},
+    )
     return out
 
 
@@ -237,14 +247,17 @@ def hardshrink(x, threshold=0.5, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.hard_shrink(x, 'threshold', threshold)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'hardshrink')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'hardshrink'
+    )
     helper = LayerHelper('hardshrink', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='hard_shrink',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'threshold': threshold})
+    helper.append_op(
+        type='hard_shrink',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold},
+    )
     return out
 
 
@@ -290,18 +303,18 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.brelu(x, 't_min', min, 't_max', max)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'hardtanh')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'hardtanh'
+    )
 
     helper = LayerHelper('hardtanh', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='brelu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         't_min': min,
-                         't_max': max
-                     })
+    helper.append_op(
+        type='brelu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'t_min': min, 't_max': max},
+    )
     return out
 
 
@@ -349,18 +362,18 @@ def hardsigmoid(x, slope=0.1666667, offset=0.5, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.hard_sigmoid(x, 'slope', slope, 'offset', offset)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'hardsigmoid')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'hardsigmoid'
+    )
 
     helper = LayerHelper('hardsigmoid', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='hard_sigmoid',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'slope': slope,
-                         'offset': offset
-                     })
+    helper.append_op(
+        type='hard_sigmoid',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'slope': slope, 'offset': offset},
+    )
     return out
 
 
@@ -406,8 +419,9 @@ def hardswish(x, name=None):
     if in_dygraph_mode():
         return _C_ops.hard_swish(x, 6, 6, 3)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'hardswish')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'hardswish'
+    )
 
     helper = LayerHelper('hardswish', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
@@ -456,14 +470,17 @@ def leaky_relu(x, negative_slope=0.01, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.leaky_relu(x, 'alpha', negative_slope)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'leaky_relu')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'leaky_relu'
+    )
     helper = LayerHelper('leaky_relu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='leaky_relu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'alpha': negative_slope})
+    helper.append_op(
+        type='leaky_relu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'alpha': negative_slope},
+    )
     return out
 
 
@@ -511,60 +528,68 @@ def prelu(x, weight, data_format="NCHW", name=None):
             #    [ 6.  ,  7.  ,  8.  ,  9.  ]]]]
     """
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'prelu')
-    check_variable_and_dtype(weight, 'weight',
-                             ['float16', 'float32', 'float64'], 'prelu')
+    check_variable_and_dtype(
+        weight, 'weight', ['float16', 'float32', 'float64'], 'prelu'
+    )
 
-    assert len(weight.shape
-               ) == 1, "The dim count of weight shape should be 1 in prelu()."
+    assert (
+        len(weight.shape) == 1
+    ), "The dim count of weight shape should be 1 in prelu()."
 
     mode = 'all'
     if weight.shape[0] > 1:
 
         true_data_format = [
-            'NC', 'NCL', 'NCHW', 'NCDHW', 'NLC', 'NHWC', 'NDHWC'
+            'NC',
+            'NCL',
+            'NCHW',
+            'NCDHW',
+            'NLC',
+            'NHWC',
+            'NDHWC',
         ]
         if data_format not in true_data_format:
             raise ValueError(
                 "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', "
-                "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format))
+                "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format)
+            )
 
         data_format = 'NCHW' if data_format[1] == 'C' else 'NHWC'
 
-        assert len(
-            x.shape
-        ) > 1, "The dim count of x should be equal or larger than 2 in prelu() when weight shape is not [1]."
+        assert (
+            len(x.shape) > 1
+        ), "The dim count of x should be equal or larger than 2 in prelu() when weight shape is not [1]."
 
-        #NOTE(GuoxiaWang): support NHWC data format
+        # NOTE(GuoxiaWang): support NHWC data format
         if data_format == 'NHWC':
-            assert weight.shape[0] == x.shape[
-                -1], "The weight size should be equal to x input channel in prelu() when weight shape is not [1]."
+            assert (
+                weight.shape[0] == x.shape[-1]
+            ), "The weight size should be equal to x input channel in prelu() when weight shape is not [1]."
         else:
-            assert weight.shape[0] == x.shape[
-                1], "The weight size should be equal to x input channel in prelu() when weight shape is not [1]."
+            assert (
+                weight.shape[0] == x.shape[1]
+            ), "The weight size should be equal to x input channel in prelu() when weight shape is not [1]."
         mode = 'channel'
 
     if in_dygraph_mode():
         return _C_ops.prelu(x, weight, data_format, mode)
     if _in_legacy_dygraph():
-        return _legacy_C_ops.prelu(x, weight, 'mode', mode, 'data_format',
-                                   data_format)
+        return _legacy_C_ops.prelu(
+            x, weight, 'mode', mode, 'data_format', data_format
+        )
 
     helper = LayerHelper('prelu', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type="prelu",
-                     inputs={
-                         "X": x,
-                         "Alpha": weight
-                     },
-                     outputs={"Out": out},
-                     attrs={
-                         "mode": mode,
-                         "data_format": data_format
-                     })
+    helper.append_op(
+        type="prelu",
+        inputs={"X": x, "Alpha": weight},
+        outputs={"Out": out},
+        attrs={"mode": mode, "data_format": data_format},
+    )
     return out
 
 
-def rrelu(x, lower=1. / 8., upper=1. / 3., training=True, name=None):
+def rrelu(x, lower=1.0 / 8.0, upper=1.0 / 3.0, training=True, name=None):
     r"""
     rrelu activation.
 
@@ -637,47 +662,56 @@ def rrelu(x, lower=1. / 8., upper=1. / 3., training=True, name=None):
     """
 
     if not in_dynamic_mode():
-        check_variable_and_dtype(x, 'X', ['float16', 'float32', 'float64'],
-                                 'rrelu')
+        check_variable_and_dtype(
+            x, 'X', ['float16', 'float32', 'float64'], 'rrelu'
+        )
 
     if not isinstance(lower, float) or not isinstance(upper, float):
         raise TypeError(
-            "The lower and upper values must be float type. Received: lower {}, upper {}."
-            .format(lower, upper))
+            "The lower and upper values must be float type. Received: lower {}, upper {}.".format(
+                lower, upper
+            )
+        )
 
     if lower < 0 or lower > 1:
         raise ValueError(
-            "The lower value must be no less than zero or greater than one. Received: {}."
-            .format(lower))
+            "The lower value must be no less than zero or greater than one. Received: {}.".format(
+                lower
+            )
+        )
 
     if upper < lower:
         raise ValueError(
-            "The upper value must be greater than lower value. Received: lower {}, upper {}."
-            .format(lower, upper))
+            "The upper value must be greater than lower value. Received: lower {}, upper {}.".format(
+                lower, upper
+            )
+        )
 
     if upper > 1:
         raise ValueError(
             "The upper value must be no greater than one. Received: {}.".format(
-                upper))
+                upper
+            )
+        )
 
     is_test = not training
 
     if _in_legacy_dygraph():
-        out, noise = _legacy_C_ops.rrelu(x, 'lower', lower, 'upper', upper,
-                                         'is_test', is_test)
+        out, noise = _legacy_C_ops.rrelu(
+            x, 'lower', lower, 'upper', upper, 'is_test', is_test
+        )
         return out
 
     helper = LayerHelper('rrelu', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     noise = helper.create_variable_for_type_inference(dtype=x.dtype)
     attrs = {'lower': lower, 'upper': upper, 'is_test': is_test}
-    helper.append_op(type='rrelu',
-                     inputs={"X": x},
-                     outputs={
-                         "Out": out,
-                         "Noise": noise
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='rrelu',
+        inputs={"X": x},
+        outputs={"Out": out, "Noise": noise},
+        attrs=attrs,
+    )
     return out
 
 
@@ -764,8 +798,9 @@ def log_sigmoid(x, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.logsigmoid(x)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'log_sigmoid')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'log_sigmoid'
+    )
     helper = LayerHelper("log_sigmoid", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(type='logsigmoid', inputs={'X': x}, outputs={'Out': out})
@@ -835,19 +870,19 @@ def maxout(x, groups, axis=1, name=None):
     if axis not in [1, -1, 3]:
         raise ValueError(
             "Attr(axis) should be 1 when data format is NCHW, -1 or 3 when data format is NHWC. Received "
-            "Attr(axis): %s." % str(axis))
+            "Attr(axis): %s." % str(axis)
+        )
     if axis == -1:
         axis = 3
 
     helper = LayerHelper('maxout', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='maxout',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'groups': groups,
-                         'axis': axis
-                     })
+    helper.append_op(
+        type='maxout',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'groups': groups, 'axis': axis},
+    )
     return out
 
 
@@ -887,17 +922,21 @@ def relu6(x, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu6')
     helper = LayerHelper('relu6', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='relu6',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'threshold': threshold})
+    helper.append_op(
+        type='relu6',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold},
+    )
     return out
 
 
-def selu(x,
-         scale=1.0507009873554804934193349852946,
-         alpha=1.6732632423543772848170429916717,
-         name=None):
+def selu(
+    x,
+    scale=1.0507009873554804934193349852946,
+    alpha=1.6732632423543772848170429916717,
+    name=None,
+):
     r"""
     selu activation
 
@@ -934,11 +973,13 @@ def selu(x,
     """
     if scale <= 1.0:
         raise ValueError(
-            "The scale must be greater than 1.0. Received: {}.".format(scale))
+            "The scale must be greater than 1.0. Received: {}.".format(scale)
+        )
 
     if alpha < 0:
         raise ValueError(
-            "The alpha must be no less than zero. Received: {}.".format(alpha))
+            "The alpha must be no less than zero. Received: {}.".format(alpha)
+        )
 
     if in_dygraph_mode():
         return _C_ops.selu(x, scale, alpha)
@@ -948,13 +989,12 @@ def selu(x,
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'selu')
     helper = LayerHelper('selu', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='selu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'scale': scale,
-                         'alpha': alpha
-                     })
+    helper.append_op(
+        type='selu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'scale': scale, 'alpha': alpha},
+    )
     return out
 
 
@@ -965,21 +1005,21 @@ def silu(x, name=None):
     .. math::
 
         silu(x) = \frac{x}{1 + e^{-x}}
-    
+
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         A Tensor with the same data type and shape as ``x`` .
-    
+
     Examples:
         .. code-block:: python
 
             import paddle
             import paddle.nn.functional as F
-            
+
             x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
             out = F.silu(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
     """
@@ -1117,44 +1157,50 @@ def softmax(x, axis=-1, dtype=None, name=None):
     use_cudnn = True
 
     if in_dygraph_mode():
-        outs_cast = x if dtype is None \
-            else _C_ops.cast(x, dtype)
+        outs_cast = x if dtype is None else _C_ops.cast(x, dtype)
         return _C_ops.softmax(outs_cast, axis)
 
     if _in_legacy_dygraph():
-        outs_cast = x if dtype is None \
+        outs_cast = (
+            x
+            if dtype is None
             else _legacy_C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
-        return _legacy_C_ops.softmax(outs_cast, 'axis', axis, 'use_cudnn',
-                                     use_cudnn)
+        )
+        return _legacy_C_ops.softmax(
+            outs_cast, 'axis', axis, 'use_cudnn', use_cudnn
+        )
 
     if dtype is None:
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'softmax')
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], 'softmax'
+        )
     else:
         check_dtype(
-            dtype, 'dtype', ['float32', 'float64'], 'softmax',
-            'If dtype is not None, it only support float32 or float64.')
+            dtype,
+            'dtype',
+            ['float32', 'float64'],
+            'softmax',
+            'If dtype is not None, it only support float32 or float64.',
+        )
 
     helper = LayerHelper("softmax", **locals())
     outs_cast = x
     if dtype is not None:
         outs_cast = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(type='cast',
-                         inputs={'X': x},
-                         outputs={'Out': outs_cast},
-                         attrs={
-                             'in_dtype': x.dtype,
-                             'out_dtype': dtype
-                         })
+        helper.append_op(
+            type='cast',
+            inputs={'X': x},
+            outputs={'Out': outs_cast},
+            attrs={'in_dtype': x.dtype, 'out_dtype': dtype},
+        )
 
     outs_softmax = helper.create_variable_for_type_inference(outs_cast.dtype)
-    helper.append_op(type='softmax',
-                     inputs={'X': outs_cast},
-                     outputs={'Out': outs_softmax},
-                     attrs={
-                         'axis': axis,
-                         'use_cudnn': use_cudnn
-                     })
+    helper.append_op(
+        type='softmax',
+        inputs={'X': outs_cast},
+        outputs={'Out': outs_softmax},
+        attrs={'axis': axis, 'use_cudnn': use_cudnn},
+    )
 
     return outs_softmax
 
@@ -1170,15 +1216,22 @@ def softmax_(x, axis=-1, dtype=None, name=None):
     use_cudnn = True
 
     if in_dygraph_mode():
-        outs_cast = x if dtype is None \
+        outs_cast = (
+            x
+            if dtype is None
             else _legacy_C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
+        )
         return _C_ops.softmax_(outs_cast, axis)
 
     if _in_legacy_dygraph():
-        outs_cast = x if dtype is None \
+        outs_cast = (
+            x
+            if dtype is None
             else _legacy_C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
-        return _legacy_C_ops.softmax_(outs_cast, 'axis', axis, 'use_cudnn',
-                                      use_cudnn)
+        )
+        return _legacy_C_ops.softmax_(
+            outs_cast, 'axis', axis, 'use_cudnn', use_cudnn
+        )
 
 
 def softplus(x, beta=1, threshold=20, name=None):
@@ -1217,17 +1270,17 @@ def softplus(x, beta=1, threshold=20, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.softplus(x, 'beta', beta, 'threshold', threshold)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'softplus')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'softplus'
+    )
     helper = LayerHelper('softplus', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='softplus',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'beta': beta,
-                         'threshold': threshold
-                     })
+    helper.append_op(
+        type='softplus',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'beta': beta, 'threshold': threshold},
+    )
     return out
 
 
@@ -1268,21 +1321,26 @@ def softshrink(x, threshold=0.5, name=None):
     if threshold < 0:
         raise ValueError(
             "The threshold must be no less than zero. Received: {}.".format(
-                threshold))
+                threshold
+            )
+        )
 
     if in_dygraph_mode():
         return _C_ops.soft_shrink(x, threshold)
     if _in_legacy_dygraph():
         return _legacy_C_ops.softshrink(x, 'lambda', threshold)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'softshrink')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'softshrink'
+    )
     helper = LayerHelper('softshrink', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='softshrink',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'lambda': threshold})
+    helper.append_op(
+        type='softshrink',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'lambda': threshold},
+    )
     return out
 
 
@@ -1317,8 +1375,9 @@ def softsign(x, name=None):
     if in_dynamic_mode():
         return _legacy_C_ops.softsign(x)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'softsign')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'softsign'
+    )
     helper = LayerHelper('softsign', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(type='softsign', inputs={'X': x}, outputs={'Out': out})
@@ -1359,10 +1418,9 @@ def swish(x, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'swish')
     helper = LayerHelper('swish', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='swish',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'beta': 1.0})
+    helper.append_op(
+        type='swish', inputs={'X': x}, outputs={'Out': out}, attrs={'beta': 1.0}
+    )
     return out
 
 
@@ -1440,8 +1498,9 @@ def tanhshrink(x, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.tanh_shrink(x)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'tanhshrink')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'tanhshrink'
+    )
     helper = LayerHelper('tanh_shrink', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(type='tanh_shrink', inputs={'X': x}, outputs={'Out': out})
@@ -1489,14 +1548,17 @@ def thresholded_relu(x, threshold=1.0, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.thresholded_relu(x, 'threshold', threshold)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'thresholded_relu')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'thresholded_relu'
+    )
     helper = LayerHelper('thresholded_relu', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='thresholded_relu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'threshold': threshold})
+    helper.append_op(
+        type='thresholded_relu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold},
+    )
     return out
 
 
@@ -1570,37 +1632,43 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
         return _legacy_C_ops.log_softmax(x, 'axis', axis)
 
     if dtype is None:
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'log_softmax')
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], 'log_softmax'
+        )
     else:
         check_dtype(
-            dtype, 'dtype', ['float32', 'float64'], 'log_softmax',
-            'If dtype is not None, it only support float32 or float64.')
+            dtype,
+            'dtype',
+            ['float32', 'float64'],
+            'log_softmax',
+            'If dtype is not None, it only support float32 or float64.',
+        )
 
     helper = LayerHelper("log_softmax", **locals())
     out_cast = x
     if dtype is not None:
         out_cast = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(type='cast',
-                         inputs={'X': x},
-                         outputs={'Out': out_cast},
-                         attrs={
-                             'in_dtype': x.dtype,
-                             'out_dtype': dtype
-                         })
+        helper.append_op(
+            type='cast',
+            inputs={'X': x},
+            outputs={'Out': out_cast},
+            attrs={'in_dtype': x.dtype, 'out_dtype': dtype},
+        )
 
     out = helper.create_variable_for_type_inference(out_cast.dtype)
-    helper.append_op(type='log_softmax',
-                     inputs={'X': out_cast},
-                     outputs={'Out': out},
-                     attrs={'axis': axis})
+    helper.append_op(
+        type='log_softmax',
+        inputs={'X': out_cast},
+        outputs={'Out': out},
+        attrs={'axis': axis},
+    )
 
     return out
 
 
 def glu(x, axis=-1, name=None):
     r"""
-    The gated linear unit. The input is evenly splited into 2 parts along a 
+    The gated linear unit. The input is evenly splited into 2 parts along a
     given axis. The first part is used as the content, and the second part is
     passed through a sigmoid function then used as the gate. The output is a
     elementwise multiplication of the content and the gate.
@@ -1611,23 +1679,23 @@ def glu(x, axis=-1, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        axis (int, optional): The axis along which split the input tensor. It 
-            should be in range [-D, D), where D is the dimensions of ``x`` . 
-            If ``axis`` < 0, it works the same way as :math:`axis + D` . 
+        axis (int, optional): The axis along which split the input tensor. It
+            should be in range [-D, D), where D is the dimensions of ``x`` .
+            If ``axis`` < 0, it works the same way as :math:`axis + D` .
             Default is -1.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
-        A Tensor with the same data type as x. The size of the given aixs is 
+        A Tensor with the same data type as x. The size of the given aixs is
         halved.
-    
+
     Examples:
         .. code-block:: python
-        
+
             import paddle
             from paddle.nn import functional as F
-            
+
             x = paddle.to_tensor(
                 [[-0.22014759, -1.76358426,  0.80566144,  0.04241343],
                  [-1.94900405, -1.89956081,  0.17134808, -1.11280477]]
@@ -1635,10 +1703,11 @@ def glu(x, axis=-1, name=None):
             print(F.glu(x).numpy())
             # array([[-0.15216254, -0.9004892 ],
             #        [-1.0577879 , -0.46985325]], dtype=float32)
-        
+
     """
-    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                             "glu")
+    check_variable_and_dtype(
+        x, 'input', ['float16', 'float32', 'float64'], "glu"
+    )
     a, b = chunk(x, 2, axis=axis, name=name)
     gate = sigmoid(b, name=name)
     out = paddle.multiply(a, gate, name=name)
@@ -1668,24 +1737,24 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
         gumbel\_softmax(v_i)=\frac{e^{v_i/t}}{\sum_{j=1}^n{e^{v_j/t}}},i=1,2,3...n
 
     Parameters:
-        x (Tensor): An N-D Tensor, the first N - 1 dimensions index into a batch 
-            of independent distributions and the last dimension represents 
+        x (Tensor): An N-D Tensor, the first N - 1 dimensions index into a batch
+            of independent distributions and the last dimension represents
             a vector of probabilities with datatype float32, float64.
         temperature (float, optional): non-negative scalar temperature.
             Default is 1.0.
-        hard (bool, optional): if True, the returned samples will be discretized as 
-            one-hot vectors, but will be differentiated as if it is the soft sample 
+        hard (bool, optional): if True, the returned samples will be discretized as
+            one-hot vectors, but will be differentiated as if it is the soft sample
             in autograd. Default is False.
-        axis (int, optional): The axis along will be calculated softmax value. 
+        axis (int, optional): The axis along will be calculated softmax value.
             Default is -1.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
-        Sampled tensor of same shape as ``x`` from the Gumbel-Softmax distribution. 
-        If ``hard = True``, the returned samples will be one-hot, otherwise they will be 
+        Sampled tensor of same shape as ``x`` from the Gumbel-Softmax distribution.
+        If ``hard = True``, the returned samples will be one-hot, otherwise they will be
         probability distributions that sum to 1 across ``axis``.
-    
+
     Examples:
         .. code-block:: python
 
@@ -1701,24 +1770,23 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
             # [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 1.        ],
             # [0.00000062, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.99999940],
             # [0.00000000, 0.00000000, 0.00000000, 0.00001258, 0.99998736, 0.00000000]]
-        
+
     """
     if in_dygraph_mode():
         return _C_ops.gumbel_softmax(x, temperature, hard, axis)
 
     if in_dynamic_mode():
-        return _legacy_C_ops.gumbel_softmax(x, 'temperature', temperature,
-                                            'hard', hard, 'axis', axis)
+        return _legacy_C_ops.gumbel_softmax(
+            x, 'temperature', temperature, 'hard', hard, 'axis', axis
+        )
 
     helper = LayerHelper("gumbel_softmax", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'gumbel_softmax')
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='gumbel_softmax',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'temperature': temperature,
-                         'hard': hard,
-                         'axis': axis
-                     })
+    helper.append_op(
+        type='gumbel_softmax',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'temperature': temperature, 'hard': hard, 'axis': axis},
+    )
     return out
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 178bc6727ec..ffe64c8e44a 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -20,14 +20,24 @@ from ...tensor import concat
 from ...tensor.creation import zeros
 from paddle.static import Variable
 from ...fluid import dygraph_utils
+
 # TODO: define the common functions to build a neural network
 from ...tensor.manipulation import squeeze
 from ...tensor.manipulation import unsqueeze
 from ...tensor import clip
 from ...tensor import sum
 from ...tensor import sqrt
-from ...fluid.data_feeder import check_variable_and_dtype, check_dtype, check_type
-from ...fluid.framework import _varbase_creator, _in_legacy_dygraph, in_dygraph_mode, _non_static_mode
+from ...fluid.data_feeder import (
+    check_variable_and_dtype,
+    check_dtype,
+    check_type,
+)
+from ...fluid.framework import (
+    _varbase_creator,
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+    _non_static_mode,
+)
 
 from ...fluid import dygraph_utils
 
@@ -112,26 +122,28 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'unfold')
 
-    assert len(x.shape) == 4, \
-            "input should be the format of [N, C, H, W]"
+    assert len(x.shape) == 4, "input should be the format of [N, C, H, W]"
 
     if isinstance(kernel_sizes, int):
         kernel_sizes = [kernel_sizes, kernel_sizes]
     else:
-        assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \
-            "kernel_sizes should either be an integer or a list of two integers"
+        assert isinstance(kernel_sizes, list) and (
+            len(kernel_sizes) == 2
+        ), "kernel_sizes should either be an integer or a list of two integers"
 
     if isinstance(strides, int):
         strides = [strides, strides]
     else:
-        assert isinstance(strides, list) and (len(strides) == 2), \
-            "strides should either be an integer or a list of two integers"
+        assert isinstance(strides, list) and (
+            len(strides) == 2
+        ), "strides should either be an integer or a list of two integers"
 
     if isinstance(dilations, int):
         dilations = [dilations, dilations]
     else:
-        assert isinstance(dilations, list) and (len(dilations) == 2), \
-            "dilations should either be an integer or a list of two integers"
+        assert isinstance(dilations, list) and (
+            len(dilations) == 2
+        ), "dilations should either be an integer or a list of two integers"
 
     if isinstance(paddings, int):
         paddings = [paddings] * 4
@@ -147,32 +159,37 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
     else:
         raise ValueError(
             "Unexpected type of paddings, it should be either an integer or a list"
-            "of 2 or 4 integers")
+            "of 2 or 4 integers"
+        )
 
     if in_dygraph_mode():
         return _C_ops.unfold(x, kernel_sizes, strides, paddings, dilations)
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="unfold",
-                     inputs={"X": x},
-                     outputs={"Y": out},
-                     attrs={
-                         "kernel_sizes": kernel_sizes,
-                         "strides": strides,
-                         "paddings": paddings,
-                         "dilations": dilations
-                     })
+    helper.append_op(
+        type="unfold",
+        inputs={"X": x},
+        outputs={"Y": out},
+        attrs={
+            "kernel_sizes": kernel_sizes,
+            "strides": strides,
+            "paddings": paddings,
+            "dilations": dilations,
+        },
+    )
     return out
 
 
-def interpolate(x,
-                size=None,
-                scale_factor=None,
-                mode='nearest',
-                align_corners=False,
-                align_mode=0,
-                data_format='NCHW',
-                name=None):
+def interpolate(
+    x,
+    size=None,
+    scale_factor=None,
+    mode='nearest',
+    align_corners=False,
+    align_mode=0,
+    data_format='NCHW',
+    name=None,
+):
     """
 
     This API resizes a batch of images.
@@ -191,9 +208,9 @@ def interpolate(x,
         'bicubic' : Bicubic interpolation
         'area': Area interpolation
 
-    Linear interpolation is the method of using a line connecting two known quantities 
-    to determine the value of an unknown quantity between the two known quantities. 
-    
+    Linear interpolation is the method of using a line connecting two known quantities
+    to determine the value of an unknown quantity between the two known quantities.
+
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
     in both the 3rd dimension(in height direction) and the 4th dimension(in width
     direction) on input tensor.
@@ -218,8 +235,8 @@ def interpolate(x,
 
     Area interpolation is to perform area interpolation
     in both the 3rd dimension(in height direction) , the 4th dimension(in width
-    direction) and the 5th dimension(in depth direction) on input tensor. Set to 
-    area will directly call `paddle.nn.functional.adaptive_avg_pool1d` or 
+    direction) and the 5th dimension(in depth direction) on input tensor. Set to
+    area will directly call `paddle.nn.functional.adaptive_avg_pool1d` or
     `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
 
     Example:
@@ -242,7 +259,7 @@ def interpolate(x,
                 input : (N,C,W_in)
                 output: (N,C,W_out) where:
                 W_out = W_{in} * scale_{factor}
-        
+
         Nearest neighbor interpolation:
 
               align_corners = False
@@ -294,25 +311,25 @@ def interpolate(x,
 
     For details of linear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Linear_interpolation.
-    
+
     For details of nearest neighbor interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
-    
+
     For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation.
-    
+
     For details of trilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Trilinear_interpolation.
-    
+
     For details of bicubic interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bicubic_interpolation
-    
+
     Parameters:
         x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
         size (list|tuple|Tensor|None): Output shape of image resize
-             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
-             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
+             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w)
+             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor.
              Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor, its dimensions size should be a 1.
         scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
@@ -345,23 +362,23 @@ def interpolate(x,
     Examples:
         .. code-block:: python
 
-		import paddle
-		import paddle.nn.functional as F
+                import paddle
+                import paddle.nn.functional as F
 
-		input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
-		output_1 = F.interpolate(x=input_data, size=[12,12])
-		print(output_1.shape)
-		    # [2L, 3L, 12L, 12L]
+                input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
+                output_1 = F.interpolate(x=input_data, size=[12,12])
+                print(output_1.shape)
+                    # [2L, 3L, 12L, 12L]
 
-		# given scale
-		output_2 = F.interpolate(x=input_data, scale_factor=[2,1])
-		print(output_2.shape)
-		# [2L, 3L, 12L, 10L]
+                # given scale
+                output_2 = F.interpolate(x=input_data, scale_factor=[2,1])
+                print(output_2.shape)
+                # [2L, 3L, 12L, 10L]
 
-		# bilinear interp
-		output_3 = F.interpolate(x=input_data, scale_factor=[2,1], mode="bilinear")
-		print(output_2.shape)
-		# [2L, 3L, 12L, 10L]
+                # bilinear interp
+                output_3 = F.interpolate(x=input_data, scale_factor=[2,1], mode="bilinear")
+                print(output_2.shape)
+                # [2L, 3L, 12L, 10L]
     """
     data_format = data_format.upper()
     resample = mode.upper()
@@ -378,7 +395,8 @@ def interpolate(x,
     if resample not in resample_methods:
         raise ValueError(
             "The 'resample' of image_resize can only be 'area', 'linear', 'bilinear', 'trilinear', "
-            " 'bicubic' or 'nearest' currently.")
+            " 'bicubic' or 'nearest' currently."
+        )
 
     if resample in ['LINEAR'] and len(x.shape) != 3:
         raise ValueError("'linear' only support 3-D tensor.")
@@ -405,8 +423,11 @@ def interpolate(x,
         )
 
     if resample == 'AREA':
-        if isinstance(size, list) or isinstance(size, tuple) or isinstance(
-                size, Variable):
+        if (
+            isinstance(size, list)
+            or isinstance(size, tuple)
+            or isinstance(size, Variable)
+        ):
             if len(size) == 0:
                 raise ValueError("output size can not be empty")
         if len(x.shape) == 3:
@@ -420,19 +441,25 @@ def interpolate(x,
     dtype = helper.input_dtype(input_param_name='x')
     if len(x.shape) == 3 and data_format not in ['NCW', 'NWC']:
         raise ValueError(
-            "Got wrong value for param `data_format`: " + data_format +
-            " received but only `NCW` or `NWC` supported for 3-D input.")
+            "Got wrong value for param `data_format`: "
+            + data_format
+            + " received but only `NCW` or `NWC` supported for 3-D input."
+        )
     elif len(x.shape) == 4 and data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
-            "Got wrong value for param `data_format`: " + data_format +
-            " received but only `NCHW` or `NHWC` supported for 4-D input.")
+            "Got wrong value for param `data_format`: "
+            + data_format
+            + " received but only `NCHW` or `NHWC` supported for 4-D input."
+        )
     elif len(x.shape) == 5 and data_format not in ['NCDHW', 'NDHWC']:
         raise ValueError(
-            "Got wrong value for param `data_format`: " + data_format +
-            " received but only `NCDHW` or `NDHWC` supported for 5-D input.")
+            "Got wrong value for param `data_format`: "
+            + data_format
+            + " received but only `NCDHW` or `NDHWC` supported for 5-D input."
+        )
 
     def _is_list_or_turple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
+        return isinstance(data, list) or isinstance(data, tuple)
 
     if data_format == 'NCHW' or data_format == 'NCDHW' or data_format == 'NCW':
         data_layout = 'NCHW'
@@ -450,7 +477,7 @@ def interpolate(x,
         "interp_method": resample_type,
         "align_corners": align_corners,
         "align_mode": align_mode,
-        "data_layout": data_layout
+        "data_layout": data_layout,
     }
 
     out_shape = size
@@ -478,9 +505,9 @@ def interpolate(x,
                 if isinstance(dim_size, Variable):
                     contain_var = True
                     continue
-                assert dim_size > 0, (
-                    "Each dimension size given in out_shape must be greater than 0."
-                )
+                assert (
+                    dim_size > 0
+                ), "Each dimension size given in out_shape must be greater than 0."
 
             if contain_var:
                 new_size_tensor = []
@@ -491,14 +518,13 @@ def interpolate(x,
                         new_size_tensor.append(dim)
                         size_list.append(-1)
                     else:
-                        assert (isinstance(dim, int))
+                        assert isinstance(dim, int)
                         temp_out = helper.create_variable_for_type_inference(
-                            'int32')
-                        fill_constant([1],
-                                      'int32',
-                                      dim,
-                                      force_cpu=True,
-                                      out=temp_out)
+                            'int32'
+                        )
+                        fill_constant(
+                            [1], 'int32', dim, force_cpu=True, out=temp_out
+                        )
                         new_size_tensor.append(temp_out)
                         size_list.append(dim)
                 inputs['SizeTensor'] = new_size_tensor
@@ -506,7 +532,8 @@ def interpolate(x,
             if len(x.shape) == 3:
                 if len(out_shape) != 1:
                     raise ValueError(
-                        "size length should be 2 for input 3-D tensor")
+                        "size length should be 2 for input 3-D tensor"
+                    )
                 if contain_var:
                     attrs['out_w'] = size_list[0]
                 else:
@@ -514,8 +541,9 @@ def interpolate(x,
                     attrs['out_w'] = out_shape[0]
             if len(x.shape) == 4:
                 if len(out_shape) != 2:
-                    raise ValueError("size length should be 2 for "
-                                     "input 4-D tensor.")
+                    raise ValueError(
+                        "size length should be 2 for " "input 4-D tensor."
+                    )
                 if contain_var:
                     attrs['out_h'] = size_list[0]
                     attrs['out_w'] = size_list[1]
@@ -525,8 +553,9 @@ def interpolate(x,
                     attrs['out_w'] = out_shape[1]
             if len(x.shape) == 5:
                 if len(out_shape) != 3:
-                    raise ValueError("size length should be 3 for "
-                                     "input 5-D tensor.")
+                    raise ValueError(
+                        "size length should be 3 for " "input 5-D tensor."
+                    )
                 if contain_var:
                     attrs['out_d'] = size_list[0]
                     attrs['out_h'] = size_list[1]
@@ -552,9 +581,10 @@ def interpolate(x,
             attrs['scale'] = list(map(float, scale_list))
         elif isinstance(scale, list) or isinstance(scale, tuple):
             if len(scale) != len(x.shape) - 2:
-                raise ValueError("scale_shape length should be {} for "
-                                 "input {}-D tensor.".format(
-                                     len(x.shape) - 2, len(x.shape)))
+                raise ValueError(
+                    "scale_shape length should be {} for "
+                    "input {}-D tensor.".format(len(x.shape) - 2, len(x.shape))
+                )
             for value in scale:
                 if value <= 0:
                     raise ValueError("Attr(scale) should be greater than zero.")
@@ -574,80 +604,114 @@ def interpolate(x,
         if resample_type == "linear":
             if in_dygraph_mode():
                 out = _C_ops.linear_interp(
-                    x, inputs['OutSize'] if 'OutSize' in inputs else None,
+                    x,
+                    inputs['OutSize'] if 'OutSize' in inputs else None,
                     inputs['SizeTensor'] if 'SizeTensor' in inputs else None,
                     inputs['Scale'] if 'Scale' in inputs else None,
-                    attrs['data_layout'], attrs['out_d'], attrs['out_h'],
-                    attrs['out_w'], attrs['scale'] if 'scale' in attrs else [],
-                    attrs['interp_method'], attrs['align_corners'],
-                    attrs['align_mode'])
+                    attrs['data_layout'],
+                    attrs['out_d'],
+                    attrs['out_h'],
+                    attrs['out_w'],
+                    attrs['scale'] if 'scale' in attrs else [],
+                    attrs['interp_method'],
+                    attrs['align_corners'],
+                    attrs['align_mode'],
+                )
             else:
                 out = _legacy_C_ops.linear_interp_v2(x, *dy_attr)
         elif resample_type == "bilinear":
             if in_dygraph_mode():
                 out = _C_ops.bilinear_interp(
-                    x, inputs['OutSize'] if 'OutSize' in inputs else None,
+                    x,
+                    inputs['OutSize'] if 'OutSize' in inputs else None,
                     inputs['SizeTensor'] if 'SizeTensor' in inputs else None,
                     inputs['Scale'] if 'Scale' in inputs else None,
-                    attrs['data_layout'], attrs['out_d'], attrs['out_h'],
-                    attrs['out_w'], attrs['scale'] if 'scale' in attrs else [],
-                    attrs['interp_method'], attrs['align_corners'],
-                    attrs['align_mode'])
+                    attrs['data_layout'],
+                    attrs['out_d'],
+                    attrs['out_h'],
+                    attrs['out_w'],
+                    attrs['scale'] if 'scale' in attrs else [],
+                    attrs['interp_method'],
+                    attrs['align_corners'],
+                    attrs['align_mode'],
+                )
             else:
                 out = _legacy_C_ops.bilinear_interp_v2(x, *dy_attr)
         elif resample_type == "trilinear":
             if in_dygraph_mode():
                 out = _C_ops.trilinear_interp(
-                    x, inputs['OutSize'] if 'OutSize' in inputs else None,
+                    x,
+                    inputs['OutSize'] if 'OutSize' in inputs else None,
                     inputs['SizeTensor'] if 'SizeTensor' in inputs else None,
                     inputs['Scale'] if 'Scale' in inputs else None,
-                    attrs['data_layout'], attrs['out_d'], attrs['out_h'],
-                    attrs['out_w'], attrs['scale'] if 'scale' in attrs else [],
-                    attrs['interp_method'], attrs['align_corners'],
-                    attrs['align_mode'])
+                    attrs['data_layout'],
+                    attrs['out_d'],
+                    attrs['out_h'],
+                    attrs['out_w'],
+                    attrs['scale'] if 'scale' in attrs else [],
+                    attrs['interp_method'],
+                    attrs['align_corners'],
+                    attrs['align_mode'],
+                )
             else:
                 out = _legacy_C_ops.trilinear_interp_v2(x, *dy_attr)
         elif resample_type == "nearest":
             if in_dygraph_mode():
                 out = _C_ops.nearest_interp(
-                    x, inputs['OutSize'] if 'OutSize' in inputs else None,
+                    x,
+                    inputs['OutSize'] if 'OutSize' in inputs else None,
                     inputs['SizeTensor'] if 'SizeTensor' in inputs else None,
                     inputs['Scale'] if 'Scale' in inputs else None,
-                    attrs['data_layout'], attrs['out_d'], attrs['out_h'],
-                    attrs['out_w'], attrs['scale'] if 'scale' in attrs else [],
-                    attrs['interp_method'], attrs['align_corners'],
-                    attrs['align_mode'])
+                    attrs['data_layout'],
+                    attrs['out_d'],
+                    attrs['out_h'],
+                    attrs['out_w'],
+                    attrs['scale'] if 'scale' in attrs else [],
+                    attrs['interp_method'],
+                    attrs['align_corners'],
+                    attrs['align_mode'],
+                )
             else:
                 out = _legacy_C_ops.nearest_interp_v2(x, *dy_attr)
         elif resample_type == "bicubic":
             if in_dygraph_mode():
                 out = _C_ops.bicubic_interp(
-                    x, inputs['OutSize'] if 'OutSize' in inputs else None,
+                    x,
+                    inputs['OutSize'] if 'OutSize' in inputs else None,
                     inputs['SizeTensor'] if 'SizeTensor' in inputs else None,
                     inputs['Scale'] if 'Scale' in inputs else None,
-                    attrs['data_layout'], attrs['out_d'], attrs['out_h'],
-                    attrs['out_w'], attrs['scale'] if 'scale' in attrs else [],
-                    attrs['interp_method'], attrs['align_corners'],
-                    attrs['align_mode'])
+                    attrs['data_layout'],
+                    attrs['out_d'],
+                    attrs['out_h'],
+                    attrs['out_w'],
+                    attrs['scale'] if 'scale' in attrs else [],
+                    attrs['interp_method'],
+                    attrs['align_corners'],
+                    attrs['align_mode'],
+                )
             else:
                 out = _legacy_C_ops.bicubic_interp_v2(x, *dy_attr)
         return out
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='{}_interp_v2'.format(resample_type),
-                     inputs=inputs,
-                     outputs={"Out": out},
-                     attrs=attrs)
+    helper.append_op(
+        type='{}_interp_v2'.format(resample_type),
+        inputs=inputs,
+        outputs={"Out": out},
+        attrs=attrs,
+    )
     return out
 
 
-def upsample(x,
-             size=None,
-             scale_factor=None,
-             mode='nearest',
-             align_corners=False,
-             align_mode=0,
-             data_format='NCHW',
-             name=None):
+def upsample(
+    x,
+    size=None,
+    scale_factor=None,
+    mode='nearest',
+    align_corners=False,
+    align_mode=0,
+    data_format='NCHW',
+    name=None,
+):
     """
     This API resizes a batch of images.
 
@@ -664,9 +728,9 @@ def upsample(x,
         'trilinear' : Trilinear interpolation
         'nearest' : Nearest neighbor interpolation
         'bicubic' : Bicubic interpolation
-    Linear interpolation is the method of using a line connecting two known quantities 
-    to determine the value of an unknown quantity between the two known quantities. 
-    
+    Linear interpolation is the method of using a line connecting two known quantities
+    to determine the value of an unknown quantity between the two known quantities.
+
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
     in both the 3rd dimension(in height direction) and the 4th dimension(in width
     direction) on input tensor.
@@ -675,7 +739,7 @@ def upsample(x,
     W-direction in this op) on a rectilinear 2D grid. The key idea is
     to perform linear interpolation first in one direction, and then
     again in the other direction.
-    
+
     Bicubic interpolation is an extension of cubic interpolation for interpolating
     data points on a two-dimensional regular grid. The interpolated surface is
     smoother than corresponding surfaces obtained by bilinear interpolation or
@@ -697,7 +761,7 @@ def upsample(x,
 
     Example:
     .. code-block:: text
-    
+
         For scale_factor:
             if align_corners = True && out_size > 1 :
               scale_factor = (in_size-1.0)/(out_size-1.0)
@@ -726,7 +790,7 @@ def upsample(x,
               output: (N,C,H_out,W_out) where:
               H_out = round(H_{in} * scale_{factor})
               W_out = round(W_{in} * scale_{factor})
-        
+
         Bilinear interpolation:
           if:
               align_corners = False , align_mode = 0
@@ -767,30 +831,30 @@ def upsample(x,
               W_out = W_{in} * scale_{factor}
     https://en.wikipedia.org/wiki/Linear_interpolation.
     For details of linear interpolation, please refer to Wikipedia:
-    
+
     For details of nearest neighbor interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
-    
+
     For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation.
-    
+
     For details of bicubic interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bicubic_interpolation
-    
+
     For details of trilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Trilinear_interpolation.
-    
+
     Parameters:
         x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
         size (list|tuple|Tensor|None, optional): Output shape of image resize
-             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
-             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
+             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w)
+             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor.
              Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor , its dimensions size should be a 1.
         scale_factor (float|Tensor|list|tuple|None, optional): The multiplier for the input height or width. At
              least one of :attr:`size` or :attr:`scale_factor` must be set.
-             And :attr:`size` has a higher priority than :attr:`scale_factor`.Has to match input size if 
+             And :attr:`size` has a higher priority than :attr:`scale_factor`.Has to match input size if
              it is either a list or a tuple or a Tensor.
              Default: None.
         mode (str, optional): The resample method. It supports 'linear', 'nearest', 'bilinear',
@@ -817,20 +881,21 @@ def upsample(x,
 
         Examples:
         .. code-block:: python
-	
-		import paddle
-		import paddle.nn as nn
 
-		input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
-		upsample_out = paddle.nn.Upsample(size=[12,12])
+                import paddle
+                import paddle.nn as nn
+
+                input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
+                upsample_out = paddle.nn.Upsample(size=[12,12])
 
-		output = upsample_out(x=input_data)
-		print(output.shape)
-		# [2L, 3L, 12L, 12L]
+                output = upsample_out(x=input_data)
+                print(output.shape)
+                # [2L, 3L, 12L, 12L]
 
     """
-    return interpolate(x, size, scale_factor, mode, align_corners, align_mode,
-                       data_format)
+    return interpolate(
+        x, size, scale_factor, mode, align_corners, align_mode, data_format
+    )
 
 
 def bilinear(x1, x2, weight, bias=None, name=None):
@@ -853,17 +918,17 @@ def bilinear(x1, x2, weight, bias=None, name=None):
     Examples:
        .. code-block:: python
 
-		import paddle
-		import paddle.nn.functional as F
+                import paddle
+                import paddle.nn.functional as F
 
-		x1 = paddle.randn((5, 5)).astype(paddle.float32)
-		x2 = paddle.randn((5, 4)).astype(paddle.float32)
-		w = paddle.randn((1000, 5, 4)).astype(paddle.float32)
-		b = paddle.randn((1, 1000)).astype(paddle.float32)
+                x1 = paddle.randn((5, 5)).astype(paddle.float32)
+                x2 = paddle.randn((5, 4)).astype(paddle.float32)
+                w = paddle.randn((1000, 5, 4)).astype(paddle.float32)
+                b = paddle.randn((1, 1000)).astype(paddle.float32)
 
-		result = F.bilinear(x1, x2, w, b)
-		print(result.shape)
-		# [5, 1000]
+                result = F.bilinear(x1, x2, w, b)
+                print(result.shape)
+                # [5, 1000]
     """
 
     if in_dygraph_mode():
@@ -881,19 +946,16 @@ def bilinear(x1, x2, weight, bias=None, name=None):
     helper = LayerHelper("bilinear", **locals())
     out = helper.create_variable_for_type_inference(dtype=x1.dtype)
 
-    helper.append_op(type="bilinear_tensor_product",
-                     inputs=inputs,
-                     outputs={"Out": out})
+    helper.append_op(
+        type="bilinear_tensor_product", inputs=inputs, outputs={"Out": out}
+    )
 
     return out
 
 
-def dropout(x,
-            p=0.5,
-            axis=None,
-            training=True,
-            mode="upscale_in_train",
-            name=None):
+def dropout(
+    x, p=0.5, axis=None, training=True, mode="upscale_in_train", name=None
+):
     """
     Dropout is a regularization technique for reducing overfitting by preventing
     neuron co-adaption during training. The dropout operator randomly sets the
@@ -1005,38 +1067,38 @@ def dropout(x,
 
         .. code-block:: python
 
-		import paddle
-
-		x = paddle.to_tensor([[1,2,3], [4,5,6]]).astype(paddle.float32)
-		y_train = paddle.nn.functional.dropout(x, 0.5)
-		y_test = paddle.nn.functional.dropout(x, 0.5, training=False)
-		y_0 = paddle.nn.functional.dropout(x, axis=0)
-		y_1 = paddle.nn.functional.dropout(x, axis=1)
-		y_01 = paddle.nn.functional.dropout(x, axis=[0,1])
-		print(x)
-		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-		#        [[1., 2., 3.],
-		#         [4., 5., 6.]])
-		print(y_train)
-		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-		#        [[2. , 0. , 6. ],
-		#         [8. , 0. , 12.]])
-		print(y_test)
-		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-		#        [[1., 2., 3.],
-		#         [4., 5., 6.]])
-		print(y_0)
-		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-		#        [[0. , 0. , 0. ],
-		#         [8. , 10., 12.]])
-		print(y_1)
-		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-		#        [[2. , 0. , 6. ],
-		#         [8. , 0. , 12.]])
-		print(y_01)
-		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-		#        [[0. , 0. , 0. ],
-		#         [8. , 0. , 12.]])
+                import paddle
+
+                x = paddle.to_tensor([[1,2,3], [4,5,6]]).astype(paddle.float32)
+                y_train = paddle.nn.functional.dropout(x, 0.5)
+                y_test = paddle.nn.functional.dropout(x, 0.5, training=False)
+                y_0 = paddle.nn.functional.dropout(x, axis=0)
+                y_1 = paddle.nn.functional.dropout(x, axis=1)
+                y_01 = paddle.nn.functional.dropout(x, axis=[0,1])
+                print(x)
+                # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[1., 2., 3.],
+                #         [4., 5., 6.]])
+                print(y_train)
+                # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[2. , 0. , 6. ],
+                #         [8. , 0. , 12.]])
+                print(y_test)
+                # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[1., 2., 3.],
+                #         [4., 5., 6.]])
+                print(y_0)
+                # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[0. , 0. , 0. ],
+                #         [8. , 10., 12.]])
+                print(y_1)
+                # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[2. , 0. , 6. ],
+                #         [8. , 0. , 12.]])
+                print(y_01)
+                # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[0. , 0. , 0. ],
+                #         [8. , 0. , 12.]])
 
     """
     if not isinstance(p, (float, int, Variable)):
@@ -1044,7 +1106,8 @@ def dropout(x,
 
     if isinstance(p, (int, float)):
         # fast return for p == 0
-        if p == 0: return x
+        if p == 0:
+            return x
         elif p < 0 or p > 1:
             raise ValueError("p argument should between 0 and 1")
     if mode not in ('downscale_in_infer', 'upscale_in_train'):
@@ -1056,41 +1119,63 @@ def dropout(x,
 
     if axis == None:  # commonly used dropout
         seed = None
-        mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+        mode = (
+            'downgrade_in_infer' if mode == 'downscale_in_infer' else mode
+        )  # semantic transfer
 
         if _non_static_mode():
             if default_main_program().random_seed != 0:
                 seed = default_main_program().random_seed
 
             if in_dygraph_mode():
-                out, mask = _C_ops.dropout( x, None, p, not training, mode, \
-                    seed if seed is not None else 0, seed is not None)
+                out, mask = _C_ops.dropout(
+                    x,
+                    None,
+                    p,
+                    not training,
+                    mode,
+                    seed if seed is not None else 0,
+                    seed is not None,
+                )
 
                 return out
-            out, mask = _legacy_C_ops.dropout(x, 'dropout_prob', p, 'is_test',
-                                              not training, 'fix_seed', seed
-                                              is not None, 'seed',
-                                              seed if seed is not None else 0,
-                                              'dropout_implementation', mode)
+            out, mask = _legacy_C_ops.dropout(
+                x,
+                'dropout_prob',
+                p,
+                'is_test',
+                not training,
+                'fix_seed',
+                seed is not None,
+                'seed',
+                seed if seed is not None else 0,
+                'dropout_implementation',
+                mode,
+            )
             return out
 
         helper = LayerHelper('dropout', **locals())
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'dropout')
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], 'dropout'
+        )
 
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
         mask = helper.create_variable_for_type_inference(
-            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True
+        )
 
         def get_attrs(prog, dropout_prob, is_test, seed):
             if (seed is None or seed == 0) and prog.random_seed != 0:
                 seed = prog.random_seed
 
-            if isinstance(dropout_prob,
-                          Variable) and not dropout_prob.shape != [1]:
+            if isinstance(
+                dropout_prob, Variable
+            ) and not dropout_prob.shape != [1]:
                 raise TypeError(
-                    "Required p.shape == [1] if type(p) is Variable, but received p.shape = {}"
-                    .format(p.shape))
+                    "Required p.shape == [1] if type(p) is Variable, but received p.shape = {}".format(
+                        p.shape
+                    )
+                )
             attrs = {
                 'dropout_prob': dropout_prob,
                 'is_test': is_test,
@@ -1102,38 +1187,45 @@ def dropout(x,
 
         attrs = get_attrs(helper.main_program, p, not training, seed)
 
-        helper.append_op(type='dropout',
-                         inputs={'X': [x]},
-                         outputs={
-                             'Out': [out],
-                             'Mask': [mask]
-                         },
-                         attrs=attrs)
+        helper.append_op(
+            type='dropout',
+            inputs={'X': [x]},
+            outputs={'Out': [out], 'Mask': [mask]},
+            attrs=attrs,
+        )
         return out
-    else:  #sometimes called dropout_nd #TODO: optimize with c++
+    else:  # sometimes called dropout_nd #TODO: optimize with c++
         if not in_dynamic_mode():
             check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'dropout')
         dtype = x.dtype
         keep_prob = 1 - p
         if training:
-            if in_dynamic_mode() and p == 1.:
-                return paddle.scale(x, scale=0.)
+            if in_dynamic_mode() and p == 1.0:
+                return paddle.scale(x, scale=0.0)
 
-            scale_input = paddle.scale(
-                x, scale=1 / keep_prob) if mode == 'upscale_in_train' else x
+            scale_input = (
+                paddle.scale(x, scale=1 / keep_prob)
+                if mode == 'upscale_in_train'
+                else x
+            )
 
-            #get mask shape
+            # get mask shape
             input_shape = x.shape
             if not in_dynamic_mode():
                 input_shape_tensor = paddle.shape(x)
             drop_axes = [axis] if isinstance(axis, int) else list(axis)
             if min(drop_axes) < 0 or max(drop_axes) > len(input_shape) - 1:
-                raise ValueError("axis value should be greater than or equal to 0 and less than dimensions of x:{}, but get axis value:{} " \
-                                 .format(len(input_shape), max(drop_axes)))
+                raise ValueError(
+                    "axis value should be greater than or equal to 0 and less than dimensions of x:{}, but get axis value:{} ".format(
+                        len(input_shape), max(drop_axes)
+                    )
+                )
             if len(drop_axes) > len(input_shape):
                 raise ValueError(
-                    "length of axis should not be greater than dimensions of x:{}, but get length of axis: {}"
-                    .format(len(input_shape), len(drop_axes)))
+                    "length of axis should not be greater than dimensions of x:{}, but get length of axis: {}".format(
+                        len(input_shape), len(drop_axes)
+                    )
+                )
             mask_shape = [1] * len(input_shape)
             if not in_dynamic_mode():
                 for i in drop_axes:
@@ -1142,11 +1234,10 @@ def dropout(x,
                 for i in drop_axes:
                     mask_shape[i] = input_shape[i]
 
-            #get mask
-            random_tensor = paddle.uniform(mask_shape,
-                                           dtype='float32',
-                                           min=0.,
-                                           max=1.0)
+            # get mask
+            random_tensor = paddle.uniform(
+                mask_shape, dtype='float32', min=0.0, max=1.0
+            )
             p = full(shape=[1], fill_value=p, dtype='float32')
             keep_mask = paddle.greater_equal(random_tensor, p)
 
@@ -1155,8 +1246,11 @@ def dropout(x,
             ret = paddle.multiply(scale_input, keep_mask, name=name)
             return ret
         else:  # test
-            ret = paddle.scale(
-                x, scale=keep_prob) if mode == 'downscale_in_infer' else x
+            ret = (
+                paddle.scale(x, scale=keep_prob)
+                if mode == 'downscale_in_infer'
+                else x
+            )
             return ret
 
 
@@ -1197,20 +1291,26 @@ def dropout2d(x, p=0.5, training=True, data_format='NCHW', name=None):
     """
     input_shape = x.shape
     if len(input_shape) != 4:
-        raise ValueError("dimensions of x should be 4, but received {} != 4"\
-        .format(len(input_shape)))
+        raise ValueError(
+            "dimensions of x should be 4, but received {} != 4".format(
+                len(input_shape)
+            )
+        )
 
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
+            "Attr(data_format): %s." % str(data_format)
+        )
 
-    return dropout(x,
-                   p=p,
-                   axis=[0, 1] if data_format == 'NCHW' else [0, 3],
-                   training=training,
-                   mode="upscale_in_train",
-                   name=name)
+    return dropout(
+        x,
+        p=p,
+        axis=[0, 1] if data_format == 'NCHW' else [0, 3],
+        training=training,
+        mode="upscale_in_train",
+        name=name,
+    )
 
 
 def dropout3d(x, p=0.5, training=True, data_format='NCDHW', name=None):
@@ -1236,33 +1336,39 @@ def dropout3d(x, p=0.5, training=True, data_format='NCDHW', name=None):
     Examples:
         .. code-block:: python
 
-		import paddle
+                import paddle
 
-		x = paddle.randn(shape=(2, 3, 4, 5, 6)).astype(paddle.float32)
-		y_train = paddle.nn.functional.dropout3d(x)  #train
-		y_test = paddle.nn.functional.dropout3d(x, training=False) #test
-		print(x[0,0,:,:,:])
-		print(y_train[0,0,:,:,:]) # may all 0
-		print(y_test[0,0,:,:,:])
+                x = paddle.randn(shape=(2, 3, 4, 5, 6)).astype(paddle.float32)
+                y_train = paddle.nn.functional.dropout3d(x)  #train
+                y_test = paddle.nn.functional.dropout3d(x, training=False) #test
+                print(x[0,0,:,:,:])
+                print(y_train[0,0,:,:,:]) # may all 0
+                print(y_test[0,0,:,:,:])
 
     """
 
     input_shape = x.shape
     if len(input_shape) != 5:
-        raise ValueError("dimensions of x should be 5, but received {} != 5" \
-        .format(len(input_shape)))
+        raise ValueError(
+            "dimensions of x should be 5, but received {} != 5".format(
+                len(input_shape)
+            )
+        )
 
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
+            "Attr(data_format): %s." % str(data_format)
+        )
 
-    return dropout(x,
-                   p=p,
-                   axis=[0, 1] if data_format == 'NCDHW' else [0, 4],
-                   training=training,
-                   mode="upscale_in_train",
-                   name=name)
+    return dropout(
+        x,
+        p=p,
+        axis=[0, 1] if data_format == 'NCDHW' else [0, 4],
+        training=training,
+        mode="upscale_in_train",
+        name=name,
+    )
 
 
 def alpha_dropout(x, p=0.5, training=True, name=None):
@@ -1284,19 +1390,19 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
     Examples:
         .. code-block:: python
 
-		import paddle
-
-		x = paddle.to_tensor([[-1, 1], [-1, 1]]).astype(paddle.float32)
-		y_train = paddle.nn.functional.alpha_dropout(x, 0.5)
-		y_test = paddle.nn.functional.alpha_dropout(x, 0.5, training=False)
-		print(y_train)
-		# Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
-		#        [[-0.10721093, -0.77919382],
-		#         [-0.10721093,  1.66559887]]) (randomly)
-		print(y_test)
-		# Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
-		#        [[-1.,  1.],
-		#         [-1.,  1.]])
+                import paddle
+
+                x = paddle.to_tensor([[-1, 1], [-1, 1]]).astype(paddle.float32)
+                y_train = paddle.nn.functional.alpha_dropout(x, 0.5)
+                y_test = paddle.nn.functional.alpha_dropout(x, 0.5, training=False)
+                print(y_train)
+                # Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[-0.10721093, -0.77919382],
+                #         [-0.10721093,  1.66559887]]) (randomly)
+                print(y_test)
+                # Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[-1.,  1.],
+                #         [-1.,  1.]])
     """
     if not isinstance(p, (float, int)):
         raise TypeError("p argument should be a float or int")
@@ -1304,37 +1410,40 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
         raise ValueError("p argument should between 0 and 1")
 
     if not in_dynamic_mode():
-        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
-                                 'alpha_dropout')
+        check_variable_and_dtype(
+            x, 'x', ['float32', 'float64'], 'alpha_dropout'
+        )
 
     if training:
         if p == 1:
-            return paddle.scale(x, scale=0.)
-        #get transformation params
+            return paddle.scale(x, scale=0.0)
+        # get transformation params
         alpha = 1.6732632423543772848170429916717
         scale = 1.0507009873554804934193349852946
         alpha_p = -alpha * scale
-        a = ((1 - p) * (1 + p * alpha_p**2))**-0.5
+        a = ((1 - p) * (1 + p * alpha_p**2)) ** -0.5
         b = -a * alpha_p * p
 
         dtype = x.dtype
         input_shape = x.shape
 
-        #get mask
-        random_tensor = paddle.uniform(input_shape,
-                                       dtype='float32',
-                                       min=0.,
-                                       max=1.0)
+        # get mask
+        random_tensor = paddle.uniform(
+            input_shape, dtype='float32', min=0.0, max=1.0
+        )
         p = full(shape=[1], fill_value=p, dtype='float32')
         keep_mask = paddle.greater_equal(random_tensor, p)
         keep_mask = paddle.cast(keep_mask, dtype)
         drop_mask = paddle.subtract(
-            full(shape=input_shape, fill_value=1., dtype=dtype), keep_mask)
+            full(shape=input_shape, fill_value=1.0, dtype=dtype), keep_mask
+        )
 
-        #apply mask
+        # apply mask
         b = full(shape=[1], fill_value=b, dtype=dtype)
-        y = paddle.add(paddle.multiply(x, keep_mask),
-                       paddle.scale(drop_mask, scale=alpha_p))
+        y = paddle.add(
+            paddle.multiply(x, keep_mask),
+            paddle.scale(drop_mask, scale=alpha_p),
+        )
         res = paddle.add(paddle.scale(y, scale=a), b, name=name)
         return res
     else:  # test
@@ -1353,11 +1462,11 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
     Parameters:
         x (Tensor): The input tensor with data type float32/double/int32/int64_t.
         pad (Tensor|list[int]|tuple[int]): The padding size with data type int.
-            If mode is 'constant' and length of pad is twice as length of x dimension, then x will 
+            If mode is 'constant' and length of pad is twice as length of x dimension, then x will
             be padded from the first  dimension to the last dimension.
             Else: 1. If input dimension is 3, then the pad has the form (pad_left,
-            pad_right). 2. If the input dimension is 4, then the pad has the form (pad_left, pad_right, 
-            pad_top, pad_bottom). 3. If the input dimension is 5, then the pad has the form 
+            pad_right). 2. If the input dimension is 4, then the pad has the form (pad_left, pad_right,
+            pad_top, pad_bottom). 3. If the input dimension is 5, then the pad has the form
             (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back).
         mode (str, optional): Four modes: 'constant' (default), 'reflect', 'replicate', 'circular'. Default is 'constant'
 
@@ -1370,12 +1479,12 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
         data_format (str, optional): An string from: "NCL", "NLC", NHWC", "NCHW", "NCDHW", "NDHWC". Specify the data format of
            the input data. Default is "NCHW"，
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-                    
-    Returns: 
+
+    Returns:
         Tensor, a Tensor padded according to pad and mode and data type is same as input.
 
     Example:
-    
+
         .. code-block:: text
 
             x = [[[[[1., 2., 3.],
@@ -1428,21 +1537,21 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
 
             import paddle
             import paddle.nn.functional as F
-            
+
             # example 1
             x_shape = (1, 1, 3)
             x = paddle.arange(paddle.prod(paddle.to_tensor(x_shape)), dtype="float32").reshape(x_shape) + 1
             y = F.pad(x, [0, 0, 0, 0, 2, 3], value=1, mode='constant', data_format="NCL")
             print(y)
             # [[[1. 1. 1. 2. 3. 1. 1. 1.]]]
-            
+
             # example 2
             x_shape = (1, 1, 3)
             x = paddle.arange(paddle.prod(paddle.to_tensor(x_shape)), dtype="float32").reshape(x_shape) + 1
             y = F.pad(x, [2, 3], value=1, mode='constant', data_format="NCL")
             print(y)
             # [[[1. 1. 1. 2. 3. 1. 1. 1.]]]
-            
+
             # example 3
             x_shape = (1, 1, 2, 3)
             x = paddle.arange(paddle.prod(paddle.to_tensor(x_shape)), dtype="float32").reshape(x_shape) + 1
@@ -1453,18 +1562,28 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
             #    [6. 4. 5. 6. 4. 5.]
             #    [3. 1. 2. 3. 1. 2.]]]]
     """
-    assert mode in ['reflect', 'replicate', 'constant', 'circular'], \
-            "mode should be one of constant, reflect, replicate, circular, but got {}.".format(mode)
+    assert mode in [
+        'reflect',
+        'replicate',
+        'constant',
+        'circular',
+    ], "mode should be one of constant, reflect, replicate, circular, but got {}.".format(
+        mode
+    )
 
     data_format = data_format.upper()
-    assert data_format in ["NCL", "NCHW", "NCDHW", "NLC", "NHWC", "NDHWC"], \
-        "data_format should be in one of [NCL, NCHW, NCDHW, NLC, NHWC, NDHWC], " \
+    assert data_format in ["NCL", "NCHW", "NCDHW", "NLC", "NHWC", "NDHWC"], (
+        "data_format should be in one of [NCL, NCHW, NCDHW, NLC, NHWC, NDHWC], "
         "but got {}".format(data_format)
+    )
 
     x_dim = len(x.shape)
 
-    if mode == "constant" and isinstance(
-            pad, (list, tuple)) and len(pad) == x_dim * 2:
+    if (
+        mode == "constant"
+        and isinstance(pad, (list, tuple))
+        and len(pad) == x_dim * 2
+    ):
         paddings = pad
         pad_value = value
 
@@ -1472,10 +1591,20 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
             out = _C_ops.pad(x, paddings, float(pad_value))
             return out
 
-        check_variable_and_dtype(x, 'x', [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
-            'complex128'
-        ], "pad")
+        check_variable_and_dtype(
+            x,
+            'x',
+            [
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'complex64',
+                'complex128',
+            ],
+            "pad",
+        )
 
         check_type(pad_value, 'pad_value', (float, int, Variable), 'pad')
         if isinstance(pad_value, int):
@@ -1484,17 +1613,18 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
         helper = LayerHelper('pad', **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(type='pad',
-                         inputs={'X': x},
-                         outputs={'Out': out},
-                         attrs={
-                             'paddings': paddings,
-                             'pad_value': pad_value
-                         })
+        helper.append_op(
+            type='pad',
+            inputs={'X': x},
+            outputs={'Out': out},
+            attrs={'paddings': paddings, 'pad_value': pad_value},
+        )
         return out
 
     assert x_dim in [
-        3, 4, 5
+        3,
+        4,
+        5,
     ], "input tesor dimension must be in [3, 4, 5] but got {}".format(x_dim)
 
     supported_format_map = {
@@ -1502,9 +1632,11 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
         4: ["NCHW", "NHWC"],
         5: ["NCDHW", "NDHWC"],
     }
-    assert data_format in supported_format_map[x_dim], \
-    "input tensor dimension is {}, it's data format should be in {} but got {}".format(
-        x_dim, supported_format_map[x_dim], data_format)
+    assert (
+        data_format in supported_format_map[x_dim]
+    ), "input tensor dimension is {}, it's data format should be in {} but got {}".format(
+        x_dim, supported_format_map[x_dim], data_format
+    )
 
     unsqueezed_dim = []
 
@@ -1512,21 +1644,21 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
         if data_format in ["NCL", "NCHW", "NCDHW"]:
             data_format = "NCDHW"
             if x_dim == 3:
-                pad = concat([zeros((4, ), dtype="int32"), pad], axis=0)
+                pad = concat([zeros((4,), dtype="int32"), pad], axis=0)
                 unsqueezed_dim = [3, 4]
                 x = unsqueeze(x, axis=unsqueezed_dim)
             elif x_dim == 4:
-                pad = concat([pad, zeros((2, ), dtype="int32")], axis=0)
+                pad = concat([pad, zeros((2,), dtype="int32")], axis=0)
                 unsqueezed_dim = [2]
                 x = unsqueeze(x, axis=unsqueezed_dim)
         elif data_format in ["NLC", "NHWC", "NDHWC"]:
             data_format = "NDHWC"
             if x_dim == 3:
-                pad = concat([zeros((4, ), dtype="int32"), pad], axis=0)
+                pad = concat([zeros((4,), dtype="int32"), pad], axis=0)
                 unsqueezed_dim = [2, 3]
                 x = unsqueeze(x, axis=unsqueezed_dim)
             elif x_dim == 4:
-                pad = concat([pad, zeros((2, ), dtype="int32")], axis=0)
+                pad = concat([pad, zeros((2,), dtype="int32")], axis=0)
                 unsqueezed_dim = [1]
                 x = unsqueeze(x, axis=unsqueezed_dim)
     else:
@@ -1560,9 +1692,19 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
         if _in_legacy_dygraph():
             if isinstance(pad, Variable):
                 pad = pad.numpy().tolist()
-            out = _legacy_C_ops.pad3d(x, "paddings", pad, "mode", mode, "value",
-                                      value, "data_format", data_format, "name",
-                                      name)
+            out = _legacy_C_ops.pad3d(
+                x,
+                "paddings",
+                pad,
+                "mode",
+                mode,
+                "value",
+                value,
+                "data_format",
+                data_format,
+                "name",
+                name,
+            )
         else:
             attrs = {'mode': mode, 'value': value, 'data_format': data_format}
             inputs = {'X': [x]}
@@ -1576,10 +1718,9 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
 
             dtype = helper.input_dtype(input_param_name='input')
             out = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='pad3d',
-                             inputs=inputs,
-                             outputs={"Out": out},
-                             attrs=attrs)
+            helper.append_op(
+                type='pad3d', inputs=inputs, outputs={"Out": out}, attrs=attrs
+            )
 
     if len(unsqueezed_dim) != 0:
         out = squeeze(out, axis=unsqueezed_dim)
@@ -1601,7 +1742,7 @@ def zeropad2d(x, padding, data_format="NCHW", name=None):
         name(str, optional): The default value is None. Normally there is no need for user
             to set this property.
 
-    Returns: 
+    Returns:
         Tensor, padded with 0 according to pad and data type is same as input.
 
     Examples:
@@ -1620,12 +1761,14 @@ def zeropad2d(x, padding, data_format="NCHW", name=None):
             #    [0. 0. 0. 0. 0. 0.]]]]
     """
 
-    return pad(x,
-               pad=padding,
-               mode='constant',
-               value=0,
-               data_format=data_format,
-               name=name)
+    return pad(
+        x,
+        pad=padding,
+        mode='constant',
+        value=0,
+        data_format=data_format,
+        name=name,
+    )
 
 
 def cosine_similarity(x1, x2, axis=1, eps=1e-8):
@@ -1637,8 +1780,8 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
         x2 (Tensor): Second input. float32/double.
         axis (int, optional): Dimension of vectors to compute cosine similarity. Default is 1.
         eps(float, optional): Small value to avoid division by zero. Default is 1e-8.
-                    
-    Returns: 
+
+    Returns:
         Tensor, a Tensor representing cosine similarity between x1 and x2 along axis.
 
     Examples:
@@ -1670,7 +1813,7 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
             result = paddle.nn.functional.cosine_similarity(x1, x2, axis=0)
             print(result)
             # [0.97689527,  0.99996042, -0.55138415]
-            
+
     """
     w12 = sum(paddle.multiply(x1, x2), axis=axis)
     w1 = sum(paddle.multiply(x1, x1), axis=axis)
@@ -1696,7 +1839,7 @@ def linear(x, weight, bias=None, name=None):
     input should be a multi-dimensional tensor of shape
     :math:`[batch\_size, *, in\_features]` , where :math:`*` means any number of
     additional dimensions. The linear operator multiplies input tensor with
-    weight and produces an output tensor of shape :math:`[batch\_size, *, out\_features]` , 
+    weight and produces an output tensor of shape :math:`[batch\_size, *, out\_features]` ,
     If :math:`bias` is not None, the bias should be a 1-D tensor of shape
     :math:`[out\_features]` and will be added to the output.
 
@@ -1714,9 +1857,9 @@ def linear(x, weight, bias=None, name=None):
 
     Examples:
         .. code-block:: python
-          
+
           import paddle
-          
+
           x = paddle.randn((3, 2), dtype="float32")
           # x: [[-0.32342386 -1.200079  ]
           #     [ 0.7979031  -0.90978354]
@@ -1732,12 +1875,13 @@ def linear(x, weight, bias=None, name=None):
           #     [2.1077576  2.1077576  2.1077576  2.1077576 ]]
     """
     if in_dygraph_mode():
-        #TODO(jiabin): using addmm for fast forward route
+        # TODO(jiabin): using addmm for fast forward route
         return _C_ops.linear(x, weight, bias)
     else:
         if _in_legacy_dygraph():
-            pre_bias = _legacy_C_ops.matmul_v2(x, weight, 'trans_x', False,
-                                               'trans_y', False)
+            pre_bias = _legacy_C_ops.matmul_v2(
+                x, weight, 'trans_x', False, 'trans_y', False
+            )
 
             if bias is None:
                 return pre_bias
@@ -1747,27 +1891,30 @@ def linear(x, weight, bias=None, name=None):
             helper = LayerHelper('linear', **locals())
             dtype = x.dtype
 
-            check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                     'linear')
-            check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
-                        'linear')
+            check_variable_and_dtype(
+                x, 'x', ['float16', 'float32', 'float64'], 'linear'
+            )
+            check_dtype(
+                dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear'
+            )
 
             inputs = {'X': [x], 'Y': [weight]}
             attrs = {'trans_x': False, 'trans_y': False}
             tmp = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='matmul_v2',
-                             inputs=inputs,
-                             outputs={'Out': tmp},
-                             attrs=attrs)
+            helper.append_op(
+                type='matmul_v2',
+                inputs=inputs,
+                outputs={'Out': tmp},
+                attrs=attrs,
+            )
             if bias is not None:
                 res = helper.create_variable_for_type_inference(dtype)
-                helper.append_op(type='elementwise_add',
-                                 inputs={
-                                     'X': [tmp],
-                                     'Y': [bias]
-                                 },
-                                 outputs={'Out': [res]},
-                                 attrs={'axis': len(x.shape) - 1})
+                helper.append_op(
+                    type='elementwise_add',
+                    inputs={'X': [tmp], 'Y': [bias]},
+                    outputs={'Out': [res]},
+                    attrs={'axis': len(x.shape) - 1},
+                )
             else:
                 res = tmp
             return res
@@ -1819,7 +1966,7 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
 
             import paddle
             import numpy as np
-            
+
             x_data = np.array([[[0, 1, 0],
                                 [ 1,  0, 1]]]).astype("float32")
             print(x_data.shape)
@@ -1827,52 +1974,55 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
             x = paddle.to_tensor(x_data, stop_gradient=False)
             output = paddle.nn.functional.label_smooth(x)
             print(output)
-            
+
             #[[[0.03333334 0.93333334 0.03333334]
             #  [0.93333334 0.03333334 0.93333334]]]
     """
-    if epsilon > 1. or epsilon < 0.:
+    if epsilon > 1.0 or epsilon < 0.0:
         raise ValueError("The value of epsilon must be between 0 and 1.")
 
     if in_dygraph_mode():
         return _C_ops.label_smooth(label, prior_dist, float(epsilon))
 
     elif paddle.in_dynamic_mode():
-        return _legacy_C_ops.label_smooth(label, prior_dist, 'epsilon',
-                                          float(epsilon))
+        return _legacy_C_ops.label_smooth(
+            label, prior_dist, 'epsilon', float(epsilon)
+        )
 
-    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                             'label_smooth')
+    check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'label_smooth'
+    )
 
     helper = LayerHelper("label_smooth", **locals())
     label.stop_gradient = True
     smooth_label = helper.create_variable_for_type_inference(label.dtype)
-    helper.append_op(type="label_smooth",
-                     inputs={
-                         "X": label,
-                         "PriorDist": prior_dist
-                     } if prior_dist else {"X": label},
-                     outputs={"Out": smooth_label},
-                     attrs={"epsilon": float(epsilon)})
+    helper.append_op(
+        type="label_smooth",
+        inputs={"X": label, "PriorDist": prior_dist}
+        if prior_dist
+        else {"X": label},
+        outputs={"Out": smooth_label},
+        attrs={"epsilon": float(epsilon)},
+    )
     return smooth_label
 
 
 def class_center_sample(label, num_classes, num_samples, group=None):
     """
     Class center sample method is proposed from the paper PartialFC that only sample a subset of the class centers.
-    The process of sampling subset class centers is straightforward: 
+    The process of sampling subset class centers is straightforward:
 
     1. First select the positive class centers;
     2. Then randomly sample negative class centers.
 
-    Specifically, given a label tensor, shape [batch_size], select all the positive class centers and randomly 
+    Specifically, given a label tensor, shape [batch_size], select all the positive class centers and randomly
     sample negative class centers, then remap the input label tensor using the sampled class centers.
 
     For more information, Partial FC: Training 10 Million Identities on a Single Machine
     arxiv: https://arxiv.org/abs/2010.05222
-    
+
     .. hint::
-        If the number of the positive class centers is greater than the input num_samples, it keeps all the positive 
+        If the number of the positive class centers is greater than the input num_samples, it keeps all the positive
         class centers and the shape of sampled_class_center will be [num_positive_class_centers].
 
         The API supports CPU, single GPU and multi GPU.
@@ -1886,7 +2036,7 @@ def class_center_sample(label, num_classes, num_samples, group=None):
         num_classes (int): A positive integer to specify the number of classes at local rank.
             Note that num_classes of each GPU can be different.
         num_samples (int): A positive integer to specify the number of class center to sample.
-        group (Group, optional): The group instance return by paddle.distributed.new_group 
+        group (Group, optional): The group instance return by paddle.distributed.new_group
             or ``None`` for global default group or ``False`` for data parallel (do not communication cross ranks).
             Default is ``None``.
 
@@ -1952,7 +2102,7 @@ def class_center_sample(label, num_classes, num_samples, group=None):
         #       [6 , 11, 10, 7 , 4 , 8 , 12, 12, 11, 12, 13, 1 , 3 , 9 , 7 , 9 , 4 , 6 , 0 , 2 ])
         #Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
         #       [0, 2, 4, 8, 9, 3])
-        
+
         # rank 1 output:
         #Tensor(shape=[20], dtype=int64, place=CUDAPlace(1), stop_gradient=True,
         #       [10, 17, 15, 11, 9 , 12, 18, 18, 17, 18, 19, 2 , 8 , 13, 11, 13, 9 , 10, 0 , 4 ])
@@ -1964,7 +2114,10 @@ def class_center_sample(label, num_classes, num_samples, group=None):
     if not (group == False or group is None or hasattr(group, 'is_member')):
         raise ValueError(
             'Expected group is False, None or instance of paddle.distributed.collective.Group \
-             (got group: {})'.format(group))
+             (got group: {})'.format(
+                group
+            )
+        )
         return
 
     if hasattr(group, 'is_member') and not group.is_member():
@@ -1977,76 +2130,112 @@ def class_center_sample(label, num_classes, num_samples, group=None):
         if core.is_compiled_with_dist():
             parallel_env = paddle.distributed.ParallelEnv()
             global_rank = parallel_env.rank
-            rank = global_rank if group is None else group.get_group_rank(
-                global_rank)
+            rank = (
+                global_rank
+                if group is None
+                else group.get_group_rank(global_rank)
+            )
             nranks = parallel_env.world_size if group is None else group.nranks
 
     if num_samples > num_classes:
         raise ValueError(
-            'Expected num_samples less than or equal to {}, got num_samples {}'.
-            format(num_classes, num_samples))
+            'Expected num_samples less than or equal to {}, got num_samples {}'.format(
+                num_classes, num_samples
+            )
+        )
 
     label_size = 1
     for dim in list(label.shape):
         label_size *= dim
     if label_size != -1 and label_size < 1:
-        raise ValueError('Expected label_size > 0 \
-             (got label_size: {})'.format(label_size))
+        raise ValueError(
+            'Expected label_size > 0 \
+             (got label_size: {})'.format(
+                label_size
+            )
+        )
 
     label_dims = len(list(label.shape))
     if label_dims != 1:
-        raise ValueError('Expected label_dims == 1 \
-             (got label_dims: {})'.format(label_dims))
+        raise ValueError(
+            'Expected label_dims == 1 \
+             (got label_dims: {})'.format(
+                label_dims
+            )
+        )
 
     seed = None
     if (seed is None or seed == 0) and default_main_program().random_seed != 0:
         seed = default_main_program().random_seed
 
     if in_dygraph_mode():
-        return _C_ops.class_center_sample(label, num_classes, num_samples,
-                                          ring_id, rank, nranks, seed
-                                          is not None,
-                                          seed if seed is not None else 0)
+        return _C_ops.class_center_sample(
+            label,
+            num_classes,
+            num_samples,
+            ring_id,
+            rank,
+            nranks,
+            seed is not None,
+            seed if seed is not None else 0,
+        )
     elif paddle.in_dynamic_mode():
-        remapped_label, sampled_class_center = _legacy_C_ops.class_center_sample(
-            label, 'num_classes', num_classes, 'num_samples', num_samples,
-            'ring_id', ring_id, 'nranks', nranks, 'rank', rank, 'fix_seed', seed
-            is not None, 'seed', seed if seed is not None else 0)
+        (
+            remapped_label,
+            sampled_class_center,
+        ) = _legacy_C_ops.class_center_sample(
+            label,
+            'num_classes',
+            num_classes,
+            'num_samples',
+            num_samples,
+            'ring_id',
+            ring_id,
+            'nranks',
+            nranks,
+            'rank',
+            rank,
+            'fix_seed',
+            seed is not None,
+            'seed',
+            seed if seed is not None else 0,
+        )
         return remapped_label, sampled_class_center
 
-    check_variable_and_dtype(label, 'label', ['int64', 'int32'],
-                             'class_center_sample')
+    check_variable_and_dtype(
+        label, 'label', ['int64', 'int32'], 'class_center_sample'
+    )
     op_type = 'class_center_sample'
     helper = LayerHelper(op_type, **locals())
     remapped_label = helper.create_variable_for_type_inference(
-        dtype=label.dtype)
+        dtype=label.dtype
+    )
     sampled_class_center = helper.create_variable_for_type_inference(
-        dtype=label.dtype)
-    helper.append_op(type=op_type,
-                     inputs={'Label': label},
-                     outputs={
-                         'RemappedLabel': remapped_label,
-                         'SampledLocalClassCenter': sampled_class_center
-                     },
-                     attrs={
-                         'num_classes': num_classes,
-                         'num_samples': num_samples,
-                         'ring_id': ring_id,
-                         'nranks': nranks,
-                         'rank': rank,
-                         'fix_seed': seed is not None,
-                         'seed': seed if seed is not None else 0
-                     })
+        dtype=label.dtype
+    )
+    helper.append_op(
+        type=op_type,
+        inputs={'Label': label},
+        outputs={
+            'RemappedLabel': remapped_label,
+            'SampledLocalClassCenter': sampled_class_center,
+        },
+        attrs={
+            'num_classes': num_classes,
+            'num_samples': num_samples,
+            'ring_id': ring_id,
+            'nranks': nranks,
+            'rank': rank,
+            'fix_seed': seed is not None,
+            'seed': seed if seed is not None else 0,
+        },
+    )
     return remapped_label, sampled_class_center
 
 
-def fold(x,
-         output_sizes,
-         kernel_sizes,
-         strides=1,
-         paddings=0,
-         dilations=1,
-         name=None):
+def fold(
+    x, output_sizes, kernel_sizes, strides=1, paddings=0, dilations=1, name=None
+):
     r"""
     
     Combines an array of sliding local blocks into a large containing
@@ -2109,35 +2298,38 @@ def fold(x,
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'fold')
 
-    assert len(x.shape) == 3, \
-            "input should be the format of [N, C, L]"
+    assert len(x.shape) == 3, "input should be the format of [N, C, L]"
 
     def _is_list_or_turple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
+        return isinstance(data, list) or isinstance(data, tuple)
 
     if isinstance(output_sizes, int):
         output_sizes = [output_sizes, output_sizes]
     else:
-        assert _is_list_or_turple_(output_sizes) and (len(output_sizes) == 2), \
-            "output_sizes should either be an integer or a list/tuple of two integers"
+        assert _is_list_or_turple_(output_sizes) and (
+            len(output_sizes) == 2
+        ), "output_sizes should either be an integer or a list/tuple of two integers"
 
     if isinstance(kernel_sizes, int):
         kernel_sizes = [kernel_sizes, kernel_sizes]
     else:
-        assert _is_list_or_turple_(kernel_sizes) and (len(kernel_sizes) == 2), \
-            "kernel_sizes should either be an integer or a list/tuple of two integers"
+        assert _is_list_or_turple_(kernel_sizes) and (
+            len(kernel_sizes) == 2
+        ), "kernel_sizes should either be an integer or a list/tuple of two integers"
 
     if isinstance(strides, int):
         strides = [strides, strides]
     else:
-        assert _is_list_or_turple_(strides) and (len(strides) == 2), \
-            "strides should either be an integer or a list/tuple of two integers"
+        assert _is_list_or_turple_(strides) and (
+            len(strides) == 2
+        ), "strides should either be an integer or a list/tuple of two integers"
 
     if isinstance(dilations, int):
         dilations = [dilations, dilations]
     else:
-        assert _is_list_or_turple_(dilations) and (len(dilations) == 2), \
-            "dilations should either be an integer or a list/tuple of two integers"
+        assert _is_list_or_turple_(dilations) and (
+            len(dilations) == 2
+        ), "dilations should either be an integer or a list/tuple of two integers"
 
     if isinstance(paddings, int):
         paddings = [paddings] * 4
@@ -2153,26 +2345,39 @@ def fold(x,
     else:
         raise ValueError(
             "Unexpected type of paddings, it should be either an integer or a list"
-            "of 2 or 4 integers")
+            "of 2 or 4 integers"
+        )
 
     if in_dygraph_mode():
-        out = _C_ops.fold(x, output_sizes, kernel_sizes, strides, paddings,
-                          dilations)
+        out = _C_ops.fold(
+            x, output_sizes, kernel_sizes, strides, paddings, dilations
+        )
     elif in_dynamic_mode():
-        out = _legacy_C_ops.fold(x, "output_sizes", output_sizes,
-                                 "kernel_sizes", kernel_sizes, "strides",
-                                 strides, "paddings", paddings, "dilations",
-                                 dilations)
+        out = _legacy_C_ops.fold(
+            x,
+            "output_sizes",
+            output_sizes,
+            "kernel_sizes",
+            kernel_sizes,
+            "strides",
+            strides,
+            "paddings",
+            paddings,
+            "dilations",
+            dilations,
+        )
     else:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
-        helper.append_op(type="fold",
-                         inputs={"X": x},
-                         outputs={"Y": out},
-                         attrs={
-                             "output_sizes": output_sizes,
-                             "kernel_sizes": kernel_sizes,
-                             "strides": strides,
-                             "paddings": paddings,
-                             "dilations": dilations
-                         })
+        helper.append_op(
+            type="fold",
+            inputs={"X": x},
+            outputs={"Y": out},
+            attrs={
+                "output_sizes": output_sizes,
+                "kernel_sizes": kernel_sizes,
+                "strides": strides,
+                "paddings": paddings,
+                "dilations": dilations,
+            },
+        )
     return out
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index f4434f69b5a..9ca0683ee64 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -17,7 +17,12 @@ import numpy as np
 from ...device import get_cudnn_version
 from ...static import Variable
 from ...fluid import dygraph_utils
-from ...fluid.layers.utils import convert_to_list, _is_symmetric_padding, _contain_var, _convert_to_tensor_list
+from ...fluid.layers.utils import (
+    convert_to_list,
+    _is_symmetric_padding,
+    _contain_var,
+    _convert_to_tensor_list,
+)
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 from ...framework import ParamAttr
 from ...fluid.layer_helper import LayerHelper
@@ -62,8 +67,10 @@ def _update_padding_nd(padding, channel_last, num_dims):
         padding = padding.upper()
         if padding not in ["SAME", "VALID"]:
             raise ValueError(
-                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".
-                format(padding))
+                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".format(
+                    padding
+                )
+            )
         if padding == "VALID":
             padding_algorithm = "VALID"
             padding = [0] * num_dims
@@ -78,10 +85,12 @@ def _update_padding_nd(padding, channel_last, num_dims):
             if not _zero_padding_in_batch_and_channel(padding, channel_last):
                 raise ValueError(
                     "Non-zero padding({}) in the batch or channel dimensions "
-                    "is not supported.".format(padding))
+                    "is not supported.".format(padding)
+                )
             padding_algorithm = "EXPLICIT"
             padding = _exclude_padding_in_batch_and_channel(
-                padding, channel_last)
+                padding, channel_last
+            )
             if _is_symmetric_padding(padding, num_dims):
                 padding = padding[0::2]
         # for padding like [pad_before, pad_after, pad_before, pad_after, ...]
@@ -102,42 +111,60 @@ def _update_padding_nd(padding, channel_last, num_dims):
         padding = convert_to_list(padding, num_dims, 'padding')
     if not all([p >= 0 for p in padding]):
         raise ValueError(
-            "Invalid padding, all value should be larger than or equal to 0, but received: {}"
-            .format(padding))
+            "Invalid padding, all value should be larger than or equal to 0, but received: {}".format(
+                padding
+            )
+        )
     return padding, padding_algorithm
 
 
-def _conv_nd(x,
-             weight,
-             bias=None,
-             stride=1,
-             padding=0,
-             padding_algorithm=None,
-             dilation=1,
-             groups=1,
-             data_format="NCHW",
-             channel_dim=1,
-             op_type="conv2d",
-             use_cudnn=True,
-             use_mkldnn=False,
-             name=None):
+def _conv_nd(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    padding_algorithm=None,
+    dilation=1,
+    groups=1,
+    data_format="NCHW",
+    channel_dim=1,
+    op_type="conv2d",
+    use_cudnn=True,
+    use_mkldnn=False,
+    name=None,
+):
 
     # Due to the poor performance of NHWC, we transpose the input to NCHW.
     if in_dygraph_mode() and op_type == "conv2d":
-        pre_bias = _C_ops.conv2d(x, weight, stride, padding, padding_algorithm,
-                                 groups, dilation, data_format, False, -1,
-                                 False)
+        pre_bias = _C_ops.conv2d(
+            x,
+            weight,
+            stride,
+            padding,
+            padding_algorithm,
+            groups,
+            dilation,
+            data_format,
+            False,
+            -1,
+            False,
+        )
         if bias is not None:
-            channel_dim = channel_dim + len(
-                x.shape) if channel_dim < 0 else channel_dim
+            channel_dim = (
+                channel_dim + len(x.shape) if channel_dim < 0 else channel_dim
+            )
             if isinstance(x, tuple):
                 x = x[0]
             if isinstance(bias, tuple):
                 bias = bias[0]
             if len(bias.shape) < len(x.shape):
                 tmp_bias = _C_ops.reshape(
-                    bias, [1 for i in range(channel_dim)] + bias.shape +
-                    [1 for i in range(len(x.shape) - channel_dim - 1)])
+                    bias,
+                    [1 for i in range(channel_dim)]
+                    + bias.shape
+                    + [1 for i in range(len(x.shape) - channel_dim - 1)],
+                )
                 return _C_ops.add(pre_bias, tmp_bias)
             else:
                 return _C_ops.add(pre_bias, bias)
@@ -145,40 +172,82 @@ def _conv_nd(x,
             return pre_bias
 
     if in_dygraph_mode() and op_type == "depthwise_conv2d":
-        pre_bias = _C_ops.depthwise_conv2d(x, weight, stride, padding,
-                                           padding_algorithm, groups, dilation,
-                                           data_format, False, -1, False, False,
-                                           use_cudnn)
+        pre_bias = _C_ops.depthwise_conv2d(
+            x,
+            weight,
+            stride,
+            padding,
+            padding_algorithm,
+            groups,
+            dilation,
+            data_format,
+            False,
+            -1,
+            False,
+            False,
+            use_cudnn,
+        )
         if bias is not None:
-            channel_dim = channel_dim + len(
-                x.shape) if channel_dim < 0 else channel_dim
+            channel_dim = (
+                channel_dim + len(x.shape) if channel_dim < 0 else channel_dim
+            )
             tmp_bias = _C_ops.reshape(
-                bias, [1 for i in range(channel_dim)] + bias.shape +
-                [1 for i in range(len(x.shape) - channel_dim - 1)])
+                bias,
+                [1 for i in range(channel_dim)]
+                + bias.shape
+                + [1 for i in range(len(x.shape) - channel_dim - 1)],
+            )
             return _C_ops.add(pre_bias, tmp_bias)
         else:
             return pre_bias
 
     if in_dygraph_mode() and op_type == "conv3d":
-        pre_bias = _C_ops.conv3d(x, weight, stride, padding, padding_algorithm,
-                                 groups, dilation, data_format, False, -1,
-                                 False)
+        pre_bias = _C_ops.conv3d(
+            x,
+            weight,
+            stride,
+            padding,
+            padding_algorithm,
+            groups,
+            dilation,
+            data_format,
+            False,
+            -1,
+            False,
+        )
         if bias is not None:
-            channel_dim = channel_dim + len(
-                x.shape) if channel_dim < 0 else channel_dim
+            channel_dim = (
+                channel_dim + len(x.shape) if channel_dim < 0 else channel_dim
+            )
             tmp_bias = _C_ops.reshape(
                 bias,
-                bias.shape + [1 for i in range(len(x.shape) - channel_dim - 1)])
+                bias.shape + [1 for i in range(len(x.shape) - channel_dim - 1)],
+            )
             return _C_ops.add(pre_bias, tmp_bias)
         else:
             return pre_bias
 
     if in_dynamic_mode():
-        attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
-                 'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn',
-                 use_mkldnn, 'fuse_relu_before_depthwise_conv', False,
-                 "padding_algorithm", padding_algorithm, "data_format",
-                 data_format)
+        attrs = (
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            'dilations',
+            dilation,
+            'groups',
+            groups,
+            'use_cudnn',
+            use_cudnn,
+            'use_mkldnn',
+            use_mkldnn,
+            'fuse_relu_before_depthwise_conv',
+            False,
+            "padding_algorithm",
+            padding_algorithm,
+            "data_format",
+            data_format,
+        )
         pre_bias = getattr(_legacy_C_ops, op_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
@@ -195,44 +264,42 @@ def _conv_nd(x,
             'use_mkldnn': use_mkldnn,
             'fuse_relu_before_depthwise_conv': False,
             "padding_algorithm": padding_algorithm,
-            "data_format": data_format
+            "data_format": data_format,
         }
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 op_type)
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], op_type
+        )
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         pre_bias = helper.create_variable_for_type_inference(dtype)
         outputs = {"Output": [pre_bias]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
         if bias is not None:
             out = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='elementwise_add',
-                             inputs={
-                                 'X': [pre_bias],
-                                 'Y': [bias]
-                             },
-                             outputs={'Out': [out]},
-                             attrs={
-                                 'axis': channel_dim,
-                                 'use_mkldnn': use_mkldnn
-                             })
+            helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [pre_bias], 'Y': [bias]},
+                outputs={'Out': [out]},
+                attrs={'axis': channel_dim, 'use_mkldnn': use_mkldnn},
+            )
         else:
             out = pre_bias
     return out
 
 
-def conv1d(x,
-           weight,
-           bias=None,
-           stride=1,
-           padding=0,
-           dilation=1,
-           groups=1,
-           data_format='NCL',
-           name=None):
+def conv1d(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    data_format='NCL',
+    name=None,
+):
     r"""
     The convolution1D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input and
@@ -279,10 +346,10 @@ def conv1d(x,
             L_{out} = \frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1
 
     Args:
-        x (Tensor): The input is 3-D Tensor with shape [N, C, L], the data type 
+        x (Tensor): The input is 3-D Tensor with shape [N, C, L], the data type
             of input is float16 or float32 or float64.
         weight (Tensor): The convolution kernel with shape [M, C/g, K], where M is
-            the number of output channels, g is the number of groups, K is the kernel's size. 
+            the number of output channels, g is the number of groups, K is the kernel's size.
         bias (Tensor, optional): The bias with shape [M,]. Default: None.
         stride (int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain one integers, (stride_size). Default: 1.
@@ -300,23 +367,23 @@ def conv1d(x,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: 1.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCL"`, `"NLC"`.
             The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
             `[batch_size, input_channels, feature_length]`.
-        name(str, optional): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
+        name(str, optional): For detailed information, please refer
+           to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.
 
     Returns:
-        A tensor representing the conv1d, whose data type is the 
+        A tensor representing the conv1d, whose data type is the
         same with input.
 
     Raises:
         ValueError: If the channel dimension of the input is less than or equal to zero.
         ValueError: If `data_format` is not "NCL" or "NLC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0
             or the element corresponding to the input's channel is not 0.
         ShapeError: If the input is not 3-D Tensor.
         ShapeError: If the input's dimension size and filter's dimension size not equal.
@@ -340,13 +407,13 @@ def conv1d(x,
            [[0, 3, 4],
             [2, 9, 7],
             [5, 6, 8]]]).astype(np.float32)
-          
+
           x_var = paddle.to_tensor(x)
           w_var = paddle.to_tensor(w)
           y_var = F.conv1d(x_var, w_var)
           y_np = y_var.numpy()
           print(y_np)
-          
+
           # [[[133. 238.]
           #   [160. 211.]]]
     """
@@ -357,36 +424,45 @@ def conv1d(x,
         use_cudnn = False
 
     if data_format not in ["NCL", "NLC"]:
-        raise ValueError("Attr(data_format) should be 'NCL' or 'NLC'. "
-                         "Received Attr(data_format): {}.".format(data_format))
+        raise ValueError(
+            "Attr(data_format) should be 'NCL' or 'NLC'. "
+            "Received Attr(data_format): {}.".format(data_format)
+        )
 
-    channel_last = (data_format == "NLC")
+    channel_last = data_format == "NLC"
     channel_dim = -1 if channel_last else 1
     conv2d_data_format = "NHWC" if channel_last else "NCHW"
     if len(x.shape) != 3:
         raise ValueError(
-            "Input x should be 3D tensor, but received x with the shape of {}".
-            format(x.shape))
+            "Input x should be 3D tensor, but received x with the shape of {}".format(
+                x.shape
+            )
+        )
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
-        raise ValueError("The channel dimension of the input({}) "
-                         "should be defined. Received: {}.".format(
-                             x.shape, num_channels))
+        raise ValueError(
+            "The channel dimension of the input({}) "
+            "should be defined. Received: {}.".format(x.shape, num_channels)
+        )
     if groups <= 0:
         raise ValueError(
-            "The groups of conv1d should be greater than 0. Received groups: {}"
-            .format(groups))
+            "The groups of conv1d should be greater than 0. Received groups: {}".format(
+                groups
+            )
+        )
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
             "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, x.shape, groups))
+            ", the groups is {}".format(num_channels, x.shape, groups)
+        )
     if num_filters % groups != 0:
         raise ValueError(
             "the number of filters must be divisible by groups,"
             "received: the number of filters is {}, the shape of weight is {}"
-            ", the groups is {}".format(num_filters, weight.shape, groups))
+            ", the groups is {}".format(num_filters, weight.shape, groups)
+        )
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 1)
@@ -397,8 +473,10 @@ def conv1d(x,
         padding = [0] + padding
     else:
         raise ValueError(
-            "The size of padding's dimension should be 1 or 2. But got padding={}"
-            .format(padding))
+            "The size of padding's dimension should be 1 or 2. But got padding={}".format(
+                padding
+            )
+        )
     stride = [1] + convert_to_list(stride, 1, 'stride')
     dilation = [1] + convert_to_list(dilation, 1, 'dilation')
     weight = unsqueeze(weight, axis=[-2])
@@ -406,14 +484,18 @@ def conv1d(x,
     l_type = "conv2d"
 
     # When "groups==num_channels and num_filters% num_channels == 0" using depthwise_conv2d has better performance
-    if (is_compiled_with_cuda() and num_channels == groups and num_channels != 1
-            and num_filters % num_channels == 0):
+    if (
+        is_compiled_with_cuda()
+        and num_channels == groups
+        and num_channels != 1
+        and num_filters % num_channels == 0
+    ):
         l_type = 'depthwise_conv2d'
         use_cudnn = False
 
     # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
     if is_compiled_with_npu():
-        if (num_channels == groups and num_channels == num_filters):
+        if num_channels == groups and num_channels == num_filters:
             l_type = 'depthwise_conv2d'
         else:
             l_type = 'conv2d'
@@ -422,17 +504,44 @@ def conv1d(x,
     x = unsqueeze(x, axis=[squeeze_aixs])
 
     if in_dygraph_mode():
-        out = getattr(_C_ops,
-                      l_type)(x, weight, stride, padding, padding_algorithm,
-                              groups, dilation, conv2d_data_format, False, -1,
-                              False, False, use_cudnn)
+        out = getattr(_C_ops, l_type)(
+            x,
+            weight,
+            stride,
+            padding,
+            padding_algorithm,
+            groups,
+            dilation,
+            conv2d_data_format,
+            False,
+            -1,
+            False,
+            False,
+            use_cudnn,
+        )
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
     elif _in_legacy_dygraph():
-        attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
-                 'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
-                 'fuse_relu_before_depthwise_conv', False, "padding_algorithm",
-                 padding_algorithm, "data_format", conv2d_data_format)
+        attrs = (
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            'dilations',
+            dilation,
+            'groups',
+            groups,
+            'use_cudnn',
+            use_cudnn,
+            'use_mkldnn',
+            False,
+            'fuse_relu_before_depthwise_conv',
+            False,
+            "padding_algorithm",
+            padding_algorithm,
+            "data_format",
+            conv2d_data_format,
+        )
         out = getattr(_legacy_C_ops, l_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
@@ -447,33 +556,35 @@ def conv1d(x,
             'use_mkldnn': False,
             'fuse_relu_before_depthwise_conv': False,
             "padding_algorithm": padding_algorithm,
-            "data_format": conv2d_data_format
+            "data_format": conv2d_data_format,
         }
-        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                                 'conv2d')
+        check_variable_and_dtype(
+            x, 'input', ['float16', 'float32', 'float64'], 'conv2d'
+        )
         helper = LayerHelper(l_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype)
         outputs = {"Output": [out]}
-        helper.append_op(type=l_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=l_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
     out = squeeze(out, axis=[squeeze_aixs])
     return out
 
 
-def conv2d(x,
-           weight,
-           bias=None,
-           stride=1,
-           padding=0,
-           dilation=1,
-           groups=1,
-           data_format="NCHW",
-           name=None):
+def conv2d(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    data_format="NCHW",
+    name=None,
+):
     r"""
 
     The convolution2D layer calculates the output based on the input, filter
@@ -594,40 +705,52 @@ def conv2d(x,
     """
     # entry checks
     if data_format not in ["NCHW", "NHWC"]:
-        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
-                         "Received Attr(data_format): {}.".format(data_format))
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. "
+            "Received Attr(data_format): {}.".format(data_format)
+        )
 
-    channel_last = (data_format == "NHWC")
+    channel_last = data_format == "NHWC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 4:
         raise ValueError(
-            "Input x should be 4D tensor, but received x with the shape of {}".
-            format(x.shape))
+            "Input x should be 4D tensor, but received x with the shape of {}".format(
+                x.shape
+            )
+        )
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
-        raise ValueError("The channel dimension of the input({}) "
-                         "should be defined. Received: {}.".format(
-                             x.shape, num_channels))
+        raise ValueError(
+            "The channel dimension of the input({}) "
+            "should be defined. Received: {}.".format(x.shape, num_channels)
+        )
     if groups <= 0:
         raise ValueError(
-            "The groups of conv2d should be greater than 0. Received groups: {}"
-            .format(groups))
+            "The groups of conv2d should be greater than 0. Received groups: {}".format(
+                groups
+            )
+        )
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
             "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, x.shape, groups))
+            ", the groups is {}".format(num_channels, x.shape, groups)
+        )
     if num_filters % groups != 0:
         raise ValueError(
             "the number of filters must be divisible by groups,"
             "received: the number of filters is {}, the shape of weight is {}"
-            ", the groups is {}".format(num_filters, weight.shape, groups))
+            ", the groups is {}".format(num_filters, weight.shape, groups)
+        )
 
     cudnn_version = get_cudnn_version()
 
-    use_cudnn = True if (is_compiled_with_cuda()
-                         and cudnn_version is not None) else False
+    use_cudnn = (
+        True
+        if (is_compiled_with_cuda() and cudnn_version is not None)
+        else False
+    )
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
@@ -635,8 +758,11 @@ def conv2d(x,
     dilation = convert_to_list(dilation, 2, 'dilation')
 
     l_type = "conv2d"
-    if (num_channels == groups and num_channels != 1
-            and num_filters % num_channels == 0):
+    if (
+        num_channels == groups
+        and num_channels != 1
+        and num_filters % num_channels == 0
+    ):
         l_type = 'depthwise_conv2d'
         if is_compiled_with_rocm():
             use_cudnn = True
@@ -644,9 +770,19 @@ def conv2d(x,
             use_cudnn = False
     else:
         if in_dygraph_mode():
-            pre_bias = _C_ops.conv2d(x, weight, stride, padding,
-                                     padding_algorithm, groups, dilation,
-                                     data_format, False, -1, False)
+            pre_bias = _C_ops.conv2d(
+                x,
+                weight,
+                stride,
+                padding,
+                padding_algorithm,
+                groups,
+                dilation,
+                data_format,
+                False,
+                -1,
+                False,
+            )
             if bias is not None:
                 out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
                 return out
@@ -657,31 +793,50 @@ def conv2d(x,
 
     # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
     if is_compiled_with_npu():
-        if (num_channels == groups and num_channels == num_filters):
+        if num_channels == groups and num_channels == num_filters:
             l_type = 'depthwise_conv2d'
         else:
             l_type = 'conv2d'
 
-    if (is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")
-        ["FLAGS_conv2d_disable_cudnn"]):
+    if (
+        is_compiled_with_cuda()
+        and get_flags("FLAGS_conv2d_disable_cudnn")[
+            "FLAGS_conv2d_disable_cudnn"
+        ]
+    ):
         use_cudnn = False
 
-    return _conv_nd(x, weight, bias, stride, padding, padding_algorithm,
-                    dilation, groups, data_format, channel_dim, l_type,
-                    use_cudnn, use_mkldnn, name)
-
-
-def conv1d_transpose(x,
-                     weight,
-                     bias=None,
-                     stride=1,
-                     padding=0,
-                     output_padding=0,
-                     groups=1,
-                     dilation=1,
-                     output_size=None,
-                     data_format="NCL",
-                     name=None):
+    return _conv_nd(
+        x,
+        weight,
+        bias,
+        stride,
+        padding,
+        padding_algorithm,
+        dilation,
+        groups,
+        data_format,
+        channel_dim,
+        l_type,
+        use_cudnn,
+        use_mkldnn,
+        name,
+    )
+
+
+def conv1d_transpose(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    output_padding=0,
+    groups=1,
+    dilation=1,
+    output_size=None,
+    data_format="NCL",
+    name=None,
+):
     r"""
     The 1-D convolution transpose layer calculates the output based on the input,
     filter, and dilation, stride, padding. Input(Input) and output(Output)
@@ -823,28 +978,36 @@ def conv1d_transpose(x,
         raise ValueError(
             "Attr(data_format) of conv2d_transpose got wrong value: "
             "received {}, but only 'NCL' or 'NLC' are supported.".format(
-                data_format))
-    channel_last = (data_format == "NLC")
+                data_format
+            )
+        )
+    channel_last = data_format == "NLC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 3:
         raise ValueError(
-            "Input x should be 3D tensor, but received x with the shape of {}".
-            format(x.shape))
+            "Input x should be 3D tensor, but received x with the shape of {}".format(
+                x.shape
+            )
+        )
 
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
-        raise ValueError("The channel dimension of the input({}) "
-                         "should be defined. Received: {}.".format(
-                             x.shape, num_channels))
+        raise ValueError(
+            "The channel dimension of the input({}) "
+            "should be defined. Received: {}.".format(x.shape, num_channels)
+        )
     if groups <= 0:
         raise ValueError(
-            "The groups of conv1d_transpose should be greater than 0. Received groups: {}"
-            .format(groups))
+            "The groups of conv1d_transpose should be greater than 0. Received groups: {}".format(
+                groups
+            )
+        )
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
             "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, x.shape, groups))
+            ", the groups is {}".format(num_channels, x.shape, groups)
+        )
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 1)
@@ -855,8 +1018,10 @@ def conv1d_transpose(x,
         padding = padding + [0]
     else:
         raise ValueError(
-            "The size of padding's dimension should 1 or 2. But got padding={}".
-            format(padding))
+            "The size of padding's dimension should 1 or 2. But got padding={}".format(
+                padding
+            )
+        )
 
     stride = convert_to_list(stride, 1, 'stride') + [1]
     dilation = convert_to_list(dilation, 1, 'dilation') + [1]
@@ -865,30 +1030,40 @@ def conv1d_transpose(x,
         output_size = []
     else:
         if output_padding != 0:
-            raise ValueError('output_padding option is mutually exclusive with '
-                             'output_size')
+            raise ValueError(
+                'output_padding option is mutually exclusive with '
+                'output_size'
+            )
         if isinstance(output_size, (list, tuple, int)):
             output_size = convert_to_list(output_size, 1, 'output_size') + [1]
         else:
             raise ValueError(
-                "output_size should be int, or list, tuple of ints")
+                "output_size should be int, or list, tuple of ints"
+            )
 
     if output_padding == 0:
         output_padding = []
     else:
-        output_padding = convert_to_list(output_padding, 1,
-                                         'output_padding') + [0]
+        output_padding = convert_to_list(
+            output_padding, 1, 'output_padding'
+        ) + [0]
 
     if len(output_padding) > 0 and output_padding[0] > stride[0]:
         raise ValueError(
             "The size of output_padding should not be greater than stride."
             "But got output_padding={} and stride={}".format(
-                output_padding[0], stride[0]))
+                output_padding[0], stride[0]
+            )
+        )
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
-    if (num_channels == groups and num_channels != 1 and num_filters == 1
-            and not use_cudnn):
+    if (
+        num_channels == groups
+        and num_channels != 1
+        and num_filters == 1
+        and not use_cudnn
+    ):
         op_type = 'depthwise_conv2d_transpose'
         use_cudnn = False
 
@@ -899,17 +1074,41 @@ def conv1d_transpose(x,
     weight = unsqueeze(weight, axis=[-1])
 
     if in_dygraph_mode():
-        out = getattr(_C_ops,
-                      op_type)(x, weight, stride, padding, output_padding,
-                               output_size, padding_algorithm, groups, dilation,
-                               conv2d_data_format)
+        out = getattr(_C_ops, op_type)(
+            x,
+            weight,
+            stride,
+            padding,
+            output_padding,
+            output_size,
+            padding_algorithm,
+            groups,
+            dilation,
+            conv2d_data_format,
+        )
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
     elif _in_legacy_dygraph():
-        attrs = ('output_padding', output_padding, 'output_size', output_size,
-                 'strides', stride, 'paddings', padding, 'padding_algorithm',
-                 padding_algorithm, 'dilations', dilation, 'groups', groups,
-                 'use_cudnn', use_cudnn, 'data_format', conv2d_data_format)
+        attrs = (
+            'output_padding',
+            output_padding,
+            'output_size',
+            output_size,
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            'padding_algorithm',
+            padding_algorithm,
+            'dilations',
+            dilation,
+            'groups',
+            groups,
+            'use_cudnn',
+            use_cudnn,
+            'data_format',
+            conv2d_data_format,
+        )
         out = getattr(_legacy_C_ops, op_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
@@ -924,18 +1123,18 @@ def conv1d_transpose(x,
             'dilations': dilation,
             'groups': groups,
             'use_cudnn': use_cudnn,
-            'data_format': conv2d_data_format
+            'data_format': conv2d_data_format,
         }
-        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                                 'conv2d_transpose')
+        check_variable_and_dtype(
+            x, 'input', ['float16', 'float32', 'float64'], 'conv2d_transpose'
+        )
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype)
         outputs = {"Output": [out]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
 
@@ -943,17 +1142,19 @@ def conv1d_transpose(x,
     return out
 
 
-def conv2d_transpose(x,
-                     weight,
-                     bias=None,
-                     stride=1,
-                     padding=0,
-                     output_padding=0,
-                     dilation=1,
-                     groups=1,
-                     output_size=None,
-                     data_format='NCHW',
-                     name=None):
+def conv2d_transpose(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    output_padding=0,
+    dilation=1,
+    groups=1,
+    output_size=None,
+    data_format='NCHW',
+    name=None,
+):
     r"""
 
     The convolution2D transpose layer calculates the output based on the input,
@@ -1094,32 +1295,43 @@ def conv2d_transpose(x,
         raise ValueError(
             "Attr(data_format) of conv2d_transpose got wrong value: "
             "received {}, but only 'NCHW' or 'NHWC' are supported.".format(
-                data_format))
-    channel_last = (data_format == "NHWC")
+                data_format
+            )
+        )
+    channel_last = data_format == "NHWC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 4:
         raise ValueError(
-            "Input x should be 4D tensor, but received x with the shape of {}".
-            format(x.shape))
+            "Input x should be 4D tensor, but received x with the shape of {}".format(
+                x.shape
+            )
+        )
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
-        raise ValueError("The channel dimension of the input({}) "
-                         "should be defined. Received: {}.".format(
-                             x.shape, num_channels))
+        raise ValueError(
+            "The channel dimension of the input({}) "
+            "should be defined. Received: {}.".format(x.shape, num_channels)
+        )
     if groups <= 0:
         raise ValueError(
-            "The groups of conv2d_transpose should be greater than 0. Received groups: {}"
-            .format(groups))
+            "The groups of conv2d_transpose should be greater than 0. Received groups: {}".format(
+                groups
+            )
+        )
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
             "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, x.shape, groups))
+            ", the groups is {}".format(num_channels, x.shape, groups)
+        )
 
     cudnn_version = get_cudnn_version()
 
-    use_cudnn = True if (is_compiled_with_cuda()
-                         and cudnn_version is not None) else False
+    use_cudnn = (
+        True
+        if (is_compiled_with_cuda() and cudnn_version is not None)
+        else False
+    )
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
@@ -1130,8 +1342,10 @@ def conv2d_transpose(x,
         output_size = []
     else:
         if output_padding != 0:
-            raise ValueError('output_padding option is mutually exclusive with '
-                             'output_size')
+            raise ValueError(
+                'output_padding option is mutually exclusive with '
+                'output_size'
+            )
         if isinstance(output_size, (list, tuple)):
             if _contain_var(output_size):
                 output_size = _convert_to_tensor_list(output_size)
@@ -1140,15 +1354,21 @@ def conv2d_transpose(x,
         elif isinstance(output_size, int):
             output_size = convert_to_list(output_size, 2, 'output_size')
         elif isinstance(output_size, Variable):
-            check_dtype(output_size.dtype, 'output_size', ['int32', 'int64'],
-                        'conv2d_transpose')
-            if len(output_size.shape) == 1 and (output_size.shape[0] == 1
-                                                or output_size.shape[0] == 2):
+            check_dtype(
+                output_size.dtype,
+                'output_size',
+                ['int32', 'int64'],
+                'conv2d_transpose',
+            )
+            if len(output_size.shape) == 1 and (
+                output_size.shape[0] == 1 or output_size.shape[0] == 2
+            ):
                 if output_size.shape[0] == 1:
                     output_size = [output_size, output_size]
             else:
                 raise ValueError(
-                    "output_size must contain one or two integers.")
+                    "output_size must contain one or two integers."
+                )
         else:
             raise ValueError(
                 "output_size should be int or Tensor or list, tuple of ints or Tensor"
@@ -1161,24 +1381,54 @@ def conv2d_transpose(x,
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
-    if (num_channels == groups and num_channels != 1 and num_filters == 1):
+    if num_channels == groups and num_channels != 1 and num_filters == 1:
         op_type = 'depthwise_conv2d_transpose'
         use_cudnn = False
 
     if in_dygraph_mode():
-        op = _C_ops.conv2d_transpose if op_type == 'conv2d_transpose' else _C_ops.depthwise_conv2d_transpose
-        pre_bias = op(x, weight, stride, padding, output_padding, output_size,
-                      padding_algorithm, groups, dilation, data_format)
+        op = (
+            _C_ops.conv2d_transpose
+            if op_type == 'conv2d_transpose'
+            else _C_ops.depthwise_conv2d_transpose
+        )
+        pre_bias = op(
+            x,
+            weight,
+            stride,
+            padding,
+            output_padding,
+            output_size,
+            padding_algorithm,
+            groups,
+            dilation,
+            data_format,
+        )
         if bias is not None:
             return nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
             return pre_bias
 
     if _in_legacy_dygraph():
-        attrs = ('output_padding', output_padding, 'output_size', output_size,
-                 'strides', stride, 'paddings', padding, 'padding_algorithm',
-                 padding_algorithm, 'dilations', dilation, 'groups', groups,
-                 'use_cudnn', use_cudnn, 'data_format', data_format)
+        attrs = (
+            'output_padding',
+            output_padding,
+            'output_size',
+            output_size,
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            'padding_algorithm',
+            padding_algorithm,
+            'dilations',
+            dilation,
+            'groups',
+            groups,
+            'use_cudnn',
+            use_cudnn,
+            'data_format',
+            data_format,
+        )
         pre_bias = getattr(_legacy_C_ops, op_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
@@ -1195,17 +1445,17 @@ def conv2d_transpose(x,
             'dilations': dilation,
             'groups': groups,
             'use_cudnn': use_cudnn,
-            'data_format': data_format
+            'data_format': data_format,
         }
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'conv2d_transpose')
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], 'conv2d_transpose'
+        )
         helper = LayerHelper(op_type, **locals())
         pre_bias = helper.create_variable_for_type_inference(x.dtype)
         outputs = {"Output": [pre_bias]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
 
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
@@ -1215,15 +1465,17 @@ def conv2d_transpose(x,
     return out
 
 
-def conv3d(x,
-           weight,
-           bias=None,
-           stride=1,
-           padding=0,
-           dilation=1,
-           groups=1,
-           data_format="NCDHW",
-           name=None):
+def conv3d(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    data_format="NCDHW",
+    name=None,
+):
     r"""
 
     The convolution3D layer calculates the output based on the input, filter
@@ -1331,60 +1583,88 @@ def conv3d(x,
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): {}.".format(data_format))
+            "Attr(data_format): {}.".format(data_format)
+        )
 
-    channel_last = (data_format == "NDHWC")
+    channel_last = data_format == "NDHWC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 5:
         raise ValueError(
-            "Input x should be 5D tensor, but received x with the shape of {}".
-            format(x.shape))
+            "Input x should be 5D tensor, but received x with the shape of {}".format(
+                x.shape
+            )
+        )
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
         raise ValueError(
             "The channel dimension of the input({}) should be defined. "
-            "Received: {}.".format(x.shape, num_channels))
+            "Received: {}.".format(x.shape, num_channels)
+        )
     if groups <= 0:
         raise ValueError(
-            "The groups of conv3d should be greater than 0. Received groups: {}"
-            .format(groups))
+            "The groups of conv3d should be greater than 0. Received groups: {}".format(
+                groups
+            )
+        )
     if num_channels % groups != 0:
         raise ValueError(
             "The number of input channels must be divisible by Attr(groups). "
             "Received: number of channels({}), groups({}).".format(
-                num_channels, groups))
+                num_channels, groups
+            )
+        )
     if num_filters % groups != 0:
         raise ValueError(
             "The number of filters must be divisible by Attr(groups). "
             "Received: number of filters({}), groups({}).".format(
-                num_filters, groups))
+                num_filters, groups
+            )
+        )
 
     cudnn_version = get_cudnn_version()
-    use_cudnn = True if (is_compiled_with_cuda()
-                         and cudnn_version is not None) else False
+    use_cudnn = (
+        True
+        if (is_compiled_with_cuda() and cudnn_version is not None)
+        else False
+    )
 
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
     stride = convert_to_list(stride, 3, 'stride')
     dilation = convert_to_list(dilation, 3, 'dilation')
     op_type = "conv3d"
 
-    return _conv_nd(x, weight, bias, stride, padding, padding_algorithm,
-                    dilation, groups, data_format, channel_dim, op_type,
-                    use_cudnn, False, name)
-
-
-def conv3d_transpose(x,
-                     weight,
-                     bias=None,
-                     stride=1,
-                     padding=0,
-                     output_padding=0,
-                     groups=1,
-                     dilation=1,
-                     output_size=None,
-                     data_format='NCDHW',
-                     name=None):
+    return _conv_nd(
+        x,
+        weight,
+        bias,
+        stride,
+        padding,
+        padding_algorithm,
+        dilation,
+        groups,
+        data_format,
+        channel_dim,
+        op_type,
+        use_cudnn,
+        False,
+        name,
+    )
+
+
+def conv3d_transpose(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    output_padding=0,
+    groups=1,
+    dilation=1,
+    output_size=None,
+    data_format='NCDHW',
+    name=None,
+):
     r"""
     The convolution3d transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
@@ -1530,29 +1810,37 @@ def conv3d_transpose(x,
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): {}.".format(data_format))
+            "Attr(data_format): {}.".format(data_format)
+        )
 
-    channel_last = (data_format == "NDHWC")
+    channel_last = data_format == "NDHWC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 5:
         raise ValueError(
-            "Input x should be 5D tensor, but received x with the shape of {}".
-            format(x.shape))
+            "Input x should be 5D tensor, but received x with the shape of {}".format(
+                x.shape
+            )
+        )
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[1]
     if num_channels < 0:
         raise ValueError(
             "The channel dimension of the input({}) should be defined. "
-            "Received: {}.".format(x.shape, num_channels))
+            "Received: {}.".format(x.shape, num_channels)
+        )
     if groups <= 0:
         raise ValueError(
-            "The groups of conv3d_transpose should be greater than 0. Received groups: {}"
-            .format(groups))
+            "The groups of conv3d_transpose should be greater than 0. Received groups: {}".format(
+                groups
+            )
+        )
     if num_channels % groups != 0:
         raise ValueError(
             "The number of input channels must be divisible by Attr(groups). "
             "Received: number of channels({}), groups({}).".format(
-                num_channels, groups))
+                num_channels, groups
+            )
+        )
 
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
     stride = convert_to_list(stride, 3, 'stride')
@@ -1561,13 +1849,16 @@ def conv3d_transpose(x,
         output_size = []
     else:
         if output_padding != 0:
-            raise ValueError('output_padding option is mutually exclusive with '
-                             'output_size')
+            raise ValueError(
+                'output_padding option is mutually exclusive with '
+                'output_size'
+            )
         if isinstance(output_size, (list, tuple, int)):
             output_size = convert_to_list(output_size, 3, 'output_size')
         else:
             raise ValueError(
-                "output_size should be int, or list, tuple of ints")
+                "output_size should be int, or list, tuple of ints"
+            )
 
     if output_padding == 0:
         output_padding = []
@@ -1576,28 +1867,55 @@ def conv3d_transpose(x,
 
     cudnn_version = get_cudnn_version()
 
-    #TODO(LielinJiang): whether to use cudnn according to the version of cudnn
-    use_cudnn = True if (is_compiled_with_cuda()
-                         and cudnn_version is not None) else False
+    # TODO(LielinJiang): whether to use cudnn according to the version of cudnn
+    use_cudnn = (
+        True
+        if (is_compiled_with_cuda() and cudnn_version is not None)
+        else False
+    )
 
     op_type = 'conv3d_transpose'
     data_format_ = "NHWC" if channel_last else "NCHW"
 
     if in_dygraph_mode():
-        pre_bias = _C_ops.conv3d_transpose(x, weight, stride, padding,
-                                           output_padding, output_size,
-                                           padding_algorithm, groups, dilation,
-                                           data_format_)
+        pre_bias = _C_ops.conv3d_transpose(
+            x,
+            weight,
+            stride,
+            padding,
+            output_padding,
+            output_size,
+            padding_algorithm,
+            groups,
+            dilation,
+            data_format_,
+        )
         if bias is not None:
             return nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
             return pre_bias
 
     if _in_legacy_dygraph():
-        attrs = ('output_padding', output_padding, 'output_size', output_size,
-                 'paddings', padding, "padding_algorithm", padding_algorithm,
-                 'strides', stride, 'dilations', dilation, 'groups', groups,
-                 'use_cudnn', use_cudnn, "data_format", data_format_)
+        attrs = (
+            'output_padding',
+            output_padding,
+            'output_size',
+            output_size,
+            'paddings',
+            padding,
+            "padding_algorithm",
+            padding_algorithm,
+            'strides',
+            stride,
+            'dilations',
+            dilation,
+            'groups',
+            groups,
+            'use_cudnn',
+            use_cudnn,
+            "data_format",
+            data_format_,
+        )
         pre_bias = getattr(_legacy_C_ops, op_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
@@ -1614,19 +1932,19 @@ def conv3d_transpose(x,
             'dilations': dilation,
             'groups': groups,
             'use_cudnn': use_cudnn,
-            "data_format": data_format_
+            "data_format": data_format_,
         }
         helper = LayerHelper(op_type, **locals())
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'conv3d')
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], 'conv3d'
+        )
 
         pre_bias = helper.create_variable_for_type_inference(x.dtype)
         outputs = {"Output": [pre_bias]}
 
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 602175c98f5..631516b7032 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -23,7 +23,11 @@ from ...fluid import dygraph_utils
 from ...tensor.layer_function_generator import templatedoc
 from paddle import in_dynamic_mode
 from paddle import _C_ops, _legacy_C_ops
-from ...fluid.framework import _non_static_mode, _in_legacy_dygraph, in_dygraph_mode
+from ...fluid.framework import (
+    _non_static_mode,
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+)
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...framework import core
 from ...common_ops_import import convert_np_dtype_to_dtype_
@@ -33,8 +37,8 @@ __all__ = []
 
 def diag_embed(input, offset=0, dim1=-2, dim2=-1):
     """
-    This OP creates a tensor whose diagonals of certain 2D planes (specified by dim1 and dim2) 
-    are filled by ``input``. By default, a 2D plane formed by the last two dimensions 
+    This OP creates a tensor whose diagonals of certain 2D planes (specified by dim1 and dim2)
+    are filled by ``input``. By default, a 2D plane formed by the last two dimensions
     of the returned tensor will be selected.
 
     The argument ``offset`` determines which diagonal is generated:
@@ -48,16 +52,16 @@ def diag_embed(input, offset=0, dim1=-2, dim2=-1):
         offset(int, optional): Which diagonal to consider. Default: 0 (main diagonal).
         dim1(int, optional): The first dimension with respect to which to take diagonal. Default: -2.
         dim2(int, optional): The second dimension with respect to which to take diagonal. Default: -1.
-    
+
     Returns:
         Tensor, the output data type is the same as input data type.
-    
+
     Examples:
         .. code-block:: python
 
             import paddle.nn.functional as F
             import numpy as np
-            
+
             diag_embed = np.random.randn(2, 3).astype('float32')
             # [[ 0.7545889 , -0.25074545,  0.5929117 ],
             #  [-0.6097662 , -0.01753256,  0.619769  ]]
@@ -104,50 +108,55 @@ def diag_embed(input, offset=0, dim1=-2, dim2=-1):
     if in_dygraph_mode():
         return _C_ops.diag_embed(input, offset, dim1, dim2)
     elif in_dynamic_mode():
-        return _legacy_C_ops.diag_embed(input, "offset", offset, "dim1", dim1,
-                                        "dim2", dim2)
+        return _legacy_C_ops.diag_embed(
+            input, "offset", offset, "dim1", dim1, "dim2", dim2
+        )
 
     inputs = {'Input': [input]}
     attrs = {'offset': offset, 'dim1': dim1, 'dim2': dim2}
 
     def __check_input(input, offset, dim1, dim2):
-        check_dtype(input.dtype, 'Input',
-                    ['int32', 'int64', 'float16', 'float32', 'float64'],
-                    'diag_embed')
+        check_dtype(
+            input.dtype,
+            'Input',
+            ['int32', 'int64', 'float16', 'float32', 'float64'],
+            'diag_embed',
+        )
 
         input_shape = list(input.shape)
-        assert len(input_shape) >= 1,                     \
-                "Input must be at least 1-dimensional, "   \
-                "But received Input's dimensional: %s.\n" %  \
-                len(input_shape)
+        assert len(input_shape) >= 1, (
+            "Input must be at least 1-dimensional, "
+            "But received Input's dimensional: %s.\n" % len(input_shape)
+        )
 
-        assert np.abs(dim1) <= len(input_shape),    \
-            "Dim1 is out of range (expected to be in range of [%d, %d], but got %d).\n"  \
+        assert np.abs(dim1) <= len(input_shape), (
+            "Dim1 is out of range (expected to be in range of [%d, %d], but got %d).\n"
             % (-(len(input_shape) + 1), len(input_shape), dim1)
+        )
 
-        assert np.abs(dim2) <= len(input_shape),      \
-            "Dim2 is out of range (expected to be in range of [%d, %d], but got %d).\n"  \
+        assert np.abs(dim2) <= len(input_shape), (
+            "Dim2 is out of range (expected to be in range of [%d, %d], but got %d).\n"
             % (-(len(input_shape) + 1), len(input_shape), dim2)
+        )
 
         dim1_ = dim1 if dim1 >= 0 else len(input_shape) + dim1 + 1
         dim2_ = dim2 if dim2 >= 0 else len(input_shape) + dim2 + 1
-        assert dim1_ != dim2_,       \
-               "dim1 and dim2 cannot be the same dimension." \
-                "But received dim1 = %d, dim2 = %d\n"%(dim1, dim2)
+        assert dim1_ != dim2_, (
+            "dim1 and dim2 cannot be the same dimension."
+            "But received dim1 = %d, dim2 = %d\n" % (dim1, dim2)
+        )
 
     __check_input(input, offset, dim1, dim2)
     helper = LayerHelper("diag_embed", **locals())
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
 
-    helper.append_op(type='diag_embed',
-                     inputs={'Input': [input]},
-                     attrs={
-                         'offset': offset,
-                         'dim1': dim1,
-                         'dim2': dim2
-                     },
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='diag_embed',
+        inputs={'Input': [input]},
+        attrs={'offset': offset, 'dim1': dim1, 'dim2': dim2},
+        outputs={'Out': [out]},
+    )
     out.stop_gradient = True
     return out
 
@@ -235,10 +244,9 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
         else:
             attrs['maxlen'] = maxlen
 
-    helper.append_op(type='sequence_mask',
-                     inputs=inputs,
-                     outputs={'Y': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='sequence_mask', inputs=inputs, outputs={'Y': out}, attrs=attrs
+    )
 
     out.stop_gradient = True
     return out
@@ -319,18 +327,19 @@ def gather_tree(ids, parents):
             return _legacy_C_ops.gather_tree(ids, parents)
         else:
             helper = LayerHelper('gather_tree', **locals())
-            check_variable_and_dtype(ids, 'ids', ['int32', 'int64'],
-                                     'gather_tree')
-            check_variable_and_dtype(parents, 'parents', ['int32', 'int64'],
-                                     'gather_tree')
+            check_variable_and_dtype(
+                ids, 'ids', ['int32', 'int64'], 'gather_tree'
+            )
+            check_variable_and_dtype(
+                parents, 'parents', ['int32', 'int64'], 'gather_tree'
+            )
             out = helper.create_variable_for_type_inference(dtype=ids.dtype)
 
-            helper.append_op(type="gather_tree",
-                             inputs={
-                                 "Ids": ids,
-                                 "Parents": parents
-                             },
-                             outputs={"Out": out})
+            helper.append_op(
+                type="gather_tree",
+                inputs={"Ids": ids, "Parents": parents},
+                outputs={"Out": out},
+            )
 
             return out
 
@@ -370,14 +379,22 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
             out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
     """
     if data_format not in ["NCHW", "NHWC"]:
-        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
-                         "Received Attr(data_format): {}.".format(data_format))
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. "
+            "Received Attr(data_format): {}.".format(data_format)
+        )
     if in_dygraph_mode():
         return _C_ops.temporal_shift(x, seg_num, shift_ratio, data_format)
     if _non_static_mode():
-        return _legacy_C_ops.temporal_shift(x, 'seg_num', seg_num,
-                                            'shift_ratio', shift_ratio,
-                                            'data_format', data_format)
+        return _legacy_C_ops.temporal_shift(
+            x,
+            'seg_num',
+            seg_num,
+            'shift_ratio',
+            shift_ratio,
+            'data_format',
+            data_format,
+        )
 
     helper = LayerHelper("temporal_shift", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'temporal_shift')
@@ -389,12 +406,14 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
     if not isinstance(seg_num, int):
         raise TypeError("seg_num must be int type.")
 
-    helper.append_op(type="temporal_shift",
-                     inputs={"X": x},
-                     outputs={"Out": out},
-                     attrs={
-                         "seg_num": seg_num,
-                         "shift_ratio": shift_ratio,
-                         "data_format": data_format
-                     })
+    helper.append_op(
+        type="temporal_shift",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={
+            "seg_num": seg_num,
+            "shift_ratio": shift_ratio,
+            "data_format": data_format,
+        },
+    )
     return out
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index fb9c4a56ab4..8658f999116 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -29,7 +29,12 @@ from paddle.utils import deprecated
 from paddle import _C_ops, _legacy_C_ops
 from paddle import in_dynamic_mode
 from paddle.framework import core, _non_static_mode
-from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode, _current_expected_place
+from ...fluid.framework import (
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+    _non_static_mode,
+    _current_expected_place,
+)
 
 __all__ = []
 
@@ -78,25 +83,32 @@ def dice_loss(input, label, epsilon=0.00001, name=None):
     """
     assert input.dtype in (paddle.float32, paddle.float64)
     assert label.dtype in (paddle.int32, paddle.int64)
-    assert len(input.shape) >= 2, \
-        "The rank of input should be greater than or equal to 2."
-    assert len(input.shape) == len(
-        label.shape), ("The rank of input and label should be equal, "
-                       "but received input: %d, label: %d." %
-                       (len(input.shape), len(label.shape)))
-    assert label.shape[-1] == 1, ("The last dimension of label should be 1, "
-                                  "but received %d." % label.shape[-1])
-    assert input.shape[:-1] == label.shape[:-1], (
-        "All dimensions should be equal except the last one.")
-    assert input.numel() > 0 and label.numel() > 0, \
-        "Any dimension of input and label cannot be equal to 0."
+    assert (
+        len(input.shape) >= 2
+    ), "The rank of input should be greater than or equal to 2."
+    assert len(input.shape) == len(label.shape), (
+        "The rank of input and label should be equal, "
+        "but received input: %d, label: %d."
+        % (len(input.shape), len(label.shape))
+    )
+    assert label.shape[-1] == 1, (
+        "The last dimension of label should be 1, "
+        "but received %d." % label.shape[-1]
+    )
+    assert (
+        input.shape[:-1] == label.shape[:-1]
+    ), "All dimensions should be equal except the last one."
+    assert (
+        input.numel() > 0 and label.numel() > 0
+    ), "Any dimension of input and label cannot be equal to 0."
 
     label = paddle.squeeze(label, [-1])
     label = paddle.nn.functional.one_hot(label, input.shape[-1])
     reduce_dim = list(range(1, len(input.shape)))
     inse = paddle.sum(input * label, axis=reduce_dim)
     dice_denominator = paddle.sum(input, axis=reduce_dim) + paddle.sum(
-        label, axis=reduce_dim)
+        label, axis=reduce_dim
+    )
     dice_score = 1 - inse * 2 / (dice_denominator + epsilon)
     return paddle.mean(dice_score)
 
@@ -147,23 +159,24 @@ def log_loss(input, label, epsilon=1e-4, name=None):
 
     loss = helper.create_variable_for_type_inference(dtype=input.dtype)
 
-    helper.append_op(type='log_loss',
-                     inputs={
-                         'Predicted': [input],
-                         'Labels': [label]
-                     },
-                     outputs={'Loss': [loss]},
-                     attrs={'epsilon': epsilon})
+    helper.append_op(
+        type='log_loss',
+        inputs={'Predicted': [input], 'Labels': [label]},
+        outputs={'Loss': [loss]},
+        attrs={'epsilon': epsilon},
+    )
     return loss
 
 
-def fluid_softmax_with_cross_entropy(logits,
-                                     label,
-                                     soft_label=False,
-                                     ignore_index=-100,
-                                     numeric_stable_mode=True,
-                                     return_softmax=False,
-                                     axis=-1):
+def fluid_softmax_with_cross_entropy(
+    logits,
+    label,
+    soft_label=False,
+    ignore_index=-100,
+    numeric_stable_mode=True,
+    return_softmax=False,
+    axis=-1,
+):
     r"""
 
     This operator implements the cross entropy loss function with softmax. This function 
@@ -253,19 +266,41 @@ def fluid_softmax_with_cross_entropy(logits,
     if _non_static_mode():
         if core.is_compiled_with_npu():
             softmax, backprop, loss = _legacy_C_ops.softmax_with_cross_entropy(
-                logits, label, 'soft_label', soft_label, 'ignore_index',
-                ignore_index, 'numeric_stable_mode', numeric_stable_mode,
-                'axis', axis)
+                logits,
+                label,
+                'soft_label',
+                soft_label,
+                'ignore_index',
+                ignore_index,
+                'numeric_stable_mode',
+                numeric_stable_mode,
+                'axis',
+                axis,
+            )
         else:
             if in_dygraph_mode():
                 softmax, loss = _C_ops.cross_entropy_with_softmax(
-                    logits, label, soft_label, True, numeric_stable_mode,
-                    ignore_index, axis)
+                    logits,
+                    label,
+                    soft_label,
+                    True,
+                    numeric_stable_mode,
+                    ignore_index,
+                    axis,
+                )
             if _in_legacy_dygraph():
                 softmax, loss = _legacy_C_ops.softmax_with_cross_entropy(
-                    logits, label, 'soft_label', soft_label, 'ignore_index',
-                    ignore_index, 'numeric_stable_mode', numeric_stable_mode,
-                    'axis', axis)
+                    logits,
+                    label,
+                    'soft_label',
+                    soft_label,
+                    'ignore_index',
+                    ignore_index,
+                    'numeric_stable_mode',
+                    numeric_stable_mode,
+                    'axis',
+                    axis,
+                )
         if not return_softmax:
             return loss
         else:
@@ -275,7 +310,7 @@ def fluid_softmax_with_cross_entropy(logits,
         'soft_label': soft_label,
         'ignore_index': ignore_index,
         'numeric_stable_mode': numeric_stable_mode,
-        'axis': axis
+        'axis': axis,
     }
     helper = LayerHelper('softmax_with_cross_entropy', **locals())
     softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
@@ -285,13 +320,12 @@ def fluid_softmax_with_cross_entropy(logits,
     if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
         backprop = helper.create_variable_for_type_inference(dtype=logits.dtype)
         outputs['Backprop'] = backprop
-    helper.append_op(type='softmax_with_cross_entropy',
-                     inputs={
-                         'Logits': logits,
-                         'Label': label
-                     },
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type='softmax_with_cross_entropy',
+        inputs={'Logits': logits, 'Label': label},
+        outputs=outputs,
+        attrs=attrs,
+    )
 
     if return_softmax:
         return loss, softmax
@@ -300,71 +334,74 @@ def fluid_softmax_with_cross_entropy(logits,
 
 
 def npair_loss(anchor, positive, labels, l2_reg=0.002):
-    """ 
-  
+    """
+
     Npair loss requires paired data. Npair loss has two parts: the first part is L2
     regularizer on the embedding vector; the second part is cross entropy loss which
     takes the similarity matrix of anchor and positive as logits.
-  
+
     For more information, please refer to:
     `Improved Deep Metric Learning with Multi class N pair Loss Objective <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf>`_
-  
+
     Args:
-      anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims], 
+      anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims],
                         the data type is float32 or float64.
-      positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims], 
+      positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims],
                         the data type is float32 or float64.
       labels(Tensor): 1-D tensor. shape=[batch_size], the data type is float32 or float64 or int64.
       l2_reg(float32): L2 regularization term on embedding vector, default: 0.002.
 
-  
+
     Returns:
       A Tensor representing the npair loss, the data type is the same as anchor, the shape is [1].
-  
+
     Examples:
 
       .. code-block:: python
-  
+
           import paddle
-          
+
           DATATYPE = "float32"
-  
+
           anchor = paddle.rand(shape=(18, 6), dtype=DATATYPE)
           positive = paddle.rand(shape=(18, 6), dtype=DATATYPE)
           labels = paddle.rand(shape=(18,), dtype=DATATYPE)
-          
+
           npair_loss = paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg = 0.002)
           print(npair_loss)
-  
+
     """
-    check_variable_and_dtype(anchor, 'anchor', ['float32', 'float64'],
-                             'npair_loss')
-    check_variable_and_dtype(positive, 'positive', ['float32', 'float64'],
-                             'positive')
-    check_variable_and_dtype(labels, 'labels', ['float32', 'float64', 'int64'],
-                             'labels')
+    check_variable_and_dtype(
+        anchor, 'anchor', ['float32', 'float64'], 'npair_loss'
+    )
+    check_variable_and_dtype(
+        positive, 'positive', ['float32', 'float64'], 'positive'
+    )
+    check_variable_and_dtype(
+        labels, 'labels', ['float32', 'float64', 'int64'], 'labels'
+    )
     Beta = 0.25
     batch_size = labels.shape[0]
 
     labels = paddle.reshape(labels, shape=[batch_size, 1])
     labels = paddle.tile(labels, repeat_times=[1, batch_size])
 
-    labels = paddle.equal(labels, paddle.transpose(labels,
-                                                   perm=[1,
-                                                         0])).astype('float32')
+    labels = paddle.equal(labels, paddle.transpose(labels, perm=[1, 0])).astype(
+        'float32'
+    )
     labels = labels / paddle.sum(labels, axis=1, keepdim=True)
 
-    l2loss = paddle.mean(paddle.sum(paddle.square(anchor), 1)) \
-             + paddle.mean(paddle.sum(paddle.square(positive), 1))
+    l2loss = paddle.mean(paddle.sum(paddle.square(anchor), 1)) + paddle.mean(
+        paddle.sum(paddle.square(positive), 1)
+    )
     l2loss = l2loss * Beta * l2_reg
 
-    similarity_matrix = paddle.matmul(anchor,
-                                      positive,
-                                      transpose_x=False,
-                                      transpose_y=True)
-    softmax_ce = fluid_softmax_with_cross_entropy(logits=similarity_matrix,
-                                                  label=labels,
-                                                  soft_label=True)
+    similarity_matrix = paddle.matmul(
+        anchor, positive, transpose_x=False, transpose_y=True
+    )
+    softmax_ce = fluid_softmax_with_cross_entropy(
+        logits=similarity_matrix, label=labels, soft_label=True
+    )
     cross_entropy = paddle.sum(labels * softmax_ce, 0)
     celoss = paddle.mean(cross_entropy)
 
@@ -412,32 +449,35 @@ def square_error_cost(input, label):
         square_out = _legacy_C_ops.square(minus_out)
         return square_out
 
-    check_variable_and_dtype(input, "input", ['float32', 'float64'],
-                             'square_error_cost')
-    check_variable_and_dtype(label, "label", ['float32', 'float64'],
-                             'square_error_cost')
+    check_variable_and_dtype(
+        input, "input", ['float32', 'float64'], 'square_error_cost'
+    )
+    check_variable_and_dtype(
+        label, "label", ['float32', 'float64'], 'square_error_cost'
+    )
     helper = LayerHelper('square_error_cost', **locals())
     minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type='elementwise_sub',
-                     inputs={
-                         'X': [input],
-                         'Y': [label]
-                     },
-                     outputs={'Out': [minus_out]})
+    helper.append_op(
+        type='elementwise_sub',
+        inputs={'X': [input], 'Y': [label]},
+        outputs={'Out': [minus_out]},
+    )
 
     square_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type='square',
-                     inputs={'X': [minus_out]},
-                     outputs={'Out': [square_out]})
+    helper.append_op(
+        type='square', inputs={'X': [minus_out]}, outputs={'Out': [square_out]}
+    )
     return square_out
 
 
-def edit_distance(input,
-                  label,
-                  normalized=True,
-                  ignored_tokens=None,
-                  input_length=None,
-                  label_length=None):
+def edit_distance(
+    input,
+    label,
+    normalized=True,
+    ignored_tokens=None,
+    input_length=None,
+    label_length=None,
+):
     """
     This op computes the edit distances, also called Levenshtein distance, between a batch of
     hypothesis strings and their references. It measures how dissimilar two strings are by counting
@@ -472,7 +512,7 @@ def edit_distance(input,
         NOTE: This Api is different from fluid.metrics.EditDistance
 
     Returns:
-	Tuple:
+        Tuple:
 
         distance(Tensor): edit distance result, its data type is float32, and its shape is (batch_size, 1).
         sequence_num(Tensor): sequence number, its data type is float32, and its shape is (1,).
@@ -514,21 +554,26 @@ def edit_distance(input,
         erased_input = helper.create_variable_for_type_inference(dtype="int64")
         erased_label = helper.create_variable_for_type_inference(dtype="int64")
 
-        helper.append_op(type="sequence_erase",
-                         inputs={"X": [input]},
-                         outputs={"Out": [erased_input]},
-                         attrs={"tokens": ignored_tokens})
+        helper.append_op(
+            type="sequence_erase",
+            inputs={"X": [input]},
+            outputs={"Out": [erased_input]},
+            attrs={"tokens": ignored_tokens},
+        )
         input = erased_input
 
-        helper.append_op(type="sequence_erase",
-                         inputs={"X": [label]},
-                         outputs={"Out": [erased_label]},
-                         attrs={"tokens": ignored_tokens})
+        helper.append_op(
+            type="sequence_erase",
+            inputs={"X": [label]},
+            outputs={"Out": [erased_label]},
+            attrs={"tokens": ignored_tokens},
+        )
         label = erased_label
 
     if in_dygraph_mode():
-        return _C_ops.edit_distance(input, label, input_length, label_length,
-                                    normalized)
+        return _C_ops.edit_distance(
+            input, label, input_length, label_length, normalized
+        )
 
     this_inputs = {"Hyps": [input], "Refs": [label]}
     if input_length is not None and label_length is not None:
@@ -538,22 +583,19 @@ def edit_distance(input,
     # edit distance op
     edit_distance_out = helper.create_variable_for_type_inference(dtype="int64")
     sequence_num = helper.create_variable_for_type_inference(dtype="int64")
-    helper.append_op(type="edit_distance",
-                     inputs=this_inputs,
-                     outputs={
-                         "Out": [edit_distance_out],
-                         "SequenceNum": [sequence_num]
-                     },
-                     attrs={"normalized": normalized})
+    helper.append_op(
+        type="edit_distance",
+        inputs=this_inputs,
+        outputs={"Out": [edit_distance_out], "SequenceNum": [sequence_num]},
+        attrs={"normalized": normalized},
+    )
 
     return edit_distance_out, sequence_num
 
 
-def binary_cross_entropy(input,
-                         label,
-                         weight=None,
-                         reduction='mean',
-                         name=None):
+def binary_cross_entropy(
+    input, label, weight=None, reduction='mean', name=None
+):
     """
     This op measures the binary_cross_entropy loss between input predictions ``input``
     and target labels ``label`` . The binary_cross_entropy loss can be described as:
@@ -621,8 +663,9 @@ def binary_cross_entropy(input,
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "The value of 'reduction' in binary_cross_entropy should be 'sum', "
-            "'mean' or 'none', but received %s, which is not allowed." %
-            reduction)
+            "'mean' or 'none', but received %s, which is not allowed."
+            % reduction
+        )
 
     if in_dygraph_mode():
         out = _C_ops.bce_loss(input, label)
@@ -642,27 +685,32 @@ def binary_cross_entropy(input,
             if weight is not None:
                 out = _legacy_C_ops.elementwise_mul(out, weight, 'axis', -1)
             if reduction == 'sum':
-                return _legacy_C_ops.reduce_sum(out, 'dim', [0], 'keep_dim',
-                                                False, "reduce_all", True)
+                return _legacy_C_ops.reduce_sum(
+                    out, 'dim', [0], 'keep_dim', False, "reduce_all", True
+                )
             elif reduction == 'mean':
                 return _legacy_C_ops.mean(out)
             else:
                 return out
         else:
-            check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                     'binary_cross_entropy')
-            check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                                     'binary_cross_entropy')
+            check_variable_and_dtype(
+                input, 'input', ['float32', 'float64'], 'binary_cross_entropy'
+            )
+            check_variable_and_dtype(
+                label, 'label', ['float32', 'float64'], 'binary_cross_entropy'
+            )
 
             sub_name = name if weight is None and reduction == 'none' else None
             helper = LayerHelper("binary_cross_entropy", name=sub_name)
             out = helper.create_variable_for_type_inference(dtype=input.dtype)
-            helper.append_op(type='bce_loss',
-                             inputs={
-                                 'X': [input],
-                                 'Label': [label],
-                             },
-                             outputs={'Out': [out]})
+            helper.append_op(
+                type='bce_loss',
+                inputs={
+                    'X': [input],
+                    'Label': [label],
+                },
+                outputs={'Out': [out]},
+            )
 
             if weight is not None:
                 if isinstance(weight, paddle.static.Variable):
@@ -670,7 +718,8 @@ def binary_cross_entropy(input,
                     out = paddle.multiply(out, weight, name=weight_name)
                 else:
                     raise ValueError(
-                        "The weight is not a Tensor, please convert to Tensor.")
+                        "The weight is not a Tensor, please convert to Tensor."
+                    )
 
             if reduction == 'sum':
                 return paddle.sum(out, name=name)
@@ -680,12 +729,9 @@ def binary_cross_entropy(input,
                 return out
 
 
-def binary_cross_entropy_with_logits(logit,
-                                     label,
-                                     weight=None,
-                                     reduction='mean',
-                                     pos_weight=None,
-                                     name=None):
+def binary_cross_entropy_with_logits(
+    logit, label, weight=None, reduction='mean', pos_weight=None, name=None
+):
     r"""
     This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
     Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
@@ -767,16 +813,23 @@ def binary_cross_entropy_with_logits(logit,
         raise ValueError(
             "The value of 'reduction' in binary_cross_entropy_with_logits "
             "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
-            % reduction)
+            % reduction
+        )
 
     if in_dygraph_mode():
-        one = _C_ops.full([1], float(1.0), core.VarDesc.VarType.FP32,
-                          _current_expected_place())
-        out = _C_ops.sigmoid_cross_entropy_with_logits(logit, label, False,
-                                                       -100)
+        one = _C_ops.full(
+            [1],
+            float(1.0),
+            core.VarDesc.VarType.FP32,
+            _current_expected_place(),
+        )
+        out = _C_ops.sigmoid_cross_entropy_with_logits(
+            logit, label, False, -100
+        )
         if pos_weight is not None:
             log_weight = _C_ops.add(
-                _C_ops.multiply(label, _C_ops.subtract(pos_weight, one)), one)
+                _C_ops.multiply(label, _C_ops.subtract(pos_weight, one)), one
+            )
             out = _C_ops.multiply(out, log_weight)
         if weight is not None:
             out = _C_ops.multiply(out, weight)
@@ -789,14 +842,27 @@ def binary_cross_entropy_with_logits(logit,
             return out
     elif _in_legacy_dygraph():
         one = _varbase_creator(dtype=logit.dtype)
-        _legacy_C_ops.fill_constant(one, 'value', float(1.0), 'force_cpu',
-                                    False, 'dtype', one.dtype, 'str_value',
-                                    '1.0', 'shape', [1])
+        _legacy_C_ops.fill_constant(
+            one,
+            'value',
+            float(1.0),
+            'force_cpu',
+            False,
+            'dtype',
+            one.dtype,
+            'str_value',
+            '1.0',
+            'shape',
+            [1],
+        )
         out = _legacy_C_ops.sigmoid_cross_entropy_with_logits(logit, label)
         if pos_weight is not None:
             log_weight = _legacy_C_ops.elementwise_add(
                 _legacy_C_ops.elementwise_mul(
-                    label, _legacy_C_ops.elementwise_sub(pos_weight, one)), one)
+                    label, _legacy_C_ops.elementwise_sub(pos_weight, one)
+                ),
+                one,
+            )
             out = _legacy_C_ops.elementwise_mul(out, log_weight)
         if weight is not None:
             out = _legacy_C_ops.elementwise_mul(out, weight)
@@ -808,30 +874,49 @@ def binary_cross_entropy_with_logits(logit,
         else:
             return out
 
-    check_variable_and_dtype(logit, 'logit', ['float32', 'float64'],
-                             'binary_cross_entropy_with_logits')
-    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                             'binary_cross_entropy_with_logits')
+    check_variable_and_dtype(
+        logit,
+        'logit',
+        ['float32', 'float64'],
+        'binary_cross_entropy_with_logits',
+    )
+    check_variable_and_dtype(
+        label,
+        'label',
+        ['float32', 'float64'],
+        'binary_cross_entropy_with_logits',
+    )
     sigmoid_name = None
     if reduction == 'none' and pos_weight is None and weight is None:
         sigmoid_name = name
 
     out = paddle.fluid.layers.sigmoid_cross_entropy_with_logits(
-        logit, label, name=sigmoid_name)
+        logit, label, name=sigmoid_name
+    )
 
     one = paddle.full(shape=[1], fill_value=1.0, dtype=logit.dtype)
     if pos_weight is not None:
-        check_variable_and_dtype(pos_weight, 'pos_weight',
-                                 ['float32', 'float64'],
-                                 'binary_cross_entropy_with_logits')
+        check_variable_and_dtype(
+            pos_weight,
+            'pos_weight',
+            ['float32', 'float64'],
+            'binary_cross_entropy_with_logits',
+        )
         log_weight = paddle.add(
-            paddle.multiply(label, paddle.subtract(pos_weight, one)), one)
-        pos_weight_name = name if reduction == 'none' and weight is None else None
+            paddle.multiply(label, paddle.subtract(pos_weight, one)), one
+        )
+        pos_weight_name = (
+            name if reduction == 'none' and weight is None else None
+        )
         out = paddle.multiply(out, log_weight, name=pos_weight_name)
 
     if weight is not None:
-        check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
-                                 'binary_cross_entropy_with_logits')
+        check_variable_and_dtype(
+            weight,
+            'weight',
+            ['float32', 'float64'],
+            'binary_cross_entropy_with_logits',
+        )
         weight_name = name if reduction == 'none' else None
         out = paddle.multiply(out, weight, name=weight_name)
 
@@ -842,15 +927,17 @@ def binary_cross_entropy_with_logits(logit,
     return out
 
 
-def hsigmoid_loss(input,
-                  label,
-                  num_classes,
-                  weight,
-                  bias=None,
-                  path_table=None,
-                  path_code=None,
-                  is_sparse=False,
-                  name=None):
+def hsigmoid_loss(
+    input,
+    label,
+    num_classes,
+    weight,
+    bias=None,
+    path_table=None,
+    path_code=None,
+    is_sparse=False,
+    name=None,
+):
     """
     The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
     and speed up the model training, especially the training of language model.
@@ -929,36 +1016,63 @@ def hsigmoid_loss(input,
             #  [1.92374969]]
     """
     if in_dygraph_mode():
-        out, _, _ = _C_ops.hierarchical_sigmoid(input, weight, label,
-                                                path_table, path_code, bias,
-                                                num_classes, is_sparse, 0, [],
-                                                [], [], is_sparse)
+        out, _, _ = _C_ops.hierarchical_sigmoid(
+            input,
+            weight,
+            label,
+            path_table,
+            path_code,
+            bias,
+            num_classes,
+            is_sparse,
+            0,
+            [],
+            [],
+            [],
+            is_sparse,
+        )
         return out
     elif _in_legacy_dygraph():
         out, _, _ = _legacy_C_ops.hierarchical_sigmoid(
-            input, weight, label, path_table, path_code, bias, 'num_classes',
-            num_classes, 'is_sparse', is_sparse, 'remote_prefetch', is_sparse)
+            input,
+            weight,
+            label,
+            path_table,
+            path_code,
+            bias,
+            'num_classes',
+            num_classes,
+            'is_sparse',
+            is_sparse,
+            'remote_prefetch',
+            is_sparse,
+        )
         return out
 
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'hsigmoid_loss')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'hsigmoid_loss'
+    )
     check_variable_and_dtype(label, 'label', ['int64'], 'hsigmoid_loss')
-    check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
-                             'hsigmoid_loss')
+    check_variable_and_dtype(
+        weight, 'weight', ['float32', 'float64'], 'hsigmoid_loss'
+    )
     if bias is not None:
-        check_variable_and_dtype(bias, 'bias', ['float32', 'float64'],
-                                 'hsigmoid_loss')
+        check_variable_and_dtype(
+            bias, 'bias', ['float32', 'float64'], 'hsigmoid_loss'
+        )
     if path_table is not None:
-        check_variable_and_dtype(path_table, 'path_table', ['int64'],
-                                 'hsigmoid_loss')
+        check_variable_and_dtype(
+            path_table, 'path_table', ['int64'], 'hsigmoid_loss'
+        )
     if path_code is not None:
-        check_variable_and_dtype(path_code, 'path_code', ['int64'],
-                                 'hsigmoid_loss')
+        check_variable_and_dtype(
+            path_code, 'path_code', ['int64'], 'hsigmoid_loss'
+        )
 
     attrs = {
         "num_classes": num_classes,
         "is_sparse": is_sparse,
-        "remote_prefetch": is_sparse
+        "remote_prefetch": is_sparse,
     }
 
     inputs = {
@@ -967,7 +1081,7 @@ def hsigmoid_loss(input,
         "Bias": bias,
         "PathTable": path_table,
         "PathCode": path_code,
-        "Label": label
+        "Label": label,
     }
 
     helper = LayerHelper('hsigmoid_loss', **locals())
@@ -975,10 +1089,9 @@ def hsigmoid_loss(input,
     pre_out = helper.create_variable_for_type_inference(input.dtype)
     outputs = {"Out": out, "PreOut": pre_out, "W_Out": weight}
 
-    helper.append_op(type="hierarchical_sigmoid",
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type="hierarchical_sigmoid", inputs=inputs, outputs=outputs, attrs=attrs
+    )
     return out
 
 
@@ -1038,34 +1151,35 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
             output = paddle.nn.functional.smooth_l1_loss(input, label)
             print(output)
     """
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'smooth_l1_loss')
-    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                             'smooth_l1_loss')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'smooth_l1_loss'
+    )
+    check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'smooth_l1_loss'
+    )
 
     if in_dygraph_mode():
         out, residual = _C_ops.huber_loss(input, label, delta)
     else:
         helper = LayerHelper('huber_loss', **locals())
         residual = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
+            dtype=helper.input_dtype()
+        )
         out = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
-        helper.append_op(type='huber_loss',
-                         inputs={
-                             'X': input,
-                             'Y': label
-                         },
-                         outputs={
-                             'Out': out,
-                             'Residual': residual
-                         },
-                         attrs={'delta': delta})
+            dtype=helper.input_dtype()
+        )
+        helper.append_op(
+            type='huber_loss',
+            inputs={'X': input, 'Y': label},
+            outputs={'Out': out, 'Residual': residual},
+            attrs={'delta': delta},
+        )
 
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "The value of 'reduction' in smooth_l1_loss should be 'sum', 'mean' or"
-            " 'none', but received %s, which is not allowed." % reduction)
+            " 'none', but received %s, which is not allowed." % reduction
+        )
     if reduction == 'none':
         return out
     elif reduction == 'mean':
@@ -1074,12 +1188,9 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
         return paddle.sum(out)
 
 
-def margin_ranking_loss(input,
-                        other,
-                        label,
-                        margin=0.0,
-                        reduction='mean',
-                        name=None):
+def margin_ranking_loss(
+    input, other, label, margin=0.0, reduction='mean', name=None
+):
     r"""
 
     Calcluate the margin rank loss between the input, other and label, use the math function as follows.
@@ -1107,7 +1218,7 @@ def margin_ranking_loss(input,
         reduction (str, optional): Indicate the reduction to apply to the loss, the candicates are ``'none'``, ``'mean'``, ``'sum'``.If :attr:`reduction` is ``'none'``, the unreduced loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned. Default is ``'mean'``.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
-    Returns: 
+    Returns:
         Tensor, if :attr:`reduction` is ``'mean'`` or ``'sum'``, the out shape is :math:`[1]`, otherwise the shape is the same as `input` .The same dtype as input tensor.
 
     Examples:
@@ -1125,7 +1236,8 @@ def margin_ranking_loss(input,
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "The value of 'reduction' in MarginRankingLoss should be 'sum', 'mean' or 'none', but "
-            "received %s, which is not allowed." % reduction)
+            "received %s, which is not allowed." % reduction
+        )
     if in_dygraph_mode():
         out = _C_ops.subtract(other, input)
         out = _C_ops.multiply(out, label)
@@ -1152,12 +1264,15 @@ def margin_ranking_loss(input,
         return out
 
     helper = LayerHelper("margin_ranking_loss", **locals())
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'margin_rank_loss')
-    check_variable_and_dtype(other, 'other', ['float32', 'float64'],
-                             'margin_rank_loss')
-    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                             'margin_rank_loss')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'margin_rank_loss'
+    )
+    check_variable_and_dtype(
+        other, 'other', ['float32', 'float64'], 'margin_rank_loss'
+    )
+    check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'margin_rank_loss'
+    )
 
     out = paddle.subtract(other, input)
     out = paddle.multiply(out, label)
@@ -1170,24 +1285,28 @@ def margin_ranking_loss(input,
     result_out = helper.create_variable_for_type_inference(input.dtype)
 
     if reduction == 'none':
-        helper.append_op(type="relu",
-                         inputs={"X": out},
-                         outputs={"Out": result_out})
+        helper.append_op(
+            type="relu", inputs={"X": out}, outputs={"Out": result_out}
+        )
         return result_out
     elif reduction == 'sum':
         out = paddle.nn.functional.relu(out)
         attrs = {"dim": [0], "keep_dim": False, "reduce_all": True}
-        helper.append_op(type="reduce_sum",
-                         inputs={"X": out},
-                         outputs={"Out": result_out},
-                         attrs=attrs)
+        helper.append_op(
+            type="reduce_sum",
+            inputs={"X": out},
+            outputs={"Out": result_out},
+            attrs=attrs,
+        )
         return result_out
     elif reduction == 'mean':
         out = paddle.nn.functional.relu(out)
-        helper.append_op(type="mean",
-                         inputs={"X": out},
-                         outputs={"Out": result_out},
-                         attrs={})
+        helper.append_op(
+            type="mean",
+            inputs={"X": out},
+            outputs={"Out": result_out},
+            attrs={},
+        )
         return result_out
 
 
@@ -1251,7 +1370,8 @@ def l1_loss(input, label, reduction='mean', name=None):
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "The value of 'reduction' in L1Loss should be 'sum', 'mean' or 'none', but "
-            "received %s, which is not allowed." % reduction)
+            "received %s, which is not allowed." % reduction
+        )
 
     if in_dygraph_mode():
         unreduced = _C_ops.abs(_C_ops.subtract(input, label))
@@ -1263,25 +1383,24 @@ def l1_loss(input, label, reduction='mean', name=None):
         else:
             return unreduced
     elif _in_legacy_dygraph():
-        unreduced = _elementwise_op_in_dygraph(input,
-                                               label,
-                                               axis=-1,
-                                               act='abs',
-                                               op_name='elementwise_sub')
+        unreduced = _elementwise_op_in_dygraph(
+            input, label, axis=-1, act='abs', op_name='elementwise_sub'
+        )
         if reduction == 'mean':
             return _legacy_C_ops.mean(unreduced)
         elif reduction == 'sum':
-            return _legacy_C_ops.reduce_sum(unreduced, 'dim', [0], 'keep_dim',
-                                            False, 'reduce_all', True)
+            return _legacy_C_ops.reduce_sum(
+                unreduced, 'dim', [0], 'keep_dim', False, 'reduce_all', True
+            )
         else:
             return unreduced
 
-    check_variable_and_dtype(input, 'input',
-                             ['float32', 'float64', 'int32', 'int64'],
-                             'l1_loss')
-    check_variable_and_dtype(label, 'label',
-                             ['float32', 'float64', 'int32', 'int64'],
-                             'l1_loss')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'l1_loss'
+    )
+    check_variable_and_dtype(
+        label, 'label', ['float32', 'float64', 'int32', 'int64'], 'l1_loss'
+    )
 
     if reduction == 'sum':
         unreduced = paddle.fluid.layers.elementwise_sub(input, label, act='abs')
@@ -1290,18 +1409,14 @@ def l1_loss(input, label, reduction='mean', name=None):
         unreduced = paddle.fluid.layers.elementwise_sub(input, label, act='abs')
         return paddle.mean(unreduced, name=name)
     else:
-        return paddle.fluid.layers.elementwise_sub(input,
-                                                   label,
-                                                   act='abs',
-                                                   name=name)
-
-
-def nll_loss(input,
-             label,
-             weight=None,
-             ignore_index=-100,
-             reduction='mean',
-             name=None):
+        return paddle.fluid.layers.elementwise_sub(
+            input, label, act='abs', name=name
+        )
+
+
+def nll_loss(
+    input, label, weight=None, ignore_index=-100, reduction='mean', name=None
+):
     """
     This api returns negative log likelihood.
     See more detail in :ref:`api_nn_loss_NLLLoss` .
@@ -1350,13 +1465,15 @@ def nll_loss(input,
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "The value of 'reduction' in nll_loss should be 'sum', 'mean' or "
-            "'none', but received %s, which is not allowed." % reduction)
+            "'none', but received %s, which is not allowed." % reduction
+        )
 
     input_shape = list(input.shape)
     input_dims = len(input_shape)
     if input_dims < 2:
         raise ValueError(
-            'Expected 2 or more dimensions (got {})'.format(input_dims))
+            'Expected 2 or more dimensions (got {})'.format(input_dims)
+        )
     n = input_shape[0]
     c = input_shape[1]
     if in_dygraph_mode():
@@ -1364,21 +1481,29 @@ def nll_loss(input,
             input = _C_ops.reshape(input, [n, c, 1, -1])
             label = _C_ops.reshape(label, [n, 1, -1])
             out_shape = [n] + input_shape[2:]
-        out, total_weight = _C_ops.nll_loss(input, label, weight, ignore_index,
-                                            reduction)
+        out, total_weight = _C_ops.nll_loss(
+            input, label, weight, ignore_index, reduction
+        )
         if input_dims != 2 and input_dims != 4 and reduction == 'none':
             out = _C_ops.reshape(out, out_shape)
         return out
     elif _in_legacy_dygraph():
         if input_dims != 2 and input_dims != 4:
-            input, _ = _legacy_C_ops.reshape2(input, None, 'shape',
-                                              [n, c, 1, -1])
+            input, _ = _legacy_C_ops.reshape2(
+                input, None, 'shape', [n, c, 1, -1]
+            )
             label, _ = _legacy_C_ops.reshape2(label, None, 'shape', [n, 1, -1])
             out_shape = [n] + input_shape[2:]
 
-        out, total_weight = _legacy_C_ops.nll_loss(input, label, weight,
-                                                   'ignore_index', ignore_index,
-                                                   'reduction', reduction)
+        out, total_weight = _legacy_C_ops.nll_loss(
+            input,
+            label,
+            weight,
+            'ignore_index',
+            ignore_index,
+            'reduction',
+            reduction,
+        )
         if input_dims != 2 and input_dims != 4 and reduction == 'none':
             out, _ = _legacy_C_ops.reshape2(out, None, 'shape', out_shape)
         return out
@@ -1402,10 +1527,9 @@ def nll_loss(input,
     total_weight = helper.create_variable_for_type_inference(dtype=input.dtype)
     outputs = {'Out': out, 'Total_weight': total_weight}
 
-    helper.append_op(type='nll_loss',
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type='nll_loss', inputs=inputs, outputs=outputs, attrs=attrs
+    )
     if input_dims != 2 and input_dims != 4 and reduction == 'none':
         out = reshape(out, shape=out_shape)
 
@@ -1488,13 +1612,15 @@ def kl_div(input, label, reduction='mean', name=None):
 
     """
     # ugly type promotion
-    if fluid.data_feeder.convert_dtype(
-            input.dtype) == 'float32' and fluid.data_feeder.convert_dtype(
-                label.dtype) == 'float64':
+    if (
+        fluid.data_feeder.convert_dtype(input.dtype) == 'float32'
+        and fluid.data_feeder.convert_dtype(label.dtype) == 'float64'
+    ):
         input = paddle.cast(input, 'float64')
-    elif fluid.data_feeder.convert_dtype(
-            input.dtype) == 'float64' and fluid.data_feeder.convert_dtype(
-                label.dtype) == 'float32':
+    elif (
+        fluid.data_feeder.convert_dtype(input.dtype) == 'float64'
+        and fluid.data_feeder.convert_dtype(label.dtype) == 'float32'
+    ):
         label = paddle.cast(label, 'float64')
 
     if in_dygraph_mode():
@@ -1527,13 +1653,12 @@ def kl_div(input, label, reduction='mean', name=None):
     fluid.data_feeder.check_type(reduction, 'reduction', str, 'kl_div')
 
     loss = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type='kldiv_loss',
-                     inputs={
-                         'X': input,
-                         'Target': label
-                     },
-                     outputs={'Loss': loss},
-                     attrs={'reduction': 'none'})
+    helper.append_op(
+        type='kldiv_loss',
+        inputs={'X': input, 'Target': label},
+        outputs={'Loss': loss},
+        attrs={'reduction': 'none'},
+    )
 
     if reduction == 'mean':
         loss = paddle.mean(loss)
@@ -1596,31 +1721,38 @@ def mse_loss(input, label, reduction='mean', name=None):
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "'reduction' in 'mse_loss' should be 'sum', 'mean' or 'none', "
-            "but received {}.".format(reduction))
+            "but received {}.".format(reduction)
+        )
 
     if not in_dynamic_mode():
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'mse_loss')
-        check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                                 'mse_loss')
+        check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'mse_loss'
+        )
+        check_variable_and_dtype(
+            label, 'label', ['float32', 'float64'], 'mse_loss'
+        )
 
     if reduction == 'none':
         return paddle.square(paddle.subtract(input, label), name=name)
     elif reduction == 'mean':
-        return paddle.mean(paddle.square(paddle.subtract(input, label)),
-                           name=name)
+        return paddle.mean(
+            paddle.square(paddle.subtract(input, label)), name=name
+        )
     else:
-        return paddle.sum(paddle.square(paddle.subtract(input, label)),
-                          name=name)
+        return paddle.sum(
+            paddle.square(paddle.subtract(input, label)), name=name
+        )
 
 
-def ctc_loss(log_probs,
-             labels,
-             input_lengths,
-             label_lengths,
-             blank=0,
-             reduction='mean',
-             norm_by_times=False):
+def ctc_loss(
+    log_probs,
+    labels,
+    input_lengths,
+    label_lengths,
+    blank=0,
+    reduction='mean',
+    norm_by_times=False,
+):
     """
 
     An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
@@ -1699,8 +1831,9 @@ def ctc_loss(log_probs,
 
     """
 
-    loss_out = fluid.layers.warpctc(log_probs, labels, blank, norm_by_times,
-                                    input_lengths, label_lengths)
+    loss_out = fluid.layers.warpctc(
+        log_probs, labels, blank, norm_by_times, input_lengths, label_lengths
+    )
 
     loss_out = paddle.squeeze(loss_out, [-1])
     assert reduction in ['mean', 'sum', 'none']
@@ -1711,15 +1844,17 @@ def ctc_loss(log_probs,
     return loss_out
 
 
-def margin_cross_entropy(logits,
-                         label,
-                         margin1=1.0,
-                         margin2=0.5,
-                         margin3=0.0,
-                         scale=64.0,
-                         group=None,
-                         return_softmax=False,
-                         reduction='mean'):
+def margin_cross_entropy(
+    logits,
+    label,
+    margin1=1.0,
+    margin2=0.5,
+    margin3=0.0,
+    scale=64.0,
+    group=None,
+    return_softmax=False,
+    reduction='mean',
+):
     r"""
     .. math::
 
@@ -1909,7 +2044,10 @@ def margin_cross_entropy(logits,
     if not (group == False or group is None or hasattr(group, 'is_member')):
         raise ValueError(
             'Expected group is False, None or instance of paddle.distributed.collective.Group \
-             (got group: {})'.format(group))
+             (got group: {})'.format(
+                group
+            )
+        )
         return
 
     if hasattr(group, 'is_member') and not group.is_member():
@@ -1923,8 +2061,11 @@ def margin_cross_entropy(logits,
         if core.is_compiled_with_dist():
             parallel_env = paddle.distributed.ParallelEnv()
             global_rank = parallel_env.rank
-            rank = global_rank if group is None else group.get_group_rank(
-                global_rank)
+            rank = (
+                global_rank
+                if group is None
+                else group.get_group_rank(global_rank)
+            )
             nranks = parallel_env.world_size if group is None else group.nranks
 
     input_dims = len(list(logits.shape))
@@ -1932,15 +2073,26 @@ def margin_cross_entropy(logits,
     if input_dims - 1 != label_dims and input_dims != label_dims:
         raise ValueError(
             'Expected input_dims - 1 = label_dims or input_dims == label_dims\
-             (got nput_dims{}, label_dims{})'.format(input_dims, label_dims))
+             (got nput_dims{}, label_dims{})'.format(
+                input_dims, label_dims
+            )
+        )
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=-1)
 
     if in_dygraph_mode():
-        softmax, loss = _C_ops.margin_cross_entropy(logits, label,
-                                                    return_softmax, ring_id,
-                                                    rank, nranks, margin1,
-                                                    margin2, margin3, scale)
+        softmax, loss = _C_ops.margin_cross_entropy(
+            logits,
+            label,
+            return_softmax,
+            ring_id,
+            rank,
+            nranks,
+            margin1,
+            margin2,
+            margin3,
+            scale,
+        )
         if reduction == 'mean':
             loss = paddle.mean(loss)
         elif reduction == 'sum':
@@ -1951,9 +2103,25 @@ def margin_cross_entropy(logits,
             return loss, softmax
     elif _in_legacy_dygraph():
         softmax, loss = _legacy_C_ops.margin_cross_entropy(
-            logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks,
-            'margin1', margin1, 'margin2', margin2, 'margin3', margin3, 'scale',
-            scale, 'return_softmax', return_softmax)
+            logits,
+            label,
+            'ring_id',
+            ring_id,
+            'rank',
+            rank,
+            'nranks',
+            nranks,
+            'margin1',
+            margin1,
+            'margin2',
+            margin2,
+            'margin3',
+            margin3,
+            'scale',
+            scale,
+            'return_softmax',
+            return_softmax,
+        )
         if reduction == 'mean':
             loss = paddle.mean(loss)
         elif reduction == 'sum':
@@ -1968,31 +2136,31 @@ def margin_cross_entropy(logits,
     softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
     loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
 
-    check_variable_and_dtype(logits, 'logits',
-                             ['float16', 'float32', 'float64'],
-                             'margin_cross_entropy')
-    check_variable_and_dtype(label, 'label', ['int32', 'int64'],
-                             'margin_cross_entropy')
-
-    helper.append_op(type=op_type,
-                     inputs={
-                         'Logits': logits,
-                         'Label': label
-                     },
-                     outputs={
-                         'Softmax': softmax,
-                         'Loss': loss
-                     },
-                     attrs={
-                         'return_softmax': return_softmax,
-                         'ring_id': ring_id,
-                         'rank': rank,
-                         'nranks': nranks,
-                         'margin1': margin1,
-                         'margin2': margin2,
-                         'margin3': margin3,
-                         'scale': scale,
-                     })
+    check_variable_and_dtype(
+        logits,
+        'logits',
+        ['float16', 'float32', 'float64'],
+        'margin_cross_entropy',
+    )
+    check_variable_and_dtype(
+        label, 'label', ['int32', 'int64'], 'margin_cross_entropy'
+    )
+
+    helper.append_op(
+        type=op_type,
+        inputs={'Logits': logits, 'Label': label},
+        outputs={'Softmax': softmax, 'Loss': loss},
+        attrs={
+            'return_softmax': return_softmax,
+            'ring_id': ring_id,
+            'rank': rank,
+            'nranks': nranks,
+            'margin1': margin1,
+            'margin2': margin2,
+            'margin3': margin3,
+            'scale': scale,
+        },
+    )
 
     if reduction == 'mean':
         loss = paddle.mean(loss)
@@ -2009,16 +2177,20 @@ def margin_cross_entropy(logits,
     since="2.0.0",
     update_to="paddle.nn.functional.cross_entropy",
     level=1,
-    reason=
-    ('Please notice that behavior of "paddle.nn.functional.softmax_with_cross_entropy" '
-     'and "paddle.nn.functional.cross_entropy" is different.'))
-def softmax_with_cross_entropy(logits,
-                               label,
-                               soft_label=False,
-                               ignore_index=-100,
-                               numeric_stable_mode=True,
-                               return_softmax=False,
-                               axis=-1):
+    reason=(
+        'Please notice that behavior of "paddle.nn.functional.softmax_with_cross_entropy" '
+        'and "paddle.nn.functional.cross_entropy" is different.'
+    ),
+)
+def softmax_with_cross_entropy(
+    logits,
+    label,
+    soft_label=False,
+    ignore_index=-100,
+    numeric_stable_mode=True,
+    return_softmax=False,
+    axis=-1,
+):
     r"""
     This operator implements the cross entropy loss function with softmax. This function 
     combines the calculation of the softmax operation and the cross entropy loss function 
@@ -2104,33 +2276,41 @@ def softmax_with_cross_entropy(logits,
             out = paddle.nn.functional.softmax_with_cross_entropy(logits=x, label=label)
             print(out)
     """
-    return fluid_softmax_with_cross_entropy(logits, label, soft_label,
-                                            ignore_index, numeric_stable_mode,
-                                            return_softmax, axis)
-
-
-def cross_entropy(input,
-                  label,
-                  weight=None,
-                  ignore_index=-100,
-                  reduction='mean',
-                  soft_label=False,
-                  axis=-1,
-                  use_softmax=True,
-                  name=None):
+    return fluid_softmax_with_cross_entropy(
+        logits,
+        label,
+        soft_label,
+        ignore_index,
+        numeric_stable_mode,
+        return_softmax,
+        axis,
+    )
+
+
+def cross_entropy(
+    input,
+    label,
+    weight=None,
+    ignore_index=-100,
+    reduction='mean',
+    soft_label=False,
+    axis=-1,
+    use_softmax=True,
+    name=None,
+):
     r"""
-    By default, this operator implements the cross entropy loss function with softmax. This function 
-    combines the calculation of the softmax operation and the cross entropy loss function 
-    to provide a more numerically stable computing. 
+    By default, this operator implements the cross entropy loss function with softmax. This function
+    combines the calculation of the softmax operation and the cross entropy loss function
+    to provide a more numerically stable computing.
 
     This operator will calculate the cross entropy loss function without softmax when use_softmax=False.
 
-    By default, this operator will calculate the mean of the result, and you can also affect 
-    the default behavior by using the reduction parameter. Please refer to the part of 
+    By default, this operator will calculate the mean of the result, and you can also affect
+    the default behavior by using the reduction parameter. Please refer to the part of
     parameters for details.
 
     This operator can be used to calculate the softmax cross entropy loss with soft and hard labels.
-    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels 
+    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels
     mean the probability of the actual label, 0.6, 0.8, 0.2, etc.
 
     The calculation of this operator includes the following two steps.
@@ -2185,7 +2365,7 @@ def cross_entropy(input,
             1.1. Hard labels (soft_label = False)
 
             .. math::
-                \\loss_j=loss_j*weight[label_j] 
+                \\loss_j=loss_j*weight[label_j]
 
 
             1.2. Soft labels (soft_label = True)
@@ -2195,21 +2375,21 @@ def cross_entropy(input,
 
         2. reduction
 
-            2.1 if the ``reduction`` parameter is ``none`` 
+            2.1 if the ``reduction`` parameter is ``none``
 
                 Return the previous result directly
 
-            2.2 if the ``reduction`` parameter is ``sum`` 
+            2.2 if the ``reduction`` parameter is ``sum``
 
                 Return the sum of the previous results
 
             .. math::
                \\loss=\sum_{j}loss_j
 
-            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to 
-            the ``weight`` parameter as follows. 
+            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to
+            the ``weight`` parameter as follows.
 
-            2.3.1. If the  ``weight``  parameter is ``None`` 
+            2.3.1. If the  ``weight``  parameter is ``None``
 
                    Return the average value of the previous results
 
@@ -2223,7 +2403,7 @@ def cross_entropy(input,
             1. Hard labels (soft_label = False)
 
             .. math::
-                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j] 
+                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j]
 
             2. Soft labels (soft_label = True)
 
@@ -2236,11 +2416,11 @@ def cross_entropy(input,
         - **input** (Tensor)
 
             Input tensor, the data type is float32, float64. Shape is
-        :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` . 
+        :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` .
 
-            Note: 
+            Note:
 
-                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the 
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
                 output of softmax operator, which will produce incorrect results.
 
                 2. when use_softmax=False, it expects the output of softmax operator.
@@ -2251,20 +2431,20 @@ def cross_entropy(input,
             :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
             the data type is int32, int64, float32, float64, where each value is [0, C-1].
 
-            2. If soft_label=True, the shape and data type should be same with ``input`` , 
+            2. If soft_label=True, the shape and data type should be same with ``input`` ,
             and the sum of the labels for each sample should be 1.
 
         - **weight** (Tensor, optional)
 
-            a manual rescaling weight given to each class. 
-            If given, has to be a Tensor of size C and the data type is float32, float64. 
+            a manual rescaling weight given to each class.
+            If given, has to be a Tensor of size C and the data type is float32, float64.
             Default is ``'None'`` .
 
         - **ignore_index** (int64, optional)
 
             Specifies a target value that is ignored
-            and does not contribute to the loss. A negative value means that no label 
-            value needs to be ignored. Only valid when soft_label = False.  
+            and does not contribute to the loss. A negative value means that no label
+            value needs to be ignored. Only valid when soft_label = False.
             Default is ``-100`` .
 
         - **reduction** (str, optional)
@@ -2278,14 +2458,14 @@ def cross_entropy(input,
 
         - **soft_label** (bool, optional)
 
-            Indicate whether label is soft. 
+            Indicate whether label is soft.
             Default is ``False``.
 
         - **axis** (int, optional)
 
-            The index of dimension to perform softmax calculations. 
-            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the 
-            number of dimensions of input :attr:`input`. 
+            The index of dimension to perform softmax calculations.
+            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the
+            number of dimensions of input :attr:`input`.
             Default is ``-1`` .
 
         - **use_softmax** (bool, optional)
@@ -2307,9 +2487,9 @@ def cross_entropy(input,
 
         If :attr:`reduction` is ``'none'``:
 
-        1. If soft_label = False, the dimension of return value is the same with ``label`` . 
+        1. If soft_label = False, the dimension of return value is the same with ``label`` .
 
-        2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . 
+        2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .
 
 
     Examples:
@@ -2322,10 +2502,10 @@ def cross_entropy(input,
             N=100
             C=200
             reduction='mean'
-            input =  paddle.rand([N, C], dtype='float64')  
+            input =  paddle.rand([N, C], dtype='float64')
             label =  paddle.randint(0, C, shape=[N], dtype='int64')
-            weight = paddle.rand([C], dtype='float64') 
-            
+            weight = paddle.rand([C], dtype='float64')
+
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction=reduction)
             dy_ret = cross_entropy_loss(
@@ -2349,9 +2529,9 @@ def cross_entropy(input,
             labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
             labels /= paddle.sum(labels, axis=axis, keepdim=True)
             paddle_loss_mean = paddle.nn.functional.cross_entropy(
-                                                                  logits,  
-                                                                  labels, 
-                                                                  soft_label=True, 
+                                                                  logits,
+                                                                  labels,
+                                                                  soft_label=True,
                                                                   axis=axis,
                                                                   weight=weight,
                                                                   reduction=reduction)
@@ -2363,12 +2543,14 @@ def cross_entropy(input,
         raise ValueError(
             "The value of 'reduction' in softmax_cross_entropy"
             "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
-            % reduction)
+            % reduction
+        )
     if ignore_index > 0 and soft_label == True:
         raise ValueError(
             "When soft_label == True, the value of 'ignore_index' in softmax_cross_entropy"
-            "should be '-100', but received %s, which is not allowed." %
-            ignore_index)
+            "should be '-100', but received %s, which is not allowed."
+            % ignore_index
+        )
 
     input_dims = len(list(input.shape))
     if input_dims == 0:
@@ -2378,29 +2560,53 @@ def cross_entropy(input,
     if input_dims - 1 != label_dims and input_dims != label_dims:
         raise ValueError(
             'Expected nput_dims - 1 = label_dims or input_dims == label_dims\
-             (got nput_dims{}, label_dims{})'.format(input_dims, label_dims))
+             (got nput_dims{}, label_dims{})'.format(
+                input_dims, label_dims
+            )
+        )
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=axis)
 
     if in_dygraph_mode():
         if soft_label == False:
-            valid_label = paddle.cast(label != ignore_index,
-                                      dtype=label.dtype) * label
+            valid_label = (
+                paddle.cast(label != ignore_index, dtype=label.dtype) * label
+            )
         if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
             if soft_label == False:
                 _, _, out = _legacy_C_ops.softmax_with_cross_entropy(
-                    input, valid_label, 'soft_label', soft_label,
-                    'ignore_index', ignore_index, 'numeric_stable_mode', True,
-                    'axis', axis, 'use_softmax', use_softmax)
+                    input,
+                    valid_label,
+                    'soft_label',
+                    soft_label,
+                    'ignore_index',
+                    ignore_index,
+                    'numeric_stable_mode',
+                    True,
+                    'axis',
+                    axis,
+                    'use_softmax',
+                    use_softmax,
+                )
             else:
                 _, _, out = _legacy_C_ops.softmax_with_cross_entropy(
-                    input, label, 'soft_label', soft_label, 'ignore_index',
-                    ignore_index, 'numeric_stable_mode', True, 'axis', axis,
-                    'use_softmax', use_softmax)
+                    input,
+                    label,
+                    'soft_label',
+                    soft_label,
+                    'ignore_index',
+                    ignore_index,
+                    'numeric_stable_mode',
+                    True,
+                    'axis',
+                    axis,
+                    'use_softmax',
+                    use_softmax,
+                )
         else:
-            _, out = _C_ops.cross_entropy_with_softmax(input, label, soft_label,
-                                                       use_softmax, True,
-                                                       ignore_index, axis)
+            _, out = _C_ops.cross_entropy_with_softmax(
+                input, label, soft_label, use_softmax, True, ignore_index, axis
+            )
 
         if weight is not None:
 
@@ -2410,11 +2616,12 @@ def cross_entropy(input,
                 # weight's shape is C, where C is class num.
                 # for 1d case: label's shape is [N,C], weight_gather's shape is N.
                 # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
-                weight_gather = paddle.matmul(x=paddle.cast(
-                    label, weight.dtype),
-                                              y=weight,
-                                              transpose_x=False,
-                                              transpose_y=True)
+                weight_gather = paddle.matmul(
+                    x=paddle.cast(label, weight.dtype),
+                    y=weight,
+                    transpose_x=False,
+                    transpose_y=True,
+                )
                 out_shape = list(out.shape)
                 weight_gather_reshape = reshape(weight_gather, shape=out_shape)
                 out = paddle.cast(out, weight_gather_reshape.dtype)
@@ -2425,29 +2632,44 @@ def cross_entropy(input,
                     raise ValueError(
                         "input's class_dimension({}) must equal to "
                         "weight's class_dimension({}) "
-                        "when weight is provided" \
-                            .format(input.shape[axis], weight.shape[-1]))
-
-                ignore_weight_mask = paddle.cast((label != ignore_index),
-                                                 out.dtype)
-                if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
-                        axis] == 1:
+                        "when weight is provided".format(
+                            input.shape[axis], weight.shape[-1]
+                        )
+                    )
+
+                ignore_weight_mask = paddle.cast(
+                    (label != ignore_index), out.dtype
+                )
+                if (
+                    ignore_weight_mask.ndim > 1
+                    and ignore_weight_mask.shape[axis] == 1
+                ):
                     # TODO: Temporarily use squeeze instead of squeeze_
-                    ignore_weight_mask = paddle.squeeze(ignore_weight_mask,
-                                                        axis)
+                    ignore_weight_mask = paddle.squeeze(
+                        ignore_weight_mask, axis
+                    )
                 if axis != -1 and axis != valid_label.ndim - 1:
-                    temp_perm = list(range(axis % valid_label.ndim)) \
-                                + list(range((axis % valid_label.ndim + 1), valid_label.ndim)) \
-                                + [axis % valid_label.ndim]
+                    temp_perm = (
+                        list(range(axis % valid_label.ndim))
+                        + list(
+                            range(
+                                (axis % valid_label.ndim + 1), valid_label.ndim
+                            )
+                        )
+                        + [axis % valid_label.ndim]
+                    )
                     weight_gather = _C_ops.gather_nd(
-                        weight, valid_label.transpose(temp_perm))
+                        weight, valid_label.transpose(temp_perm)
+                    )
                 else:
                     weight_gather = _C_ops.gather_nd(weight, valid_label)
-                weight_gather = _C_ops.multiply(weight_gather,
-                                                ignore_weight_mask)
+                weight_gather = _C_ops.multiply(
+                    weight_gather, ignore_weight_mask
+                )
                 input_shape = list(label.shape)
-                weight_gather_reshape = reshape(weight_gather,
-                                                shape=input_shape)
+                weight_gather_reshape = reshape(
+                    weight_gather, shape=input_shape
+                )
                 out = paddle.cast(out, weight_gather_reshape.dtype)
                 out = _C_ops.multiply(out, weight_gather_reshape)
 
@@ -2468,22 +2690,24 @@ def cross_entropy(input,
                 # for each label[i],set 1 or 0, according to ignore_index
                 # mask[i]=0, if label[i]==ignore_index
                 # mask[i]=1, otherwise
-                mask = (label != ignore_index)
+                mask = label != ignore_index
                 if weight is None:
                     mask = paddle.cast(mask, dtype=out_sum.dtype)
                     count = _C_ops.sum(mask, [], None, False)
                     ret = out_sum / (count + (count == 0.0))
                 else:
                     mask = paddle.cast(mask, weight_gather_reshape.dtype)
-                    weight_ignored = _C_ops.multiply(mask,
-                                                     weight_gather_reshape)
+                    weight_ignored = _C_ops.multiply(
+                        mask, weight_gather_reshape
+                    )
                     weight_sum = _C_ops.sum(weight_ignored, [], None, False)
                     ret = out_sum / (weight_sum + (weight_sum == 0.0))
                 return ret
             elif weight is not None:
                 out_sum = _C_ops.sum(out, [], None, False)
-                total_weight = _C_ops.sum(weight_gather_reshape, [], None,
-                                          False)
+                total_weight = _C_ops.sum(
+                    weight_gather_reshape, [], None, False
+                )
                 return out_sum / (total_weight + (total_weight == 0.0))
             else:
                 return _C_ops.mean_all(out)
@@ -2495,32 +2719,65 @@ def cross_entropy(input,
 
     elif _in_legacy_dygraph():
         if soft_label == False:
-            valid_label = paddle.cast(label != ignore_index,
-                                      dtype=label.dtype) * label
+            valid_label = (
+                paddle.cast(label != ignore_index, dtype=label.dtype) * label
+            )
             label_min = paddle.min(valid_label)
             label_max = paddle.max(valid_label)
             if label_min < 0:
-                raise ValueError("Target {} is out of lower bound.".format(
-                    label_min.item()))
+                raise ValueError(
+                    "Target {} is out of lower bound.".format(label_min.item())
+                )
             if label_max >= input.shape[axis]:
-                raise ValueError("Target {} is out of upper bound.".format(
-                    label_max.item()))
+                raise ValueError(
+                    "Target {} is out of upper bound.".format(label_max.item())
+                )
         if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
             if soft_label == False:
                 _, _, out = _legacy_C_ops.softmax_with_cross_entropy(
-                    input, valid_label, 'soft_label', soft_label,
-                    'ignore_index', ignore_index, 'numeric_stable_mode', True,
-                    'axis', axis, 'use_softmax', use_softmax)
+                    input,
+                    valid_label,
+                    'soft_label',
+                    soft_label,
+                    'ignore_index',
+                    ignore_index,
+                    'numeric_stable_mode',
+                    True,
+                    'axis',
+                    axis,
+                    'use_softmax',
+                    use_softmax,
+                )
             else:
                 _, _, out = _legacy_C_ops.softmax_with_cross_entropy(
-                    input, label, 'soft_label', soft_label, 'ignore_index',
-                    ignore_index, 'numeric_stable_mode', True, 'axis', axis,
-                    'use_softmax', use_softmax)
+                    input,
+                    label,
+                    'soft_label',
+                    soft_label,
+                    'ignore_index',
+                    ignore_index,
+                    'numeric_stable_mode',
+                    True,
+                    'axis',
+                    axis,
+                    'use_softmax',
+                    use_softmax,
+                )
         else:
             _, out = _legacy_C_ops.softmax_with_cross_entropy(
-                input, label, 'soft_label', soft_label, 'ignore_index',
-                ignore_index, 'numeric_stable_mode', True, 'axis', axis,
-                'use_softmax', use_softmax)
+                input,
+                label,
+                'soft_label',
+                soft_label,
+                'ignore_index',
+                ignore_index,
+                'numeric_stable_mode',
+                True,
+                'axis',
+                axis,
+                'use_softmax',
+                use_softmax,
+            )
 
         if weight is not None:
 
@@ -2530,11 +2787,12 @@ def cross_entropy(input,
                 # weight's shape is C, where C is class num.
                 # for 1d case: label's shape is [N,C], weight_gather's shape is N.
                 # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
-                weight_gather = paddle.matmul(x=paddle.cast(
-                    label, weight.dtype),
-                                              y=weight,
-                                              transpose_x=False,
-                                              transpose_y=True)
+                weight_gather = paddle.matmul(
+                    x=paddle.cast(label, weight.dtype),
+                    y=weight,
+                    transpose_x=False,
+                    transpose_y=True,
+                )
                 out_shape = list(out.shape)
                 weight_gather_reshape = reshape(weight_gather, shape=out_shape)
                 out = paddle.cast(out, weight_gather_reshape.dtype)
@@ -2546,29 +2804,44 @@ def cross_entropy(input,
                     raise ValueError(
                         "input's class_dimension({}) must equal to "
                         "weight's class_dimension({}) "
-                        "when weight is provided" \
-                            .format(input.shape[axis], weight.shape[-1]))
-
-                ignore_weight_mask = paddle.cast((label != ignore_index),
-                                                 out.dtype)
-                if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
-                        axis] == 1:
+                        "when weight is provided".format(
+                            input.shape[axis], weight.shape[-1]
+                        )
+                    )
+
+                ignore_weight_mask = paddle.cast(
+                    (label != ignore_index), out.dtype
+                )
+                if (
+                    ignore_weight_mask.ndim > 1
+                    and ignore_weight_mask.shape[axis] == 1
+                ):
                     # TODO: Temporarily use squeeze instead of squeeze_
-                    ignore_weight_mask = paddle.squeeze(ignore_weight_mask,
-                                                        axis)
+                    ignore_weight_mask = paddle.squeeze(
+                        ignore_weight_mask, axis
+                    )
                 if axis != -1 and axis != valid_label.ndim - 1:
-                    temp_perm = list(range(axis % valid_label.ndim)) \
-                                + list(range((axis % valid_label.ndim + 1), valid_label.ndim)) \
-                                + [axis % valid_label.ndim]
+                    temp_perm = (
+                        list(range(axis % valid_label.ndim))
+                        + list(
+                            range(
+                                (axis % valid_label.ndim + 1), valid_label.ndim
+                            )
+                        )
+                        + [axis % valid_label.ndim]
+                    )
                     weight_gather = _legacy_C_ops.gather_nd(
-                        weight, valid_label.transpose(temp_perm))
+                        weight, valid_label.transpose(temp_perm)
+                    )
                 else:
                     weight_gather = _legacy_C_ops.gather_nd(weight, valid_label)
                 weight_gather = _legacy_C_ops.elementwise_mul(
-                    weight_gather, ignore_weight_mask)
+                    weight_gather, ignore_weight_mask
+                )
                 input_shape = list(label.shape)
-                weight_gather_reshape = reshape(weight_gather,
-                                                shape=input_shape)
+                weight_gather_reshape = reshape(
+                    weight_gather, shape=input_shape
+                )
                 out = paddle.cast(out, weight_gather_reshape.dtype)
                 out = _legacy_C_ops.elementwise_mul(out, weight_gather_reshape)
 
@@ -2589,7 +2862,7 @@ def cross_entropy(input,
                 # for each label[i],set 1 or 0, according to ignore_index
                 # mask[i]=0, if label[i]==ignore_index
                 # mask[i]=1, otherwise
-                mask = (label != ignore_index)
+                mask = label != ignore_index
                 if weight is None:
                     mask = paddle.cast(mask, dtype=out_sum.dtype)
                     count = _legacy_C_ops.reduce_sum(mask, 'reduce_all', True)
@@ -2597,15 +2870,18 @@ def cross_entropy(input,
                 else:
                     mask = paddle.cast(mask, weight_gather_reshape.dtype)
                     weight_ignored = _legacy_C_ops.elementwise_mul(
-                        mask, weight_gather_reshape)
+                        mask, weight_gather_reshape
+                    )
                     weight_sum = _legacy_C_ops.reduce_sum(
-                        weight_ignored, 'reduce_all', True)
+                        weight_ignored, 'reduce_all', True
+                    )
                     ret = out_sum / (weight_sum + (weight_sum == 0.0))
                 return ret
             elif weight is not None:
                 out_sum = _legacy_C_ops.reduce_sum(out, 'reduce_all', True)
-                total_weight = _legacy_C_ops.reduce_sum(weight_gather_reshape,
-                                                        'reduce_all', True)
+                total_weight = _legacy_C_ops.reduce_sum(
+                    weight_gather_reshape, 'reduce_all', True
+                )
                 return out_sum / (total_weight + (total_weight == 0.0))
             else:
                 return _legacy_C_ops.mean(out)
@@ -2614,18 +2890,24 @@ def cross_entropy(input,
                 out = paddle.squeeze(out, axis=axis)
             return out
 
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'softmax_cross_entropy')
     check_variable_and_dtype(
-        label, 'label',
+        input,
+        'input',
+        ['float16', 'float32', 'float64'],
+        'softmax_cross_entropy',
+    )
+    check_variable_and_dtype(
+        label,
+        'label',
         ['uint8', 'int8', 'int16', 'int32', 'int64', 'float32', 'float64'],
-        'softmax_cross_entropy')
+        'softmax_cross_entropy',
+    )
     attrs = {
         'soft_label': soft_label,
         'ignore_index': ignore_index,
         'numeric_stable_mode': True,
         'axis': axis,
-        'use_softmax': use_softmax
+        'use_softmax': use_softmax,
     }
     helper = LayerHelper('softmax_with_cross_entropy', **locals())
     softmax = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -2635,17 +2917,17 @@ def cross_entropy(input,
     if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
         backprop = helper.create_variable_for_type_inference(dtype=input.dtype)
         outputs['Backprop'] = backprop
-    helper.append_op(type='softmax_with_cross_entropy',
-                     inputs={
-                         'Logits': input,
-                         'Label': label
-                     },
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type='softmax_with_cross_entropy',
+        inputs={'Logits': input, 'Label': label},
+        outputs=outputs,
+        attrs=attrs,
+    )
 
     if weight is not None:
-        check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
-                                 'softmax_cross_entropy')
+        check_variable_and_dtype(
+            weight, 'weight', ['float32', 'float64'], 'softmax_cross_entropy'
+        )
         weight_name = name if reduction == 'none' else None
         if soft_label == True:
             # chajchaj:
@@ -2653,34 +2935,48 @@ def cross_entropy(input,
             # weight's shape is C, where C is class num.
             # for 1d case: label's shape is [N,C], weight_gather's shape is N.
             # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
-            weight_gather = paddle.matmul(x=paddle.cast(label, weight.dtype),
-                                          y=weight,
-                                          transpose_x=False,
-                                          transpose_y=True)
+            weight_gather = paddle.matmul(
+                x=paddle.cast(label, weight.dtype),
+                y=weight,
+                transpose_x=False,
+                transpose_y=True,
+            )
 
             out_shape = list(out.shape)
             weight_gather_reshape = reshape(weight_gather, shape=out_shape)
             out = paddle.cast(out, weight_gather_reshape.dtype)
         else:
             if input.shape[axis] != weight.shape[-1]:
-                raise ValueError("input's class_dimension({}) must equal to "
-                                 "weight's class_dimension({}) "
-                                 "when weight is provided" \
-                                 .format(input.shape[axis], weight.shape[-1]))
+                raise ValueError(
+                    "input's class_dimension({}) must equal to "
+                    "weight's class_dimension({}) "
+                    "when weight is provided".format(
+                        input.shape[axis], weight.shape[-1]
+                    )
+                )
 
             valid_label = paddle.multiply(
-                paddle.cast(label != ignore_index, dtype=label.dtype), label)
-            ignore_weight_mask = paddle.cast((label != ignore_index),
-                                             input.dtype)
-            if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
-                    axis] == 1:
+                paddle.cast(label != ignore_index, dtype=label.dtype), label
+            )
+            ignore_weight_mask = paddle.cast(
+                (label != ignore_index), input.dtype
+            )
+            if (
+                ignore_weight_mask.ndim > 1
+                and ignore_weight_mask.shape[axis] == 1
+            ):
                 ignore_weight_mask = paddle.squeeze(ignore_weight_mask, axis)
             if axis != -1 and axis != valid_label.ndim - 1:
-                temp_perm = list(range(axis % valid_label.ndim)) \
-                            + list(range((axis % valid_label.ndim + 1), valid_label.ndim)) \
-                            + [axis % valid_label.ndim]
+                temp_perm = (
+                    list(range(axis % valid_label.ndim))
+                    + list(
+                        range((axis % valid_label.ndim + 1), valid_label.ndim)
+                    )
+                    + [axis % valid_label.ndim]
+                )
                 weight_gather = paddle.gather_nd(
-                    weight, paddle.transpose(valid_label, temp_perm))
+                    weight, paddle.transpose(valid_label, temp_perm)
+                )
             else:
                 weight_gather = paddle.gather_nd(weight, valid_label)
             weight_gather = paddle.multiply(weight_gather, ignore_weight_mask)
@@ -2697,8 +2993,8 @@ def cross_entropy(input,
             # for each label[i],set 1 or 0, according to ignore_index
             # mask[i]=0, if label[i]==ignore_index
             # mask[i]=1, otherwise
-            mask = (label != ignore_index)
-            if (weight is None):
+            mask = label != ignore_index
+            if weight is None:
                 mask = paddle.cast(mask, dtype=out_sum.dtype)
                 count = paddle.sum(mask, name=name)
                 ret = out_sum / (count + (count == 0.0))
@@ -2722,13 +3018,15 @@ def cross_entropy(input,
         return out
 
 
-def sigmoid_focal_loss(logit,
-                       label,
-                       normalizer=None,
-                       alpha=0.25,
-                       gamma=2.0,
-                       reduction='sum',
-                       name=None):
+def sigmoid_focal_loss(
+    logit,
+    label,
+    normalizer=None,
+    alpha=0.25,
+    gamma=2.0,
+    reduction='sum',
+    name=None,
+):
     r"""
     `Focal Loss <https://arxiv.org/abs/1708.02002>`_ is proposed to address the
     foreground-background class imbalance for classification tasks. It down-weights
@@ -2736,12 +3034,12 @@ def sigmoid_focal_loss(logit,
     it is used in one-stage object detection where the foreground-background class
     imbalance is extremely high.
 
-    This operator measures focal loss function as follows: 
+    This operator measures focal loss function as follows:
 
     .. math::
            Out = -Labels * alpha * {(1 - \sigma(Logit))}^{gamma}\log(\sigma(Logit)) - (1 - Labels) * (1 - alpha) * {\sigma(Logit)}^{gamma}\log(1 - \sigma(Logit))
 
-    We know that :math:`\sigma(Logit) = \frac{1}{1 + \exp(-Logit)}`. 
+    We know that :math:`\sigma(Logit) = \frac{1}{1 + \exp(-Logit)}`.
 
     Then, if :attr:`normalizer` is not None, this operator divides the
     normalizer tensor on the loss `Out`:
@@ -2768,7 +3066,7 @@ def sigmoid_focal_loss(logit,
             For object detection task, it is the number of positive samples.
             If set to None, the focal loss will not be normalized. Default is None.
         alpha(int|float, optional): Hyper-parameter to balance the positive and negative example,
-            it should be between 0 and 1.  Default value is set to 0.25. 
+            it should be between 0 and 1.  Default value is set to 0.25.
         gamma(int|float, optional): Hyper-parameter to modulate the easy and hard examples.
             Default value is set to 2.0.
         reduction (str, optional): Indicate how to average the loss by batch_size,
@@ -2802,37 +3100,49 @@ def sigmoid_focal_loss(logit,
         raise ValueError(
             "The value of 'reduction' in sigmoid_focal_loss "
             "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
-            % reduction)
+            % reduction
+        )
 
     if normalizer is not None:
-        check_variable_and_dtype(normalizer, 'normalizer',
-                                 ['float32', 'float64'], 'sigmoid_focal_loss')
+        check_variable_and_dtype(
+            normalizer,
+            'normalizer',
+            ['float32', 'float64'],
+            'sigmoid_focal_loss',
+        )
         normalizer_shape = list(normalizer.shape)
         normalizer_dims = len(normalizer_shape)
         if normalizer_dims > 1:
             raise ValueError(
-                "Expected one dimension of normalizer in sigmoid_focal_loss but got {}."
-                .format(normalizer_dims))
+                "Expected one dimension of normalizer in sigmoid_focal_loss but got {}.".format(
+                    normalizer_dims
+                )
+            )
 
     if in_dygraph_mode():
         place = _current_expected_place()
         one = _C_ops.full(logit.shape, float(1.0), logit.dtype, place)
 
-        loss = _C_ops.sigmoid_cross_entropy_with_logits(logit, label, False,
-                                                        -100)
+        loss = _C_ops.sigmoid_cross_entropy_with_logits(
+            logit, label, False, -100
+        )
 
         pred = _C_ops.sigmoid(logit)
 
         p_t = _C_ops.add(
             _C_ops.multiply(pred, label),
-            _C_ops.multiply(_C_ops.subtract(one, pred),
-                            _C_ops.subtract(one, label)))
+            _C_ops.multiply(
+                _C_ops.subtract(one, pred), _C_ops.subtract(one, label)
+            ),
+        )
 
         alpha = fluid.dygraph.base.to_variable([alpha], dtype=loss.dtype)
         alpha_t = _C_ops.add(
             _C_ops.multiply(alpha, label),
-            _C_ops.multiply(_C_ops.subtract(one, alpha),
-                            _C_ops.subtract(one, label)))
+            _C_ops.multiply(
+                _C_ops.subtract(one, alpha), _C_ops.subtract(one, label)
+            ),
+        )
         loss = _C_ops.multiply(alpha_t, loss)
 
         gamma = fluid.dygraph.base.to_variable([gamma], dtype=loss.dtype)
@@ -2851,9 +3161,19 @@ def sigmoid_focal_loss(logit,
 
     elif _in_legacy_dygraph():
         one = _varbase_creator(dtype=logit.dtype)
-        _legacy_C_ops.fill_constant(one, 'value', float(1.0), 'force_cpu',
-                                    False, 'dtype', one.dtype, 'str_value',
-                                    '1.0', 'shape', logit.shape)
+        _legacy_C_ops.fill_constant(
+            one,
+            'value',
+            float(1.0),
+            'force_cpu',
+            False,
+            'dtype',
+            one.dtype,
+            'str_value',
+            '1.0',
+            'shape',
+            logit.shape,
+        )
         loss = _legacy_C_ops.sigmoid_cross_entropy_with_logits(logit, label)
 
         pred = _legacy_C_ops.sigmoid(logit)
@@ -2862,19 +3182,24 @@ def sigmoid_focal_loss(logit,
             _legacy_C_ops.elementwise_mul(pred, label),
             _legacy_C_ops.elementwise_mul(
                 _legacy_C_ops.elementwise_sub(one, pred),
-                _legacy_C_ops.elementwise_sub(one, label)))
+                _legacy_C_ops.elementwise_sub(one, label),
+            ),
+        )
 
         alpha = fluid.dygraph.base.to_variable([alpha], dtype=loss.dtype)
         alpha_t = _legacy_C_ops.elementwise_add(
             _legacy_C_ops.elementwise_mul(alpha, label),
             _legacy_C_ops.elementwise_mul(
                 _legacy_C_ops.elementwise_sub(one, alpha),
-                _legacy_C_ops.elementwise_sub(one, label)))
+                _legacy_C_ops.elementwise_sub(one, label),
+            ),
+        )
         loss = _legacy_C_ops.elementwise_mul(alpha_t, loss)
 
         gamma = fluid.dygraph.base.to_variable([gamma], dtype=loss.dtype)
         gamma_t = _legacy_C_ops.elementwise_pow(
-            _legacy_C_ops.elementwise_sub(one, p_t), gamma)
+            _legacy_C_ops.elementwise_sub(one, p_t), gamma
+        )
         loss = _legacy_C_ops.elementwise_mul(gamma_t, loss)
 
         if normalizer is not None:
@@ -2887,16 +3212,19 @@ def sigmoid_focal_loss(logit,
 
         return loss
 
-    check_variable_and_dtype(logit, 'logit', ['float32', 'float64'],
-                             'sigmoid_focal_loss')
-    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                             'sigmoid_focal_loss')
+    check_variable_and_dtype(
+        logit, 'logit', ['float32', 'float64'], 'sigmoid_focal_loss'
+    )
+    check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'sigmoid_focal_loss'
+    )
 
     bce_name = None
     if reduction == 'none' and normalizer is None:
         bce_name = name
     loss = paddle.nn.functional.binary_cross_entropy_with_logits(
-        logit, label, reduction='none', name=bce_name)
+        logit, label, reduction='none', name=bce_name
+    )
 
     pred = paddle.nn.functional.sigmoid(logit)
     p_t = pred * label + (1 - pred) * (1 - label)
@@ -2919,74 +3247,89 @@ def sigmoid_focal_loss(logit,
     return loss
 
 
-def multi_label_soft_margin_loss(input,
-                                 label,
-                                 weight=None,
-                                 reduction="mean",
-                                 name=None):
+def multi_label_soft_margin_loss(
+    input, label, weight=None, reduction="mean", name=None
+):
     r"""
 
-        Parameters:
-            input (Tensor): Input tensor, the data type is float32 or float64. Shape is (N, C), where C is number of classes, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1.
-            label (Tensor): Label tensor, the data type is float32 or float64. The shape of label is the same as the shape of input.
-            weight (Tensor,optional): a manual rescaling weight given to each class.
-                    If given, has to be a Tensor of size C and the data type is float32, float64.
-                    Default is ``'None'`` .
-            reduction (str, optional): Indicate how to average the loss by batch_size,
-                    the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-                    If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
-                    If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
-                    If :attr:`reduction` is ``'sum'``, the summed loss is returned.
-                    Default: ``'mean'``
-            name (str, optional): Name for the operation (optional, default is None).
-                    For more information, please refer to :ref:`api_guide_Name`.
-
-	Shape:
-            input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means number of classes, available dtype is float32, float64. The sum operationoperates over all the elements.
-            label: N-D Tensor, same shape as the input.
-            weight:N-D Tensor, the shape is [N,1]
-            output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input.
-
-	Returns:
-            Tensor, The tensor variable storing the multi_label_soft_margin_loss of input and label.
-
-	Examples:
-            .. code-block:: python
+    Parameters:
+        input (Tensor): Input tensor, the data type is float32 or float64. Shape is (N, C), where C is number of classes, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Tensor): Label tensor, the data type is float32 or float64. The shape of label is the same as the shape of input.
+        weight (Tensor,optional): a manual rescaling weight given to each class.
+                If given, has to be a Tensor of size C and the data type is float32, float64.
+                Default is ``'None'`` .
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+                the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+                If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+                If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+                If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+                Default: ``'mean'``
+        name (str, optional): Name for the operation (optional, default is None).
+                For more information, please refer to :ref:`api_guide_Name`.
 
-                import paddle
-                import paddle.nn.functional as F
-                input = paddle.to_tensor([[1, -2, 3], [0, -1, 2], [1, 0, 1]], dtype=paddle.float32)
-                # label elements in {1., -1.}
-                label = paddle.to_tensor([[-1, 1, -1], [1, 1, 1], [1, -1, 1]], dtype=paddle.float32)
-                loss = F.multi_label_soft_margin_loss(input, label, reduction='none')
-                print(loss)
-                # Tensor([3.49625897, 0.71111226, 0.43989015])
-                loss = F.multi_label_soft_margin_loss(input, label, reduction='mean')
-                print(loss)
-                # Tensor([1.54908717])
+    Shape:
+        input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means number of classes, available dtype is float32, float64. The sum operationoperates over all the elements.
+        label: N-D Tensor, same shape as the input.
+        weight:N-D Tensor, the shape is [N,1]
+        output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input.
+
+    Returns:
+        Tensor, The tensor variable storing the multi_label_soft_margin_loss of input and label.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            input = paddle.to_tensor([[1, -2, 3], [0, -1, 2], [1, 0, 1]], dtype=paddle.float32)
+            # label elements in {1., -1.}
+            label = paddle.to_tensor([[-1, 1, -1], [1, 1, 1], [1, -1, 1]], dtype=paddle.float32)
+            loss = F.multi_label_soft_margin_loss(input, label, reduction='none')
+            print(loss)
+            # Tensor([3.49625897, 0.71111226, 0.43989015])
+            loss = F.multi_label_soft_margin_loss(input, label, reduction='mean')
+            print(loss)
+            # Tensor([1.54908717])
     """
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "'reduction' in 'multi_label_soft_margin_loss' should be 'sum', 'mean' or 'none', "
-            "but received {}.".format(reduction))
+            "but received {}.".format(reduction)
+        )
 
     if not (input.shape == label.shape):
-        raise ValueError("The input and label should have same dimension,"
-                         "but received {}!={}".format(input.shape, label.shape))
+        raise ValueError(
+            "The input and label should have same dimension,"
+            "but received {}!={}".format(input.shape, label.shape)
+        )
 
     if not _non_static_mode():
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'multilabel_soft_margin_loss')
-        check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                                 'multilabel_soft_margin_loss')
+        check_variable_and_dtype(
+            input,
+            'input',
+            ['float32', 'float64'],
+            'multilabel_soft_margin_loss',
+        )
+        check_variable_and_dtype(
+            label,
+            'label',
+            ['float32', 'float64'],
+            'multilabel_soft_margin_loss',
+        )
 
-    loss = -(label * paddle.nn.functional.log_sigmoid(input) +
-             (1 - label) * paddle.nn.functional.log_sigmoid(-input))
+    loss = -(
+        label * paddle.nn.functional.log_sigmoid(input)
+        + (1 - label) * paddle.nn.functional.log_sigmoid(-input)
+    )
 
     if weight is not None:
         if not _non_static_mode():
-            check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
-                                     'multilabel_soft_margin_loss')
+            check_variable_and_dtype(
+                weight,
+                'weight',
+                ['float32', 'float64'],
+                'multilabel_soft_margin_loss',
+            )
         loss = loss * weight
 
     loss = loss.mean(axis=-1)  # only return N loss values
@@ -3076,17 +3419,21 @@ def hinge_embedding_loss(input, label, margin=1.0, reduction='mean', name=None):
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "'reduction' in 'hinge_embedding_loss' should be 'sum', 'mean' or 'none', "
-            "but received {}.".format(reduction))
+            "but received {}.".format(reduction)
+        )
 
     if not _non_static_mode():
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'hinge_embedding_loss')
-        check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                                 'hinge_embedding_loss')
+        check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'hinge_embedding_loss'
+        )
+        check_variable_and_dtype(
+            label, 'label', ['float32', 'float64'], 'hinge_embedding_loss'
+        )
 
     zero_ = paddle.zeros([1], dtype=input.dtype)
-    loss = paddle.where(label == 1., input, zero_) + \
-           paddle.where(label == -1., paddle.nn.functional.relu(margin - input), zero_)
+    loss = paddle.where(label == 1.0, input, zero_) + paddle.where(
+        label == -1.0, paddle.nn.functional.relu(margin - input), zero_
+    )
 
     if reduction == 'mean':
         return paddle.mean(loss, name=name)
@@ -3096,12 +3443,9 @@ def hinge_embedding_loss(input, label, margin=1.0, reduction='mean', name=None):
         return loss
 
 
-def cosine_embedding_loss(input1,
-                          input2,
-                          label,
-                          margin=0,
-                          reduction='mean',
-                          name=None):
+def cosine_embedding_loss(
+    input1, input2, label, margin=0, reduction='mean', name=None
+):
     r"""
     This operator computes the cosine embedding loss of Tensor ``input1``, ``input2`` and ``label`` as follows.
 
@@ -3162,12 +3506,14 @@ def cosine_embedding_loss(input1,
     """
     if len(label.shape) != 1:
         raise ValueError(
-            "1D target tensor expected, multi-target not supported")
+            "1D target tensor expected, multi-target not supported"
+        )
 
     if input1.shape != input2.shape:
         raise ValueError(
             "the shape of input tensor 1 should be equal to input tensor 2, but found inputs with "
-            "different sizes")
+            "different sizes"
+        )
 
     if len(input1.shape) > 2:
         raise ValueError(
@@ -3176,9 +3522,13 @@ def cosine_embedding_loss(input1,
 
     if input1.dtype not in [paddle.float32, paddle.float64]:
         raise ValueError(
-            "The data type of input Variable must be 'float32' or 'float64'")
+            "The data type of input Variable must be 'float32' or 'float64'"
+        )
     if label.dtype not in [
-            paddle.int32, paddle.int64, paddle.float32, paddle.float64
+        paddle.int32,
+        paddle.int64,
+        paddle.float32,
+        paddle.float64,
     ]:
         raise ValueError(
             "The data type of label Variable must be 'int32', 'int64', 'float32', 'float64'"
@@ -3204,14 +3554,16 @@ def cosine_embedding_loss(input1,
         return paddle.sum(out, name=name)
 
 
-def triplet_margin_with_distance_loss(input,
-                                      positive,
-                                      negative,
-                                      distance_function=None,
-                                      margin=1.0,
-                                      swap=False,
-                                      reduction='mean',
-                                      name=None):
+def triplet_margin_with_distance_loss(
+    input,
+    positive,
+    negative,
+    distance_function=None,
+    margin=1.0,
+    swap=False,
+    reduction='mean',
+    name=None,
+):
     r"""
     Measures the triplet loss given an input
     tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
@@ -3231,7 +3583,7 @@ def triplet_margin_with_distance_loss(input,
     .. math::
         d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p
 
-    or user can defined their own distance functions. `margin` is a nonnegative margin representing the minimum difference 
+    or user can defined their own distance functions. `margin` is a nonnegative margin representing the minimum difference
     between the positive and negative distances that is required for the loss to be 0. If `swap` is true, it will compare distance of (input, negative) with
     distance of (negative, positive) and change it to the smaller one. For more details see http://www.bmva.org/bmvc/2016/papers/paper119/paper119.pdf.
 
@@ -3247,10 +3599,10 @@ def triplet_margin_with_distance_loss(input,
             The shape of label is the same as the shape of input.
 
         distance_function (callable, optional): Quantifies the distance between two tensors. if not specified, 2 norm functions will be used.
-	
-	    margin (float, optional):Default: :math:`1`.A nonnegative margin representing the minimum difference
+
+            margin (float, optional):Default: :math:`1`.A nonnegative margin representing the minimum difference
             between the positive and negative distances required for the loss to be 0.
-	
+
         swap (bool, optional):The distance swap changes the negative distance to the swap distance (distance between positive samples
                 and negative samples) if swap distance smaller than negative distance. Default: ``False``.
 
@@ -3262,7 +3614,7 @@ def triplet_margin_with_distance_loss(input,
             Default: ``'mean'``
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-	    
+
     Returns:
         Output: Tensor. The tensor variable storing the triplet_margin_with_distance_loss of input and positive and negative.
 
@@ -3286,28 +3638,47 @@ def triplet_margin_with_distance_loss(input,
 
     """
     if reduction not in ['sum', 'mean', 'none']:
-        raise ValueError("'reduction' in 'triplet_margin_with_distance_loss' "
-                         "should be 'sum', 'mean' or 'none', "
-                         "but received {}.".format(reduction))
+        raise ValueError(
+            "'reduction' in 'triplet_margin_with_distance_loss' "
+            "should be 'sum', 'mean' or 'none', "
+            "but received {}.".format(reduction)
+        )
     if margin < 0:
         raise ValueError(
             "The margin between positive samples and negative samples should be greater than 0."
         )
     if not _non_static_mode():
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'triplet_margin_with_distance_loss')
-        check_variable_and_dtype(positive, 'positive', ['float32', 'float64'],
-                                 'triplet_margin_with_distance_loss')
-        check_variable_and_dtype(negative, 'negative', ['float32', 'float64'],
-                                 'triplet_margin_with_distance_loss')
+        check_variable_and_dtype(
+            input,
+            'input',
+            ['float32', 'float64'],
+            'triplet_margin_with_distance_loss',
+        )
+        check_variable_and_dtype(
+            positive,
+            'positive',
+            ['float32', 'float64'],
+            'triplet_margin_with_distance_loss',
+        )
+        check_variable_and_dtype(
+            negative,
+            'negative',
+            ['float32', 'float64'],
+            'triplet_margin_with_distance_loss',
+        )
 
     if not (input.shape == positive.shape == negative.shape):
-        raise ValueError("input's shape must equal to "
-                         "positive's shape and  "
-                         "negative's shape")
+        raise ValueError(
+            "input's shape must equal to "
+            "positive's shape and  "
+            "negative's shape"
+        )
 
-    distance_function = distance_function if distance_function is not None \
+    distance_function = (
+        distance_function
+        if distance_function is not None
         else paddle.nn.PairwiseDistance(2)
+    )
 
     positive_dist = distance_function(input, positive)
     negative_dist = distance_function(input, negative)
@@ -3319,7 +3690,8 @@ def triplet_margin_with_distance_loss(input,
     if not paddle.all(positive_dist > 0) or not paddle.all(negative_dist > 0):
         raise ValueError(
             "The positive distance or negative distance should be greater than 0, "
-            "The distance functions should be checked.")
+            "The distance functions should be checked."
+        )
 
     loss = paddle.clip(positive_dist - negative_dist + margin, min=0.0)
 
@@ -3331,15 +3703,17 @@ def triplet_margin_with_distance_loss(input,
         return loss
 
 
-def triplet_margin_loss(input,
-                        positive,
-                        negative,
-                        margin=1.0,
-                        p=2,
-                        epsilon=1e-6,
-                        swap=False,
-                        reduction='mean',
-                        name=None):
+def triplet_margin_loss(
+    input,
+    positive,
+    negative,
+    margin=1.0,
+    p=2,
+    epsilon=1e-6,
+    swap=False,
+    reduction='mean',
+    name=None,
+):
     r"""
         Measures the triplet loss given an input
         tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
@@ -3416,23 +3790,29 @@ def triplet_margin_loss(input,
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "'reduction' in 'triplet_margin_loss' should be 'sum', 'mean' or 'none', "
-            "but received {}.".format(reduction))
+            "but received {}.".format(reduction)
+        )
     if margin < 0:
         raise ValueError(
             "The margin between positive samples and negative samples should be greater than 0."
         )
     if not _non_static_mode():
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'triplet_margin_loss')
-        check_variable_and_dtype(positive, 'positive', ['float32', 'float64'],
-                                 'triplet_margin_loss')
-        check_variable_and_dtype(negative, 'negative', ['float32', 'float64'],
-                                 'triplet_margin_loss')
+        check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'triplet_margin_loss'
+        )
+        check_variable_and_dtype(
+            positive, 'positive', ['float32', 'float64'], 'triplet_margin_loss'
+        )
+        check_variable_and_dtype(
+            negative, 'negative', ['float32', 'float64'], 'triplet_margin_loss'
+        )
 
     if not (input.shape == positive.shape == negative.shape):
-        raise ValueError("input's shape must equal to "
-                         "positive's shape and  "
-                         "negative's shape")
+        raise ValueError(
+            "input's shape must equal to "
+            "positive's shape and  "
+            "negative's shape"
+        )
 
     distance_function = paddle.nn.PairwiseDistance(p, epsilon=epsilon)
     positive_dist = distance_function(input, positive)
@@ -3505,20 +3885,23 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "The value of 'reduction' in soft_margin_loss should be 'sum', "
-            "'mean' or 'none', but received %s, which is not allowed." %
-            reduction)
+            "'mean' or 'none', but received %s, which is not allowed."
+            % reduction
+        )
 
     if not _non_static_mode():
-        fluid.data_feeder.check_variable_and_dtype(input, 'input',
-                                                   ['float32', 'float64'],
-                                                   'soft_margin_loss')
         fluid.data_feeder.check_variable_and_dtype(
-            label, 'label', ['int32', 'int64', 'float32', 'float64'],
-            'soft_margin_loss')
+            input, 'input', ['float32', 'float64'], 'soft_margin_loss'
+        )
+        fluid.data_feeder.check_variable_and_dtype(
+            label,
+            'label',
+            ['int32', 'int64', 'float32', 'float64'],
+            'soft_margin_loss',
+        )
 
     if not (input.shape == label.shape):
-        raise ValueError("input's shape must equal to "
-                         "label's shape")
+        raise ValueError("input's shape must equal to " "label's shape")
 
     label = fluid.layers.cast(label, input.dtype)
     out = paddle.log(1 + paddle.exp(-label * input))
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 1f5d7436302..20541b0ba19 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -24,7 +24,12 @@ from ...fluid import dygraph_utils
 import numbers
 from paddle import _C_ops, _legacy_C_ops
 from paddle import in_dynamic_mode
-from paddle.fluid.framework import core, _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
+from paddle.fluid.framework import (
+    core,
+    _non_static_mode,
+    in_dygraph_mode,
+    _in_legacy_dygraph,
+)
 
 __all__ = []
 
@@ -86,18 +91,30 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
 
     if _in_legacy_dygraph():
         eps = fluid.dygraph.base.to_variable([epsilon], dtype=x.dtype)
-        out = _legacy_C_ops.p_norm(x, 'axis', axis, 'porder', float(p),
-                                   'keepdim', True, 'epsilon', epsilon)
+        out = _legacy_C_ops.p_norm(
+            x,
+            'axis',
+            axis,
+            'porder',
+            float(p),
+            'keepdim',
+            True,
+            'epsilon',
+            epsilon,
+        )
         return x / _legacy_C_ops.elementwise_max(out, eps)
 
     check_type(p, 'p', (float, int), 'normalize')
     check_type(axis, 'axis', (int), 'normalize')
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'normalize')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'normalize'
+    )
     if len(x.shape) == 1 and axis != 0 and axis != -1:
         raise ValueError(
-            "Axis must be 0 or -1 when x is a 1-D tensor, but received axis = {}"
-            .format(axis))
+            "Axis must be 0 or -1 when x is a 1-D tensor, but received axis = {}".format(
+                axis
+            )
+        )
 
     attrs = {
         'axis': axis,
@@ -107,26 +124,27 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     }
     helper = LayerHelper('p_norm', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='p_norm',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='p_norm', inputs={'X': x}, outputs={'Out': out}, attrs=attrs
+    )
     eps = out.block.create_var(dtype=out.dtype)
     eps = paddle.full(shape=[1], fill_value=epsilon, dtype=out.dtype)
     return paddle.divide(x, paddle.maximum(out, eps), name=name)
 
 
-def batch_norm(x,
-               running_mean,
-               running_var,
-               weight,
-               bias,
-               training=False,
-               momentum=0.9,
-               epsilon=1e-05,
-               data_format="NCHW",
-               use_global_stats=None,
-               name=None):
+def batch_norm(
+    x,
+    running_mean,
+    running_var,
+    weight,
+    bias,
+    training=False,
+    momentum=0.9,
+    epsilon=1e-05,
+    data_format="NCHW",
+    use_global_stats=None,
+    name=None,
+):
     """
     Applies Batch Normalization as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
@@ -178,7 +196,8 @@ def batch_norm(x,
     if data_format not in true_data_format:
         raise ValueError(
             "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', "
-            "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format))
+            "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format)
+        )
 
     data_format = 'NCHW' if data_format[1] == 'C' else 'NHWC'
 
@@ -190,29 +209,64 @@ def batch_norm(x,
 
     if in_dygraph_mode():
         batch_norm_out, _, _, _, _, _ = _C_ops.batch_norm(
-            x, weight, bias, running_mean, running_var, momentum, epsilon,
-            data_format, not training, use_global_stats, trainable_statistics,
-            False)
-
-        return dygraph_utils._append_activation_in_dygraph(batch_norm_out,
-                                                           act=None)
+            x,
+            weight,
+            bias,
+            running_mean,
+            running_var,
+            momentum,
+            epsilon,
+            data_format,
+            not training,
+            use_global_stats,
+            trainable_statistics,
+            False,
+        )
+
+        return dygraph_utils._append_activation_in_dygraph(
+            batch_norm_out, act=None
+        )
 
     elif _in_legacy_dygraph():
         # for dygraph need tuple
-        attrs = ("momentum", momentum, "epsilon", epsilon, "is_test",
-                 not training, "data_layout", data_format, "use_mkldnn", False,
-                 "fuse_with_relu", False, "use_global_stats", use_global_stats,
-                 "trainable_statistics", trainable_statistics)
+        attrs = (
+            "momentum",
+            momentum,
+            "epsilon",
+            epsilon,
+            "is_test",
+            not training,
+            "data_layout",
+            data_format,
+            "use_mkldnn",
+            False,
+            "fuse_with_relu",
+            False,
+            "use_global_stats",
+            use_global_stats,
+            "trainable_statistics",
+            trainable_statistics,
+        )
 
         batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
-            x, weight, bias, running_mean, running_var, None, mean_out,
-            variance_out, *attrs)
-
-        return dygraph_utils._append_activation_in_dygraph(batch_norm_out,
-                                                           act=None)
-
-    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                             'BatchNorm')
+            x,
+            weight,
+            bias,
+            running_mean,
+            running_var,
+            None,
+            mean_out,
+            variance_out,
+            *attrs
+        )
+
+        return dygraph_utils._append_activation_in_dygraph(
+            batch_norm_out, act=None
+        )
+
+    check_variable_and_dtype(
+        x, 'input', ['float16', 'float32', 'float64'], 'BatchNorm'
+    )
 
     # for static need dict
     attrs = {
@@ -231,16 +285,18 @@ def batch_norm(x,
         "Scale": [weight],
         "Bias": [bias],
         "Mean": [running_mean],
-        "Variance": [running_var]
+        "Variance": [running_var],
     }
 
     helper = LayerHelper('batch_norm', **locals())
 
     param_dtype = x.dtype if x.dtype != 'float16' else 'float32'
-    saved_mean = helper.create_variable_for_type_inference(dtype=param_dtype,
-                                                           stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=param_dtype, stop_gradient=True
+    )
     saved_variance = helper.create_variable_for_type_inference(
-        dtype=param_dtype, stop_gradient=True)
+        dtype=param_dtype, stop_gradient=True
+    )
     batch_norm_out = helper.create_variable_for_type_inference(x.dtype)
 
     outputs = {
@@ -248,29 +304,26 @@ def batch_norm(x,
         "MeanOut": [running_mean],
         "VarianceOut": [running_var],
         "SavedMean": [saved_mean],
-        "SavedVariance": [saved_variance]
+        "SavedVariance": [saved_variance],
     }
 
     if training or trainable_statistics:
         # reserve_space is only used for training.
         reserve_space = helper.create_variable_for_type_inference(
-            dtype=x.dtype, stop_gradient=True)
+            dtype=x.dtype, stop_gradient=True
+        )
         outputs["ReserveSpace"] = [reserve_space]
 
-    helper.append_op(type="batch_norm",
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
+    )
 
     return helper.append_activation(batch_norm_out)
 
 
-def layer_norm(x,
-               normalized_shape,
-               weight=None,
-               bias=None,
-               epsilon=1e-05,
-               name=None):
+def layer_norm(
+    x, normalized_shape, weight=None, bias=None, epsilon=1e-05, name=None
+):
     """
     see more detail in paddle.nn.LayerNorm
 
@@ -310,32 +363,49 @@ def layer_norm(x,
         normalized_shape = list(normalized_shape)
     elif not isinstance(normalized_shape, list):
         raise ValueError(
-            "`normalized_shape` should be int, list of ints or tuple of ints.")
+            "`normalized_shape` should be int, list of ints or tuple of ints."
+        )
 
     normalized_ndim = len(normalized_shape)
     begin_norm_axis = input_ndim - normalized_ndim
-    if input_ndim < normalized_ndim or input_shape[
-            begin_norm_axis:] != normalized_shape:
+    if (
+        input_ndim < normalized_ndim
+        or input_shape[begin_norm_axis:] != normalized_shape
+    ):
         str_normalized_shape = str(normalized_shape)
-        raise ValueError('Given normalized_shape is ' + str_normalized_shape +
-                         ', expected input with shape [*, ' +
-                         str_normalized_shape[1:] + ', but got input shape ' +
-                         str(input_shape))
+        raise ValueError(
+            'Given normalized_shape is '
+            + str_normalized_shape
+            + ', expected input with shape [*, '
+            + str_normalized_shape[1:]
+            + ', but got input shape '
+            + str(input_shape)
+        )
 
     if in_dygraph_mode():
-        pre_act, _, _, = _C_ops.layer_norm(x, weight, bias, epsilon,
-                                           begin_norm_axis, False)
+        (
+            pre_act,
+            _,
+            _,
+        ) = _C_ops.layer_norm(x, weight, bias, epsilon, begin_norm_axis, False)
 
         return dygraph_utils._append_activation_in_dygraph(pre_act, act=None)
 
     if _in_legacy_dygraph():
-        pre_act, _, _ = _legacy_C_ops.layer_norm(x, weight, bias, 'epsilon',
-                                                 epsilon, 'begin_norm_axis',
-                                                 begin_norm_axis)
+        pre_act, _, _ = _legacy_C_ops.layer_norm(
+            x,
+            weight,
+            bias,
+            'epsilon',
+            epsilon,
+            'begin_norm_axis',
+            begin_norm_axis,
+        )
         return dygraph_utils._append_activation_in_dygraph(pre_act, act=None)
 
-    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                             'LayerNorm')
+    check_variable_and_dtype(
+        x, 'input', ['float16', 'float32', 'float64'], 'LayerNorm'
+    )
 
     inputs = dict()
     inputs['X'] = [x]
@@ -349,37 +419,40 @@ def layer_norm(x,
     helper = LayerHelper('layer_norm', **locals())
 
     dtype = x.dtype
-    mean_out = helper.create_variable_for_type_inference(dtype=dtype,
-                                                         stop_gradient=True)
-    variance_out = helper.create_variable_for_type_inference(dtype=dtype,
-                                                             stop_gradient=True)
+    mean_out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
+    variance_out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
     layer_norm_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type="layer_norm",
-                     inputs=inputs,
-                     outputs={
-                         "Y": layer_norm_out,
-                         "Mean": mean_out,
-                         "Variance": variance_out,
-                     },
-                     attrs={
-                         "epsilon": epsilon,
-                         "begin_norm_axis": begin_norm_axis
-                     })
+    helper.append_op(
+        type="layer_norm",
+        inputs=inputs,
+        outputs={
+            "Y": layer_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon, "begin_norm_axis": begin_norm_axis},
+    )
 
     return helper.append_activation(layer_norm_out)
 
 
-def instance_norm(x,
-                  running_mean=None,
-                  running_var=None,
-                  weight=None,
-                  bias=None,
-                  use_input_stats=True,
-                  momentum=0.9,
-                  eps=1e-05,
-                  data_format="NCHW",
-                  name=None):
+def instance_norm(
+    x,
+    running_mean=None,
+    running_var=None,
+    weight=None,
+    bias=None,
+    use_input_stats=True,
+    momentum=0.9,
+    eps=1e-05,
+    data_format="NCHW",
+    name=None,
+):
     """
     See more detail in nn.layer.InstanceNorm2D.
 
@@ -417,9 +490,17 @@ def instance_norm(x,
         out = _C_ops.instance_norm(x, weight, bias, eps)
         return out
     if _in_legacy_dygraph():
-        out, _, _ = _legacy_C_ops.instance_norm(x, weight, bias, "epsilon", eps,
-                                                "momentum", momentum,
-                                                "data_format", data_format)
+        out, _, _ = _legacy_C_ops.instance_norm(
+            x,
+            weight,
+            bias,
+            "epsilon",
+            eps,
+            "momentum",
+            momentum,
+            "data_format",
+            data_format,
+        )
         return out
 
     check_variable_and_dtype(x, 'input', ['float32', 'float64'], "InstanceNorm")
@@ -432,104 +513,106 @@ def instance_norm(x,
         inputs = {"X": [x]}
 
     helper = LayerHelper('instance_norm', **locals())
-    saved_mean = helper.create_variable_for_type_inference(dtype=x.dtype,
-                                                           stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
     saved_variance = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True)
+        dtype=x.dtype, stop_gradient=True
+    )
     instance_norm_out = helper.create_variable_for_type_inference(x.dtype)
 
     outputs = {
         "Y": [instance_norm_out],
         "SavedMean": [saved_mean],
-        "SavedVariance": [saved_variance]
+        "SavedVariance": [saved_variance],
     }
 
-    helper.append_op(type="instance_norm",
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs
+    )
     return instance_norm_out
 
 
-def local_response_norm(x,
-                        size,
-                        alpha=1e-4,
-                        beta=0.75,
-                        k=1.,
-                        data_format="NCHW",
-                        name=None):
+def local_response_norm(
+    x, size, alpha=1e-4, beta=0.75, k=1.0, data_format="NCHW", name=None
+):
     r"""
-        Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
-        For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
-
-        The formula is as follows:
-
-        .. math::
-
-            Output(i, x, y) = Input(i, x, y) / \left(k + \alpha \sum\limits^{\min(C-1, i + size/2)}_{j = \max(0, i - size/2)}(Input(j, x, y))^2\right)^{\beta}
-
-        In the above equation:
+    Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
+    For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
 
-        - :math:`size` : The number of channels to sum over.
-        - :math:`k` : The offset (avoid being divided by 0).
-        - :math:`\\alpha` : The scaling parameter.
-        - :math:`\\beta` : The exponent parameter.
+    The formula is as follows:
 
+    .. math::
 
-        Args:
-            x (Tensor): The input 3-D/4-D/5-D tensor. The data type is float32.
-            size (int): The number of channels to sum over.
-            alpha (float, optional): The scaling parameter, positive. Default:1e-4
-            beta (float, optional): The exponent, positive. Default:0.75
-            k (float, optional): An offset, positive. Default: 1.0
-            data_format (str, optional): Specify the data format of the input, and the data format of the output
-                will be consistent with that of the input. An optional string from:
-                If x is 3-D Tensor, the string could be `"NCL"` or `"NLC"` . When it is `"NCL"`,
-                the data is stored in the order of: `[batch_size, input_channels, feature_length]`.
-                If x is 4-D Tensor, the string could be  `"NCHW"`, `"NHWC"`. When it is `"NCHW"`,
-                the data is stored in the order of: `[batch_size, input_channels, input_height, input_width]`.
-                If x is 5-D Tensor, the string could be  `"NCDHW"`, `"NDHWC"` . When it is `"NCDHW"`,
-                the data is stored in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
-            name (str, optional): Name for the operation (optional, default is None). For more information,
-                please refer to :ref:`api_guide_Name`.
+        Output(i, x, y) = Input(i, x, y) / \left(k + \alpha \sum\limits^{\min(C-1, i + size/2)}_{j = \max(0, i - size/2)}(Input(j, x, y))^2\right)^{\beta}
+
+    In the above equation:
+
+    - :math:`size` : The number of channels to sum over.
+    - :math:`k` : The offset (avoid being divided by 0).
+    - :math:`\\alpha` : The scaling parameter.
+    - :math:`\\beta` : The exponent parameter.
+
+
+    Args:
+        x (Tensor): The input 3-D/4-D/5-D tensor. The data type is float32.
+        size (int): The number of channels to sum over.
+        alpha (float, optional): The scaling parameter, positive. Default:1e-4
+        beta (float, optional): The exponent, positive. Default:0.75
+        k (float, optional): An offset, positive. Default: 1.0
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. An optional string from:
+            If x is 3-D Tensor, the string could be `"NCL"` or `"NLC"` . When it is `"NCL"`,
+            the data is stored in the order of: `[batch_size, input_channels, feature_length]`.
+            If x is 4-D Tensor, the string could be  `"NCHW"`, `"NHWC"`. When it is `"NCHW"`,
+            the data is stored in the order of: `[batch_size, input_channels, input_height, input_width]`.
+            If x is 5-D Tensor, the string could be  `"NCDHW"`, `"NDHWC"` . When it is `"NCDHW"`,
+            the data is stored in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name (str, optional): Name for the operation (optional, default is None). For more information,
+            please refer to :ref:`api_guide_Name`.
 
-        Returns:
-            A tensor storing the transformation result with the same shape and data type as input.
+    Returns:
+        A tensor storing the transformation result with the same shape and data type as input.
 
 
-        Examples:
+    Examples:
 
-        .. code-block:: python
+    .. code-block:: python
 
-            import paddle
+        import paddle
 
-            x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32")
-            y = paddle.nn.functional.local_response_norm(x, size=5)
-            print(y.shape)  # [3, 3, 112, 112]
-        """
+        x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32")
+        y = paddle.nn.functional.local_response_norm(x, size=5)
+        print(y.shape)  # [3, 3, 112, 112]
+    """
     if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32'], 'local_response_norm')
     if data_format not in ['NCL', 'NLC', 'NCHW', 'NHWC', 'NCDHW', 'NDHWC']:
         raise ValueError(
-            "data_format should be in one of [NCL, NCHW, NCDHW, NLC, NHWC, NDHWC], " \
-            "but got {}".format(data_format))
+            "data_format should be in one of [NCL, NCHW, NCDHW, NLC, NHWC, NDHWC], "
+            "but got {}".format(data_format)
+        )
 
     sizes = x.shape
     dim = len(sizes)
     if dim < 3:
         raise ValueError(
-            'Expected 3D or higher dimensionality input, but got {} dimensions'.
-            format(dim))
+            'Expected 3D or higher dimensionality input, but got {} dimensions'.format(
+                dim
+            )
+        )
 
     for i, sz in enumerate(sizes):
         if not sz > 0 and i > 0:
-            raise ValueError("Expected every dim's size to be larger than 0, "
-                             "but the size of the {}-th dim is {}".format(
-                                 i, sz))
+            raise ValueError(
+                "Expected every dim's size to be larger than 0, "
+                "but the size of the {}-th dim is {}".format(i, sz)
+            )
 
     channel_last = True if data_format[-1] == "C" else False
 
     from functools import reduce
+
     sum_sizes = reduce(lambda x, y: x * y, sizes[1:])
 
     div = paddle.unsqueeze(paddle.multiply(x, x), axis=1)
@@ -537,8 +620,11 @@ def local_response_norm(x,
         pad4d_shape = [0, 0, size // 2, (size - 1) // 2]
         pool2d_shape = (size, 1)
         reshape_shape = [
-            sizes[0], 1, sizes[1], sizes[2],
-            int(sum_sizes / (sizes[1] * sizes[2]))
+            sizes[0],
+            1,
+            sizes[1],
+            sizes[2],
+            int(sum_sizes / (sizes[1] * sizes[2])),
         ]
         pad5d_shape = [0, 0, 0, 0, size // 2, (size - 1) // 2]
         pool3d_shape = (size, 1, 1)
@@ -546,26 +632,29 @@ def local_response_norm(x,
         pad4d_shape = [size // 2, (size - 1) // 2, 0, 0]
         pool2d_shape = (1, size)
         reshape_shape = [
-            sizes[0], 1, sizes[1],
-            int(sum_sizes / (sizes[1] * sizes[-1])), sizes[-1]
+            sizes[0],
+            1,
+            sizes[1],
+            int(sum_sizes / (sizes[1] * sizes[-1])),
+            sizes[-1],
         ]
         pad5d_shape = [size // 2, (size - 1) // 2, 0, 0, 0, 0]
         pool3d_shape = (1, 1, size)
 
     if dim == 3:
         div = paddle.nn.functional.pad(div, pad=pad4d_shape)
-        div = paddle.nn.functional.avg_pool2d(div,
-                                              kernel_size=pool2d_shape,
-                                              stride=1)
+        div = paddle.nn.functional.avg_pool2d(
+            div, kernel_size=pool2d_shape, stride=1
+        )
         div = paddle.squeeze(div, axis=1)
     else:
         div = paddle.reshape(div, shape=reshape_shape)
-        div = paddle.nn.functional.pad(div,
-                                       pad=pad5d_shape,
-                                       data_format='NCDHW')
-        div = paddle.nn.functional.avg_pool3d(div,
-                                              kernel_size=pool3d_shape,
-                                              stride=1)
+        div = paddle.nn.functional.pad(
+            div, pad=pad5d_shape, data_format='NCDHW'
+        )
+        div = paddle.nn.functional.avg_pool3d(
+            div, kernel_size=pool3d_shape, stride=1
+        )
         div = paddle.reshape(paddle.squeeze(div, axis=1), sizes)
 
     div = paddle.scale(div, scale=alpha, bias=k)
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 4fd6c75e4c6..7df6e8ae822 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -33,7 +33,9 @@ def _check_input(x, dimension):
     if len(x.shape) != dimension:
         raise ValueError(
             "Excepted Input X is {}-D tensor, but received {}-D {}".format(
-                dimension, len(x.shape), type(x)))
+                dimension, len(x.shape), type(x)
+            )
+        )
 
 
 def _check_instance(x, x_name, types=(int, float)):
@@ -41,16 +43,19 @@ def _check_instance(x, x_name, types=(int, float)):
     if not isinstance(x, types):
         raise ValueError(
             "Excepted {} type for {} but received type: {}. ".format(
-                types, x_name, type(x)))
+                types, x_name, type(x)
+            )
+        )
 
 
 def _check_value_limitation(x, x_name, min_limit=1e-3):
-
     def _check_value(x, x_name, min_limit=1e-3):
         if isinstance(x, int) and min_limit is not None and x < min_limit:
             raise ValueError(
-                "Excepted the input {} to be greater than {} but received x: {}. "
-                .format(x_name, min_limit, x))
+                "Excepted the input {} to be greater than {} but received x: {}. ".format(
+                    x_name, min_limit, x
+                )
+            )
 
     for ele in x:
         _check_value(ele, x_name)
@@ -74,21 +79,24 @@ def _channel_last(data_format, num_dims):
         if data_format not in ['NCL', 'NLC']:
             raise ValueError(
                 "Attr(data_format) should be 'NCL' or 'NLC'. Received "
-                "Attr(data_format): %s" % str(data_format))
+                "Attr(data_format): %s" % str(data_format)
+            )
         else:
             return True if data_format == "NLC" else False
     if num_dims == 2:
         if data_format not in ['NCHW', 'NHWC']:
             raise ValueError(
                 "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-                "Attr(data_format): %s" % str(data_format))
+                "Attr(data_format): %s" % str(data_format)
+            )
         else:
             return True if data_format == "NHWC" else False
     if num_dims == 3:
         if data_format not in ['NCDHW', 'NDHWC']:
             raise ValueError(
                 "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-                "Attr(data_format): %s" % str(data_format))
+                "Attr(data_format): %s" % str(data_format)
+            )
         else:
             return True if data_format == "NDHWC" else False
 
@@ -98,13 +106,16 @@ def _update_padding_nd(padding, num_dims, channel_last=False, ceil_mode=False):
         padding = padding.upper()
         if padding not in ["SAME", "VALID"]:
             raise ValueError(
-                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".
-                format(padding))
+                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".format(
+                    padding
+                )
+            )
         if padding == "VALID":
             if ceil_mode != False:
                 raise ValueError(
                     "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
+                    "Received ceil_mode: True."
+                )
 
             padding_algorithm = "VALID"
             padding = [0] * num_dims
@@ -119,10 +130,12 @@ def _update_padding_nd(padding, num_dims, channel_last=False, ceil_mode=False):
             if not _zero_padding_in_batch_and_channel(padding, channel_last):
                 raise ValueError(
                     "Non-zero padding({}) in the batch or channel dimensions "
-                    "is not supported.".format(padding))
+                    "is not supported.".format(padding)
+                )
             padding_algorithm = "EXPLICIT"
             padding = _exclude_padding_in_batch_and_channel(
-                padding, channel_last)
+                padding, channel_last
+            )
             if utils._is_symmetric_padding(padding, num_dims):
                 padding = padding[0::2]
         # for padding like [pad_before, pad_after, pad_before, pad_after, ...]
@@ -145,25 +158,29 @@ def _update_padding_nd(padding, num_dims, channel_last=False, ceil_mode=False):
 
 
 def _expand_low_nd_padding(padding):
-    #1d to 2d fake input
+    # 1d to 2d fake input
     if len(padding) == 2:
         padding = [0] * 2 + padding
     elif len(padding) == 1:
         padding = [0] + padding
     else:
         raise ValueError(
-            "The size of padding's dimmention should be 1 or 2. But got padding={}"
-            .format(padding))
+            "The size of padding's dimmention should be 1 or 2. But got padding={}".format(
+                padding
+            )
+        )
     return padding
 
 
-def avg_pool1d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               exclusive=True,
-               ceil_mode=False,
-               name=None):
+def avg_pool1d(
+    x,
+    kernel_size,
+    stride=None,
+    padding=0,
+    exclusive=True,
+    ceil_mode=False,
+    name=None,
+):
     """
     This API implements average pooling 1d operation,
     See more details in :ref:`api_nn_pooling_AvgPool1d` .
@@ -195,7 +212,7 @@ def avg_pool1d(x,
 
     Examples:
         .. code-block:: python
-          
+
             import paddle
             import paddle.nn as nn
 
@@ -222,28 +239,56 @@ def avg_pool1d(x,
     _check_value_limitation(stride, "stride", min_limit=1e-3)
 
     channel_last = _channel_last("NCL", 1)
-    padding, padding_algorithm = _update_padding_nd(padding,
-                                                    1,
-                                                    channel_last=channel_last,
-                                                    ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 1, channel_last=channel_last, ceil_mode=ceil_mode
+    )
 
     # use 2d to implenment 1d should expand padding in advance.
     padding = _expand_low_nd_padding(padding)
 
     if in_dygraph_mode():
-        output = _C_ops.pool2d(x, kernel_size, stride, padding, ceil_mode,
-                               exclusive, data_format, 'avg', False, False,
-                               padding_algorithm, True)
+        output = _C_ops.pool2d(
+            x,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            exclusive,
+            data_format,
+            'avg',
+            False,
+            False,
+            padding_algorithm,
+            True,
+        )
         return squeeze(output, [2])
 
     if _in_legacy_dygraph():
-        output = _legacy_C_ops.pool2d(x, 'pooling_type', 'avg', 'ksize',
-                                      kernel_size, 'global_pooling', False,
-                                      'strides', stride, 'paddings', padding,
-                                      'padding_algorithm', padding_algorithm,
-                                      'use_cudnn', True, 'ceil_mode', ceil_mode,
-                                      'use_mkldnn', False, 'exclusive',
-                                      exclusive, 'data_format', data_format)
+        output = _legacy_C_ops.pool2d(
+            x,
+            'pooling_type',
+            'avg',
+            'ksize',
+            kernel_size,
+            'global_pooling',
+            False,
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            'padding_algorithm',
+            padding_algorithm,
+            'use_cudnn',
+            True,
+            'ceil_mode',
+            ceil_mode,
+            'use_mkldnn',
+            False,
+            'exclusive',
+            exclusive,
+            'data_format',
+            data_format,
+        )
         return squeeze(output, [2])
 
     op_type = 'pool2d'
@@ -251,35 +296,39 @@ def avg_pool1d(x,
     dtype = helper.input_dtype(input_param_name='x')
     pool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type=op_type,
-                     inputs={"X": x},
-                     outputs={"Out": pool_out},
-                     attrs={
-                         "pooling_type": 'avg',
-                         "ksize": kernel_size,
-                         "global_pooling": False,
-                         "strides": stride,
-                         "paddings": padding,
-                         "padding_algorithm": padding_algorithm,
-                         "use_cudnn": True,
-                         "ceil_mode": ceil_mode,
-                         "use_mkldnn": False,
-                         "exclusive": exclusive,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": 'avg',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": exclusive,
+            "data_format": data_format,
+        },
+    )
 
     return squeeze(pool_out, [2])
 
 
-def avg_pool2d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               ceil_mode=False,
-               exclusive=True,
-               divisor_override=None,
-               data_format="NCHW",
-               name=None):
+def avg_pool2d(
+    x,
+    kernel_size,
+    stride=None,
+    padding=0,
+    ceil_mode=False,
+    exclusive=True,
+    divisor_override=None,
+    data_format="NCHW",
+    name=None,
+):
     """
     This API implements average pooling 2d operation.
     See more details in :ref:`api_nn_pooling_AvgPool2d` .
@@ -314,16 +363,16 @@ def avg_pool2d(x,
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-    
+
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
-    
+
     Examples:
         .. code-block:: python
-          
+
             import paddle
             import paddle.nn.functional as F
-            
+
             # avg pool2d
             x = paddle.uniform([1, 3, 32, 32], paddle.float32)
             out = F.avg_pool2d(x,
@@ -341,23 +390,52 @@ def avg_pool2d(x,
     _check_value_limitation(stride, "stride", min_limit=1e-3)
 
     channel_last = _channel_last(data_format, 2)
-    padding, padding_algorithm = _update_padding_nd(padding,
-                                                    2,
-                                                    channel_last,
-                                                    ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 2, channel_last, ceil_mode=ceil_mode
+    )
 
     if _non_static_mode():
         if in_dygraph_mode():
-            output = _C_ops.pool2d(x, kernel_size, stride, padding, ceil_mode,
-                                   exclusive, data_format, 'avg', False, False,
-                                   padding_algorithm, True)
+            output = _C_ops.pool2d(
+                x,
+                kernel_size,
+                stride,
+                padding,
+                ceil_mode,
+                exclusive,
+                data_format,
+                'avg',
+                False,
+                False,
+                padding_algorithm,
+                True,
+            )
         else:
             output = _legacy_C_ops.pool2d(
-                x, 'pooling_type', 'avg', 'ksize', kernel_size,
-                'global_pooling', False, 'padding_algorithm', padding_algorithm,
-                'strides', stride, 'paddings', padding, 'use_cudnn', True,
-                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive',
-                exclusive, 'data_format', data_format)
+                x,
+                'pooling_type',
+                'avg',
+                'ksize',
+                kernel_size,
+                'global_pooling',
+                False,
+                'padding_algorithm',
+                padding_algorithm,
+                'strides',
+                stride,
+                'paddings',
+                padding,
+                'use_cudnn',
+                True,
+                'ceil_mode',
+                ceil_mode,
+                'use_mkldnn',
+                False,
+                'exclusive',
+                exclusive,
+                'data_format',
+                data_format,
+            )
         if divisor_override is None:
             return output
         else:
@@ -370,22 +448,24 @@ def avg_pool2d(x,
     dtype = helper.input_dtype(input_param_name='x')
     pool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type=op_type,
-                     inputs={"X": x},
-                     outputs={"Out": pool_out},
-                     attrs={
-                         "pooling_type": "avg",
-                         "ksize": kernel_size,
-                         "global_pooling": False,
-                         "strides": stride,
-                         "paddings": padding,
-                         "padding_algorithm": padding_algorithm,
-                         "use_cudnn": True,
-                         "ceil_mode": ceil_mode,
-                         "use_mkldnn": False,
-                         "exclusive": exclusive,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": "avg",
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": exclusive,
+            "data_format": data_format,
+        },
+    )
 
     if divisor_override is None:
         return pool_out
@@ -394,15 +474,17 @@ def avg_pool2d(x,
         return pool_out * (kernel_size[0] * kernel_size[1]) / divisor_override
 
 
-def avg_pool3d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               ceil_mode=False,
-               exclusive=True,
-               divisor_override=None,
-               data_format="NCDHW",
-               name=None):
+def avg_pool3d(
+    x,
+    kernel_size,
+    stride=None,
+    padding=0,
+    ceil_mode=False,
+    exclusive=True,
+    divisor_override=None,
+    data_format="NCDHW",
+    name=None,
+):
     """
     This API implements average pooling 3d operation.
     See more details in :ref:`api_nn_pooling_AvgPool3d` .
@@ -435,13 +517,13 @@ def avg_pool3d(x,
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-    
+
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
 
     Examples:
         .. code-block:: python
-          
+
           import paddle
 
           x = paddle.uniform([1, 3, 32, 32, 32], paddle.float32)
@@ -460,25 +542,54 @@ def avg_pool3d(x,
         stride = utils.convert_to_list(stride, 3, 'pool_stride')
 
     channel_last = _channel_last(data_format, 3)
-    padding, padding_algorithm = _update_padding_nd(padding,
-                                                    3,
-                                                    channel_last=channel_last,
-                                                    ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode
+    )
 
     _check_value_limitation(kernel_size, "kernel_size", min_limit=1e-3)
     _check_value_limitation(stride, "stride", min_limit=1e-3)
 
     if in_dygraph_mode():
-        pool_out = _C_ops.pool3d(x, kernel_size, stride, padding, ceil_mode,
-                                 exclusive, data_format, 'avg', False, False,
-                                 padding_algorithm, True)
+        pool_out = _C_ops.pool3d(
+            x,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            exclusive,
+            data_format,
+            'avg',
+            False,
+            False,
+            padding_algorithm,
+            True,
+        )
     elif _in_legacy_dygraph():
         pool_out = _legacy_C_ops.pool3d(
-            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
-            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
-            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', exclusive, 'data_format',
-            data_format)
+            x,
+            'pooling_type',
+            'avg',
+            'ksize',
+            kernel_size,
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            'global_pooling',
+            False,
+            'padding_algorithm',
+            padding_algorithm,
+            'use_cudnn',
+            True,
+            'ceil_mode',
+            ceil_mode,
+            'use_mkldnn',
+            False,
+            'exclusive',
+            exclusive,
+            'data_format',
+            data_format,
+        )
     else:
         op_type = "pool3d"
         helper = LayerHelper(op_type, **locals())
@@ -487,38 +598,45 @@ def avg_pool3d(x,
         pool_out = helper.create_variable_for_type_inference(dtype)
         outputs = {"Out": pool_out}
 
-        helper.append_op(type=op_type,
-                         inputs={"X": x},
-                         outputs=outputs,
-                         attrs={
-                             "pooling_type": 'avg',
-                             "ksize": kernel_size,
-                             "global_pooling": False,
-                             "strides": stride,
-                             "paddings": padding,
-                             "padding_algorithm": padding_algorithm,
-                             "use_cudnn": True,
-                             "ceil_mode": ceil_mode,
-                             "use_mkldnn": False,
-                             "exclusive": exclusive,
-                             "data_format": data_format,
-                         })
+        helper.append_op(
+            type=op_type,
+            inputs={"X": x},
+            outputs=outputs,
+            attrs={
+                "pooling_type": 'avg',
+                "ksize": kernel_size,
+                "global_pooling": False,
+                "strides": stride,
+                "paddings": padding,
+                "padding_algorithm": padding_algorithm,
+                "use_cudnn": True,
+                "ceil_mode": ceil_mode,
+                "use_mkldnn": False,
+                "exclusive": exclusive,
+                "data_format": data_format,
+            },
+        )
 
     if divisor_override is None:
         return pool_out
     else:
         _check_instance(divisor_override, "divisor_override")
-        return pool_out * (kernel_size[0] * kernel_size[1] *
-                           kernel_size[2]) / divisor_override
+        return (
+            pool_out
+            * (kernel_size[0] * kernel_size[1] * kernel_size[2])
+            / divisor_override
+        )
 
 
-def max_pool1d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               return_mask=False,
-               ceil_mode=False,
-               name=None):
+def max_pool1d(
+    x,
+    kernel_size,
+    stride=None,
+    padding=0,
+    return_mask=False,
+    ceil_mode=False,
+    name=None,
+):
     """
     This API implements max pooling 1d opereation.
     See more details in :ref:`api_nn_pooling_MaxPool1d` .
@@ -577,44 +695,96 @@ def max_pool1d(x,
     else:
         stride = [1] + utils.convert_to_list(stride, 1, 'pool_stride')
 
-    padding, padding_algorithm = _update_padding_nd(padding,
-                                                    1,
-                                                    ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 1, ceil_mode=ceil_mode
+    )
 
     # use 2d to implenment 1d should expand padding in advance.
     padding = _expand_low_nd_padding(padding)
 
     if in_dygraph_mode():
         if return_mask:
-            pool_out = _C_ops.max_pool2d_with_index(x, kernel_size, stride,
-                                                    padding, False, False)
-            return (squeeze(pool_out[0], [2]),
-                    squeeze(pool_out[1], [2])) if return_mask else squeeze(
-                        pool_out[0], [2])
+            pool_out = _C_ops.max_pool2d_with_index(
+                x, kernel_size, stride, padding, False, False
+            )
+            return (
+                (squeeze(pool_out[0], [2]), squeeze(pool_out[1], [2]))
+                if return_mask
+                else squeeze(pool_out[0], [2])
+            )
         else:
-            pool_out = _C_ops.pool2d(x, kernel_size, stride, padding, ceil_mode,
-                                     True, data_format, 'max', False, False,
-                                     padding_algorithm, True)
+            pool_out = _C_ops.pool2d(
+                x,
+                kernel_size,
+                stride,
+                padding,
+                ceil_mode,
+                True,
+                data_format,
+                'max',
+                False,
+                False,
+                padding_algorithm,
+                True,
+            )
             return squeeze(pool_out, [2])
 
     if _in_legacy_dygraph():
         if return_mask:
             pool_out = _legacy_C_ops.max_pool2d_with_index(
-                x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
-                stride, 'paddings', padding, 'padding_algorithm',
-                padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-                'use_mkldnn', False, 'exclusive', True, 'data_format',
-                data_format)
-            return (squeeze(pool_out[0], [2]),
-                    squeeze(pool_out[1], [2])) if return_mask else squeeze(
-                        pool_out[0], [2])
+                x,
+                'ksize',
+                kernel_size,
+                'global_pooling',
+                False,
+                'strides',
+                stride,
+                'paddings',
+                padding,
+                'padding_algorithm',
+                padding_algorithm,
+                'use_cudnn',
+                True,
+                'ceil_mode',
+                ceil_mode,
+                'use_mkldnn',
+                False,
+                'exclusive',
+                True,
+                'data_format',
+                data_format,
+            )
+            return (
+                (squeeze(pool_out[0], [2]), squeeze(pool_out[1], [2]))
+                if return_mask
+                else squeeze(pool_out[0], [2])
+            )
         else:
             pool_out = _legacy_C_ops.pool2d(
-                x, 'pooling_type', 'max', 'ksize', kernel_size,
-                'global_pooling', False, 'padding_algorithm', padding_algorithm,
-                'strides', stride, 'paddings', padding, 'use_cudnn', True,
-                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
-                'data_format', data_format)
+                x,
+                'pooling_type',
+                'max',
+                'ksize',
+                kernel_size,
+                'global_pooling',
+                False,
+                'padding_algorithm',
+                padding_algorithm,
+                'strides',
+                stride,
+                'paddings',
+                padding,
+                'use_cudnn',
+                True,
+                'ceil_mode',
+                ceil_mode,
+                'use_mkldnn',
+                False,
+                'exclusive',
+                True,
+                'data_format',
+                data_format,
+            )
             return squeeze(pool_out, [2])
 
     op_type = 'max_pool2d_with_index' if return_mask else "pool2d"
@@ -624,36 +794,44 @@ def max_pool1d(x,
     mask = helper.create_variable_for_type_inference('int32')
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(type=op_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": 'max',
-                         "ksize": kernel_size,
-                         "global_pooling": False,
-                         "strides": stride,
-                         "paddings": padding,
-                         "padding_algorithm": padding_algorithm,
-                         "use_cudnn": True,
-                         "ceil_mode": ceil_mode,
-                         "use_mkldnn": False,
-                         "exclusive": True,
-                         "data_format": data_format,
-                     })
-
-    return (squeeze(pool_out, [2]),
-            squeeze(mask, [2])) if return_mask else squeeze(pool_out, [2])
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": True,
+            "data_format": data_format,
+        },
+    )
+
+    return (
+        (squeeze(pool_out, [2]), squeeze(mask, [2]))
+        if return_mask
+        else squeeze(pool_out, [2])
+    )
 
 
 def _unpool_output_size(x, kernel_size, stride, padding, output_size):
-    assert output_size is None or isinstance(
-        output_size, (list, tuple)
-    ), "Required output_size is None|list|tuple, but received %s" % output_size
+    assert output_size is None or isinstance(output_size, (list, tuple)), (
+        "Required output_size is None|list|tuple, but received %s" % output_size
+    )
     input_size = x.shape
     default_size = []
     for d in range(len(kernel_size)):
-        default_size.append((input_size[-len(kernel_size) + d] - 1) *
-                            stride[d] + kernel_size[d] - 2 * padding[d])
+        default_size.append(
+            (input_size[-len(kernel_size) + d] - 1) * stride[d]
+            + kernel_size[d]
+            - 2 * padding[d]
+        )
 
     has_static_var = False
     if output_size is None:
@@ -673,37 +851,42 @@ def _unpool_output_size(x, kernel_size, stride, padding, output_size):
         raise ValueError(
             "output_size should be a sequence containing "
             "{} or {} elements, but it has a length of '{}'".format(
-                len(kernel_size),
-                len(kernel_size) + 2, len(output_size)))
+                len(kernel_size), len(kernel_size) + 2, len(output_size)
+            )
+        )
     if not has_static_var:
         for d in range(len(kernel_size)):
             min_size = default_size[d] - stride[d]
             max_size = default_size[d] + stride[d]
             if not (min_size < output_size[d] < max_size):
                 raise ValueError(
-                    'invalid output_size "{}" (dim {} must be between {} and {})'
-                    .format(output_size, d, min_size, max_size))
+                    'invalid output_size "{}" (dim {} must be between {} and {})'.format(
+                        output_size, d, min_size, max_size
+                    )
+                )
 
     return output_size
 
 
-def max_unpool1d(x,
-                 indices,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 data_format="NCL",
-                 output_size=None,
-                 name=None):
+def max_unpool1d(
+    x,
+    indices,
+    kernel_size,
+    stride=None,
+    padding=0,
+    data_format="NCL",
+    output_size=None,
+    name=None,
+):
     r"""
     This API implements max unpooling 1d opereation.
-    `max_unpool1d` accepts the output of `max_pool1d` as input, 
-    including the indices of the maximum value and calculate the partial inverse. 
+    `max_unpool1d` accepts the output of `max_pool1d` as input,
+    including the indices of the maximum value and calculate the partial inverse.
     All non-maximum values ​​are set to zero.
 
     - Input: :math:`(N, C, L_{in})`
     - Output: :math:`(N, C, L_{out})`, where
-    
+
     .. math::
         L_{out} = (L_{in} - 1) * stride - 2 * padding + kernel\_size
 
@@ -712,11 +895,11 @@ def max_unpool1d(x,
 
     Args:
         x (Tensor): The input tensor of unpooling operator which is a 3-D tensor with
-                          shape [N, C, L]. The format of input tensor is `"NCL"`, 
+                          shape [N, C, L]. The format of input tensor is `"NCL"`,
                           where `N` is batch size, `C` is the number of channels, `L` is
                           the length of the feature. The data type is float32 or float64.
         indices (Tensor): The indices given out by maxpooling1d which is a 3-D tensor with
-                          shape [N, C, L]. The format of input tensor is `"NCL"` , 
+                          shape [N, C, L]. The format of input tensor is `"NCL"` ,
                           where `N` is batch size, `C` is the number of channels, `L` is
                           the length of the featuree. The data type is float32 or float64.
         kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
@@ -724,7 +907,7 @@ def max_unpool1d(x,
         stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
             it must contain an integer.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, stride, padding).
         data_format (string): The data format of the input and output data.
@@ -735,11 +918,11 @@ def max_unpool1d(x,
                              None by default.
 
     Returns:
-        Tensor: The output tensor of unpooling result. 
+        Tensor: The output tensor of unpooling result.
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
             import paddle.nn.functional as F
 
@@ -752,8 +935,10 @@ def max_unpool1d(x,
     """
     """NCL to NCHW"""
     if data_format not in ["NCL"]:
-        raise ValueError("Attr(data_format) should be 'NCL'. Received "
-                         "Attr(data_format): %s." % str(data_format))
+        raise ValueError(
+            "Attr(data_format) should be 'NCL'. Received "
+            "Attr(data_format): %s." % str(data_format)
+        )
     data_format = "NCHW"
     x = unsqueeze(x, [2])
     indices = unsqueeze(indices, [2])
@@ -766,18 +951,32 @@ def max_unpool1d(x,
     # use 2d to implenment 1d should expand padding in advance.
     padding = _expand_low_nd_padding(padding)
 
-    output_size = _unpool_output_size(x, kernel_size, stride, padding,
-                                      output_size)
+    output_size = _unpool_output_size(
+        x, kernel_size, stride, padding, output_size
+    )
 
     if in_dygraph_mode():
-        output = _C_ops.unpool(x, indices, kernel_size, stride, padding,
-                               output_size, data_format)
+        output = _C_ops.unpool(
+            x, indices, kernel_size, stride, padding, output_size, data_format
+        )
         return squeeze(output, [2])
     elif in_dynamic_mode():
-        output = _legacy_C_ops.unpool(x, indices, 'unpooling_type', 'max',
-                                      'ksize', kernel_size, 'strides', stride,
-                                      'paddings', padding, "output_size",
-                                      output_size, "data_format", data_format)
+        output = _legacy_C_ops.unpool(
+            x,
+            indices,
+            'unpooling_type',
+            'max',
+            'ksize',
+            kernel_size,
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            "output_size",
+            output_size,
+            "data_format",
+            data_format,
+        )
         return squeeze(output, [2])
 
     op_type = "unpool"
@@ -785,30 +984,31 @@ def max_unpool1d(x,
     dtype = helper.input_dtype(input_param_name="x")
     unpool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type=op_type,
-                     inputs={
-                         "X": x,
-                         "Indices": indices
-                     },
-                     outputs={"Out": unpool_out},
-                     attrs={
-                         "unpooling_type": "max",
-                         "ksize": kernel_size,
-                         "strides": stride,
-                         "paddings": padding,
-                         "output_size": output_size
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x, "Indices": indices},
+        outputs={"Out": unpool_out},
+        attrs={
+            "unpooling_type": "max",
+            "ksize": kernel_size,
+            "strides": stride,
+            "paddings": padding,
+            "output_size": output_size,
+        },
+    )
     return squeeze(unpool_out, [2])
 
 
-def max_unpool2d(x,
-                 indices,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 data_format="NCHW",
-                 output_size=None,
-                 name=None):
+def max_unpool2d(
+    x,
+    indices,
+    kernel_size,
+    stride=None,
+    padding=0,
+    data_format="NCHW",
+    output_size=None,
+    name=None,
+):
     r"""
     This API implements max unpooling 2d opereation.
     See more details in :ref:`api_nn_pooling_MaxUnPool2D` .
@@ -816,12 +1016,12 @@ def max_unpool2d(x,
 
     Args:
         x (Tensor): The input tensor of unpooling operator which is a 4-D tensor with
-                          shape [N, C, H, W]. The format of input tensor is `"NCHW"`, 
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"`,
                           where `N` is batch size, `C` is the number of channels,
                           `H` is the height of the feature, and `W` is the width of the
                           feature. The data type if float32 or float64.
         indices (Tensor): The indices given out by maxpooling2d which is a 4-D tensor with
-                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` , 
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` ,
                           where `N` is batch size, `C` is the number of channels,
                           `H` is the height of the feature, and `W` is the width of the
                           feature. The data type if float32 or float64.
@@ -830,7 +1030,7 @@ def max_unpool2d(x,
         stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
             it must contain an integer.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, padding).
         name(str, optional): For detailed information, please refer
@@ -850,16 +1050,16 @@ def max_unpool2d(x,
           or as given by :attr:`output_size` in the call operator
 
         Returns:
-            Tensor: The output tensor of unpooling result. 
+            Tensor: The output tensor of unpooling result.
 
         Raises:
             ValueError: If the input is not a 4-D tensor.
             ValueError: If indeces shape is not equal input shape.
-            
+
 
         Examples:
             .. code-block:: python
-          
+
             import paddle
             import paddle.nn.functional as F
 
@@ -869,9 +1069,9 @@ def max_unpool2d(x,
             unpool_out = F.max_unpool2d(pool_out, indices, kernel_size=2, padding=0)
             # unpool_out shape: [1, 1, 6, 6]
 
-            # specify a different output size than input size 
+            # specify a different output size than input size
             unpool_out = F.max_unpool2d(pool_out, indices, kernel_size=2, padding=0, output_size=[7,7])
-            # unpool_out shape: [1, 1, 7, 7] 
+            # unpool_out shape: [1, 1, 7, 7]
 
     """
     kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
@@ -882,21 +1082,37 @@ def max_unpool2d(x,
     padding = utils.convert_to_list(padding, 2, 'padding')
 
     if data_format not in ["NCHW"]:
-        raise ValueError("Attr(data_format) should be 'NCHW'. Received "
-                         "Attr(data_format): %s." % str(data_format))
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW'. Received "
+            "Attr(data_format): %s." % str(data_format)
+        )
 
-    output_size = _unpool_output_size(x, kernel_size, stride, padding,
-                                      output_size)
+    output_size = _unpool_output_size(
+        x, kernel_size, stride, padding, output_size
+    )
 
     if in_dygraph_mode():
-        output = _C_ops.unpool(x, indices, kernel_size, stride, padding,
-                               output_size, data_format)
+        output = _C_ops.unpool(
+            x, indices, kernel_size, stride, padding, output_size, data_format
+        )
         return output
     elif in_dynamic_mode():
-        output = _legacy_C_ops.unpool(x, indices, 'unpooling_type', 'max',
-                                      'ksize', kernel_size, 'strides', stride,
-                                      'paddings', padding, "output_size",
-                                      output_size, "data_format", data_format)
+        output = _legacy_C_ops.unpool(
+            x,
+            indices,
+            'unpooling_type',
+            'max',
+            'ksize',
+            kernel_size,
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            "output_size",
+            output_size,
+            "data_format",
+            data_format,
+        )
         return output
 
     op_type = "unpool"
@@ -904,39 +1120,40 @@ def max_unpool2d(x,
     dtype = helper.input_dtype(input_param_name="x")
     unpool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type=op_type,
-                     inputs={
-                         "X": x,
-                         "Indices": indices
-                     },
-                     outputs={"Out": unpool_out},
-                     attrs={
-                         "unpooling_type": "max",
-                         "ksize": kernel_size,
-                         "strides": stride,
-                         "paddings": padding,
-                         "output_size": output_size
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x, "Indices": indices},
+        outputs={"Out": unpool_out},
+        attrs={
+            "unpooling_type": "max",
+            "ksize": kernel_size,
+            "strides": stride,
+            "paddings": padding,
+            "output_size": output_size,
+        },
+    )
     return unpool_out
 
 
-def max_unpool3d(x,
-                 indices,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 data_format="NCDHW",
-                 output_size=None,
-                 name=None):
+def max_unpool3d(
+    x,
+    indices,
+    kernel_size,
+    stride=None,
+    padding=0,
+    data_format="NCDHW",
+    output_size=None,
+    name=None,
+):
     r"""
     This API implements max unpooling 3d opereation.
-    `max_unpool3d` accepts the output of `max_pool3d` as input, 
-    including the indices of the maximum value and calculate the partial inverse. 
+    `max_unpool3d` accepts the output of `max_pool3d` as input,
+    including the indices of the maximum value and calculate the partial inverse.
     All non-maximum values ​​are set to zero.
 
     - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
     - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
-    
+
     .. math::
         D_{out} = (D_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0]
 
@@ -951,21 +1168,21 @@ def max_unpool3d(x,
 
     Args:
         x (Tensor): The input tensor of unpooling operator which is a 5-D tensor with
-                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"`, 
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"`,
                           where `N` is batch size, `C` is the number of channels, `D` is
-                          the depth of the feature, `H` is the height of the feature, 
+                          the depth of the feature, `H` is the height of the feature,
                           and `W` is the width of the feature. The data type is float32 or float64.
         indices (Tensor): The indices given out by maxpooling3d which is a 5-D tensor with
-                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` , 
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` ,
                           where `N` is batch size, `C` is the number of channels, `D` is
-                          the depth of the feature, `H` is the height of the feature, 
+                          the depth of the feature, `H` is the height of the feature,
                           and `W` is the width of the feature. The data type is float32 or float64.
         kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
             it must contain an integer.
         stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
             it must contain an integer.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, stride, padding).
         data_format (string): The data format of the input and output data.
@@ -976,11 +1193,11 @@ def max_unpool3d(x,
                              None by default.
 
     Returns:
-        Tensor: The output tensor of unpooling result. 
+        Tensor: The output tensor of unpooling result.
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
             import paddle.nn.functional as F
 
@@ -999,21 +1216,37 @@ def max_unpool3d(x,
     padding = utils.convert_to_list(padding, 3, 'padding')
 
     if data_format not in ["NCDHW"]:
-        raise ValueError("Attr(data_format) should be 'NCDHW'. Received "
-                         "Attr(data_format): %s." % str(data_format))
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW'. Received "
+            "Attr(data_format): %s." % str(data_format)
+        )
 
-    output_size = _unpool_output_size(x, kernel_size, stride, padding,
-                                      output_size)
+    output_size = _unpool_output_size(
+        x, kernel_size, stride, padding, output_size
+    )
 
     if in_dygraph_mode():
-        output = _C_ops.unpool3d(x, indices, kernel_size, stride, padding,
-                                 output_size, data_format)
+        output = _C_ops.unpool3d(
+            x, indices, kernel_size, stride, padding, output_size, data_format
+        )
         return output
     elif in_dynamic_mode():
-        output = _legacy_C_ops.unpool3d(x, indices, 'unpooling_type', 'max',
-                                        'ksize', kernel_size, 'strides', stride,
-                                        'paddings', padding, "output_size",
-                                        output_size, "data_format", data_format)
+        output = _legacy_C_ops.unpool3d(
+            x,
+            indices,
+            'unpooling_type',
+            'max',
+            'ksize',
+            kernel_size,
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            "output_size",
+            output_size,
+            "data_format",
+            data_format,
+        )
         return output
 
     op_type = "unpool3d"
@@ -1021,30 +1254,31 @@ def max_unpool3d(x,
     dtype = helper.input_dtype(input_param_name="x")
     unpool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type=op_type,
-                     inputs={
-                         "X": x,
-                         "Indices": indices
-                     },
-                     outputs={"Out": unpool_out},
-                     attrs={
-                         "unpooling_type": "max",
-                         "ksize": kernel_size,
-                         "strides": stride,
-                         "paddings": padding,
-                         "output_size": output_size
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x, "Indices": indices},
+        outputs={"Out": unpool_out},
+        attrs={
+            "unpooling_type": "max",
+            "ksize": kernel_size,
+            "strides": stride,
+            "paddings": padding,
+            "output_size": output_size,
+        },
+    )
     return unpool_out
 
 
-def max_pool2d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               return_mask=False,
-               ceil_mode=False,
-               data_format="NCHW",
-               name=None):
+def max_pool2d(
+    x,
+    kernel_size,
+    stride=None,
+    padding=0,
+    return_mask=False,
+    ceil_mode=False,
+    data_format="NCHW",
+    name=None,
+):
     """
     This API implements max pooling 2d operation.
     See more details in :ref:`api_nn_pooling_MaxPool2d` .
@@ -1108,14 +1342,14 @@ def max_pool2d(x,
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
+            "Attr(data_format): %s." % str(data_format)
+        )
 
     channel_last = True if data_format == "NHWC" else False
 
-    padding, padding_algorithm = _update_padding_nd(padding,
-                                                    num_dims=2,
-                                                    channel_last=channel_last,
-                                                    ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, num_dims=2, channel_last=channel_last, ceil_mode=ceil_mode
+    )
 
     if data_format == "NHWC" and return_mask:
         raise ValueError(
@@ -1124,69 +1358,122 @@ def max_pool2d(x,
 
     if in_dygraph_mode():
         if return_mask:
-            output = _C_ops.max_pool2d_with_index(x, kernel_size, stride,
-                                                  padding, False, False)
+            output = _C_ops.max_pool2d_with_index(
+                x, kernel_size, stride, padding, False, False
+            )
             return output if return_mask else output[0]
         else:
-            return _C_ops.pool2d(x, kernel_size, stride, padding, ceil_mode,
-                                 True, data_format, 'max', False, False,
-                                 padding_algorithm, True)
+            return _C_ops.pool2d(
+                x,
+                kernel_size,
+                stride,
+                padding,
+                ceil_mode,
+                True,
+                data_format,
+                'max',
+                False,
+                False,
+                padding_algorithm,
+                True,
+            )
 
     if _in_legacy_dygraph():
         if return_mask:
             output = _legacy_C_ops.max_pool2d_with_index(
-                x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
-                stride, 'paddings', padding, 'padding_algorithm',
-                padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-                'use_mkldnn', False, 'exclusive', True, 'data_format',
-                data_format)
+                x,
+                'ksize',
+                kernel_size,
+                'global_pooling',
+                False,
+                'strides',
+                stride,
+                'paddings',
+                padding,
+                'padding_algorithm',
+                padding_algorithm,
+                'use_cudnn',
+                True,
+                'ceil_mode',
+                ceil_mode,
+                'use_mkldnn',
+                False,
+                'exclusive',
+                True,
+                'data_format',
+                data_format,
+            )
             return output if return_mask else output[0]
         else:
             output = _legacy_C_ops.pool2d(
-                x, 'pooling_type', 'max', 'ksize', kernel_size,
-                'global_pooling', False, 'padding_algorithm', padding_algorithm,
-                'strides', stride, 'paddings', padding, 'use_cudnn', True,
-                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
-                'data_format', data_format)
+                x,
+                'pooling_type',
+                'max',
+                'ksize',
+                kernel_size,
+                'global_pooling',
+                False,
+                'padding_algorithm',
+                padding_algorithm,
+                'strides',
+                stride,
+                'paddings',
+                padding,
+                'use_cudnn',
+                True,
+                'ceil_mode',
+                ceil_mode,
+                'use_mkldnn',
+                False,
+                'exclusive',
+                True,
+                'data_format',
+                data_format,
+            )
             return output
 
     op_type = 'max_pool2d_with_index' if return_mask else "pool2d"
     helper = LayerHelper(op_type, **locals())
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'max_pool2d')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'max_pool2d'
+    )
     dtype = helper.input_dtype(input_param_name='x')
     pool_out = helper.create_variable_for_type_inference(dtype)
     mask = helper.create_variable_for_type_inference("int32")
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(type=op_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": 'max',
-                         "ksize": kernel_size,
-                         "global_pooling": False,
-                         "strides": stride,
-                         "paddings": padding,
-                         "padding_algorithm": padding_algorithm,
-                         "use_cudnn": True,
-                         "ceil_mode": ceil_mode,
-                         "use_mkldnn": False,
-                         "exclusive": True,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": True,
+            "data_format": data_format,
+        },
+    )
 
     return (pool_out, mask) if return_mask else pool_out
 
 
-def max_pool3d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               return_mask=False,
-               ceil_mode=False,
-               data_format="NCDHW",
-               name=None):
+def max_pool3d(
+    x,
+    kernel_size,
+    stride=None,
+    padding=0,
+    return_mask=False,
+    ceil_mode=False,
+    data_format="NCDHW",
+    name=None,
+):
     """
     This API implements max pooling 2d operation.
     See more details in :ref:`api_nn_pooling_MaxPool3d` .
@@ -1216,7 +1503,7 @@ def max_pool3d(x,
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-    
+
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
 
@@ -1256,10 +1543,9 @@ def max_pool3d(x,
 
     channel_last = _channel_last(data_format, 3)
 
-    padding, padding_algorithm = _update_padding_nd(padding,
-                                                    3,
-                                                    channel_last=channel_last,
-                                                    ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode
+    )
 
     if data_format == "NDHWC" and return_mask:
         raise ValueError(
@@ -1268,30 +1554,80 @@ def max_pool3d(x,
 
     if in_dygraph_mode():
         if return_mask:
-            output = _C_ops.max_pool3d_with_index(x, kernel_size, stride,
-                                                  padding, False, False)
+            output = _C_ops.max_pool3d_with_index(
+                x, kernel_size, stride, padding, False, False
+            )
             return output if return_mask else output[0]
         else:
-            return _C_ops.pool3d(x, kernel_size, stride, padding, ceil_mode,
-                                 True, data_format, 'max', False, False,
-                                 padding_algorithm, True)
+            return _C_ops.pool3d(
+                x,
+                kernel_size,
+                stride,
+                padding,
+                ceil_mode,
+                True,
+                data_format,
+                'max',
+                False,
+                False,
+                padding_algorithm,
+                True,
+            )
 
     if _in_legacy_dygraph():
         if return_mask:
             output = _legacy_C_ops.max_pool3d_with_index(
-                x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides',
-                stride, 'paddings', padding, 'global_pooling', False,
-                'padding_algorithm', padding_algorithm, 'use_cudnn', True,
-                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
-                'data_format', data_format)
+                x,
+                'pooling_type',
+                'max',
+                'ksize',
+                kernel_size,
+                'strides',
+                stride,
+                'paddings',
+                padding,
+                'global_pooling',
+                False,
+                'padding_algorithm',
+                padding_algorithm,
+                'use_cudnn',
+                True,
+                'ceil_mode',
+                ceil_mode,
+                'use_mkldnn',
+                False,
+                'exclusive',
+                True,
+                'data_format',
+                data_format,
+            )
             return output if return_mask else output[0]
         else:
             output = _legacy_C_ops.pool3d(
-                x, 'pooling_type', 'max', 'ksize', kernel_size,
-                'global_pooling', False, 'padding_algorithm', padding_algorithm,
-                'strides', stride, 'paddings', padding, 'use_cudnn', True,
-                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
-                'data_format', data_format)
+                x,
+                'pooling_type',
+                'max',
+                'ksize',
+                kernel_size,
+                'global_pooling',
+                False,
+                'padding_algorithm',
+                padding_algorithm,
+                'strides',
+                stride,
+                'paddings',
+                padding,
+                'use_cudnn',
+                True,
+                'ceil_mode',
+                ceil_mode,
+                'use_mkldnn',
+                False,
+                'exclusive',
+                True,
+                'data_format',
+                data_format,
+            )
             return output
 
     op_type = "max_pool3d_with_index" if return_mask else "pool3d"
@@ -1302,30 +1638,32 @@ def max_pool3d(x,
     mask = helper.create_variable_for_type_inference('int32')
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(type=op_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": 'max',
-                         "ksize": kernel_size,
-                         "global_pooling": False,
-                         "strides": stride,
-                         "paddings": padding,
-                         "padding_algorithm": padding_algorithm,
-                         "use_cudnn": True,
-                         "ceil_mode": ceil_mode,
-                         "use_mkldnn": False,
-                         "exclusive": False,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": False,
+            "data_format": data_format,
+        },
+    )
 
     return (pool_out, mask) if return_mask else pool_out
 
 
 def adaptive_avg_pool1d(x, output_size, name=None):
     """
-    Adaptive average pooling 1d operation on :attr:`x` according to :attr:`output_size`. 
-    
+    Adaptive average pooling 1d operation on :attr:`x` according to :attr:`output_size`.
+
     Notes:
         See more details in :ref:`api_nn_pooling_AdaptiveAvgPool1d` .
 
@@ -1333,10 +1671,10 @@ def adaptive_avg_pool1d(x, output_size, name=None):
         x (Tensor): The input Tensor of pooling, which is a 3-D tensor with shape :math:`[N, C, L]`, where :math:`N` is batch size, :math:`C` is the number of channels and :math:`L` is the length of the feature. The data type is float32 or float64.
         output_size (int): The target output size. Its data type must be int.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: The result of 1D adaptive average pooling. Its data type is same as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -1361,21 +1699,34 @@ def adaptive_avg_pool1d(x, output_size, name=None):
     """
     pool_type = 'avg'
     if not in_dynamic_mode():
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'adaptive_pool2d')
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], 'adaptive_pool2d'
+        )
         check_type(output_size, 'pool_size', (int), 'adaptive_pool1d')
     _check_input(x, 3)
     pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
 
     x = unsqueeze(x, [2])
     if in_dygraph_mode():
-        pool_out = _C_ops.pool2d(x, pool_size, [1, 1], [0, 0], False, True,
-                                 "NCHW", pool_type, False, True, "EXPLICIT",
-                                 False)
+        pool_out = _C_ops.pool2d(
+            x,
+            pool_size,
+            [1, 1],
+            [0, 0],
+            False,
+            True,
+            "NCHW",
+            pool_type,
+            False,
+            True,
+            "EXPLICIT",
+            False,
+        )
         return squeeze(pool_out, [2])
     if _in_legacy_dygraph():
-        pool_out = _legacy_C_ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
-                                        pool_size, 'adaptive', True)
+        pool_out = _legacy_C_ops.pool2d(
+            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True
+        )
         return squeeze(pool_out, [2])
 
     l_type = "pool2d"
@@ -1385,14 +1736,16 @@ def adaptive_avg_pool1d(x, output_size, name=None):
     pool_out = helper.create_variable_for_type_inference(dtype)
 
     outputs = {"Out": pool_out}
-    helper.append_op(type=l_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": pool_type,
-                         "ksize": pool_size,
-                         "adaptive": True,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        },
+    )
 
     return squeeze(pool_out, [2])
 
@@ -1401,7 +1754,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
     """
     Applies 2D adaptive avg pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size.
-    
+
     For avg adaptive pool2d:
     ..  math::
         hstart &= floor(i * H_{in} / H_{out})
@@ -1455,14 +1808,16 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
             # out.shape is [2, 3, 3, 3]
     """
     if not in_dynamic_mode():
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'adaptive_avg_pool2d')
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], 'adaptive_avg_pool2d'
+        )
         check_type(data_format, 'data_format', str, 'adaptive_avg_pool2d')
 
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
+            "Attr(data_format): %s." % str(data_format)
+        )
 
     if data_format == "NCHW":
         in_h, in_w = x.shape[2:4]
@@ -1488,14 +1843,35 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
         output_size = utils._convert_to_tensor_list(output_size)
 
     if in_dygraph_mode():
-        return _C_ops.pool2d(x, output_size, [1, 1], [0, 0], False, True,
-                             data_format, 'avg', False, True, "EXPLICIT", False)
+        return _C_ops.pool2d(
+            x,
+            output_size,
+            [1, 1],
+            [0, 0],
+            False,
+            True,
+            data_format,
+            'avg',
+            False,
+            True,
+            "EXPLICIT",
+            False,
+        )
 
     if _in_legacy_dygraph():
-        return _legacy_C_ops.pool2d(x, 'pooling_type', 'avg', 'ksize',
-                                    output_size, 'global_pooling', False,
-                                    'adaptive', True, 'data_format',
-                                    data_format)
+        return _legacy_C_ops.pool2d(
+            x,
+            'pooling_type',
+            'avg',
+            'ksize',
+            output_size,
+            'global_pooling',
+            False,
+            'adaptive',
+            True,
+            'data_format',
+            data_format,
+        )
 
     l_type = 'pool2d'
 
@@ -1505,15 +1881,17 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
 
     outputs = {"Out": pool_out}
 
-    helper.append_op(type=l_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": "avg",
-                         "ksize": output_size,
-                         "adaptive": True,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": "avg",
+            "ksize": output_size,
+            "adaptive": True,
+            "data_format": data_format,
+        },
+    )
 
     return pool_out
 
@@ -1522,7 +1900,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
     """
     This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size.
-    
+
     For avg adaptive pool3d:
     ..  math::
         dstart &= floor(i * D_{in} / D_{out})
@@ -1579,14 +1957,16 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
             # out.shape is [2, 3, 3, 3, 3]
     """
     if not in_dynamic_mode():
-        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
-                                 'adaptive_avg_pool3d')
+        check_variable_and_dtype(
+            x, 'x', ['float32', 'float64'], 'adaptive_avg_pool3d'
+        )
         check_type(data_format, 'data_format', str, 'adaptive_avg_pool3d')
 
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
+            "Attr(data_format): %s." % str(data_format)
+        )
 
     if data_format == "NCDHW":
         in_l, in_h, in_w = x.shape[2:5]
@@ -1605,13 +1985,34 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
             output_size[2] = in_w
 
     if in_dygraph_mode():
-        return _C_ops.pool3d(x, output_size, [1, 1, 1], [0, 0, 0], False, True,
-                             data_format, 'avg', False, True, "EXPLICIT", False)
+        return _C_ops.pool3d(
+            x,
+            output_size,
+            [1, 1, 1],
+            [0, 0, 0],
+            False,
+            True,
+            data_format,
+            'avg',
+            False,
+            True,
+            "EXPLICIT",
+            False,
+        )
     elif _in_legacy_dygraph():
-        return _legacy_C_ops.pool3d(x, 'pooling_type', 'avg', 'ksize',
-                                    output_size, 'global_pooling', False,
-                                    'adaptive', True, 'data_format',
-                                    data_format)
+        return _legacy_C_ops.pool3d(
+            x,
+            'pooling_type',
+            'avg',
+            'ksize',
+            output_size,
+            'global_pooling',
+            False,
+            'adaptive',
+            True,
+            'data_format',
+            data_format,
+        )
 
     l_type = 'pool3d'
 
@@ -1620,15 +2021,17 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
     pool_out = helper.create_variable_for_type_inference(dtype)
     outputs = {"Out": pool_out}
 
-    helper.append_op(type=l_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": "avg",
-                         "ksize": output_size,
-                         "adaptive": True,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": "avg",
+            "ksize": output_size,
+            "adaptive": True,
+            "data_format": data_format,
+        },
+    )
 
     return pool_out
 
@@ -1680,8 +2083,9 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
     """
     pool_type = 'max'
     if not in_dynamic_mode():
-        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
-                                 'adaptive_max_pool1d')
+        check_variable_and_dtype(
+            x, 'x', ['float32', 'float64'], 'adaptive_max_pool1d'
+        )
         check_type(output_size, 'pool_size', int, 'adaptive_max_pool1d')
         check_type(return_mask, 'return_mask', bool, 'adaptive_max_pool1d')
     _check_input(x, 3)
@@ -1690,17 +2094,23 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
 
     x = unsqueeze(x, [2])
     if in_dygraph_mode():
-        pool_out = _C_ops.max_pool2d_with_index(x, pool_size, [1, 1], [0, 0],
-                                                False, True)
-        return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_mask else squeeze(pool_out[0], [2])
+        pool_out = _C_ops.max_pool2d_with_index(
+            x, pool_size, [1, 1], [0, 0], False, True
+        )
+        return (
+            (squeeze(pool_out[0], [2]), squeeze(pool_out[1], [2]))
+            if return_mask
+            else squeeze(pool_out[0], [2])
+        )
     if _in_legacy_dygraph():
-        pool_out = _legacy_C_ops.max_pool2d_with_index(x, 'pooling_type',
-                                                       pool_type, 'ksize',
-                                                       pool_size, 'adaptive',
-                                                       True)
-        return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_mask else squeeze(pool_out[0], [2])
+        pool_out = _legacy_C_ops.max_pool2d_with_index(
+            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True
+        )
+        return (
+            (squeeze(pool_out[0], [2]), squeeze(pool_out[1], [2]))
+            if return_mask
+            else squeeze(pool_out[0], [2])
+        )
 
     l_type = 'max_pool2d_with_index'
 
@@ -1711,64 +2121,70 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
     mask = helper.create_variable_for_type_inference('int32')
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(type=l_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": pool_type,
-                         "ksize": pool_size,
-                         "adaptive": True,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        },
+    )
 
-    return (squeeze(pool_out, [2]),
-            squeeze(mask, [2])) if return_mask else squeeze(pool_out, [2])
+    return (
+        (squeeze(pool_out, [2]), squeeze(mask, [2]))
+        if return_mask
+        else squeeze(pool_out, [2])
+    )
 
 
 def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
     """
-        This operation applies a 2D adaptive max pooling on input tensor.
-        See more details in :ref:`api_nn_pooling_AdaptiveMaxPool2d` .
+    This operation applies a 2D adaptive max pooling on input tensor.
+    See more details in :ref:`api_nn_pooling_AdaptiveMaxPool2d` .
 
-        Args:
-            x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float16, float32, float64, int32 or int64.
-            output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two elements, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
-            return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
-            name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+    Args:
+        x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float16, float32, float64, int32 or int64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two elements, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
+        return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
 
-        Returns:
-            Tensor: The output tensor of adaptive max pool2d result. The data type is same as input tensor.
+    Returns:
+        Tensor: The output tensor of adaptive max pool2d result. The data type is same as input tensor.
 
-        Examples:
-            .. code-block:: python
+    Examples:
+        .. code-block:: python
 
-              # max adaptive pool2d
-              # suppose input data in the shape of [N, C, H, W], `output_size` is [m, n]
-              # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
-              # of input data into m*n grids averagely and performs poolings in each
-              # grid to get output.
-              # adaptive max pool performs calculations as follow:
-              #
-              #     for i in range(m):
-              #         for j in range(n):
-              #             hstart = floor(i * H / m)
-              #             hend = ceil((i + 1) * H / m)
-              #             wstart = floor(i * W / n)
-              #             wend = ceil((i + 1) * W / n)
-              #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
-              #
-              import paddle
+          # max adaptive pool2d
+          # suppose input data in the shape of [N, C, H, W], `output_size` is [m, n]
+          # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+          # of input data into m*n grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(m):
+          #         for j in range(n):
+          #             hstart = floor(i * H / m)
+          #             hend = ceil((i + 1) * H / m)
+          #             wstart = floor(i * W / n)
+          #             wend = ceil((i + 1) * W / n)
+          #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
+          #
+          import paddle
 
-              input_data = paddle.randn(shape=(2, 3, 32, 32))
-              out = paddle.nn.functional.adaptive_max_pool2d(
-                            x = input_data,
-                            output_size=[3, 3])
-              # out.shape is [2, 3, 3, 3]
+          input_data = paddle.randn(shape=(2, 3, 32, 32))
+          out = paddle.nn.functional.adaptive_max_pool2d(
+                        x = input_data,
+                        output_size=[3, 3])
+          # out.shape is [2, 3, 3, 3]
     """
     if not in_dynamic_mode():
-        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
-                                 'adaptive_max_pool2d')
+        check_variable_and_dtype(
+            x, 'x', ['float32', 'float64'], 'adaptive_max_pool2d'
+        )
         check_type(return_mask, 'return_mask', bool, 'adaptive_max_pool2d')
-        #check_type(output_size, 'pool_size', (int), 'adaptive_max_pool2d')
+        # check_type(output_size, 'pool_size', (int), 'adaptive_max_pool2d')
     _check_input(x, 4)
 
     in_h, in_w = x.shape[2:4]
@@ -1781,13 +2197,14 @@ def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
         if output_size[1] == None:
             output_size[1] = in_w
     if in_dygraph_mode():
-        pool_out = _C_ops.max_pool2d_with_index(x, output_size, [1, 1], [0, 0],
-                                                False, True)
+        pool_out = _C_ops.max_pool2d_with_index(
+            x, output_size, [1, 1], [0, 0], False, True
+        )
         return pool_out if return_mask else pool_out[0]
     if _in_legacy_dygraph():
-        pool_out = _legacy_C_ops.max_pool2d_with_index(x, 'pooling_type', 'max',
-                                                       'ksize', output_size,
-                                                       'adaptive', True)
+        pool_out = _legacy_C_ops.max_pool2d_with_index(
+            x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True
+        )
         return pool_out if return_mask else pool_out[0]
 
     l_type = 'max_pool2d_with_index'
@@ -1799,67 +2216,70 @@ def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
     mask = helper.create_variable_for_type_inference('int32')
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(type=l_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": 'max',
-                         "ksize": output_size,
-                         "adaptive": True,
-                     })
-    #return (pool_out, mask) if return_mask else pool_out
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": output_size,
+            "adaptive": True,
+        },
+    )
+    # return (pool_out, mask) if return_mask else pool_out
     return pool_out
 
 
 def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
     """
-        This operation applies a 3D adaptive max pooling on input tensor.
-        See more details in :ref:`api_nn_pooling_AdaptiveMaxPool3d` .
+    This operation applies a 3D adaptive max pooling on input tensor.
+    See more details in :ref:`api_nn_pooling_AdaptiveMaxPool3d` .
 
-        Args:
-            x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
-            output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
-            return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
-            name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+    Args:
+        x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
+        return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
 
-        Returns:
-            Tensor: The output tensor of adaptive max pool3d result. The data type is same as input tensor.
+    Returns:
+        Tensor: The output tensor of adaptive max pool3d result. The data type is same as input tensor.
 
-        Examples:
-            .. code-block:: python
+    Examples:
+        .. code-block:: python
 
-              # adaptive max pool3d
-              # suppose input data in the shape of [N, C, D, H, W], `output_size` is [l, m, n]
-              # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
-              # of input data into m*n grids averagely and performs poolings in each
-              # grid to get output.
-              # adaptive max pool performs calculations as follow:
-              #
-              #     for i in range(l):
-              #         for j in range(m):
-              #             for k in range(n):
-              #                 dstart = floor(i * D / l)
-              #                 dend = ceil((i + 1) * D / l)
-              #                 hstart = floor(i * H / m)
-              #                 hend = ceil((i + 1) * H / m)
-              #                 wstart = floor(i * W / n)
-              #                 wend = ceil((i + 1) * W / n)
-              #             output[:, :, i, j, k] = max(input[:, :, dstart: dend, hstart: hend, wstart: wend])
-              #
-              import paddle
+          # adaptive max pool3d
+          # suppose input data in the shape of [N, C, D, H, W], `output_size` is [l, m, n]
+          # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+          # of input data into m*n grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(l):
+          #         for j in range(m):
+          #             for k in range(n):
+          #                 dstart = floor(i * D / l)
+          #                 dend = ceil((i + 1) * D / l)
+          #                 hstart = floor(i * H / m)
+          #                 hend = ceil((i + 1) * H / m)
+          #                 wstart = floor(i * W / n)
+          #                 wend = ceil((i + 1) * W / n)
+          #             output[:, :, i, j, k] = max(input[:, :, dstart: dend, hstart: hend, wstart: wend])
+          #
+          import paddle
 
-              input_data = paddle.randn(shape=(2, 3, 8, 32, 32))
-              out = paddle.nn.functional.adaptive_max_pool3d(
-                            x = input_data,
-                            output_size=[3, 3, 3])
-              # out.shape is [2, 3, 3, 3, 3]
+          input_data = paddle.randn(shape=(2, 3, 8, 32, 32))
+          out = paddle.nn.functional.adaptive_max_pool3d(
+                        x = input_data,
+                        output_size=[3, 3, 3])
+          # out.shape is [2, 3, 3, 3, 3]
     """
 
     if not in_dynamic_mode():
-        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
-                                 'adaptive_max_pool3d')
+        check_variable_and_dtype(
+            x, 'x', ['float32', 'float64'], 'adaptive_max_pool3d'
+        )
         check_type(return_mask, 'return_mask', bool, 'adaptive_max_pool3d')
-        #check_type(output_size, 'pool_size', (int), 'adaptive_max_pool3d')
+        # check_type(output_size, 'pool_size', (int), 'adaptive_max_pool3d')
     _check_input(x, 5)
 
     in_l, in_h, in_w = x.shape[2:5]
@@ -1877,12 +2297,13 @@ def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
     if in_dynamic_mode():
         if in_dygraph_mode():
             # By default, strides is [1,1,1] and paddings is [0, 0, 0]
-            pool_out = _C_ops.max_pool3d_with_index(x, output_size, [1, 1, 1],
-                                                    [0, 0, 0], False, True)
+            pool_out = _C_ops.max_pool3d_with_index(
+                x, output_size, [1, 1, 1], [0, 0, 0], False, True
+            )
         elif _in_legacy_dygraph():
             pool_out = _legacy_C_ops.max_pool3d_with_index(
-                x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive',
-                True)
+                x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True
+            )
         return pool_out if return_mask else pool_out[0]
 
     l_type = 'max_pool3d_with_index'
@@ -1894,13 +2315,15 @@ def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
     mask = helper.create_variable_for_type_inference('int32')
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(type=l_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": 'max',
-                         "ksize": output_size,
-                         "adaptive": True,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": output_size,
+            "adaptive": True,
+        },
+    )
 
     return (pool_out, mask) if return_mask else pool_out
diff --git a/python/paddle/nn/functional/sparse_attention.py b/python/paddle/nn/functional/sparse_attention.py
index 77327bae520..841ccb4313f 100644
--- a/python/paddle/nn/functional/sparse_attention.py
+++ b/python/paddle/nn/functional/sparse_attention.py
@@ -20,62 +20,64 @@ from paddle import _C_ops, _legacy_C_ops
 from paddle import in_dynamic_mode
 
 
-def sparse_attention(query,
-                     key,
-                     value,
-                     sparse_csr_offset,
-                     sparse_csr_columns,
-                     key_padding_mask=None,
-                     attn_mask=None,
-                     name=None):
+def sparse_attention(
+    query,
+    key,
+    value,
+    sparse_csr_offset,
+    sparse_csr_columns,
+    key_padding_mask=None,
+    attn_mask=None,
+    name=None,
+):
     r"""
     This operator sparsify the Attention matrix in Transformer module
-    to achieve the effect of reducing memory consumption and computation. 
-    The sparse layout is expressed in CSR format and contains two parameters, 
-    ``offset`` and ``columns``. The equation is: 
+    to achieve the effect of reducing memory consumption and computation.
+    The sparse layout is expressed in CSR format and contains two parameters,
+    ``offset`` and ``columns``. The equation is:
 
     .. math::
 
         result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V
 
-    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module. 
-    The dimensions of the three parameters are the same. 
+    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
+    The dimensions of the three parameters are the same.
     ``d`` represents the size of the last dimension of the three parameters.
 
-    Warning:    
+    Warning:
         This API is only used in ``CUDA 11.3`` and above versions.
 
     Args:
-        query(Tensor): The query tensor in the Attention module. 
-                        4-D tensor with shape: 
-                        [batch_size, num_heads, seq_len, head_dim]. 
+        query(Tensor): The query tensor in the Attention module.
+                        4-D tensor with shape:
+                        [batch_size, num_heads, seq_len, head_dim].
                         The dtype can be float32 and float64.
-        key(Tensor): The key tensor in the Attention module. 
-                        4-D tensor with shape: 
-                        [batch_size, num_heads, seq_len, head_dim]. 
+        key(Tensor): The key tensor in the Attention module.
+                        4-D tensor with shape:
+                        [batch_size, num_heads, seq_len, head_dim].
                         The dtype can be float32 and float64.
-        value(Tensor): The value tensor in the Attention module. 
-                        4-D tensor with shape:  
-                        [batch_size, num_heads, seq_len, head_dim]. 
+        value(Tensor): The value tensor in the Attention module.
+                        4-D tensor with shape:
+                        [batch_size, num_heads, seq_len, head_dim].
                         The dtype can be float32 and float64.
-        sparse_csr_offset(Tensor): The sparsity feature in the Attention module 
-                        is expressed in the CSR format, and the offset represents 
+        sparse_csr_offset(Tensor): The sparsity feature in the Attention module
+                        is expressed in the CSR format, and the offset represents
                         the number of non-zero elements in each row of the matrix.
-                        3-D tensor with shape:   
-                        [batch_size, num_heads, seq_len + 1]. 
+                        3-D tensor with shape:
+                        [batch_size, num_heads, seq_len + 1].
                         The dtype should be int32.
-        sparse_csr_columns(Tensor): The sparsity feature in the Attention module 
-                        is expressed in the CSR format, and the columns represent 
+        sparse_csr_columns(Tensor): The sparsity feature in the Attention module
+                        is expressed in the CSR format, and the columns represent
                         the column index values of non-zero elements in the matrix.
-                        3-D tensor with shape:  
-                        [batch_size, num_heads, sparse_nnz]. 
+                        3-D tensor with shape:
+                        [batch_size, num_heads, sparse_nnz].
                         The dtype should be int32.
-        key_padding_mask(Tensor, optional):The key padding mask tensor in the Attention module. 
-                        2-D tensor with shape: [batch_size, seq_len]. 
+        key_padding_mask(Tensor, optional):The key padding mask tensor in the Attention module.
+                        2-D tensor with shape: [batch_size, seq_len].
                         The dtype can be float32 and float64.
                         A value of 0 means that the position is masked.
-        attn_mask(Tensor, optional):The attention mask tensor in the Attention module. 
-                        2-D tensor with shape: [seq_len, seq_len]. 
+        attn_mask(Tensor, optional):The attention mask tensor in the Attention module.
+                        2-D tensor with shape: [seq_len, seq_len].
                         The dtype can be float32 and float64.
                         A value of 0 means that the position is masked.
         name(str, optional): The default value is None. Normally there is no need for user
@@ -84,7 +86,7 @@ def sparse_attention(query,
 
     Returns:
         4-D tensor with shape:
-        [batch_size, num_heads, seq_len, head_dim]. 
+        [batch_size, num_heads, seq_len, head_dim].
         The dtype can be float32 or float64.
 
     Examples:
@@ -113,40 +115,50 @@ def sparse_attention(query,
             print(sparse_csr_columns_data.shape)
             # (1, 1, 8)
             paddle.disable_static()
-            query = paddle.to_tensor(query_data, stop_gradient=False, 
+            query = paddle.to_tensor(query_data, stop_gradient=False,
                             place=paddle.CUDAPlace(0))
-            key = paddle.to_tensor(key_data, stop_gradient=False, 
+            key = paddle.to_tensor(key_data, stop_gradient=False,
                             place=paddle.CUDAPlace(0))
-            value = paddle.to_tensor(value_data, stop_gradient=False, 
+            value = paddle.to_tensor(value_data, stop_gradient=False,
                             place=paddle.CUDAPlace(0))
-            offset = paddle.to_tensor(sparse_csr_offset_data, stop_gradient=False, 
+            offset = paddle.to_tensor(sparse_csr_offset_data, stop_gradient=False,
                             place=paddle.CUDAPlace(0))
-            columns = paddle.to_tensor(sparse_csr_columns_data, stop_gradient=False, 
+            columns = paddle.to_tensor(sparse_csr_columns_data, stop_gradient=False,
                             place=paddle.CUDAPlace(0))
-            key_padding_mask = paddle.to_tensor(key_padding_mask_data, stop_gradient=False, 
+            key_padding_mask = paddle.to_tensor(key_padding_mask_data, stop_gradient=False,
                             place=paddle.CUDAPlace(0))
-            attention_mask = paddle.to_tensor(attention_mask_data, stop_gradient=False, 
+            attention_mask = paddle.to_tensor(attention_mask_data, stop_gradient=False,
                             place=paddle.CUDAPlace(0))
-            output_mask = paddle.nn.functional.sparse_attention(query, key, 
-                            value, offset, columns, 
+            output_mask = paddle.nn.functional.sparse_attention(query, key,
+                            value, offset, columns,
                             key_padding_mask=key_padding_mask, attn_mask=attention_mask)
             print(output_mask)
             # [[[[0.        , 1.        ],
             #    [1.99830270, 2.99830270],
             #    [0.        , 1.        ],
             #    [0.        , 1.        ]]]]
-            output = paddle.nn.functional.sparse_attention(query, key, 
+            output = paddle.nn.functional.sparse_attention(query, key,
                             value, offset, columns)
-            print(output) 
+            print(output)
             # [[[[1.60885942, 2.60885954],
             #       [1.99830270, 2.99830270],
             #       [1.60885942, 2.60885954],
             #       [1.99830270, 2.99830270]]]]
     """
     if in_dynamic_mode():
-        result_attention, result_sdd, result_softmax = _legacy_C_ops.sparse_attention(
-            query, key, value, sparse_csr_offset, sparse_csr_columns,
-            key_padding_mask, attn_mask)
+        (
+            result_attention,
+            result_sdd,
+            result_softmax,
+        ) = _legacy_C_ops.sparse_attention(
+            query,
+            key,
+            value,
+            sparse_csr_offset,
+            sparse_csr_columns,
+            key_padding_mask,
+            attn_mask,
+        )
         return result_attention
 
     helper = LayerHelper('sparse_attention', **locals())
@@ -166,7 +178,7 @@ def sparse_attention(query,
     outputs = {
         'Out': out,
         'SparseDotSdd': result_sdd,
-        'Softmax': result_softmax
+        'Softmax': result_softmax,
     }
     helper.append_op(type='sparse_attention', inputs=inputs, outputs=outputs)
     return out
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index dc80743de51..761fdc33e72 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -28,7 +28,7 @@ class CELU(Layer):
     CELU Activation.
 
     .. math::
-    
+
         CELU(x) = max(0, x) + min(0, \alpha * (e^{x/\alpha}-1))
 
     Parameters:
@@ -44,7 +44,7 @@ class CELU(Layer):
         .. code-block:: python
 
             import paddle
-            
+
             x = paddle.to_tensor([[-1. ,6.], [1., 15.6]])
             m = paddle.nn.CELU(0.2)
             out = m(x)
@@ -140,7 +140,7 @@ class GELU(Layer):
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
 
             x = paddle.to_tensor([[-1, 0.5],[1, 1.5]])
@@ -404,12 +404,14 @@ class PReLU(Layer):
             #    [ 6.  ,  7.  ,  8.  ,  9.  ]]]]
     """
 
-    def __init__(self,
-                 num_parameters=1,
-                 init=0.25,
-                 weight_attr=None,
-                 data_format="NCHW",
-                 name=None):
+    def __init__(
+        self,
+        num_parameters=1,
+        init=0.25,
+        weight_attr=None,
+        data_format="NCHW",
+        name=None,
+    ):
         super(PReLU, self).__init__()
         self._num_parameters = num_parameters
         self._init = init
@@ -417,12 +419,13 @@ class PReLU(Layer):
         self._name = name
         self._data_format = data_format
 
-        self._weight = self.create_parameter(attr=self._weight_attr,
-                                             shape=[self._num_parameters],
-                                             dtype=get_default_dtype(),
-                                             is_bias=False,
-                                             default_initializer=Constant(
-                                                 self._init))
+        self._weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=[self._num_parameters],
+            dtype=get_default_dtype(),
+            is_bias=False,
+            default_initializer=Constant(self._init),
+        )
 
     def forward(self, x):
         return F.prelu(x, self._weight, data_format=self._data_format)
@@ -430,8 +433,12 @@ class PReLU(Layer):
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
         return 'num_parameters={}, data_format={}, init={}, dtype={}{}'.format(
-            self._num_parameters, self._data_format, self._init, self._dtype,
-            name_str)
+            self._num_parameters,
+            self._data_format,
+            self._init,
+            self._dtype,
+            name_str,
+        )
 
 
 class RReLU(Layer):
@@ -505,22 +512,22 @@ class RReLU(Layer):
             #   [ 6.          7.          8.          9.        ]]]]
     """
 
-    def __init__(self, lower=1. / 8., upper=1. / 3., name=None):
+    def __init__(self, lower=1.0 / 8.0, upper=1.0 / 3.0, name=None):
         super(RReLU, self).__init__()
         self._lower = lower
         self._upper = upper
         self._name = name
 
     def forward(self, x):
-        return F.rrelu(x,
-                       lower=self._lower,
-                       upper=self._upper,
-                       training=self.training)
+        return F.rrelu(
+            x, lower=self._lower, upper=self._upper, training=self.training
+        )
 
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
         return 'lower={}, upper={}, training={}, dtype={}{}'.format(
-            self._lower, self._upper, self.training, self._dtype, name_str)
+            self._lower, self._upper, self.training, self._dtype, name_str
+        )
 
 
 class ReLU(Layer):
@@ -639,10 +646,12 @@ class SELU(Layer):
             # [[0, 1.050701],[2.101402, 3.152103]]
     """
 
-    def __init__(self,
-                 scale=1.0507009873554804934193349852946,
-                 alpha=1.6732632423543772848170429916717,
-                 name=None):
+    def __init__(
+        self,
+        scale=1.0507009873554804934193349852946,
+        alpha=1.6732632423543772848170429916717,
+        name=None,
+    ):
         super(SELU, self).__init__()
         self._scale = scale
         self._alpha = alpha
@@ -653,8 +662,9 @@ class SELU(Layer):
 
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
-        return 'scale={:.16f}, alpha={:.16f}{}'.format(self._scale, self._alpha,
-                                                       name_str)
+        return 'scale={:.16f}, alpha={:.16f}{}'.format(
+            self._scale, self._alpha, name_str
+        )
 
 
 class LeakyReLU(Layer):
@@ -839,8 +849,9 @@ class Softplus(Layer):
 
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
-        return 'beta={}, threshold={}{}'.format(self._beta, self._threshold,
-                                                name_str)
+        return 'beta={}, threshold={}{}'.format(
+            self._beta, self._threshold, name_str
+        )
 
 
 class Softshrink(Layer):
@@ -1474,8 +1485,11 @@ class Softmax2D(Layer):
         self._name = name
 
     def forward(self, x):
-        assert x.ndim == 3 or x.ndim == 4, "Softmax2D requires a 3D or 4D tensor as input. Received: {}D.".format(
-            x.ndim)
+        assert (
+            x.ndim == 3 or x.ndim == 4
+        ), "Softmax2D requires a 3D or 4D tensor as input. Received: {}D.".format(
+            x.ndim
+        )
         return F.softmax(x, axis=-3, dtype=self._dtype, name=self._name)
 
     def extra_repr(self):
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index ee5641f5d12..bbe694e4834 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -145,37 +145,43 @@ class Linear(Layer):
           #     [2.1077576  2.1077576  2.1077576  2.1077576 ]]
     """
 
-    def __init__(self,
-                 in_features,
-                 out_features,
-                 weight_attr=None,
-                 bias_attr=None,
-                 name=None):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        weight_attr=None,
+        bias_attr=None,
+        name=None,
+    ):
         super(Linear, self).__init__()
         self._dtype = self._helper.get_default_dtype()
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
-        self.weight = self.create_parameter(shape=[in_features, out_features],
-                                            attr=self._weight_attr,
-                                            dtype=self._dtype,
-                                            is_bias=False)
-        self.bias = self.create_parameter(shape=[out_features],
-                                          attr=self._bias_attr,
-                                          dtype=self._dtype,
-                                          is_bias=True)
+        self.weight = self.create_parameter(
+            shape=[in_features, out_features],
+            attr=self._weight_attr,
+            dtype=self._dtype,
+            is_bias=False,
+        )
+        self.bias = self.create_parameter(
+            shape=[out_features],
+            attr=self._bias_attr,
+            dtype=self._dtype,
+            is_bias=True,
+        )
         self.name = name
 
     def forward(self, input):
-        out = F.linear(x=input,
-                       weight=self.weight,
-                       bias=self.bias,
-                       name=self.name)
+        out = F.linear(
+            x=input, weight=self.weight, bias=self.bias, name=self.name
+        )
         return out
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
         return 'in_features={}, out_features={}, dtype={}{}'.format(
-            self.weight.shape[0], self.weight.shape[1], self._dtype, name_str)
+            self.weight.shape[0], self.weight.shape[1], self._dtype, name_str
+        )
 
 
 class Upsample(Layer):
@@ -325,8 +331,8 @@ class Upsample(Layer):
         x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
         size (list|tuple|Tensor|None): Output shape of image resize
-             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
-             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
+             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w)
+             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor.
              Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor , its dimensions size should be a 1.
         scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
@@ -373,7 +379,7 @@ class Upsample(Layer):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
             import paddle.nn as nn
             import numpy as np
@@ -388,14 +394,16 @@ class Upsample(Layer):
 
     """
 
-    def __init__(self,
-                 size=None,
-                 scale_factor=None,
-                 mode='nearest',
-                 align_corners=False,
-                 align_mode=0,
-                 data_format='NCHW',
-                 name=None):
+    def __init__(
+        self,
+        size=None,
+        scale_factor=None,
+        mode='nearest',
+        align_corners=False,
+        align_mode=0,
+        data_format='NCHW',
+        name=None,
+    ):
         super(Upsample, self).__init__()
         self.size = size
         self.scale_factor = scale_factor
@@ -406,14 +414,16 @@ class Upsample(Layer):
         self.name = name
 
     def forward(self, x):
-        out = F.interpolate(x,
-                            size=self.size,
-                            scale_factor=self.scale_factor,
-                            mode=self.mode,
-                            align_corners=self.align_corners,
-                            align_mode=self.align_mode,
-                            data_format=self.data_format,
-                            name=self.name)
+        out = F.interpolate(
+            x,
+            size=self.size,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners,
+            align_mode=self.align_mode,
+            data_format=self.data_format,
+            name=self.name,
+        )
 
         return out
 
@@ -424,8 +434,13 @@ class Upsample(Layer):
             main_str = 'size={}'.format(self.size)
         name_str = ', name={}'.format(self.name) if self.name else ''
         return '{}, mode={}, align_corners={}, align_mode={}, data_format={}{}'.format(
-            main_str, self.mode, self.align_corners, self.align_mode,
-            self.data_format, name_str)
+            main_str,
+            self.mode,
+            self.align_corners,
+            self.align_mode,
+            self.data_format,
+            name_str,
+        )
 
 
 class UpsamplingNearest2D(Layer):
@@ -479,11 +494,9 @@ class UpsamplingNearest2D(Layer):
             # [2L, 3L, 12L, 12L]
     """
 
-    def __init__(self,
-                 size=None,
-                 scale_factor=None,
-                 data_format='NCHW',
-                 name=None):
+    def __init__(
+        self, size=None, scale_factor=None, data_format='NCHW', name=None
+    ):
         super(UpsamplingNearest2D, self).__init__()
         self.size = size
         self.scale_factor = scale_factor
@@ -491,14 +504,16 @@ class UpsamplingNearest2D(Layer):
         self.name = name
 
     def forward(self, x):
-        out = F.interpolate(x,
-                            size=self.size,
-                            scale_factor=self.scale_factor,
-                            mode='nearest',
-                            align_corners=False,
-                            align_mode=0,
-                            data_format=self.data_format,
-                            name=self.name)
+        out = F.interpolate(
+            x,
+            size=self.size,
+            scale_factor=self.scale_factor,
+            mode='nearest',
+            align_corners=False,
+            align_mode=0,
+            data_format=self.data_format,
+            name=self.name,
+        )
 
         return out
 
@@ -508,8 +523,9 @@ class UpsamplingNearest2D(Layer):
         else:
             main_str = 'size={}'.format(self.size)
         name_str = ', name={}'.format(self.name) if self.name else ''
-        return '{}, data_format={}{}'.format(main_str, self.data_format,
-                                             name_str)
+        return '{}, data_format={}{}'.format(
+            main_str, self.data_format, name_str
+        )
 
 
 class UpsamplingBilinear2D(Layer):
@@ -564,11 +580,9 @@ class UpsamplingBilinear2D(Layer):
             # [2L, 3L, 12L, 12L]
     """
 
-    def __init__(self,
-                 size=None,
-                 scale_factor=None,
-                 data_format='NCHW',
-                 name=None):
+    def __init__(
+        self, size=None, scale_factor=None, data_format='NCHW', name=None
+    ):
         super(UpsamplingBilinear2D, self).__init__()
         self.size = size
         self.scale_factor = scale_factor
@@ -576,14 +590,16 @@ class UpsamplingBilinear2D(Layer):
         self.name = name
 
     def forward(self, x):
-        out = F.interpolate(x,
-                            size=self.size,
-                            scale_factor=self.scale_factor,
-                            mode='bilinear',
-                            align_corners=True,
-                            align_mode=0,
-                            data_format=self.data_format,
-                            name=self.name)
+        out = F.interpolate(
+            x,
+            size=self.size,
+            scale_factor=self.scale_factor,
+            mode='bilinear',
+            align_corners=True,
+            align_mode=0,
+            data_format=self.data_format,
+            name=self.name,
+        )
 
         return out
 
@@ -593,8 +609,9 @@ class UpsamplingBilinear2D(Layer):
         else:
             main_str = 'size={}'.format(self.size)
         name_str = ', name={}'.format(self.name) if self.name else ''
-        return '{}, data_format={}{}'.format(main_str, self.data_format,
-                                             name_str)
+        return '{}, data_format={}{}'.format(
+            main_str, self.data_format, name_str
+        )
 
 
 class Bilinear(Layer):
@@ -651,13 +668,15 @@ class Bilinear(Layer):
 
     """
 
-    def __init__(self,
-                 in1_features,
-                 in2_features,
-                 out_features,
-                 weight_attr=None,
-                 bias_attr=None,
-                 name=None):
+    def __init__(
+        self,
+        in1_features,
+        in2_features,
+        out_features,
+        weight_attr=None,
+        bias_attr=None,
+        name=None,
+    ):
         super(Bilinear, self).__init__()
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
@@ -668,17 +687,23 @@ class Bilinear(Layer):
         self._dtype = self._helper.get_default_dtype()
 
         weight_shape = [
-            self._out_features, self._in1_features, self._in2_features
+            self._out_features,
+            self._in1_features,
+            self._in2_features,
         ]
-        self.weight = self.create_parameter(attr=self._weight_attr,
-                                            shape=weight_shape,
-                                            dtype=self._dtype,
-                                            is_bias=False)
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=weight_shape,
+            dtype=self._dtype,
+            is_bias=False,
+        )
         bias_shape = [1, self._out_features]
-        self.bias = self.create_parameter(attr=self._bias_attr,
-                                          shape=bias_shape,
-                                          dtype=self._dtype,
-                                          is_bias=True)
+        self.bias = self.create_parameter(
+            attr=self._bias_attr,
+            shape=bias_shape,
+            dtype=self._dtype,
+            is_bias=True,
+        )
 
     def forward(self, x1, x2):
         return F.bilinear(x1, x2, self.weight, self.bias, self._name)
@@ -686,8 +711,12 @@ class Bilinear(Layer):
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
         return 'in1_features={}, in2_features={}, out_features={}, dtype={}{}'.format(
-            self._in1_features, self._in2_features, self._out_features,
-            self._dtype, name_str)
+            self._in1_features,
+            self._in2_features,
+            self._out_features,
+            self._dtype,
+            name_str,
+        )
 
 
 class Dropout(Layer):
@@ -738,7 +767,7 @@ class Dropout(Layer):
             print(x)
             print(y_train)
             print(y_test)
-   """
+    """
 
     def __init__(self, p=0.5, axis=None, mode="upscale_in_train", name=None):
         super(Dropout, self).__init__()
@@ -749,18 +778,21 @@ class Dropout(Layer):
         self.name = name
 
     def forward(self, input):
-        out = F.dropout(input,
-                        p=self.p,
-                        axis=self.axis,
-                        training=self.training,
-                        mode=self.mode,
-                        name=self.name)
+        out = F.dropout(
+            input,
+            p=self.p,
+            axis=self.axis,
+            training=self.training,
+            mode=self.mode,
+            name=self.name,
+        )
         return out
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
-        return 'p={}, axis={}, mode={}{}'.format(self.p, self.axis, self.mode,
-                                                 name_str)
+        return 'p={}, axis={}, mode={}{}'.format(
+            self.p, self.axis, self.mode, name_str
+        )
 
 
 class Dropout2D(Layer):
@@ -800,7 +832,7 @@ class Dropout2D(Layer):
             print(x)
             print(y_train)
             print(y_test)
-   """
+    """
 
     def __init__(self, p=0.5, data_format='NCHW', name=None):
         super(Dropout2D, self).__init__()
@@ -810,17 +842,20 @@ class Dropout2D(Layer):
         self.name = name
 
     def forward(self, input):
-        out = F.dropout2d(input,
-                          p=self.p,
-                          training=self.training,
-                          data_format=self.data_format,
-                          name=self.name)
+        out = F.dropout2d(
+            input,
+            p=self.p,
+            training=self.training,
+            data_format=self.data_format,
+            name=self.name,
+        )
         return out
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
-        return 'p={}, data_format={}{}'.format(self.p, self.data_format,
-                                               name_str)
+        return 'p={}, data_format={}{}'.format(
+            self.p, self.data_format, name_str
+        )
 
 
 class Dropout3D(Layer):
@@ -860,7 +895,7 @@ class Dropout3D(Layer):
             print(x)
             print(y_train)
             print(y_test)
-   """
+    """
 
     def __init__(self, p=0.5, data_format='NCDHW', name=None):
         super(Dropout3D, self).__init__()
@@ -870,17 +905,20 @@ class Dropout3D(Layer):
         self.name = name
 
     def forward(self, input):
-        out = F.dropout3d(input,
-                          p=self.p,
-                          training=self.training,
-                          data_format=self.data_format,
-                          name=self.name)
+        out = F.dropout3d(
+            input,
+            p=self.p,
+            training=self.training,
+            data_format=self.data_format,
+            name=self.name,
+        )
         return out
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
-        return 'p={}, data_format={}{}'.format(self.p, self.data_format,
-                                               name_str)
+        return 'p={}, data_format={}{}'.format(
+            self.p, self.data_format, name_str
+        )
 
 
 class AlphaDropout(Layer):
@@ -919,7 +957,7 @@ class AlphaDropout(Layer):
             print(y_train)
             # [[-0.10721093, 1.6655989 ], [-0.7791938, -0.7791938]] (randomly)
             print(y_test)
-   """
+    """
 
     def __init__(self, p=0.5, name=None):
         super(AlphaDropout, self).__init__()
@@ -927,10 +965,9 @@ class AlphaDropout(Layer):
         self.name = name
 
     def forward(self, input):
-        out = F.alpha_dropout(input,
-                              p=self.p,
-                              training=self.training,
-                              name=self.name)
+        out = F.alpha_dropout(
+            input, p=self.p, training=self.training, name=self.name
+        )
         return out
 
     def extra_repr(self):
@@ -980,12 +1017,9 @@ class Pad1D(Layer):
             #   [0. 4. 5. 6. 0. 0.]]]
     """
 
-    def __init__(self,
-                 padding,
-                 mode='constant',
-                 value=0.0,
-                 data_format="NCL",
-                 name=None):
+    def __init__(
+        self, padding, mode='constant', value=0.0, data_format="NCL", name=None
+    ):
         super(Pad1D, self).__init__()
         self._pad = _npairs(padding, 1)
         self._mode = mode
@@ -994,17 +1028,20 @@ class Pad1D(Layer):
         self._name = name
 
     def forward(self, x):
-        return F.pad(x,
-                     pad=self._pad,
-                     mode=self._mode,
-                     value=self._value,
-                     data_format=self._data_format,
-                     name=self._name)
+        return F.pad(
+            x,
+            pad=self._pad,
+            mode=self._mode,
+            value=self._value,
+            data_format=self._data_format,
+            name=self._name,
+        )
 
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
         return 'padding={}, mode={}, value={}, data_format={}{}'.format(
-            self._pad, self._mode, self._value, self._data_format, name_str)
+            self._pad, self._mode, self._value, self._data_format, name_str
+        )
 
 
 class Pad2D(Layer):
@@ -1016,8 +1053,8 @@ class Pad2D(Layer):
 
     Parameters:
         padding (Tensor|list[int]|int): The padding size with data type int. If is int, use the
-            same padding in all dimensions. Else [len(padding)/2] dimensions of input will be padded. 
-            The pad has the form (pad_left, pad_right, pad_top, pad_bottom). 
+            same padding in all dimensions. Else [len(padding)/2] dimensions of input will be padded.
+            The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
         mode (str, optional): Four modes: 'constant' (default), 'reflect', 'replicate', 'circular'. Default is 'constant'.
 
            - 'constant' mode, uses a constant value to pad the input tensor.
@@ -1053,12 +1090,9 @@ class Pad2D(Layer):
             #    [0. 0. 0. 0.]]]]
     """
 
-    def __init__(self,
-                 padding,
-                 mode='constant',
-                 value=0.0,
-                 data_format="NCHW",
-                 name=None):
+    def __init__(
+        self, padding, mode='constant', value=0.0, data_format="NCHW", name=None
+    ):
         super(Pad2D, self).__init__()
         self._pad = _npairs(padding, 2)
         self._mode = mode
@@ -1067,17 +1101,20 @@ class Pad2D(Layer):
         self._name = name
 
     def forward(self, x):
-        return F.pad(x,
-                     pad=self._pad,
-                     mode=self._mode,
-                     value=self._value,
-                     data_format=self._data_format,
-                     name=self._name)
+        return F.pad(
+            x,
+            pad=self._pad,
+            mode=self._mode,
+            value=self._value,
+            data_format=self._data_format,
+            name=self._name,
+        )
 
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
         return 'padding={}, mode={}, value={}, data_format={}{}'.format(
-            self._pad, self._mode, self._value, self._data_format, name_str)
+            self._pad, self._mode, self._value, self._data_format, name_str
+        )
 
 
 class ZeroPad2D(Layer):
@@ -1128,23 +1165,25 @@ class ZeroPad2D(Layer):
         super(ZeroPad2D, self).__init__()
         self._pad = _npairs(padding, 2)
         self._mode = 'constant'
-        self._value = 0.
+        self._value = 0.0
         self._data_format = data_format
         self._name = name
 
     def forward(self, x):
-        return F.pad(x,
-                     pad=self._pad,
-                     mode=self._mode,
-                     value=self._value,
-                     data_format=self._data_format,
-                     name=self._name)
+        return F.pad(
+            x,
+            pad=self._pad,
+            mode=self._mode,
+            value=self._value,
+            data_format=self._data_format,
+            name=self._name,
+        )
 
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
-        return 'padding={}, data_format={}{}'.format(self._pad,
-                                                     self._data_format,
-                                                     name_str)
+        return 'padding={}, data_format={}{}'.format(
+            self._pad, self._data_format, name_str
+        )
 
 
 class Pad3D(Layer):
@@ -1193,12 +1232,14 @@ class Pad3D(Layer):
             #     [0. 0. 0. 0.]]]]]
     """
 
-    def __init__(self,
-                 padding,
-                 mode='constant',
-                 value=0.0,
-                 data_format="NCDHW",
-                 name=None):
+    def __init__(
+        self,
+        padding,
+        mode='constant',
+        value=0.0,
+        data_format="NCDHW",
+        name=None,
+    ):
         super(Pad3D, self).__init__()
         self._pad = _npairs(padding, 3)
         self._mode = mode
@@ -1207,17 +1248,20 @@ class Pad3D(Layer):
         self._name = name
 
     def forward(self, x):
-        return F.pad(x,
-                     pad=self._pad,
-                     mode=self._mode,
-                     value=self._value,
-                     data_format=self._data_format,
-                     name=self._name)
+        return F.pad(
+            x,
+            pad=self._pad,
+            mode=self._mode,
+            value=self._value,
+            data_format=self._data_format,
+            name=self._name,
+        )
 
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
         return 'padding={}, mode={}, value={}, data_format={}{}'.format(
-            self._pad, self._mode, self._value, self._data_format, name_str)
+            self._pad, self._mode, self._value, self._data_format, name_str
+        )
 
 
 class CosineSimilarity(Layer):
@@ -1279,7 +1323,7 @@ class CosineSimilarity(Layer):
 
 class Embedding(Layer):
     r"""
-    
+
     Embedding Layer, used to construct a callable object of the ``Embedding`` class.
     For specific usage, refer to code examples. It implements the function of the Embedding Layer.
     This layer is used to lookup embeddings vector of ids provided by :attr:`x` .
@@ -1378,13 +1422,15 @@ class Embedding(Layer):
 
     """
 
-    def __init__(self,
-                 num_embeddings,
-                 embedding_dim,
-                 padding_idx=None,
-                 sparse=False,
-                 weight_attr=None,
-                 name=None):
+    def __init__(
+        self,
+        num_embeddings,
+        embedding_dim,
+        padding_idx=None,
+        sparse=False,
+        weight_attr=None,
+        name=None,
+    ):
         super(Embedding, self).__init__()
         self._num_embeddings = num_embeddings
         self._embedding_dim = embedding_dim
@@ -1398,12 +1444,20 @@ class Embedding(Layer):
         if self._embedding_dim <= 0:
             raise ValueError("embedding_dim must be gather than 0")
 
-        padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
-            num_embeddings + padding_idx)
+        padding_idx = (
+            -1
+            if padding_idx is None
+            else padding_idx
+            if padding_idx >= 0
+            else (num_embeddings + padding_idx)
+        )
 
         if padding_idx >= num_embeddings or padding_idx < -num_embeddings:
-            raise ValueError("padding_idx must be within [-{}, {})".format(
-                num_embeddings, num_embeddings))
+            raise ValueError(
+                "padding_idx must be within [-{}, {})".format(
+                    num_embeddings, num_embeddings
+                )
+            )
 
         self._dtype = self._helper.get_default_dtype()
         self._size = [self._num_embeddings, self._embedding_dim]
@@ -1411,21 +1465,25 @@ class Embedding(Layer):
         self._weight_attr = weight_attr
         self._remote_prefetch = False
         self._name = name
-        self.weight = self.create_parameter(attr=self._weight_attr,
-                                            shape=self._size,
-                                            dtype=self._dtype,
-                                            is_bias=False)
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=self._size,
+            dtype=self._dtype,
+            is_bias=False,
+        )
 
         if in_dynamic_mode() and padding_idx != -1:
             with paddle.no_grad():
                 self.weight[padding_idx] = 0.0
 
     def forward(self, x):
-        return F.embedding(x,
-                           weight=self.weight,
-                           padding_idx=self._padding_idx,
-                           sparse=self._sparse,
-                           name=self._name)
+        return F.embedding(
+            x,
+            weight=self.weight,
+            padding_idx=self._padding_idx,
+            sparse=self._sparse,
+            name=self._name,
+        )
 
     def extra_repr(self):
         main_str = '{_num_embeddings}, {_embedding_dim}'
@@ -1449,7 +1507,7 @@ class Unfold(Layer):
 
     See ``paddle.nn.functional.unfold`` for more details.
 
-    
+
     Parameters:
         kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
                                   or an integer k treated as [k, k].
@@ -1483,12 +1541,9 @@ class Unfold(Layer):
             print(result)
     """
 
-    def __init__(self,
-                 kernel_sizes,
-                 dilations=1,
-                 paddings=0,
-                 strides=1,
-                 name=None):
+    def __init__(
+        self, kernel_sizes, dilations=1, paddings=0, strides=1, name=None
+    ):
         super(Unfold, self).__init__()
 
         self.kernel_sizes = kernel_sizes
@@ -1498,17 +1553,24 @@ class Unfold(Layer):
         self.name = name
 
     def forward(self, input):
-        return F.unfold(input,
-                        kernel_sizes=self.kernel_sizes,
-                        strides=self.strides,
-                        paddings=self.paddings,
-                        dilations=self.dilations,
-                        name=self.name)
+        return F.unfold(
+            input,
+            kernel_sizes=self.kernel_sizes,
+            strides=self.strides,
+            paddings=self.paddings,
+            dilations=self.dilations,
+            name=self.name,
+        )
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
-        return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.\
-                format(self.kernel_sizes, self.dilations, self.paddings, self.strides, name_str)
+        return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.format(
+            self.kernel_sizes,
+            self.dilations,
+            self.paddings,
+            self.strides,
+            name_str,
+        )
 
 
 class Fold(Layer):
@@ -1568,13 +1630,15 @@ class Fold(Layer):
             # y.shape = [2,3,4,5]
    """
 
-    def __init__(self,
-                 output_sizes,
-                 kernel_sizes,
-                 dilations=1,
-                 paddings=0,
-                 strides=1,
-                 name=None):
+    def __init__(
+        self,
+        output_sizes,
+        kernel_sizes,
+        dilations=1,
+        paddings=0,
+        strides=1,
+        name=None,
+    ):
         super(Fold, self).__init__()
 
         self.output_sizes = output_sizes
@@ -1585,15 +1649,22 @@ class Fold(Layer):
         self.name = name
 
     def forward(self, input):
-        return F.fold(input,
-                      output_sizes=self.output_sizes,
-                      kernel_sizes=self.kernel_sizes,
-                      strides=self.strides,
-                      paddings=self.paddings,
-                      dilations=self.dilations,
-                      name=self.name)
+        return F.fold(
+            input,
+            output_sizes=self.output_sizes,
+            kernel_sizes=self.kernel_sizes,
+            strides=self.strides,
+            paddings=self.paddings,
+            dilations=self.dilations,
+            name=self.name,
+        )
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
-        return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.\
-                format(self.kernel_sizes, self.dilations, self.paddings, self.strides, name_str)
+        return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.format(
+            self.kernel_sizes,
+            self.dilations,
+            self.paddings,
+            self.strides,
+            name_str,
+        )
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 4ef987eccf2..61267446dcf 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -31,7 +31,7 @@ __all__ = []
 
 def _get_default_param_initializer(num_channels, filter_size):
     filter_elem_num = num_channels * np.prod(filter_size)
-    std = (2.0 / filter_elem_num)**0.5
+    std = (2.0 / filter_elem_num) ** 0.5
     return Normal(0.0, std)
 
 
@@ -44,24 +44,27 @@ def _reverse_repeat_list(t, n):
 
 
 class _ConvNd(Layer):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 transposed,
-                 dims,
-                 stride=1,
-                 padding=0,
-                 padding_mode='zeros',
-                 output_padding=0,
-                 dilation=1,
-                 groups=1,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCHW"):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        transposed,
+        dims,
+        stride=1,
+        padding=0,
+        padding_mode='zeros',
+        output_padding=0,
+        dilation=1,
+        groups=1,
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NCHW",
+    ):
         super(_ConvNd, self).__init__()
-        assert weight_attr is not False, "weight_attr should not be False in Conv."
+        assert (
+            weight_attr is not False
+        ), "weight_attr should not be False in Conv."
         self._param_attr = weight_attr
         self._bias_attr = bias_attr
         self._groups = groups
@@ -72,11 +75,16 @@ class _ConvNd(Layer):
         valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
         if padding_mode not in valid_padding_modes:
             raise ValueError(
-                "padding_mode must be one of {}, but got padding_mode='{}'".
-                format(valid_padding_modes, padding_mode))
+                "padding_mode must be one of {}, but got padding_mode='{}'".format(
+                    valid_padding_modes, padding_mode
+                )
+            )
 
-        if padding_mode in {'reflect', 'replicate', 'circular'
-                            } and not isinstance(padding, int):
+        if padding_mode in {
+            'reflect',
+            'replicate',
+            'circular',
+        } and not isinstance(padding, int):
             raise TypeError(
                 "when padding_mode in ['reflect', 'replicate', 'circular'], type of padding must be int"
             )
@@ -84,12 +92,16 @@ class _ConvNd(Layer):
         valid_format = {'NHWC', 'NCHW', 'NDHWC', 'NCDHW', 'NLC', 'NCL'}
         if data_format not in valid_format:
             raise ValueError(
-                "data_format must be one of {}, but got data_format='{}'".
-                format(valid_format, data_format))
+                "data_format must be one of {}, but got data_format='{}'".format(
+                    valid_format, data_format
+                )
+            )
 
-        channel_last = (data_format == "NHWC") or (data_format
-                                                   == "NDHWC") or (data_format
-                                                                   == "NLC")
+        channel_last = (
+            (data_format == "NHWC")
+            or (data_format == "NDHWC")
+            or (data_format == "NLC")
+        )
         if channel_last:
             self._channel_dim = len(data_format) - 1
         else:
@@ -97,66 +109,86 @@ class _ConvNd(Layer):
 
         self._stride = utils.convert_to_list(stride, dims, 'stride')
         self._dilation = utils.convert_to_list(dilation, dims, 'dilation')
-        self._kernel_size = utils.convert_to_list(kernel_size, dims,
-                                                  'kernel_size')
+        self._kernel_size = utils.convert_to_list(
+            kernel_size, dims, 'kernel_size'
+        )
         self._padding = padding
         self._padding_mode = padding_mode
         self.output_padding = output_padding
         if dims != 1:
             self._updated_padding, self._padding_algorithm = _update_padding_nd(
-                padding, channel_last, dims)
+                padding, channel_last, dims
+            )
 
         if transposed:
-            filter_shape = [self._in_channels, out_channels // groups
-                            ] + self._kernel_size
+            filter_shape = [
+                self._in_channels,
+                out_channels // groups,
+            ] + self._kernel_size
         else:
             if in_channels % groups != 0:
                 raise ValueError("in_channels must be divisible by groups.")
 
             if padding_mode in {'reflect', 'replicate', 'circular'}:
-                _paired_padding = utils.convert_to_list(padding, dims,
-                                                        'padding')
+                _paired_padding = utils.convert_to_list(
+                    padding, dims, 'padding'
+                )
                 self._reversed_padding_repeated_twice = _reverse_repeat_list(
-                    _paired_padding, 2)
+                    _paired_padding, 2
+                )
 
-                self._updated_padding, self._padding_algorithm = _update_padding_nd(
-                    0, channel_last, dims)
+                (
+                    self._updated_padding,
+                    self._padding_algorithm,
+                ) = _update_padding_nd(0, channel_last, dims)
 
-            filter_shape = [out_channels, in_channels // groups
-                            ] + self._kernel_size
+            filter_shape = [
+                out_channels,
+                in_channels // groups,
+            ] + self._kernel_size
 
         def _get_default_param_initializer():
             if transposed:
                 return None
             filter_elem_num = np.prod(self._kernel_size) * self._in_channels
-            std = (2.0 / filter_elem_num)**0.5
+            std = (2.0 / filter_elem_num) ** 0.5
             return Normal(0.0, std)
 
         self.weight = self.create_parameter(
             shape=filter_shape,
             attr=self._param_attr,
-            default_initializer=_get_default_param_initializer())
-        self.bias = self.create_parameter(attr=self._bias_attr,
-                                          shape=[self._out_channels],
-                                          is_bias=True)
+            default_initializer=_get_default_param_initializer(),
+        )
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=[self._out_channels], is_bias=True
+        )
 
         cudnn_version = get_cudnn_version()
 
-        self._use_cudnn = True if (is_compiled_with_cuda()
-                                   and cudnn_version is not None) else False
+        self._use_cudnn = (
+            True
+            if (is_compiled_with_cuda() and cudnn_version is not None)
+            else False
+        )
 
         self._op_type = "conv" + str(dims) + 'd'
-        if self._op_type == 'conv2d' and (in_channels == groups
-                                          and in_channels != 1
-                                          and out_channels % in_channels == 0):
+        if self._op_type == 'conv2d' and (
+            in_channels == groups
+            and in_channels != 1
+            and out_channels % in_channels == 0
+        ):
             self._op_type = 'depthwise_conv2d'
             if is_compiled_with_rocm():
                 self._use_cudnn = True
             else:
                 self._use_cudnn = False
 
-        if (is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")
-            ["FLAGS_conv2d_disable_cudnn"]):
+        if (
+            is_compiled_with_cuda()
+            and get_flags("FLAGS_conv2d_disable_cudnn")[
+                "FLAGS_conv2d_disable_cudnn"
+            ]
+        ):
             self._use_cudnn = False
 
     def extra_repr(self):
@@ -302,50 +334,58 @@ class Conv1D(_ConvNd):
           #   [160. 211.]]]
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 padding_mode='zeros',
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCL"):
-        super(Conv1D, self).__init__(in_channels,
-                                     out_channels,
-                                     kernel_size,
-                                     False,
-                                     1,
-                                     stride=stride,
-                                     padding=padding,
-                                     padding_mode=padding_mode,
-                                     dilation=dilation,
-                                     groups=groups,
-                                     weight_attr=weight_attr,
-                                     bias_attr=bias_attr,
-                                     data_format=data_format)
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        padding_mode='zeros',
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NCL",
+    ):
+        super(Conv1D, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            1,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+        )
 
     def forward(self, x):
         padding = 0
         if self._padding_mode != "zeros":
-            x = F.pad(x,
-                      self._reversed_padding_repeated_twice,
-                      mode=self._padding_mode,
-                      data_format=self._data_format)
+            x = F.pad(
+                x,
+                self._reversed_padding_repeated_twice,
+                mode=self._padding_mode,
+                data_format=self._data_format,
+            )
         else:
             padding = self._padding
 
-        out = F.conv1d(x,
-                       self.weight,
-                       bias=self.bias,
-                       padding=padding,
-                       stride=self._stride,
-                       dilation=self._dilation,
-                       groups=self._groups,
-                       data_format=self._data_format)
+        out = F.conv1d(
+            x,
+            self.weight,
+            bias=self.bias,
+            padding=padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format,
+        )
         return out
 
 
@@ -477,43 +517,49 @@ class Conv1DTranspose(_ConvNd):
           # [[[60. 16. 99. 75.  4.]]]
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 output_padding=0,
-                 groups=1,
-                 dilation=1,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCL"):
-        super(Conv1DTranspose, self).__init__(in_channels,
-                                              out_channels,
-                                              kernel_size,
-                                              True,
-                                              1,
-                                              stride=stride,
-                                              padding=padding,
-                                              dilation=dilation,
-                                              output_padding=output_padding,
-                                              groups=groups,
-                                              weight_attr=weight_attr,
-                                              bias_attr=bias_attr,
-                                              data_format=data_format)
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        groups=1,
+        dilation=1,
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NCL",
+    ):
+        super(Conv1DTranspose, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            True,
+            1,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+        )
 
     def forward(self, x, output_size=None):
-        out = F.conv1d_transpose(x,
-                                 self.weight,
-                                 bias=self.bias,
-                                 output_size=output_size,
-                                 output_padding=self.output_padding,
-                                 padding=self._padding,
-                                 stride=self._stride,
-                                 dilation=self._dilation,
-                                 groups=self._groups,
-                                 data_format=self._data_format)
+        out = F.conv1d_transpose(
+            x,
+            self.weight,
+            bias=self.bias,
+            output_size=output_size,
+            output_padding=self.output_padding,
+            padding=self._padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format,
+        )
         return out
 
 
@@ -549,7 +595,7 @@ class Conv2D(_ConvNd):
     * :math:`b`: Bias value, a 1-D ``Tensor`` with shape [M].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    
+
     Parameters:
         in_channels(int): The number of input channels in the input image.
         out_channels(int): The number of output channels produced by the convolution.
@@ -559,7 +605,7 @@ class Conv2D(_ConvNd):
             stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
@@ -616,11 +662,11 @@ class Conv2D(_ConvNd):
 
           import paddle
           import paddle.nn as nn
-          
+
           paddle.disable_static()
-          
+
           x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
-          
+
           conv = nn.Conv2D(4, 6, (3, 3))
           y_var = conv(x_var)
           y_np = y_var.numpy()
@@ -628,51 +674,59 @@ class Conv2D(_ConvNd):
           # (2, 6, 6, 6)
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 padding_mode='zeros',
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCHW"):
-        super(Conv2D, self).__init__(in_channels,
-                                     out_channels,
-                                     kernel_size,
-                                     False,
-                                     2,
-                                     stride=stride,
-                                     padding=padding,
-                                     padding_mode=padding_mode,
-                                     dilation=dilation,
-                                     groups=groups,
-                                     weight_attr=weight_attr,
-                                     bias_attr=bias_attr,
-                                     data_format=data_format)
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        padding_mode='zeros',
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NCHW",
+    ):
+        super(Conv2D, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            2,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+        )
 
     def forward(self, x):
         if self._padding_mode != 'zeros':
-            x = F.pad(x,
-                      self._reversed_padding_repeated_twice,
-                      mode=self._padding_mode,
-                      data_format=self._data_format)
-
-        out = F.conv._conv_nd(x,
-                              self.weight,
-                              bias=self.bias,
-                              stride=self._stride,
-                              padding=self._updated_padding,
-                              padding_algorithm=self._padding_algorithm,
-                              dilation=self._dilation,
-                              groups=self._groups,
-                              data_format=self._data_format,
-                              channel_dim=self._channel_dim,
-                              op_type=self._op_type,
-                              use_cudnn=self._use_cudnn)
+            x = F.pad(
+                x,
+                self._reversed_padding_repeated_twice,
+                mode=self._padding_mode,
+                data_format=self._data_format,
+            )
+
+        out = F.conv._conv_nd(
+            x,
+            self.weight,
+            bias=self.bias,
+            stride=self._stride,
+            padding=self._updated_padding,
+            padding_algorithm=self._padding_algorithm,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format,
+            channel_dim=self._channel_dim,
+            op_type=self._op_type,
+            use_cudnn=self._use_cudnn,
+        )
         return out
 
 
@@ -707,7 +761,7 @@ class Conv2DTranspose(_ConvNd):
     * :math:`b`: Bias value, a 1-D ``Tensor`` with shape [M].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    
+
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of channels produced by the convolution.
@@ -719,7 +773,7 @@ class Conv2DTranspose(_ConvNd):
             stride_H = stride_W = stride. Default: 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` on both sides 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` on both sides
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
@@ -781,7 +835,7 @@ class Conv2DTranspose(_ConvNd):
 
           import paddle
           import paddle.nn as nn
-          
+
           paddle.disable_static()
 
           x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
@@ -793,31 +847,35 @@ class Conv2DTranspose(_ConvNd):
           # (2, 6, 10, 10)
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 output_padding=0,
-                 dilation=1,
-                 groups=1,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCHW"):
-        super(Conv2DTranspose, self).__init__(in_channels,
-                                              out_channels,
-                                              kernel_size,
-                                              True,
-                                              2,
-                                              stride=stride,
-                                              padding=padding,
-                                              dilation=dilation,
-                                              output_padding=output_padding,
-                                              groups=groups,
-                                              weight_attr=weight_attr,
-                                              bias_attr=bias_attr,
-                                              data_format=data_format)
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        dilation=1,
+        groups=1,
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NCHW",
+    ):
+        super(Conv2DTranspose, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            True,
+            2,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+        )
 
     def forward(self, x, output_size=None):
         if output_size is None:
@@ -825,16 +883,18 @@ class Conv2DTranspose(_ConvNd):
         else:
             output_padding = 0
 
-        out = F.conv2d_transpose(x,
-                                 self.weight,
-                                 bias=self.bias,
-                                 padding=self._padding,
-                                 output_padding=output_padding,
-                                 stride=self._stride,
-                                 dilation=self._dilation,
-                                 groups=self._groups,
-                                 output_size=output_size,
-                                 data_format=self._data_format)
+        out = F.conv2d_transpose(
+            x,
+            self.weight,
+            bias=self.bias,
+            padding=self._padding,
+            output_padding=output_padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            output_size=output_size,
+            data_format=self._data_format,
+        )
         return out
 
 
@@ -843,7 +903,7 @@ class Conv3D(_ConvNd):
     **Convlution3d Layer**
     The convolution3d layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are multidimensional tensors with a shape of 
+    Output(Output) are multidimensional tensors with a shape of
     :math:`[N, C, D, H, W]` . Where N is batch size, C is the number of
     channels, D is the depth of the feature, H is the height of the feature,
     and W is the width of the feature. Convlution3D is similar with Convlution2D
@@ -874,7 +934,7 @@ class Conv3D(_ConvNd):
             stride_D = stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
@@ -937,11 +997,11 @@ class Conv3D(_ConvNd):
 
           import paddle
           import paddle.nn as nn
-          
+
           paddle.disable_static()
 
           x_var = paddle.uniform((2, 4, 8, 8, 8), dtype='float32', min=-1., max=1.)
-          
+
           conv = nn.Conv3D(4, 6, (3, 3, 3))
           y_var = conv(x_var)
           y_np = y_var.numpy()
@@ -949,51 +1009,59 @@ class Conv3D(_ConvNd):
           # (2, 6, 6, 6, 6)
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 padding_mode='zeros',
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCDHW"):
-        super(Conv3D, self).__init__(in_channels,
-                                     out_channels,
-                                     kernel_size,
-                                     False,
-                                     3,
-                                     stride=stride,
-                                     padding=padding,
-                                     padding_mode=padding_mode,
-                                     dilation=dilation,
-                                     groups=groups,
-                                     weight_attr=weight_attr,
-                                     bias_attr=bias_attr,
-                                     data_format=data_format)
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        padding_mode='zeros',
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NCDHW",
+    ):
+        super(Conv3D, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            3,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+        )
 
     def forward(self, x):
         if self._padding_mode != 'zeros':
-            x = F.pad(x,
-                      self._reversed_padding_repeated_twice,
-                      mode=self._padding_mode,
-                      data_format=self._data_format)
-
-        out = F.conv._conv_nd(x,
-                              self.weight,
-                              bias=self.bias,
-                              stride=self._stride,
-                              padding=self._updated_padding,
-                              padding_algorithm=self._padding_algorithm,
-                              dilation=self._dilation,
-                              groups=self._groups,
-                              data_format=self._data_format,
-                              channel_dim=self._channel_dim,
-                              op_type=self._op_type,
-                              use_cudnn=self._use_cudnn)
+            x = F.pad(
+                x,
+                self._reversed_padding_repeated_twice,
+                mode=self._padding_mode,
+                data_format=self._data_format,
+            )
+
+        out = F.conv._conv_nd(
+            x,
+            self.weight,
+            bias=self.bias,
+            stride=self._stride,
+            padding=self._updated_padding,
+            padding_algorithm=self._padding_algorithm,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format,
+            channel_dim=self._channel_dim,
+            op_type=self._op_type,
+            use_cudnn=self._use_cudnn,
+        )
         return out
 
 
@@ -1126,31 +1194,35 @@ class Conv3DTranspose(_ConvNd):
           # (2, 6, 10, 10, 10)
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 output_padding=0,
-                 dilation=1,
-                 groups=1,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCDHW"):
-        super(Conv3DTranspose, self).__init__(in_channels,
-                                              out_channels,
-                                              kernel_size,
-                                              True,
-                                              3,
-                                              stride=stride,
-                                              padding=padding,
-                                              dilation=dilation,
-                                              output_padding=output_padding,
-                                              groups=groups,
-                                              weight_attr=weight_attr,
-                                              bias_attr=bias_attr,
-                                              data_format=data_format)
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        dilation=1,
+        groups=1,
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NCDHW",
+    ):
+        super(Conv3DTranspose, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            True,
+            3,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+        )
 
     def forward(self, x, output_size=None):
         if output_size is None:
@@ -1158,14 +1230,16 @@ class Conv3DTranspose(_ConvNd):
         else:
             output_padding = 0
 
-        out = F.conv3d_transpose(x,
-                                 self.weight,
-                                 bias=self.bias,
-                                 padding=self._padding,
-                                 output_padding=output_padding,
-                                 stride=self._stride,
-                                 dilation=self._dilation,
-                                 groups=self._groups,
-                                 output_size=output_size,
-                                 data_format=self._data_format)
+        out = F.conv3d_transpose(
+            x,
+            self.weight,
+            bias=self.bias,
+            padding=self._padding,
+            output_padding=output_padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            output_size=output_size,
+            data_format=self._data_format,
+        )
         return out
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 1ff37afa141..5fe6e24c547 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -18,7 +18,11 @@ import numpy as np
 import paddle.fluid as fluid
 import paddle
 from .. import functional as F
-from paddle.fluid.framework import _varbase_creator, in_dygraph_mode, _in_legacy_dygraph
+from paddle.fluid.framework import (
+    _varbase_creator,
+    in_dygraph_mode,
+    _in_legacy_dygraph,
+)
 from .. import Layer
 from paddle import in_dynamic_mode
 
@@ -106,15 +110,14 @@ class BCEWithLogitsLoss(Layer):
 
     """
 
-    def __init__(self,
-                 weight=None,
-                 reduction='mean',
-                 pos_weight=None,
-                 name=None):
+    def __init__(
+        self, weight=None, reduction='mean', pos_weight=None, name=None
+    ):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in BCEWithLogitsLoss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % reduction)
+                "received %s, which is not allowed." % reduction
+            )
 
         super(BCEWithLogitsLoss, self).__init__()
         self.weight = weight
@@ -124,30 +127,35 @@ class BCEWithLogitsLoss(Layer):
 
     def forward(self, logit, label):
         out = paddle.nn.functional.binary_cross_entropy_with_logits(
-            logit, label, self.weight, self.reduction, self.pos_weight,
-            self.name)
+            logit,
+            label,
+            self.weight,
+            self.reduction,
+            self.pos_weight,
+            self.name,
+        )
         return out
 
 
 class CrossEntropyLoss(Layer):
     r"""
-    By default, this operator implements the cross entropy loss function with softmax. This function 
-    combines the calculation of the softmax operation and the cross entropy loss function 
+    By default, this operator implements the cross entropy loss function with softmax. This function
+    combines the calculation of the softmax operation and the cross entropy loss function
     to provide a more numerically stable computing.
 
     This operator will calculate the cross entropy loss function without softmax when use_softmax=False.
 
-    By default, this operator will calculate the mean of the result, and you can also affect 
-    the default behavior by using the reduction parameter. Please refer to the part of 
+    By default, this operator will calculate the mean of the result, and you can also affect
+    the default behavior by using the reduction parameter. Please refer to the part of
     parameters for details.
 
     This operator can be used to calculate the softmax cross entropy loss with soft and hard labels.
-    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels 
+    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels
     mean the probability of the actual label, 0.6, 0.8, 0.2, etc.
 
     The calculation of this operator includes the following two steps.
 
-    -  **I.softmax cross entropy** 
+    -  **I.softmax cross entropy**
 
         1. Hard label (each sample can only be assigned into one category)
 
@@ -184,7 +192,7 @@ class CrossEntropyLoss(Layer):
 
 
 
-    -  **II.Weight and reduction processing** 
+    -  **II.Weight and reduction processing**
 
         1. Weight
 
@@ -196,7 +204,7 @@ class CrossEntropyLoss(Layer):
             1.1. Hard labels (soft_label = False)
 
             .. math::
-                \\loss_j=loss_j*weight[label_j] 
+                \\loss_j=loss_j*weight[label_j]
 
 
             1.2. Soft labels (soft_label = True)
@@ -206,21 +214,21 @@ class CrossEntropyLoss(Layer):
 
         2. reduction
 
-            2.1 if the ``reduction`` parameter is ``none`` 
+            2.1 if the ``reduction`` parameter is ``none``
 
             Return the previous result directly
 
-            2.2 if the ``reduction`` parameter is ``sum`` 
+            2.2 if the ``reduction`` parameter is ``sum``
 
             Return the sum of the previous results
 
             .. math::
                \\loss=\sum_{j}loss_j
 
-            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to 
-            the ``weight`` parameter as follows. 
+            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to
+            the ``weight`` parameter as follows.
 
-            2.3.1. If the  ``weight``  parameter is ``None`` 
+            2.3.1. If the  ``weight``  parameter is ``None``
 
             Return the average value of the previous results
 
@@ -234,27 +242,27 @@ class CrossEntropyLoss(Layer):
             1. Hard labels (soft_label = False)
 
              .. math::
-                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j] 
+                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j]
 
             2. Soft labels (soft_label = True)
 
              .. math::
                 \\loss=\sum_{j}loss_j/\sum_{j}\left(\sum_{i}weight[label_i]\right)
- 
- 
+
+
     Parameters:
 
         - **weight** (Tensor, optional)
 
-            a manual rescaling weight given to each class. 
-            If given, has to be a Tensor of size C and the data type is float32, float64. 
+            a manual rescaling weight given to each class.
+            If given, has to be a Tensor of size C and the data type is float32, float64.
             Default is ``'None'`` .
 
         - **ignore_index** (int64, optional)
 
             Specifies a target value that is ignored
-            and does not contribute to the loss. A negative value means that no label 
-            value needs to be ignored. Only valid when soft_label = False.  
+            and does not contribute to the loss. A negative value means that no label
+            value needs to be ignored. Only valid when soft_label = False.
             Default is ``-100`` .
 
         - **reduction** (str, optional)
@@ -268,15 +276,15 @@ class CrossEntropyLoss(Layer):
 
         - **soft_label** (bool, optional)
 
-            Indicate whether label is soft. 
+            Indicate whether label is soft.
             If soft_label=False, the label is hard.  If soft_label=True, the label is soft.
             Default is ``False``.
 
         - **axis** (int, optional)
 
-            The index of dimension to perform softmax calculations. 
-            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number 
-            of dimensions of input :attr:`input`. 
+            The index of dimension to perform softmax calculations.
+            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number
+            of dimensions of input :attr:`input`.
             Default is ``-1`` .
 
         - **use_softmax** (bool, optional)
@@ -295,11 +303,11 @@ class CrossEntropyLoss(Layer):
         - **input** (Tensor)
 
             Input tensor, the data type is float32, float64. Shape is
-        :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` . 
+        :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` .
 
-            Note: 
+            Note:
 
-                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the 
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
                 output of softmax operator, which will produce incorrect results.
 
                 2. when use_softmax=False, it expects the output of softmax operator.
@@ -307,11 +315,11 @@ class CrossEntropyLoss(Layer):
 
         - **label** (Tensor)
 
-            1. If soft_label=False, the shape is 
+            1. If soft_label=False, the shape is
             :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
             the data type is int32, int64, float32, float64, where each value is [0, C-1].
 
-            2. If soft_label=True, the shape and data type should be same with ``input`` , 
+            2. If soft_label=True, the shape and data type should be same with ``input`` ,
             and the sum of the labels for each sample should be 1.
 
         - **output** (Tensor)
@@ -324,24 +332,24 @@ class CrossEntropyLoss(Layer):
 
             If :attr:`reduction` is ``'none'``:
 
-            1. If soft_label = False, the dimension of return value is the same with ``label`` . 
+            1. If soft_label = False, the dimension of return value is the same with ``label`` .
 
-            2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . 
+            2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .
 
     Examples:
 
         .. code-block:: python
-            
+
             # hard labels
             import paddle
             paddle.seed(99999)
             N=100
             C=200
             reduction='mean'
-            input =  paddle.rand([N, C], dtype='float64')  
+            input =  paddle.rand([N, C], dtype='float64')
             label =  paddle.randint(0, C, shape=[N], dtype='int64')
-            weight = paddle.rand([C], dtype='float64') 
-            
+            weight = paddle.rand([C], dtype='float64')
+
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction=reduction)
             dy_ret = cross_entropy_loss(
@@ -365,9 +373,9 @@ class CrossEntropyLoss(Layer):
             labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
             labels /= paddle.sum(labels, axis=axis, keepdim=True)
             paddle_loss_mean = paddle.nn.functional.cross_entropy(
-                                                                  logits,  
-                                                                  labels, 
-                                                                  soft_label=True, 
+                                                                  logits,
+                                                                  labels,
+                                                                  soft_label=True,
                                                                   axis=axis,
                                                                   weight=weight,
                                                                   reduction=reduction)
@@ -375,14 +383,16 @@ class CrossEntropyLoss(Layer):
 
     """
 
-    def __init__(self,
-                 weight=None,
-                 ignore_index=-100,
-                 reduction='mean',
-                 soft_label=False,
-                 axis=-1,
-                 use_softmax=True,
-                 name=None):
+    def __init__(
+        self,
+        weight=None,
+        ignore_index=-100,
+        reduction='mean',
+        soft_label=False,
+        axis=-1,
+        use_softmax=True,
+        name=None,
+    ):
         super(CrossEntropyLoss, self).__init__()
         self.weight = weight
         self.reduction = reduction
@@ -393,15 +403,17 @@ class CrossEntropyLoss(Layer):
         self.name = name
 
     def forward(self, input, label):
-        ret = paddle.nn.functional.cross_entropy(input,
-                                                 label,
-                                                 weight=self.weight,
-                                                 ignore_index=self.ignore_index,
-                                                 reduction=self.reduction,
-                                                 soft_label=self.soft_label,
-                                                 axis=self.axis,
-                                                 use_softmax=self.use_softmax,
-                                                 name=self.name)
+        ret = paddle.nn.functional.cross_entropy(
+            input,
+            label,
+            weight=self.weight,
+            ignore_index=self.ignore_index,
+            reduction=self.reduction,
+            soft_label=self.soft_label,
+            axis=self.axis,
+            use_softmax=self.use_softmax,
+            name=self.name,
+        )
 
         return ret
 
@@ -409,7 +421,7 @@ class CrossEntropyLoss(Layer):
 class HSigmoidLoss(Layer):
     """
     Hierarchical Sigmoid Layer.
-    
+
     The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
     and speed up the model training, especially the training of language model.
     Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
@@ -444,7 +456,7 @@ class HSigmoidLoss(Layer):
             is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr,
             hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not
             set, the bias is initialized zero. Default is None.
-        is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and 
+        is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and
             `path_code` should be passed to its forward method, otherwise `path_table` and `path_code`
             should not be passed to its forward method. Default is False.
         is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True,
@@ -477,18 +489,21 @@ class HSigmoidLoss(Layer):
             #  [2.34564662]]
     """
 
-    def __init__(self,
-                 feature_size,
-                 num_classes,
-                 weight_attr=None,
-                 bias_attr=None,
-                 is_custom=False,
-                 is_sparse=False,
-                 name=None):
+    def __init__(
+        self,
+        feature_size,
+        num_classes,
+        weight_attr=None,
+        bias_attr=None,
+        is_custom=False,
+        is_sparse=False,
+        name=None,
+    ):
         super(HSigmoidLoss, self).__init__()
         if (num_classes < 2) and (not is_custom):
             raise ValueError(
-                "num_classes must not be less than 2 with default tree")
+                "num_classes must not be less than 2 with default tree"
+            )
 
         if (not is_custom) and (is_sparse):
             print("Sparse mode should not be used without custom tree")
@@ -506,29 +521,34 @@ class HSigmoidLoss(Layer):
         self._dtype = paddle.get_default_dtype()
 
         remote_prefetch = is_sparse
-        print("With sparse mode, if your models has only"
-              " small parameter prefetch may cause speed down")
+        print(
+            "With sparse mode, if your models has only"
+            " small parameter prefetch may cause speed down"
+        )
 
         C = self._num_classes if is_custom else self._num_classes - 1
-        self.weight = self.create_parameter([C, self._feature_size],
-                                            attr=self._weight_attr,
-                                            is_bias=False,
-                                            dtype=self._dtype)
-        self.bias = self.create_parameter([C, 1],
-                                          attr=self._bias_attr,
-                                          is_bias=True,
-                                          dtype=self._dtype)
+        self.weight = self.create_parameter(
+            [C, self._feature_size],
+            attr=self._weight_attr,
+            is_bias=False,
+            dtype=self._dtype,
+        )
+        self.bias = self.create_parameter(
+            [C, 1], attr=self._bias_attr, is_bias=True, dtype=self._dtype
+        )
 
     def forward(self, input, label, path_table=None, path_code=None):
-        out = F.hsigmoid_loss(input,
-                              label,
-                              self._num_classes,
-                              self.weight,
-                              self.bias,
-                              path_table=path_table,
-                              path_code=path_code,
-                              is_sparse=self._is_sparse,
-                              name=self._name)
+        out = F.hsigmoid_loss(
+            input,
+            label,
+            self._num_classes,
+            self.weight,
+            self.bias,
+            path_table=path_table,
+            path_code=path_code,
+            is_sparse=self._is_sparse,
+            name=self._name,
+        )
         return out
 
 
@@ -589,17 +609,18 @@ class MSELoss(Layer):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "'reduction' in 'MSELoss' should be 'sum', 'mean' or 'none', "
-                "but received {}.".format(reduction))
+                "but received {}.".format(reduction)
+            )
         self.reduction = reduction
 
     def forward(self, input, label):
         if not in_dynamic_mode():
-            fluid.data_feeder.check_variable_and_dtype(input, 'input',
-                                                       ['float32', 'float64'],
-                                                       'MSELoss')
-            fluid.data_feeder.check_variable_and_dtype(label, 'label',
-                                                       ['float32', 'float64'],
-                                                       'MSELoss')
+            fluid.data_feeder.check_variable_and_dtype(
+                input, 'input', ['float32', 'float64'], 'MSELoss'
+            )
+            fluid.data_feeder.check_variable_and_dtype(
+                label, 'label', ['float32', 'float64'], 'MSELoss'
+            )
 
         if in_dygraph_mode():
             square_out = paddle._C_ops.square(paddle.subtract(input, label))
@@ -654,7 +675,7 @@ class L1Loss(Layer):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
             import numpy as np
 
@@ -684,16 +705,16 @@ class L1Loss(Layer):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in L1Loss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % reduction)
+                "received %s, which is not allowed." % reduction
+            )
         super(L1Loss, self).__init__()
         self.reduction = reduction
         self.name = name
 
     def forward(self, input, label):
-        return paddle.nn.functional.l1_loss(input,
-                                            label,
-                                            self.reduction,
-                                            name=self.name)
+        return paddle.nn.functional.l1_loss(
+            input, label, self.reduction, name=self.name
+        )
 
 
 class BCELoss(Layer):
@@ -773,7 +794,8 @@ class BCELoss(Layer):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in bce_loss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % reduction)
+                "received %s, which is not allowed." % reduction
+            )
 
         super(BCELoss, self).__init__()
         self.weight = weight
@@ -781,10 +803,9 @@ class BCELoss(Layer):
         self.name = name
 
     def forward(self, input, label):
-        out = paddle.nn.functional.binary_cross_entropy(input, label,
-                                                        self.weight,
-                                                        self.reduction,
-                                                        self.name)
+        out = paddle.nn.functional.binary_cross_entropy(
+            input, label, self.weight, self.reduction, self.name
+        )
         return out
 
 
@@ -874,15 +895,14 @@ class NLLLoss(Layer):
 
     """
 
-    def __init__(self,
-                 weight=None,
-                 ignore_index=-100,
-                 reduction='mean',
-                 name=None):
+    def __init__(
+        self, weight=None, ignore_index=-100, reduction='mean', name=None
+    ):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in nll_loss should be 'sum', 'mean' or "
-                "'none', but received %s, which is not allowed." % reduction)
+                "'none', but received %s, which is not allowed." % reduction
+            )
         super(NLLLoss, self).__init__()
         self._weight = weight
         self._ignore_index = ignore_index
@@ -890,12 +910,14 @@ class NLLLoss(Layer):
         self._name = name
 
     def forward(self, input, label):
-        return F.nll_loss(input,
-                          label,
-                          weight=self._weight,
-                          ignore_index=self._ignore_index,
-                          reduction=self._reduction,
-                          name=self._name)
+        return F.nll_loss(
+            input,
+            label,
+            weight=self._weight,
+            ignore_index=self._ignore_index,
+            reduction=self._reduction,
+            name=self._name,
+        )
 
 
 class KLDivLoss(Layer):
@@ -999,7 +1021,7 @@ class MarginRankingLoss(Layer):
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Shape:
-    
+
         input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
 
         other: N-D Tensor, `other` have the same shape and dtype as `input`.
@@ -1031,17 +1053,17 @@ class MarginRankingLoss(Layer):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in MarginRankingLoss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % reduction)
+                "received %s, which is not allowed." % reduction
+            )
         super(MarginRankingLoss, self).__init__()
         self.margin = margin
         self.reduction = reduction
         self.name = name
 
     def forward(self, input, other, label):
-        out = paddle.nn.functional.margin_ranking_loss(input, other, label,
-                                                       self.margin,
-                                                       self.reduction,
-                                                       self.name)
+        out = paddle.nn.functional.margin_ranking_loss(
+            input, other, label, self.margin, self.reduction, self.name
+        )
         return out
 
 
@@ -1125,19 +1147,23 @@ class CTCLoss(Layer):
         self.blank = blank
         self.reduction = reduction
 
-    def forward(self,
-                log_probs,
-                labels,
-                input_lengths,
-                label_lengths,
-                norm_by_times=False):
-        return paddle.nn.functional.ctc_loss(log_probs,
-                                             labels,
-                                             input_lengths,
-                                             label_lengths,
-                                             self.blank,
-                                             self.reduction,
-                                             norm_by_times=norm_by_times)
+    def forward(
+        self,
+        log_probs,
+        labels,
+        input_lengths,
+        label_lengths,
+        norm_by_times=False,
+    ):
+        return paddle.nn.functional.ctc_loss(
+            log_probs,
+            labels,
+            input_lengths,
+            label_lengths,
+            self.blank,
+            self.reduction,
+            norm_by_times=norm_by_times,
+        )
 
 
 class SmoothL1Loss(Layer):
@@ -1207,11 +1233,13 @@ class SmoothL1Loss(Layer):
         self.name = name
 
     def forward(self, input, label):
-        return F.smooth_l1_loss(input,
-                                label,
-                                reduction=self.reduction,
-                                delta=self.delta,
-                                name=self.name)
+        return F.smooth_l1_loss(
+            input,
+            label,
+            reduction=self.reduction,
+            delta=self.delta,
+            name=self.name,
+        )
 
 
 class MultiLabelSoftMarginLoss(Layer):
@@ -1279,17 +1307,20 @@ class MultiLabelSoftMarginLoss(Layer):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "'reduction' in 'MultiLabelSoftMarginloss' should be 'sum', 'mean' or 'none', "
-                "but received {}.".format(reduction))
+                "but received {}.".format(reduction)
+            )
         self.weight = weight
         self.reduction = reduction
         self.name = name
 
     def forward(self, input, label):
-        return F.multi_label_soft_margin_loss(input,
-                                              label,
-                                              weight=self.weight,
-                                              reduction=self.reduction,
-                                              name=self.name)
+        return F.multi_label_soft_margin_loss(
+            input,
+            label,
+            weight=self.weight,
+            reduction=self.reduction,
+            name=self.name,
+        )
 
 
 class HingeEmbeddingLoss(Layer):
@@ -1379,11 +1410,13 @@ class HingeEmbeddingLoss(Layer):
         self.name = name
 
     def forward(self, input, label):
-        return F.hinge_embedding_loss(input,
-                                      label,
-                                      reduction=self.reduction,
-                                      margin=self.margin,
-                                      name=self.name)
+        return F.hinge_embedding_loss(
+            input,
+            label,
+            reduction=self.reduction,
+            margin=self.margin,
+            name=self.name,
+        )
 
 
 class CosineEmbeddingLoss(Layer):
@@ -1457,23 +1490,27 @@ class CosineEmbeddingLoss(Layer):
         if margin > 1 or margin < -1:
             raise ValueError(
                 "The value of 'margin' should be in the interval of [-1, 1], but received %f, which is not allowed."
-                % margin)
+                % margin
+            )
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' should be 'sum', 'mean' or "
-                "'none', but received %s, which is not allowed." % reduction)
+                "'none', but received %s, which is not allowed." % reduction
+            )
         super(CosineEmbeddingLoss, self).__init__()
         self.margin = margin
         self.reduction = reduction
         self.name = name
 
     def forward(self, input1, input2, label):
-        return F.cosine_embedding_loss(input1,
-                                       input2,
-                                       label,
-                                       margin=self.margin,
-                                       reduction=self.reduction,
-                                       name=self.name)
+        return F.cosine_embedding_loss(
+            input1,
+            input2,
+            label,
+            margin=self.margin,
+            reduction=self.reduction,
+            name=self.name,
+        )
 
 
 class TripletMarginWithDistanceLoss(Layer):
@@ -1491,22 +1528,22 @@ class TripletMarginWithDistanceLoss(Layer):
         L(input, pos, neg) = \max \{d(input_i, pos_i) - d(input_i, neg_i) + {\rm margin}, 0\}
 
     where the default `distance_function`
-    
+
     .. math::
-    	d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_2
-    
-    or user can define their own distance function. `margin` is a nonnegative margin representing the minimum difference 
+        d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_2
+
+    or user can define their own distance function. `margin` is a nonnegative margin representing the minimum difference
     between the positive and negative distances that is required for the loss to be 0. If `swap` is true, it will compare distance of (input, negative) with
     distance of (negative, positive) and change it to the smaller one. For more details see http://www.bmva.org/bmvc/2016/papers/paper119/paper119.pdf.
 
     Parameters:
         distance_function (Callable, Optional): Quantifies the distance between two tensors. if not specified, 2 norm functions will be used.
-	
+
         margin (float, Optional):Default: :math:`1`.A nonnegative margin representing the minimum difference
                 between the positive and negative distances required for the loss to be 0. Larger
                 margins penalize cases where the negative examples are not distant enough from the
                 anchors, relative to the positives.
-		
+
         swap (bool, Optional):The distance swap changes the negative distance to the swap distance (distance between positive samples
                 and negative samples) if swap distance smaller than negative distance. Default: ``False``.
 
@@ -1518,18 +1555,18 @@ class TripletMarginWithDistanceLoss(Layer):
                 Default: ``'mean'``
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-	    
+
     Shapes:
         input (Tensor):Input tensor, the data type is float32 or float64.
-	the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
+        the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
 
         positive (Tensor):Positive tensor, the data type is float32 or float64.
-	The shape of label is the same as the shape of input.
+        The shape of label is the same as the shape of input.
 
         negative (Tensor):Negative tensor, the data type is float32 or float64.
-	The shape of label is the same as the shape of input.
-	
-	    output(Tensor): The tensor variable storing the triplet_margin_with_distance_loss of input and positive and negative.
+        The shape of label is the same as the shape of input.
+
+            output(Tensor): The tensor variable storing the triplet_margin_with_distance_loss of input and positive and negative.
 
     Return：
         A callable object of TripletMarginWithDistanceLoss
@@ -1555,18 +1592,21 @@ class TripletMarginWithDistanceLoss(Layer):
 
     """
 
-    def __init__(self,
-                 distance_function=None,
-                 margin=1.0,
-                 swap=False,
-                 reduction: str = 'mean',
-                 name=None):
+    def __init__(
+        self,
+        distance_function=None,
+        margin=1.0,
+        swap=False,
+        reduction: str = 'mean',
+        name=None,
+    ):
         super(TripletMarginWithDistanceLoss, self).__init__()
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in TripletMarginWithDistanceLoss "
                 "should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % reduction)
+                "received %s, which is not allowed." % reduction
+            )
         self.margin = margin
         self.swap = swap
         self.reduction = reduction
@@ -1574,13 +1614,15 @@ class TripletMarginWithDistanceLoss(Layer):
         self.name = name
 
     def forward(self, input, positive, negative):
-        return F.triplet_margin_with_distance_loss(input,
-                                                   positive,
-                                                   negative,
-                                                   margin=self.margin,
-                                                   swap=self.swap,
-                                                   reduction=self.reduction,
-                                                   name=self.name)
+        return F.triplet_margin_with_distance_loss(
+            input,
+            positive,
+            negative,
+            margin=self.margin,
+            swap=self.swap,
+            reduction=self.reduction,
+            name=self.name,
+        )
 
 
 class TripletMarginLoss(Layer):
@@ -1650,7 +1692,7 @@ class TripletMarginLoss(Layer):
             loss = triplet_margin_loss(input, positive, negative)
             print(loss)
             # Tensor([0.        , 0.57496738, 0.        ])
-	    
+
             triplet_margin_loss = paddle.nn.TripletMarginLoss(margin=1.0, swap=True, reduction='mean', )
             loss = triplet_margin_loss(input, positive, negative,)
             print(loss)
@@ -1658,18 +1700,21 @@ class TripletMarginLoss(Layer):
 
     """
 
-    def __init__(self,
-                 margin=1.0,
-                 p=2.,
-                 epsilon=1e-6,
-                 swap=False,
-                 reduction='mean',
-                 name=None):
+    def __init__(
+        self,
+        margin=1.0,
+        p=2.0,
+        epsilon=1e-6,
+        swap=False,
+        reduction='mean',
+        name=None,
+    ):
         super(TripletMarginLoss, self).__init__()
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in TripletMarginLoss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % reduction)
+                "received %s, which is not allowed." % reduction
+            )
         self.margin = margin
         self.p = p
         self.epsilon = epsilon
@@ -1678,15 +1723,17 @@ class TripletMarginLoss(Layer):
         self.name = name
 
     def forward(self, input, positive, negative):
-        return F.triplet_margin_loss(input,
-                                     positive,
-                                     negative,
-                                     margin=self.margin,
-                                     p=self.p,
-                                     epsilon=self.epsilon,
-                                     swap=self.swap,
-                                     reduction=self.reduction,
-                                     name=self.name)
+        return F.triplet_margin_loss(
+            input,
+            positive,
+            negative,
+            margin=self.margin,
+            p=self.p,
+            epsilon=self.epsilon,
+            swap=self.swap,
+            reduction=self.reduction,
+            name=self.name,
+        )
 
 
 class SoftMarginLoss(Layer):
@@ -1748,13 +1795,15 @@ class SoftMarginLoss(Layer):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in SoftMarginLoss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % reduction)
+                "received %s, which is not allowed." % reduction
+            )
 
         super(SoftMarginLoss, self).__init__()
         self.reduction = reduction
         self.name = name
 
     def forward(self, input, label):
-        out = paddle.nn.functional.soft_margin_loss(input, label,
-                                                    self.reduction, self.name)
+        out = paddle.nn.functional.soft_margin_loss(
+            input, label, self.reduction, self.name
+        )
         return out
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 0a259b58125..1d40e66f580 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -56,23 +56,27 @@ __all__ = []
 
 class _InstanceNormBase(Layer):
     """
-    This class is based class for InstanceNorm1D, 2d, 3d. 
+    This class is based class for InstanceNorm1D, 2d, 3d.
 
     See InstaceNorm1D, InstanceNorm2D or InstanceNorm3D for more details.
     """
 
-    def __init__(self,
-                 num_features,
-                 epsilon=1e-5,
-                 momentum=0.9,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCHW",
-                 name=None):
+    def __init__(
+        self,
+        num_features,
+        epsilon=1e-5,
+        momentum=0.9,
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NCHW",
+        name=None,
+    ):
         super(_InstanceNormBase, self).__init__()
 
         if weight_attr == False or bias_attr == False:
-            assert weight_attr == bias_attr, "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
+            assert (
+                weight_attr == bias_attr
+            ), "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
         self._epsilon = epsilon
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
@@ -83,11 +87,14 @@ class _InstanceNormBase(Layer):
                 attr=self._weight_attr,
                 shape=[num_features],
                 default_initializer=Constant(1.0),
-                is_bias=False)
-            self.bias = self.create_parameter(attr=self._bias_attr,
-                                              shape=[num_features],
-                                              default_initializer=Constant(0.0),
-                                              is_bias=True)
+                is_bias=False,
+            )
+            self.bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=[num_features],
+                default_initializer=Constant(0.0),
+                is_bias=True,
+            )
         else:
             self.scale = None
             self.bias = None
@@ -98,14 +105,14 @@ class _InstanceNormBase(Layer):
     def forward(self, input):
         self._check_input_dim(input)
 
-        return instance_norm(input,
-                             weight=self.scale,
-                             bias=self.bias,
-                             eps=self._epsilon)
+        return instance_norm(
+            input, weight=self.scale, bias=self.bias, eps=self._epsilon
+        )
 
     def extra_repr(self):
-        return 'num_features={}, epsilon={}'.format(self._num_features,
-                                                    self._epsilon)
+        return 'num_features={}, epsilon={}'.format(
+            self._num_features, self._epsilon
+        )
 
 
 class InstanceNorm1D(_InstanceNormBase):
@@ -175,8 +182,11 @@ class InstanceNorm1D(_InstanceNormBase):
 
     def _check_input_dim(self, input):
         if len(input.shape) != 2 and len(input.shape) != 3:
-            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
-                len(input.shape)))
+            raise ValueError(
+                'expected 2D or 3D input (got {}D input)'.format(
+                    len(input.shape)
+                )
+            )
 
 
 class InstanceNorm2D(_InstanceNormBase):
@@ -245,8 +255,9 @@ class InstanceNorm2D(_InstanceNormBase):
 
     def _check_input_dim(self, input):
         if len(input.shape) != 4:
-            raise ValueError('expected 4D input (got {}D input)'.format(
-                len(input.shape)))
+            raise ValueError(
+                'expected 4D input (got {}D input)'.format(len(input.shape))
+            )
 
 
 class InstanceNorm3D(_InstanceNormBase):
@@ -315,8 +326,9 @@ class InstanceNorm3D(_InstanceNormBase):
 
     def _check_input_dim(self, input):
         if len(input.shape) != 5:
-            raise ValueError('expected 5D input (got {}D input)'.format(
-                len(input.shape)))
+            raise ValueError(
+                'expected 5D input (got {}D input)'.format(len(input.shape))
+            )
 
 
 class GroupNorm(Layer):
@@ -356,21 +368,23 @@ class GroupNorm(Layer):
           paddle.disable_static()
           np.random.seed(123)
           x_data = np.random.random(size=(2, 6, 2, 2)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.to_tensor(x_data)
           group_norm = paddle.nn.GroupNorm(num_channels=6, num_groups=6)
           group_norm_out = group_norm(x)
 
           print(group_norm_out.numpy())
     """
 
-    def __init__(self,
-                 num_groups,
-                 num_channels,
-                 epsilon=1e-05,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format='NCHW',
-                 name=None):
+    def __init__(
+        self,
+        num_groups,
+        num_channels,
+        epsilon=1e-05,
+        weight_attr=None,
+        bias_attr=None,
+        data_format='NCHW',
+        name=None,
+    ):
         super(GroupNorm, self).__init__()
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
@@ -384,39 +398,57 @@ class GroupNorm(Layer):
 
         if weight_attr == False:
             self.weight = self.create_parameter(
-                attr=None, shape=param_shape, default_initializer=Constant(1.0))
+                attr=None, shape=param_shape, default_initializer=Constant(1.0)
+            )
             self.weight.stop_gradient = True
         else:
             self.weight = self.create_parameter(
                 attr=self._weight_attr,
                 shape=param_shape,
-                default_initializer=Constant(1.0))
-            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
+                default_initializer=Constant(1.0),
+            )
+            self.weight.stop_gradient = (
+                self._weight_attr != None
+                and self._weight_attr.learning_rate == 0.0
+            )
 
         if bias_attr == False:
-            self.bias = self.create_parameter(attr=None,
-                                              shape=param_shape,
-                                              default_initializer=Constant(0.0),
-                                              is_bias=True)
+            self.bias = self.create_parameter(
+                attr=None,
+                shape=param_shape,
+                default_initializer=Constant(0.0),
+                is_bias=True,
+            )
             self.bias.stop_gradient = True
         else:
-            self.bias = self.create_parameter(attr=self._bias_attr,
-                                              shape=param_shape,
-                                              is_bias=True)
-            self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True
+            )
+            self.bias.stop_gradient = (
+                self._bias_attr != None and self._bias_attr.learning_rate == 0.0
+            )
 
     def forward(self, input):
         mean_out = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype, stop_gradient=True)
+            dtype=input.dtype, stop_gradient=True
+        )
         variance_out = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype, stop_gradient=True)
+            dtype=input.dtype, stop_gradient=True
+        )
 
         if in_dygraph_mode():
-            pre_act = _C_ops.group_norm(input, self.weight, self.bias,
-                                        self._epsilon, self._num_groups, "NCHW")
+            pre_act = _C_ops.group_norm(
+                input,
+                self.weight,
+                self.bias,
+                self._epsilon,
+                self._num_groups,
+                "NCHW",
+            )
 
-            return dygraph_utils._append_activation_in_dygraph(pre_act,
-                                                               act=None)
+            return dygraph_utils._append_activation_in_dygraph(
+                pre_act, act=None
+            )
 
         elif _in_legacy_dygraph():
             pre_act, _, _ = _legacy_C_ops.group_norm(
@@ -430,8 +462,9 @@ class GroupNorm(Layer):
                 'groups',
                 self._num_groups,
             )
-            return dygraph_utils._append_activation_in_dygraph(pre_act,
-                                                               act=None)
+            return dygraph_utils._append_activation_in_dygraph(
+                pre_act, act=None
+            )
 
         inputs = {'X': input}
         if self.bias is not None:
@@ -441,32 +474,33 @@ class GroupNorm(Layer):
 
         # create output
         group_norm_out = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype)
-
-        self._helper.append_op(type="group_norm",
-                               inputs=inputs,
-                               outputs={
-                                   "Y": group_norm_out,
-                                   "Mean": mean_out,
-                                   "Variance": variance_out,
-                               },
-                               attrs={
-                                   "epsilon": self._epsilon,
-                                   "groups": self._num_groups
-                               })
+            dtype=input.dtype
+        )
+
+        self._helper.append_op(
+            type="group_norm",
+            inputs=inputs,
+            outputs={
+                "Y": group_norm_out,
+                "Mean": mean_out,
+                "Variance": variance_out,
+            },
+            attrs={"epsilon": self._epsilon, "groups": self._num_groups},
+        )
 
         return self._helper.append_activation(group_norm_out, None)
 
     def extra_repr(self):
         return 'num_groups={}, num_channels={}, epsilon={}'.format(
-            self._num_groups, self._num_channels, self._epsilon)
+            self._num_groups, self._num_channels, self._epsilon
+        )
 
 
 class LayerNorm(Layer):
     r"""
     :alias_main: paddle.nn.LayerNorm
-	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
-	:old_api: paddle.fluid.dygraph.LayerNorm
+        :alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
+        :old_api: paddle.fluid.dygraph.LayerNorm
 
     This interface is used to construct a callable object of the ``LayerNorm`` class.
     For more details, refer to code examples.
@@ -520,19 +554,21 @@ class LayerNorm(Layer):
 
           np.random.seed(123)
           x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.to_tensor(x_data)
           layer_norm = paddle.nn.LayerNorm(x_data.shape[1:])
           layer_norm_out = layer_norm(x)
 
           print(layer_norm_out)
     """
 
-    def __init__(self,
-                 normalized_shape,
-                 epsilon=1e-05,
-                 weight_attr=None,
-                 bias_attr=None,
-                 name=None):
+    def __init__(
+        self,
+        normalized_shape,
+        epsilon=1e-05,
+        weight_attr=None,
+        bias_attr=None,
+        name=None,
+    ):
         super(LayerNorm, self).__init__()
         if isinstance(normalized_shape, numbers.Integral):
             normalized_shape = [normalized_shape]
@@ -549,25 +585,29 @@ class LayerNorm(Layer):
             self.weight = self.create_parameter(
                 attr=self._weight_attr,
                 shape=param_shape,
-                default_initializer=Constant(1.0))
+                default_initializer=Constant(1.0),
+            )
 
         if bias_attr is False:
             self.bias = None
         else:
-            self.bias = self.create_parameter(attr=self._bias_attr,
-                                              shape=param_shape,
-                                              is_bias=True)
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True
+            )
 
     def forward(self, input):
-        return layer_norm(input,
-                          normalized_shape=self._normalized_shape,
-                          weight=self.weight,
-                          bias=self.bias,
-                          epsilon=self._epsilon)
+        return layer_norm(
+            input,
+            normalized_shape=self._normalized_shape,
+            weight=self.weight,
+            bias=self.bias,
+            epsilon=self._epsilon,
+        )
 
     def extra_repr(self):
-        return 'normalized_shape={}, epsilon={}'.format(self._normalized_shape,
-                                                        self._epsilon)
+        return 'normalized_shape={}, epsilon={}'.format(
+            self._normalized_shape, self._epsilon
+        )
 
 
 class _BatchNormBase(Layer):
@@ -575,15 +615,17 @@ class _BatchNormBase(Layer):
     BatchNorm base .
     """
 
-    def __init__(self,
-                 num_features,
-                 momentum=0.9,
-                 epsilon=1e-05,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format='NCHW',
-                 use_global_stats=None,
-                 name=None):
+    def __init__(
+        self,
+        num_features,
+        momentum=0.9,
+        epsilon=1e-05,
+        weight_attr=None,
+        bias_attr=None,
+        data_format='NCHW',
+        use_global_stats=None,
+        name=None,
+    ):
         super(_BatchNormBase, self).__init__()
         self._num_features = num_features
         self._weight_attr = weight_attr
@@ -603,29 +645,40 @@ class _BatchNormBase(Layer):
                 attr=None,
                 shape=param_shape,
                 dtype=self._dtype,
-                default_initializer=Constant(1.0))
+                default_initializer=Constant(1.0),
+            )
             self.weight.stop_gradient = True
         else:
             self.weight = self.create_parameter(
                 attr=self._weight_attr,
                 shape=param_shape,
                 dtype=self._dtype,
-                default_initializer=Constant(1.0))
-            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
+                default_initializer=Constant(1.0),
+            )
+            self.weight.stop_gradient = (
+                self._weight_attr != None
+                and self._weight_attr.learning_rate == 0.0
+            )
 
         if bias_attr == False:
-            self.bias = self.create_parameter(attr=None,
-                                              shape=param_shape,
-                                              dtype=self._dtype,
-                                              default_initializer=Constant(0.0),
-                                              is_bias=True)
+            self.bias = self.create_parameter(
+                attr=None,
+                shape=param_shape,
+                dtype=self._dtype,
+                default_initializer=Constant(0.0),
+                is_bias=True,
+            )
             self.bias.stop_gradient = True
         else:
-            self.bias = self.create_parameter(attr=self._bias_attr,
-                                              shape=param_shape,
-                                              dtype=self._dtype,
-                                              is_bias=True)
-            self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.
+            self.bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=param_shape,
+                dtype=self._dtype,
+                is_bias=True,
+            )
+            self.bias.stop_gradient = (
+                self._bias_attr != None and self._bias_attr.learning_rate == 0.0
+            )
 
         moving_mean_name = None
         moving_variance_name = None
@@ -634,22 +687,28 @@ class _BatchNormBase(Layer):
             moving_mean_name = name + "_mean"
             moving_variance_name = name + "_variance"
 
-        self._mean = self.create_parameter(dtype=self._dtype,
-                                           attr=ParamAttr(
-                                               name=moving_mean_name,
-                                               initializer=Constant(0.0),
-                                               trainable=False,
-                                               do_model_average=True),
-                                           shape=param_shape)
+        self._mean = self.create_parameter(
+            dtype=self._dtype,
+            attr=ParamAttr(
+                name=moving_mean_name,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=True,
+            ),
+            shape=param_shape,
+        )
         self._mean.stop_gradient = True
 
-        self._variance = self.create_parameter(dtype=self._dtype,
-                                               attr=ParamAttr(
-                                                   name=moving_variance_name,
-                                                   initializer=Constant(1.0),
-                                                   trainable=False,
-                                                   do_model_average=True),
-                                               shape=param_shape)
+        self._variance = self.create_parameter(
+            dtype=self._dtype,
+            attr=ParamAttr(
+                name=moving_variance_name,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=True,
+            ),
+            shape=param_shape,
+        )
         self._variance.stop_gradient = True
 
         self._data_format = data_format
@@ -673,22 +732,26 @@ class _BatchNormBase(Layer):
 
         if self.training:
             warnings.warn(
-                "When training, we now always track global mean and variance.")
-
-        return batch_norm(input,
-                          self._mean,
-                          self._variance,
-                          weight=self.weight,
-                          bias=self.bias,
-                          training=self.training,
-                          momentum=self._momentum,
-                          epsilon=self._epsilon,
-                          data_format=self._data_format,
-                          use_global_stats=self._use_global_stats)
+                "When training, we now always track global mean and variance."
+            )
+
+        return batch_norm(
+            input,
+            self._mean,
+            self._variance,
+            weight=self.weight,
+            bias=self.bias,
+            training=self.training,
+            momentum=self._momentum,
+            epsilon=self._epsilon,
+            data_format=self._data_format,
+            use_global_stats=self._use_global_stats,
+        )
 
     def extra_repr(self):
         main_str = 'num_features={}, momentum={}, epsilon={}'.format(
-            self._num_features, self._momentum, self._epsilon)
+            self._num_features, self._momentum, self._epsilon
+        )
         if self._data_format != 'NCHW':
             main_str += ', data_format={}'.format(self._data_format)
         if self._name is not None:
@@ -771,18 +834,27 @@ class BatchNorm1D(_BatchNormBase):
           print(batch_norm_out)
     """
 
-    def __init__(self,
-                 num_features,
-                 momentum=0.9,
-                 epsilon=1e-05,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format='NCL',
-                 use_global_stats=None,
-                 name=None):
-        super(BatchNorm1D,
-              self).__init__(num_features, momentum, epsilon, weight_attr,
-                             bias_attr, data_format, use_global_stats, name)
+    def __init__(
+        self,
+        num_features,
+        momentum=0.9,
+        epsilon=1e-05,
+        weight_attr=None,
+        bias_attr=None,
+        data_format='NCL',
+        use_global_stats=None,
+        name=None,
+    ):
+        super(BatchNorm1D, self).__init__(
+            num_features,
+            momentum,
+            epsilon,
+            weight_attr,
+            bias_attr,
+            data_format,
+            use_global_stats,
+            name,
+        )
 
     def _check_data_format(self, input):
         if input == 'NCHW' or input == 'NC' or input == 'NCL':
@@ -791,12 +863,16 @@ class BatchNorm1D(_BatchNormBase):
             self._data_format = "NHWC"
         else:
             raise ValueError(
-                'expected NC , NCL, NLC or None for data_format input')
+                'expected NC , NCL, NLC or None for data_format input'
+            )
 
     def _check_input_dim(self, input):
         if len(input.shape) != 2 and len(input.shape) != 3:
-            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
-                len(input.shape)))
+            raise ValueError(
+                'expected 2D or 3D input (got {}D input)'.format(
+                    len(input.shape)
+                )
+            )
 
 
 class BatchNorm2D(_BatchNormBase):
@@ -883,8 +959,9 @@ class BatchNorm2D(_BatchNormBase):
 
     def _check_input_dim(self, input):
         if len(input.shape) != 4:
-            raise ValueError('expected 4D input (got {}D input)'.format(
-                len(input.shape)))
+            raise ValueError(
+                'expected 4D input (got {}D input)'.format(len(input.shape))
+            )
 
 
 class BatchNorm3D(_BatchNormBase):
@@ -961,18 +1038,27 @@ class BatchNorm3D(_BatchNormBase):
           print(batch_norm_out)
     """
 
-    def __init__(self,
-                 num_features,
-                 momentum=0.9,
-                 epsilon=1e-05,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format='NCDHW',
-                 use_global_stats=None,
-                 name=None):
-        super(BatchNorm3D,
-              self).__init__(num_features, momentum, epsilon, weight_attr,
-                             bias_attr, data_format, use_global_stats, name)
+    def __init__(
+        self,
+        num_features,
+        momentum=0.9,
+        epsilon=1e-05,
+        weight_attr=None,
+        bias_attr=None,
+        data_format='NCDHW',
+        use_global_stats=None,
+        name=None,
+    ):
+        super(BatchNorm3D, self).__init__(
+            num_features,
+            momentum,
+            epsilon,
+            weight_attr,
+            bias_attr,
+            data_format,
+            use_global_stats,
+            name,
+        )
 
     def _check_data_format(self, input):
         if input == 'NCHW' or input == 'NCDHW':
@@ -981,12 +1067,14 @@ class BatchNorm3D(_BatchNormBase):
             self._data_format = 'NHWC'
         else:
             raise ValueError(
-                'expected NCDHW, NDHWC or None for data_format input')
+                'expected NCDHW, NDHWC or None for data_format input'
+            )
 
     def _check_input_dim(self, input):
         if len(input.shape) != 5:
-            raise ValueError('expected 5D input (got {}D input)'.format(
-                len(input.shape)))
+            raise ValueError(
+                'expected 5D input (got {}D input)'.format(len(input.shape))
+            )
 
 
 class SyncBatchNorm(_BatchNormBase):
@@ -1076,17 +1164,26 @@ class SyncBatchNorm(_BatchNormBase):
               # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]]
     """
 
-    def __init__(self,
-                 num_features,
-                 momentum=0.9,
-                 epsilon=1e-05,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format='NCHW',
-                 name=None):
-        super(SyncBatchNorm,
-              self).__init__(num_features, momentum, epsilon, weight_attr,
-                             bias_attr, data_format, None, name)
+    def __init__(
+        self,
+        num_features,
+        momentum=0.9,
+        epsilon=1e-05,
+        weight_attr=None,
+        bias_attr=None,
+        data_format='NCHW',
+        name=None,
+    ):
+        super(SyncBatchNorm, self).__init__(
+            num_features,
+            momentum,
+            epsilon,
+            weight_attr,
+            bias_attr,
+            data_format,
+            None,
+            name,
+        )
 
     def _check_data_format(self):
         if self._data_format in ['NCHW', 'NCDHW', 'NC', 'NCL']:
@@ -1110,24 +1207,55 @@ class SyncBatchNorm(_BatchNormBase):
         ### use_global_stats only support False in sync_batch_norm
         if in_dygraph_mode():
             sync_batch_norm_out, _, _, _, _, _ = _C_ops.sync_batch_norm_(
-                x, self.weight, self.bias, self._mean, self._variance,
-                self._momentum, self._epsilon, self._data_format,
-                not self.training, False, False, False)
+                x,
+                self.weight,
+                self.bias,
+                self._mean,
+                self._variance,
+                self._momentum,
+                self._epsilon,
+                self._data_format,
+                not self.training,
+                False,
+                False,
+                False,
+            )
             return sync_batch_norm_out
 
         elif in_dynamic_mode():
-            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
-                     "is_test", not self.training, "data_layout",
-                     self._data_format, "use_mkldnn", False, "fuse_with_relu",
-                     False, "use_global_stats", False, 'trainable_statistics',
-                     False)
+            attrs = (
+                "momentum",
+                self._momentum,
+                "epsilon",
+                self._epsilon,
+                "is_test",
+                not self.training,
+                "data_layout",
+                self._data_format,
+                "use_mkldnn",
+                False,
+                "fuse_with_relu",
+                False,
+                "use_global_stats",
+                False,
+                'trainable_statistics',
+                False,
+            )
             sync_batch_norm_out, _, _, _, _, _ = _legacy_C_ops.sync_batch_norm(
-                x, self.weight, self.bias, self._mean, self._variance, mean_out,
-                variance_out, *attrs)
+                x,
+                self.weight,
+                self.bias,
+                self._mean,
+                self._variance,
+                mean_out,
+                variance_out,
+                *attrs
+            )
             return sync_batch_norm_out
 
-        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                                 'SyncBatchNorm')
+        check_variable_and_dtype(
+            x, 'input', ['float16', 'float32', 'float64'], 'SyncBatchNorm'
+        )
 
         attrs = {
             "momentum": self._momentum,
@@ -1145,28 +1273,30 @@ class SyncBatchNorm(_BatchNormBase):
             "Scale": [self.weight],
             "Bias": [self.bias],
             "Mean": [self._mean],
-            "Variance": [self._variance]
+            "Variance": [self._variance],
         }
 
         saved_mean = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True)
+            dtype=self._dtype, stop_gradient=True
+        )
         saved_variance = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True)
+            dtype=self._dtype, stop_gradient=True
+        )
         sync_batch_norm_out = self._helper.create_variable_for_type_inference(
-            self._dtype)
+            self._dtype
+        )
 
         outputs = {
             "Y": [sync_batch_norm_out],
             "MeanOut": [mean_out],
             "VarianceOut": [variance_out],
             "SavedMean": [saved_mean],
-            "SavedVariance": [saved_variance]
+            "SavedVariance": [saved_variance],
         }
 
-        self._helper.append_op(type="sync_batch_norm",
-                               inputs=inputs,
-                               outputs=outputs,
-                               attrs=attrs)
+        self._helper.append_op(
+            type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
+        )
         return sync_batch_norm_out
 
     @classmethod
@@ -1192,18 +1322,28 @@ class SyncBatchNorm(_BatchNormBase):
         """
         layer_output = layer
         if isinstance(layer, _BatchNormBase):
-            if layer._weight_attr != None and not isinstance(
-                    layer._weight_attr,
-                    bool) and layer._weight_attr.name != None:
+            if (
+                layer._weight_attr != None
+                and not isinstance(layer._weight_attr, bool)
+                and layer._weight_attr.name != None
+            ):
                 layer._weight_attr.name = layer._weight_attr.name + '_sync'
-            if layer._bias_attr != None and not isinstance(
-                    layer._bias_attr, bool) and layer._bias_attr.name != None:
+            if (
+                layer._bias_attr != None
+                and not isinstance(layer._bias_attr, bool)
+                and layer._bias_attr.name != None
+            ):
                 layer._bias_attr.name = layer._bias_attr.name + '_sync'
 
-            layer_output = SyncBatchNorm(layer._num_features, layer._momentum,
-                                         layer._epsilon, layer._weight_attr,
-                                         layer._bias_attr, layer._data_format,
-                                         layer._name)
+            layer_output = SyncBatchNorm(
+                layer._num_features,
+                layer._momentum,
+                layer._epsilon,
+                layer._weight_attr,
+                layer._bias_attr,
+                layer._data_format,
+                layer._name,
+            )
 
             if layer._weight_attr != False and layer._bias_attr != False:
                 with no_grad():
@@ -1213,58 +1353,61 @@ class SyncBatchNorm(_BatchNormBase):
             layer_output._variance = layer._variance
 
         for name, sublayer in layer.named_children():
-            layer_output.add_sublayer(name,
-                                      cls.convert_sync_batchnorm(sublayer))
+            layer_output.add_sublayer(
+                name, cls.convert_sync_batchnorm(sublayer)
+            )
         del layer
         return layer_output
 
 
 class LocalResponseNorm(Layer):
     """
-        Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
-        For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
+    Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
+    For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
 
-        See more details in :ref:`api_paddle_nn_functional_local_response_norm` .
+    See more details in :ref:`api_paddle_nn_functional_local_response_norm` .
 
-        Parameters:
-            size (int): The number of channels to sum over.
-            alpha (float, optional): The scaling parameter, positive. Default:1e-4
-            beta (float, optional): The exponent, positive. Default:0.75
-            k (float, optional): An offset, positive. Default: 1.0
-            data_format (str, optional): Specify the data format of the input, and the data format of the output
-                will be consistent with that of the input. An optional string from:
-                If input is 3-D Tensor, the string could be `"NCL"` or `"NLC"` . When it is `"NCL"`,
-                the data is stored in the order of: `[batch_size, input_channels, feature_length]`.
-                If input is 4-D Tensor, the string could be  `"NCHW"`, `"NHWC"`. When it is `"NCHW"`,
-                the data is stored in the order of: `[batch_size, input_channels, input_height, input_width]`.
-                If input is 5-D Tensor, the string could be  `"NCDHW"`, `"NDHWC"` . When it is `"NCDHW"`,
-                the data is stored in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
-            name (str, optional): Name for the operation (optional, default is None). For more information,
-                please refer to :ref:`api_guide_Name`.
-
-        Shape:
-            - input: 3-D/4-D/5-D tensor.
-            - output: 3-D/4-D/5-D tensor, the same shape as input.
+    Parameters:
+        size (int): The number of channels to sum over.
+        alpha (float, optional): The scaling parameter, positive. Default:1e-4
+        beta (float, optional): The exponent, positive. Default:0.75
+        k (float, optional): An offset, positive. Default: 1.0
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. An optional string from:
+            If input is 3-D Tensor, the string could be `"NCL"` or `"NLC"` . When it is `"NCL"`,
+            the data is stored in the order of: `[batch_size, input_channels, feature_length]`.
+            If input is 4-D Tensor, the string could be  `"NCHW"`, `"NHWC"`. When it is `"NCHW"`,
+            the data is stored in the order of: `[batch_size, input_channels, input_height, input_width]`.
+            If input is 5-D Tensor, the string could be  `"NCDHW"`, `"NDHWC"` . When it is `"NCDHW"`,
+            the data is stored in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name (str, optional): Name for the operation (optional, default is None). For more information,
+            please refer to :ref:`api_guide_Name`.
 
-        Examples:
+    Shape:
+        - input: 3-D/4-D/5-D tensor.
+        - output: 3-D/4-D/5-D tensor, the same shape as input.
 
-        .. code-block:: python
+    Examples:
 
-            import paddle
+    .. code-block:: python
 
-            x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32")
-            m = paddle.nn.LocalResponseNorm(size=5)
-            y = m(x)
-            print(y.shape)  # [3, 3, 112, 112]
-        """
+        import paddle
+
+        x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32")
+        m = paddle.nn.LocalResponseNorm(size=5)
+        y = m(x)
+        print(y.shape)  # [3, 3, 112, 112]
+    """
 
-    def __init__(self,
-                 size,
-                 alpha=0.0001,
-                 beta=0.75,
-                 k=1.0,
-                 data_format="NCHW",
-                 name=None):
+    def __init__(
+        self,
+        size,
+        alpha=0.0001,
+        beta=0.75,
+        k=1.0,
+        data_format="NCHW",
+        name=None,
+    ):
         super(LocalResponseNorm, self).__init__()
         self.size = size
         self.alpha = alpha
@@ -1274,13 +1417,21 @@ class LocalResponseNorm(Layer):
         self.name = name
 
     def forward(self, input):
-        out = F.local_response_norm(input, self.size, self.alpha, self.beta,
-                                    self.k, self.data_format, self.name)
+        out = F.local_response_norm(
+            input,
+            self.size,
+            self.alpha,
+            self.beta,
+            self.k,
+            self.data_format,
+            self.name,
+        )
         return out
 
     def extra_repr(self):
         main_str = 'size={}, alpha={}, beta={}, k={}'.format(
-            self.size, self.alpha, self.beta, self.k)
+            self.size, self.alpha, self.beta, self.k
+        )
         if self.data_format != 'NCHW':
             main_str += ', data_format={}'.format(self.data_format)
         if self.name is not None:
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index ccba13316a1..c3f6d317176 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -61,7 +61,7 @@ class AvgPool1D(Layer):
 
     Returns:
         A callable object of AvgPool1D.
-        
+
     Examples:
 
         .. code-block:: python
@@ -77,13 +77,15 @@ class AvgPool1D(Layer):
 
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 exclusive=True,
-                 ceil_mode=False,
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        exclusive=True,
+        ceil_mode=False,
+        name=None,
+    ):
         super(AvgPool1D, self).__init__()
         self.kernel_size = kernel_size
         self.stride = stride
@@ -93,13 +95,21 @@ class AvgPool1D(Layer):
         self.name = name
 
     def forward(self, x):
-        out = F.avg_pool1d(x, self.kernel_size, self.stride, self.padding,
-                           self.exclusive, self.ceil_mode, self.name)
+        out = F.avg_pool1d(
+            x,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.exclusive,
+            self.ceil_mode,
+            self.name,
+        )
         return out
 
     def extra_repr(self):
         return 'kernel_size={kernel_size}, stride={stride}, padding={padding}'.format(
-            **self.__dict__)
+            **self.__dict__
+        )
 
 
 class AvgPool2D(Layer):
@@ -174,15 +184,17 @@ class AvgPool2D(Layer):
 
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 ceil_mode=False,
-                 exclusive=True,
-                 divisor_override=None,
-                 data_format="NCHW",
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        ceil_mode=False,
+        exclusive=True,
+        divisor_override=None,
+        data_format="NCHW",
+        name=None,
+    ):
         super(AvgPool2D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
@@ -194,19 +206,22 @@ class AvgPool2D(Layer):
         self.name = name
 
     def forward(self, x):
-        return F.avg_pool2d(x,
-                            kernel_size=self.ksize,
-                            stride=self.stride,
-                            padding=self.padding,
-                            ceil_mode=self.ceil_mode,
-                            exclusive=self.exclusive,
-                            divisor_override=self.divisor,
-                            data_format=self.data_format,
-                            name=self.name)
+        return F.avg_pool2d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            exclusive=self.exclusive,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name,
+        )
 
     def extra_repr(self):
         return 'kernel_size={ksize}, stride={stride}, padding={padding}'.format(
-            **self.__dict__)
+            **self.__dict__
+        )
 
 
 class AvgPool3D(Layer):
@@ -267,15 +282,17 @@ class AvgPool3D(Layer):
 
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 ceil_mode=False,
-                 exclusive=True,
-                 divisor_override=None,
-                 data_format="NCDHW",
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        ceil_mode=False,
+        exclusive=True,
+        divisor_override=None,
+        data_format="NCDHW",
+        name=None,
+    ):
         super(AvgPool3D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
@@ -287,19 +304,22 @@ class AvgPool3D(Layer):
         self.name = name
 
     def forward(self, x):
-        return F.avg_pool3d(x,
-                            kernel_size=self.ksize,
-                            stride=self.stride,
-                            padding=self.padding,
-                            ceil_mode=self.ceil_mode,
-                            exclusive=self.exclusive,
-                            divisor_override=self.divisor,
-                            data_format=self.data_format,
-                            name=self.name)
+        return F.avg_pool3d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            exclusive=self.exclusive,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name,
+        )
 
     def extra_repr(self):
         return 'kernel_size={ksize}, stride={stride}, padding={padding}'.format(
-            **self.__dict__)
+            **self.__dict__
+        )
 
 
 class MaxPool1D(Layer):
@@ -371,13 +391,15 @@ class MaxPool1D(Layer):
 
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 return_mask=False,
-                 ceil_mode=False,
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        return_mask=False,
+        ceil_mode=False,
+        name=None,
+    ):
         super(MaxPool1D, self).__init__()
         self.kernel_size = kernel_size
         self.stride = stride
@@ -387,13 +409,21 @@ class MaxPool1D(Layer):
         self.name = name
 
     def forward(self, input):
-        out = F.max_pool1d(input, self.kernel_size, self.stride, self.padding,
-                           self.return_mask, self.ceil_mode, self.name)
+        out = F.max_pool1d(
+            input,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.return_mask,
+            self.ceil_mode,
+            self.name,
+        )
         return out
 
     def extra_repr(self):
         return 'kernel_size={kernel_size}, stride={stride}, padding={padding}'.format(
-            **self.__dict__)
+            **self.__dict__
+        )
 
 
 class MaxPool2D(Layer):
@@ -473,14 +503,16 @@ class MaxPool2D(Layer):
             # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 return_mask=False,
-                 ceil_mode=False,
-                 data_format="NCHW",
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        return_mask=False,
+        ceil_mode=False,
+        data_format="NCHW",
+        name=None,
+    ):
         super(MaxPool2D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
@@ -491,18 +523,21 @@ class MaxPool2D(Layer):
         self.name = name
 
     def forward(self, x):
-        return F.max_pool2d(x,
-                            kernel_size=self.ksize,
-                            stride=self.stride,
-                            padding=self.padding,
-                            return_mask=self.return_mask,
-                            ceil_mode=self.ceil_mode,
-                            data_format=self.data_format,
-                            name=self.name)
+        return F.max_pool2d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            return_mask=self.return_mask,
+            ceil_mode=self.ceil_mode,
+            data_format=self.data_format,
+            name=self.name,
+        )
 
     def extra_repr(self):
         return 'kernel_size={ksize}, stride={stride}, padding={padding}'.format(
-            **self.__dict__)
+            **self.__dict__
+        )
 
 
 class MaxPool3D(Layer):
@@ -570,14 +605,16 @@ class MaxPool3D(Layer):
             # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 return_mask=False,
-                 ceil_mode=False,
-                 data_format="NCDHW",
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        return_mask=False,
+        ceil_mode=False,
+        data_format="NCDHW",
+        name=None,
+    ):
         super(MaxPool3D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
@@ -588,18 +625,21 @@ class MaxPool3D(Layer):
         self.name = name
 
     def forward(self, x):
-        return F.max_pool3d(x,
-                            kernel_size=self.ksize,
-                            stride=self.stride,
-                            padding=self.padding,
-                            return_mask=self.return_mask,
-                            ceil_mode=self.ceil_mode,
-                            data_format=self.data_format,
-                            name=self.name)
+        return F.max_pool3d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            return_mask=self.return_mask,
+            ceil_mode=self.ceil_mode,
+            data_format=self.data_format,
+            name=self.name,
+        )
 
     def extra_repr(self):
         return 'kernel_size={ksize}, stride={stride}, padding={padding}'.format(
-            **self.__dict__)
+            **self.__dict__
+        )
 
 
 class AdaptiveAvgPool1D(Layer):
@@ -741,10 +781,12 @@ class AdaptiveAvgPool2D(Layer):
         self._name = name
 
     def forward(self, x):
-        return F.adaptive_avg_pool2d(x,
-                                     output_size=self._output_size,
-                                     data_format=self._data_format,
-                                     name=self._name)
+        return F.adaptive_avg_pool2d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name,
+        )
 
     def extra_repr(self):
         return 'output_size={}'.format(self._output_size)
@@ -833,10 +875,12 @@ class AdaptiveAvgPool3D(Layer):
         self._name = name
 
     def forward(self, x):
-        return F.adaptive_avg_pool3d(x,
-                                     output_size=self._output_size,
-                                     data_format=self._data_format,
-                                     name=self._name)
+        return F.adaptive_avg_pool3d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name,
+        )
 
     def extra_repr(self):
         return 'output_size={}'.format(self._output_size)
@@ -918,12 +962,14 @@ class AdaptiveMaxPool1D(Layer):
         self.name = name
 
     def forward(self, input):
-        return F.adaptive_max_pool1d(input, self.output_size, self.return_mask,
-                                     self.name)
+        return F.adaptive_max_pool1d(
+            input, self.output_size, self.return_mask, self.name
+        )
 
     def extra_repr(self):
-        return 'output_size={}, return_mask={}'.format(self.output_size,
-                                                       self.return_mask)
+        return 'output_size={}, return_mask={}'.format(
+            self.output_size, self.return_mask
+        )
 
 
 class AdaptiveMaxPool2D(Layer):
@@ -996,14 +1042,17 @@ class AdaptiveMaxPool2D(Layer):
         self._name = name
 
     def forward(self, x):
-        return F.adaptive_max_pool2d(x,
-                                     output_size=self._output_size,
-                                     return_mask=self._return_mask,
-                                     name=self._name)
+        return F.adaptive_max_pool2d(
+            x,
+            output_size=self._output_size,
+            return_mask=self._return_mask,
+            name=self._name,
+        )
 
     def extra_repr(self):
-        return 'output_size={}, return_mask={}'.format(self._output_size,
-                                                       self._return_mask)
+        return 'output_size={}, return_mask={}'.format(
+            self._output_size, self._return_mask
+        )
 
 
 class AdaptiveMaxPool3D(Layer):
@@ -1088,39 +1137,42 @@ class AdaptiveMaxPool3D(Layer):
         self._name = name
 
     def forward(self, x):
-        return F.adaptive_max_pool3d(x,
-                                     output_size=self._output_size,
-                                     return_mask=self._return_mask,
-                                     name=self._name)
+        return F.adaptive_max_pool3d(
+            x,
+            output_size=self._output_size,
+            return_mask=self._return_mask,
+            name=self._name,
+        )
 
     def extra_repr(self):
-        return 'output_size={}, return_mask={}'.format(self._output_size,
-                                                       self._return_mask)
+        return 'output_size={}, return_mask={}'.format(
+            self._output_size, self._return_mask
+        )
 
 
 class MaxUnPool1D(Layer):
     r"""
     This API implements max unpooling 1d opereation.
 
-    `max_unpool1d` accepts the output of `max_pool1d` as input, 
-    including the indices of the maximum value and calculate the partial inverse. 
+    `max_unpool1d` accepts the output of `max_pool1d` as input,
+    including the indices of the maximum value and calculate the partial inverse.
     All non-maximum values ​​are set to zero.
 
     - Input: :math:`(N, C, L_{in})`
     - Output: :math:`(N, C, L_{out})`, where
-    
+
     .. math::
         L_{out} = (L_{in} - 1) * stride - 2 * padding + kernel\_size
 
     or as given by :attr:`output_size` in the call operator.
-    
+
     Parameters:
         kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
             it must contain an integer.
         stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
             it must contain an integer.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, stride, padding).
         data_format (string): The data format of the input and output data.
@@ -1136,7 +1188,7 @@ class MaxUnPool1D(Layer):
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
             import paddle.nn.functional as F
             import numpy as np
@@ -1150,13 +1202,15 @@ class MaxUnPool1D(Layer):
 
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 data_format="NCL",
-                 output_size=None,
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        data_format="NCL",
+        output_size=None,
+        name=None,
+    ):
         super(MaxUnPool1D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
@@ -1166,14 +1220,16 @@ class MaxUnPool1D(Layer):
         self.name = name
 
     def forward(self, x, indices):
-        return F.max_unpool1d(x,
-                              indices,
-                              kernel_size=self.ksize,
-                              stride=self.stride,
-                              padding=self.padding,
-                              data_format=self.data_format,
-                              output_size=self.output_size,
-                              name=self.name)
+        return F.max_unpool1d(
+            x,
+            indices,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            data_format=self.data_format,
+            output_size=self.output_size,
+            name=self.name,
+        )
 
     def extra_repr(self):
         return 'output_size={}'.format(self.output_size)
@@ -1186,7 +1242,7 @@ class MaxUnPool2D(Layer):
     'max_unpool2d' accepts the output of 'max_unpool2d' as input
     Including the indices of the maximum value and calculating the partial inverse
     All non-maximum values ​​are set to zero.
-    
+
 
     Parameters:
         kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
@@ -1195,7 +1251,7 @@ class MaxUnPool2D(Layer):
             it must contain an integer.
         kernel_size (int|tuple): Size of the max unpooling window.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, padding).
         name(str, optional): For detailed information, please refer
@@ -1217,11 +1273,11 @@ class MaxUnPool2D(Layer):
     Returns:
         A callable object of MaxUnPool2D.
 
-            
+
 
     Examples:
         .. code-block:: python
-        
+
         import paddle
         import paddle.nn.functional as F
 
@@ -1234,13 +1290,15 @@ class MaxUnPool2D(Layer):
 
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 data_format="NCHW",
-                 output_size=None,
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        data_format="NCHW",
+        output_size=None,
+        name=None,
+    ):
         super(MaxUnPool2D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
@@ -1250,14 +1308,16 @@ class MaxUnPool2D(Layer):
         self.name = name
 
     def forward(self, x, indices):
-        return F.max_unpool2d(x,
-                              indices,
-                              kernel_size=self.ksize,
-                              stride=self.stride,
-                              padding=self.padding,
-                              data_format=self.data_format,
-                              output_size=self.output_size,
-                              name=self.name)
+        return F.max_unpool2d(
+            x,
+            indices,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            data_format=self.data_format,
+            output_size=self.output_size,
+            name=self.name,
+        )
 
     def extra_repr(self):
         return 'output_size={}'.format(self.output_size)
@@ -1267,13 +1327,13 @@ class MaxUnPool3D(Layer):
     r"""
     This API implements max unpooling 3d opereation.
 
-    `max_unpool3d` accepts the output of `max_pool3d` as input, 
-    including the indices of the maximum value and calculate the partial inverse. 
+    `max_unpool3d` accepts the output of `max_pool3d` as input,
+    including the indices of the maximum value and calculate the partial inverse.
     All non-maximum values ​​are set to zero.
 
     - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
     - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
-    
+
     .. math::
         D_{out} = (D_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0]
 
@@ -1285,14 +1345,14 @@ class MaxUnPool3D(Layer):
 
     or as given by :attr:`output_size` in the call operator
 
-    
+
     Parameters:
         kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
             it must contain an integer.
         stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
             it must contain an integer.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, stride, padding).
         data_format (string): The data format of the input and output data.
@@ -1308,7 +1368,7 @@ class MaxUnPool3D(Layer):
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
             import paddle.nn.functional as F
             import numpy as np
@@ -1322,13 +1382,15 @@ class MaxUnPool3D(Layer):
 
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 data_format="NCDHW",
-                 output_size=None,
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        data_format="NCDHW",
+        output_size=None,
+        name=None,
+    ):
         super(MaxUnPool3D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
@@ -1338,14 +1400,16 @@ class MaxUnPool3D(Layer):
         self.name = name
 
     def forward(self, x, indices):
-        return F.max_unpool3d(x,
-                              indices,
-                              kernel_size=self.ksize,
-                              stride=self.stride,
-                              padding=self.padding,
-                              data_format=self.data_format,
-                              output_size=self.output_size,
-                              name=self.name)
+        return F.max_unpool3d(
+            x,
+            indices,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            data_format=self.data_format,
+            output_size=self.output_size,
+            name=self.name,
+        )
 
     def extra_repr(self):
         return 'output_size={}'.format(self.output_size)
diff --git a/python/paddle/nn/utils/spectral_norm_hook.py b/python/paddle/nn/utils/spectral_norm_hook.py
index 375fe9013b8..eaf1817efc1 100644
--- a/python/paddle/nn/utils/spectral_norm_hook.py
+++ b/python/paddle/nn/utils/spectral_norm_hook.py
@@ -23,21 +23,21 @@ from .. import functional as F
 __all__ = []
 
 
-def normal_(x, mean=0., std=1.):
+def normal_(x, mean=0.0, std=1.0):
     temp_value = paddle.normal(mean, std, shape=x.shape)
     x.set_value(temp_value)
     return x
 
 
 class SpectralNorm(object):
-
     def __init__(self, name='weight', n_power_iterations=1, dim=0, eps=1e-12):
         self.name = name
         self.dim = dim
         if n_power_iterations <= 0:
             raise ValueError(
                 'Expected n_power_iterations to be positive, but '
-                'got n_power_iterations={}'.format(n_power_iterations))
+                'got n_power_iterations={}'.format(n_power_iterations)
+            )
         self.n_power_iterations = n_power_iterations
         self.eps = eps
 
@@ -46,8 +46,9 @@ class SpectralNorm(object):
         if self.dim != 0:
             # transpose dim to front
             weight_mat = weight_mat.transpose(
-                [self.dim] +
-                [d for d in range(weight_mat.dim()) if d != self.dim])
+                [self.dim]
+                + [d for d in range(weight_mat.dim()) if d != self.dim]
+            )
 
         height = weight_mat.shape[0]
 
@@ -64,20 +65,24 @@ class SpectralNorm(object):
                 for _ in range(self.n_power_iterations):
                     v.set_value(
                         F.normalize(
-                            paddle.matmul(weight_mat,
-                                          u,
-                                          transpose_x=True,
-                                          transpose_y=False),
+                            paddle.matmul(
+                                weight_mat,
+                                u,
+                                transpose_x=True,
+                                transpose_y=False,
+                            ),
                             axis=0,
                             epsilon=self.eps,
-                        ))
+                        )
+                    )
 
                     u.set_value(
                         F.normalize(
                             paddle.matmul(weight_mat, v),
                             axis=0,
                             epsilon=self.eps,
-                        ))
+                        )
+                    )
                 if self.n_power_iterations > 0:
                     u = u.clone()
                     v = v.clone()
@@ -87,15 +92,20 @@ class SpectralNorm(object):
         return weight
 
     def __call__(self, layer, inputs):
-        setattr(layer, self.name,
-                self.compute_weight(layer, do_power_iteration=layer.training))
+        setattr(
+            layer,
+            self.name,
+            self.compute_weight(layer, do_power_iteration=layer.training),
+        )
 
     @staticmethod
     def apply(layer, name, n_power_iterations, dim, eps):
         for k, hook in layer._forward_pre_hooks.items():
             if isinstance(hook, SpectralNorm) and hook.name == name:
-                raise RuntimeError("Cannot register two spectral_norm hooks on "
-                                   "the same parameter {}".format(name))
+                raise RuntimeError(
+                    "Cannot register two spectral_norm hooks on "
+                    "the same parameter {}".format(name)
+                )
 
         fn = SpectralNorm(name, n_power_iterations, dim, eps)
         weight = layer._parameters[name]
@@ -106,9 +116,9 @@ class SpectralNorm(object):
 
             # randomly initialize u and v
             u = layer.create_parameter([h])
-            u = normal_(u, 0., 1.)
+            u = normal_(u, 0.0, 1.0)
             v = layer.create_parameter([w])
-            v = normal_(v, 0., 1.)
+            v = normal_(v, 0.0, 1.0)
             u = F.normalize(u, axis=0, epsilon=fn.eps)
             v = F.normalize(v, axis=0, epsilon=fn.eps)
 
@@ -127,13 +137,11 @@ class SpectralNorm(object):
         return fn
 
 
-def spectral_norm(layer,
-                  name='weight',
-                  n_power_iterations=1,
-                  eps=1e-12,
-                  dim=None):
+def spectral_norm(
+    layer, name='weight', n_power_iterations=1, eps=1e-12, dim=None
+):
     r"""
-    This spectral_norm layer applies spectral normalization to a parameter according to the 
+    This spectral_norm layer applies spectral normalization to a parameter according to the
     following Calculation:
 
     Step 1:
@@ -169,7 +177,7 @@ def spectral_norm(layer,
         n_power_iterations(int, optional): The number of power iterations to calculate spectral norm. Default: 1.
         eps(float, optional): The epsilon for numerical stability in calculating norms. Default: 1e-12.
         dim(int, optional): The index of dimension which should be permuted to the first before reshaping Input(Weight) to matrix, it should be set as 0 if Input(Weight) is the weight of fc layer, and should be set as 1 if Input(Weight) is the weight of conv layer. Default: None.
-        
+
     Returns:
         The original layer with the spectral norm hook
 
@@ -188,11 +196,11 @@ def spectral_norm(layer,
             #        [[[[-0.21090528,  0.18563725, -0.14127982],
             #           [-0.02310637,  0.03197737,  0.34353802],
             #           [-0.17117859,  0.33152047, -0.28408015]],
-            # 
+            #
             #          [[-0.13336606, -0.01862637,  0.06959272],
             #           [-0.02236020, -0.27091628, -0.24532901],
             #           [ 0.27254242,  0.15516677,  0.09036587]],
-            # 
+            #
             #          [[ 0.30169338, -0.28146112, -0.11768346],
             #           [-0.45765871, -0.12504843, -0.17482486],
             #           [-0.36866254, -0.19969313,  0.08783543]]]])
@@ -201,8 +209,8 @@ def spectral_norm(layer,
 
     if dim is None:
         if isinstance(
-                layer,
-            (Conv1DTranspose, Conv2DTranspose, Conv3DTranspose, Linear)):
+            layer, (Conv1DTranspose, Conv2DTranspose, Conv3DTranspose, Linear)
+        ):
             dim = 1
         else:
             dim = 0
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index 40c1021848c..8ef9f75c5cb 100755
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -37,16 +37,15 @@ def l2_norm(x, axis, epsilon=1e-12, name=None):
     helper = LayerHelper("l2_normalize", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     norm = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="norm",
-                     inputs={"X": x},
-                     outputs={
-                         "Out": out,
-                         "Norm": norm
-                     },
-                     attrs={
-                         "axis": 1 if axis is None else axis,
-                         "epsilon": epsilon,
-                     })
+    helper.append_op(
+        type="norm",
+        inputs={"X": x},
+        outputs={"Out": out, "Norm": norm},
+        attrs={
+            "axis": 1 if axis is None else axis,
+            "epsilon": epsilon,
+        },
+    )
     return paddle.squeeze(norm, axis=[axis])
 
 
@@ -93,14 +92,13 @@ def _weight_norm(v, g, dim):
         v_normalized = F.l2_normalize(p_matrix, axis=1)
         v_normalized = paddle.reshape(v_normalized, transposed_shape)
         v_normalized = paddle.transpose(v_normalized, perm)
-    weight = F.elementwise_mul(v_normalized,
-                               g,
-                               axis=dim if dim is not None else -1)
+    weight = F.elementwise_mul(
+        v_normalized, g, axis=dim if dim is not None else -1
+    )
     return weight
 
 
 class WeightNorm(object):
-
     def __init__(self, name, dim):
         if dim is None:
             dim = -1
@@ -116,8 +114,10 @@ class WeightNorm(object):
     def apply(layer, name, dim):
         for k, hook in layer._forward_pre_hooks.items():
             if isinstance(hook, WeightNorm) and hook.name == name:
-                raise RuntimeError("Cannot register two weight_norm hooks on "
-                                   "the same parameter {}".format(name))
+                raise RuntimeError(
+                    "Cannot register two weight_norm hooks on "
+                    "the same parameter {}".format(name)
+                )
 
         if dim is None:
             dim = -1
@@ -164,29 +164,29 @@ class WeightNorm(object):
 
 def weight_norm(layer, name='weight', dim=0):
     r"""
-    This weight_norm layer applies weight normalization to a parameter according to the 
+    This weight_norm layer applies weight normalization to a parameter according to the
     following formula:
 
     .. math::
 
         \mathbf{w} = g \dfrac{v}{\|v\|}
 
-    Weight normalization is a reparameterization of the weight vectors in a neural network that 
-    decouples the magnitude of those weight vectors from their direction. Weight normalization 
-    replaces the parameter specified by `name`(eg: 'weight') with two parameters: one parameter 
-    specifying the magnitude (eg: 'weight_g') and one parameter specifying the direction 
-    (eg: 'weight_v'). Weight normalization has been implemented as discussed in this paper: 
+    Weight normalization is a reparameterization of the weight vectors in a neural network that
+    decouples the magnitude of those weight vectors from their direction. Weight normalization
+    replaces the parameter specified by `name`(eg: 'weight') with two parameters: one parameter
+    specifying the magnitude (eg: 'weight_g') and one parameter specifying the direction
+    (eg: 'weight_v'). Weight normalization has been implemented as discussed in this paper:
     `Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks
     <https://arxiv.org/pdf/1602.07868.pdf>`_.
 
     Parameters:
         layer(Layer): Layer of paddle, which has weight.
         name(str, optional): Name of the weight parameter. Default: 'weight'.
-        dim(int, optional): Dimension over which to compute the norm. Dim is a non-negative number 
-              which is less than the rank of weight Tensor. For Example, dim can be chosen from 0, 
-              1, 2, 3 for convolution whose weight shape is [cout, cin, kh, kw] and rank is 4. 
+        dim(int, optional): Dimension over which to compute the norm. Dim is a non-negative number
+              which is less than the rank of weight Tensor. For Example, dim can be chosen from 0,
+              1, 2, 3 for convolution whose weight shape is [cout, cin, kh, kw] and rank is 4.
               If dim is set to None, meaning that all elements will be normalized. Default: 0.
-    
+
     Returns:
         Origin layer with weight norm hook.
 
@@ -222,7 +222,7 @@ def remove_weight_norm(layer, name='weight'):
 
     Examples:
         .. code-block:: python
-          
+
             import paddle
             from paddle.nn import Conv2D
             from paddle.nn.utils import weight_norm, remove_weight_norm
diff --git a/python/paddle/onnx/export.py b/python/paddle/onnx/export.py
index 666cd7c0862..7123f485bf8 100644
--- a/python/paddle/onnx/export.py
+++ b/python/paddle/onnx/export.py
@@ -26,20 +26,20 @@ def export(layer, path, input_spec=None, opset_version=9, **configs):
     Args:
         layer (Layer): The Layer to be exported.
         path (str): The path prefix to export model. The format is ``dirname/file_prefix`` or ``file_prefix`` ,
-            and the exported ONNX file suffix is ``.onnx`` . 
-        input_spec (list[InputSpec|Tensor], optional): Describes the input of the exported model's forward 
-            method, which can be described by InputSpec or example Tensor. If None, all input variables of 
+            and the exported ONNX file suffix is ``.onnx`` .
+        input_spec (list[InputSpec|Tensor], optional): Describes the input of the exported model's forward
+            method, which can be described by InputSpec or example Tensor. If None, all input variables of
             the original Layer's forward method would be the inputs of the exported ``ONNX`` model. Default: None.
         opset_version(int, optional): Opset version of exported ONNX model.
             Now, stable supported opset version include 9, 10, 11. Default: 9.
-        **configs (dict, optional): Other export configuration options for compatibility. We do not 
-            recommend using these configurations, they may be removed in the future. If not necessary, 
+        **configs (dict, optional): Other export configuration options for compatibility. We do not
+            recommend using these configurations, they may be removed in the future. If not necessary,
             DO NOT use them. Default None.
             The following options are currently supported:
             (1) output_spec (list[Tensor]): Selects the output targets of the exported model.
-            By default, all return variables of original Layer's forward method are kept as the 
-            output of the exported model. If the provided ``output_spec`` list is not all output variables, 
-            the exported model will be pruned according to the given ``output_spec`` list. 
+            By default, all return variables of original Layer's forward method are kept as the
+            output of the exported model. If the provided ``output_spec`` list is not all output variables,
+            the exported model will be pruned according to the given ``output_spec`` list.
     Returns:
         None
     Examples:
@@ -94,11 +94,14 @@ def export(layer, path, input_spec=None, opset_version=9, **configs):
         raise ValueError(
             "The input path MUST be format of dirname/file_prefix "
             "[dirname\\file_prefix in Windows system], but "
-            "the file_prefix is empty in received path: {}".format(path))
+            "the file_prefix is empty in received path: {}".format(path)
+        )
     save_file = path + '.onnx'
 
-    p2o.dygraph2onnx(layer,
-                     save_file,
-                     input_spec=input_spec,
-                     opset_version=opset_version,
-                     **configs)
+    p2o.dygraph2onnx(
+        layer,
+        save_file,
+        input_spec=input_spec,
+        opset_version=opset_version,
+        **configs
+    )
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index f3c15ce479d..c75e4b08dc1 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -109,25 +109,29 @@ class Adadelta(Optimizer):
     _avg_squared_grad_acc_str = "_avg_squared_grad"
     _avg_squared_update_acc_str = "_avg_squared_update"
 
-    def __init__(self,
-                 learning_rate=0.001,
-                 epsilon=1.0e-6,
-                 rho=0.95,
-                 parameters=None,
-                 weight_decay=None,
-                 grad_clip=None,
-                 name=None):
+    def __init__(
+        self,
+        learning_rate=0.001,
+        epsilon=1.0e-6,
+        rho=0.95,
+        parameters=None,
+        weight_decay=None,
+        grad_clip=None,
+        name=None,
+    ):
         if learning_rate is None:
             raise ValueError("learning_rate is not set.")
         if epsilon is None:
             raise ValueError("epsilon is not set.")
         if rho is None:
             raise ValueError("rho is not set.")
-        super(Adadelta, self).__init__(learning_rate=learning_rate,
-                                       parameters=parameters,
-                                       weight_decay=weight_decay,
-                                       grad_clip=grad_clip,
-                                       name=name)
+        super(Adadelta, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name,
+        )
         self.type = "adadelta"
         self._epsilon = epsilon
         self._rho = rho
@@ -151,43 +155,44 @@ class Adadelta(Optimizer):
             param_and_grad = self._update_param_group(param_and_grad)
 
         avg_squared_grad_acc = self._get_accumulator(
-            self._avg_squared_grad_acc_str, param_and_grad[0])
+            self._avg_squared_grad_acc_str, param_and_grad[0]
+        )
         avg_squared_update_acc = self._get_accumulator(
-            self._avg_squared_update_acc_str, param_and_grad[0])
+            self._avg_squared_update_acc_str, param_and_grad[0]
+        )
 
         if in_dygraph_mode():
             with no_grad():
-                _C_ops.adadelta_(param_and_grad[0], param_and_grad[1],
-                                 avg_squared_grad_acc, avg_squared_update_acc,
-                                 self._rho, self._epsilon)
+                _C_ops.adadelta_(
+                    param_and_grad[0],
+                    param_and_grad[1],
+                    avg_squared_grad_acc,
+                    avg_squared_update_acc,
+                    self._rho,
+                    self._epsilon,
+                )
             return None
 
         if not isinstance(block, framework.Block):
             raise TypeError("block is not instance of framework.Block.")
 
         # Create the adadelta optimizer op
-        adadelta_op = block.append_op(type=self.type,
-                                      inputs={
-                                          "Param": param_and_grad[0],
-                                          "Grad": param_and_grad[1],
-                                          "AvgSquaredGrad":
-                                          avg_squared_grad_acc,
-                                          "AvgSquaredUpdate":
-                                          avg_squared_update_acc
-                                      },
-                                      outputs={
-                                          "ParamOut":
-                                          param_and_grad[0],
-                                          "AvgSquaredGradOut":
-                                          avg_squared_grad_acc,
-                                          "AvgSquaredUpdateOut":
-                                          avg_squared_update_acc
-                                      },
-                                      attrs={
-                                          "epsilon": self._epsilon,
-                                          "rho": self._rho
-                                      },
-                                      stop_gradient=True)
+        adadelta_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "AvgSquaredGrad": avg_squared_grad_acc,
+                "AvgSquaredUpdate": avg_squared_update_acc,
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "AvgSquaredGradOut": avg_squared_grad_acc,
+                "AvgSquaredUpdateOut": avg_squared_update_acc,
+            },
+            attrs={"epsilon": self._epsilon, "rho": self._rho},
+            stop_gradient=True,
+        )
 
         return adadelta_op
 
diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py
index f5cd7bdaa83..777e516ec37 100644
--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
@@ -108,21 +108,25 @@ class Adagrad(Optimizer):
     """
     _moment_acc_str = "moment"
 
-    def __init__(self,
-                 learning_rate,
-                 epsilon=1.0e-6,
-                 parameters=None,
-                 weight_decay=None,
-                 grad_clip=None,
-                 name=None,
-                 initial_accumulator_value=0.0):
+    def __init__(
+        self,
+        learning_rate,
+        epsilon=1.0e-6,
+        parameters=None,
+        weight_decay=None,
+        grad_clip=None,
+        name=None,
+        initial_accumulator_value=0.0,
+    ):
         assert learning_rate is not None
         assert epsilon is not None
-        super(Adagrad, self).__init__(learning_rate=learning_rate,
-                                      parameters=parameters,
-                                      weight_decay=weight_decay,
-                                      grad_clip=grad_clip,
-                                      name=name)
+        super(Adagrad, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name,
+        )
         self.type = "adagrad"
         self._epsilon = epsilon
         self.initial_accumulator_value = initial_accumulator_value
@@ -138,9 +142,11 @@ class Adagrad(Optimizer):
             parameters = self._update_param_group(parameters)
 
         for p in parameters:
-            self._add_accumulator(self._moment_acc_str,
-                                  p,
-                                  fill_value=self.initial_accumulator_value)
+            self._add_accumulator(
+                self._moment_acc_str,
+                p,
+                fill_value=self.initial_accumulator_value,
+            )
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -148,26 +154,22 @@ class Adagrad(Optimizer):
         if isinstance(param_and_grad, dict):
             param_and_grad = self._update_param_group(param_and_grad)
 
-        moment_acc = self._get_accumulator(self._moment_acc_str,
-                                           param_and_grad[0])
+        moment_acc = self._get_accumulator(
+            self._moment_acc_str, param_and_grad[0]
+        )
         # Create the adagrad optimizer op
-        adagrad_op = block.append_op(type=self.type,
-                                     inputs={
-                                         "Param":
-                                         param_and_grad[0],
-                                         "Grad":
-                                         param_and_grad[1],
-                                         "Moment":
-                                         moment_acc,
-                                         "LearningRate":
-                                         self._create_param_lr(param_and_grad)
-                                     },
-                                     outputs={
-                                         "ParamOut": param_and_grad[0],
-                                         "MomentOut": moment_acc
-                                     },
-                                     attrs={"epsilon": self._epsilon},
-                                     stop_gradient=True)
+        adagrad_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Moment": moment_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
+            },
+            outputs={"ParamOut": param_and_grad[0], "MomentOut": moment_acc},
+            attrs={"epsilon": self._epsilon},
+            stop_gradient=True,
+        )
 
         return adagrad_op
 
@@ -175,6 +177,7 @@ class Adagrad(Optimizer):
         self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
         self.initial_accumulator_value = parameters.get(
             'initial_accumulator_value',
-            self._default_dict['initial_accumulator_value'])
+            self._default_dict['initial_accumulator_value'],
+        )
         parameters = parameters.get('params')
         return parameters
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index cb07fdb7f56..46ae3623936 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -133,15 +133,17 @@ class Adamax(Optimizer):
     _inf_norm_acc_str = "inf_norm"
     _beta1_pow_acc_str = "beta1_pow_acc"
 
-    def __init__(self,
-                 learning_rate=0.001,
-                 beta1=0.9,
-                 beta2=0.999,
-                 epsilon=1e-8,
-                 parameters=None,
-                 weight_decay=None,
-                 grad_clip=None,
-                 name=None):
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-8,
+        parameters=None,
+        weight_decay=None,
+        grad_clip=None,
+        name=None,
+    ):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
@@ -152,11 +154,13 @@ class Adamax(Optimizer):
             raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
         if not 0 <= epsilon:
             raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
-        super(Adamax, self).__init__(learning_rate=learning_rate,
-                                     parameters=parameters,
-                                     weight_decay=weight_decay,
-                                     grad_clip=grad_clip,
-                                     name=name)
+        super(Adamax, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name,
+        )
         self.type = "adamax"
         self._beta1 = beta1
         self._beta2 = beta2
@@ -164,7 +168,7 @@ class Adamax(Optimizer):
         self._default_dict = {
             'beta1': beta1,
             'beta2': beta2,
-            'epsilon': epsilon
+            'epsilon': epsilon,
         }
 
     def _create_accumulators(self, block, parameters):
@@ -175,10 +179,12 @@ class Adamax(Optimizer):
         for p in parameters:
             self._add_accumulator(self._moment_acc_str, p)
             self._add_accumulator(self._inf_norm_acc_str, p)
-            self._add_accumulator(name=self._beta1_pow_acc_str,
-                                  param=p,
-                                  fill_value=self._beta1,
-                                  shape=[1])
+            self._add_accumulator(
+                name=self._beta1_pow_acc_str,
+                param=p,
+                fill_value=self._beta1,
+                shape=[1],
+            )
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -186,22 +192,43 @@ class Adamax(Optimizer):
             param_and_grad = self._update_param_group(param_and_grad)
 
         moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
-        inf_norm = self._get_accumulator(self._inf_norm_acc_str,
-                                         param_and_grad[0])
-        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-                                              param_and_grad[0])
+        inf_norm = self._get_accumulator(
+            self._inf_norm_acc_str, param_and_grad[0]
+        )
+        beta1_pow_acc = self._get_accumulator(
+            self._beta1_pow_acc_str, param_and_grad[0]
+        )
 
         if framework.in_dygraph_mode():
-            _C_ops.adamax_(param_and_grad[0], param_and_grad[1],
-                           self._create_param_lr(param_and_grad), moment,
-                           inf_norm, beta1_pow_acc, self._beta1, self._beta2,
-                           self._epsilon)
+            _C_ops.adamax_(
+                param_and_grad[0],
+                param_and_grad[1],
+                self._create_param_lr(param_and_grad),
+                moment,
+                inf_norm,
+                beta1_pow_acc,
+                self._beta1,
+                self._beta2,
+                self._epsilon,
+            )
         elif framework._in_legacy_dygraph():
-            _legacy_C_ops.adamax(param_and_grad[0], param_and_grad[1],
-                                 self._create_param_lr(param_and_grad), moment,
-                                 inf_norm, beta1_pow_acc, param_and_grad[0],
-                                 moment, inf_norm, "beta1", self._beta1,
-                                 "beta2", self._beta2, "epsilon", self._epsilon)
+            _legacy_C_ops.adamax(
+                param_and_grad[0],
+                param_and_grad[1],
+                self._create_param_lr(param_and_grad),
+                moment,
+                inf_norm,
+                beta1_pow_acc,
+                param_and_grad[0],
+                moment,
+                inf_norm,
+                "beta1",
+                self._beta1,
+                "beta2",
+                self._beta2,
+                "epsilon",
+                self._epsilon,
+            )
         else:
             # create the adamax optimize op
             adamax_op = block.append_op(
@@ -212,25 +239,25 @@ class Adamax(Optimizer):
                     "LearningRate": self._create_param_lr(param_and_grad),
                     "Moment": moment,
                     "InfNorm": inf_norm,
-                    "Beta1Pow": beta1_pow_acc
+                    "Beta1Pow": beta1_pow_acc,
                 },
                 outputs={
                     "ParamOut": param_and_grad[0],
                     "MomentOut": moment,
-                    "InfNormOut": inf_norm
+                    "InfNormOut": inf_norm,
                 },
                 attrs={
                     "beta1": self._beta1,
                     "beta2": self._beta2,
-                    "epsilon": self._epsilon
+                    "epsilon": self._epsilon,
                 },
-                stop_gradient=True)
+                stop_gradient=True,
+            )
 
             return adamax_op
 
     def _finish_update(self, block, parameters_and_grads):
-        """Update Beta1 Power accumulator
-        """
+        """Update Beta1 Power accumulator"""
         assert isinstance(block, framework.Block)
         if isinstance(parameters_and_grads, list):
             for param, grad in parameters_and_grads:
@@ -238,47 +265,61 @@ class Adamax(Optimizer):
                     continue
                 if framework.in_dygraph_mode():
                     beta1_pow_acc = self._get_accumulator(
-                        self._beta1_pow_acc_str, param)
+                        self._beta1_pow_acc_str, param
+                    )
                     with no_grad():
-                        tmp = _C_ops.scale(beta1_pow_acc, self._beta1, 0.0,
-                                           True)
+                        tmp = _C_ops.scale(
+                            beta1_pow_acc, self._beta1, 0.0, True
+                        )
                         beta1_pow_acc.copy_(tmp, False)
                     continue
                 with param.block.program._optimized_guard(
-                    [param, grad]), name_scope('adamax'):
+                    [param, grad]
+                ), name_scope('adamax'):
                     beta1_pow_acc = self._get_accumulator(
-                        self._beta1_pow_acc_str, param)
-                    block.append_op(type="scale",
-                                    inputs={"X": beta1_pow_acc},
-                                    outputs={"Out": beta1_pow_acc},
-                                    attrs={"scale": self._beta1},
-                                    stop_gradient=True)
+                        self._beta1_pow_acc_str, param
+                    )
+                    block.append_op(
+                        type="scale",
+                        inputs={"X": beta1_pow_acc},
+                        outputs={"Out": beta1_pow_acc},
+                        attrs={"scale": self._beta1},
+                        stop_gradient=True,
+                    )
         else:
             for param, grad in parameters_and_grads['params']:
                 if grad is None or param.stop_gradient is True:
                     continue
                 if framework.in_dygraph_mode():
                     beta1_pow_acc = self._get_accumulator(
-                        self._beta1_pow_acc_str, param)
+                        self._beta1_pow_acc_str, param
+                    )
                     self._beta1 = parameters_and_grads.get(
-                        'beta1', self._default_dict['beta1'])
+                        'beta1', self._default_dict['beta1']
+                    )
                     with no_grad():
-                        tmp = _C_ops.scale(beta1_pow_acc, self._beta1, 0.0,
-                                           True)
+                        tmp = _C_ops.scale(
+                            beta1_pow_acc, self._beta1, 0.0, True
+                        )
                         beta1_pow_acc.copy_(tmp, False)
                     continue
 
                 with param.block.program._optimized_guard(
-                    [param, grad]), name_scope('adamax'):
+                    [param, grad]
+                ), name_scope('adamax'):
                     beta1_pow_acc = self._get_accumulator(
-                        self._beta1_pow_acc_str, param)
+                        self._beta1_pow_acc_str, param
+                    )
                     self._beta1 = parameters_and_grads.get(
-                        'beta1', self._default_dict['beta1'])
-                    block.append_op(type="scale",
-                                    inputs={"X": beta1_pow_acc},
-                                    outputs={"Out": beta1_pow_acc},
-                                    attrs={"scale": self._beta1},
-                                    stop_gradient=True)
+                        'beta1', self._default_dict['beta1']
+                    )
+                    block.append_op(
+                        type="scale",
+                        inputs={"X": beta1_pow_acc},
+                        outputs={"Out": beta1_pow_acc},
+                        attrs={"scale": self._beta1},
+                        stop_gradient=True,
+                    )
 
     def _update_param_group(self, parameters):
         self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 4d7d128e05e..f4c9505936f 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -59,7 +59,7 @@ class LRScheduler(object):
         instance to schedule learning rate.
 
     Examples:
-        Here is an example of a simple ``StepDecay`` implementation. 
+        Here is an example of a simple ``StepDecay`` implementation.
 
         .. code-block:: python
 
@@ -93,8 +93,10 @@ class LRScheduler(object):
     def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
         if not isinstance(learning_rate, (float, int)):
             raise TypeError(
-                "The type of learning rate must be float, but received {}".
-                format(type(learning_rate)))
+                "The type of learning rate must be float, but received {}".format(
+                    type(learning_rate)
+                )
+            )
         self.base_lr = float(learning_rate)
         self.last_lr = float(learning_rate)
         self.last_epoch = last_epoch
@@ -133,8 +135,11 @@ class LRScheduler(object):
                 self.last_lr = self.get_lr()
 
         if self.verbose:
-            print('Epoch {}: {} set learning rate to {}.'.format(
-                self.last_epoch, self.__class__.__name__, self.last_lr))
+            print(
+                'Epoch {}: {} set learning rate to {}.'.format(
+                    self.last_epoch, self.__class__.__name__, self.last_lr
+                )
+            )
 
     def state_dict(self):
         """
@@ -153,7 +158,8 @@ class LRScheduler(object):
                 assert value.shape == [
                     1
                 ], "shape of Tensor in state_dict must be [1] {}".format(
-                    value.shape)
+                    value.shape
+                )
                 value = value.numpy()[0]
             state_dict[key] = value
 
@@ -184,8 +190,10 @@ class LRScheduler(object):
                 self.__dict__[key] = state_dict[key]
             else:
                 raise RuntimeError(
-                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict"
-                    .format(key))
+                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict".format(
+                        key
+                    )
+                )
         if len(state_dict) > len(self.keys):
             warnings.warn(
                 "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
@@ -279,12 +287,14 @@ class NoamDecay(LRScheduler):
 
     """
 
-    def __init__(self,
-                 d_model,
-                 warmup_steps,
-                 learning_rate=1.0,
-                 last_epoch=-1,
-                 verbose=False):
+    def __init__(
+        self,
+        d_model,
+        warmup_steps,
+        learning_rate=1.0,
+        last_epoch=-1,
+        verbose=False,
+    ):
         self.d_model = d_model
         self.warmup_steps = warmup_steps
         super(NoamDecay, self).__init__(learning_rate, last_epoch, verbose)
@@ -379,8 +389,9 @@ class PiecewiseDecay(LRScheduler):
     def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
         self.boundaries = boundaries
         self.values = values
-        super(PiecewiseDecay, self).__init__(last_epoch=last_epoch,
-                                             verbose=verbose)
+        super(PiecewiseDecay, self).__init__(
+            last_epoch=last_epoch, verbose=verbose
+        )
 
     def get_lr(self):
         for i in range(len(self.boundaries)):
@@ -460,10 +471,13 @@ class NaturalExpDecay(LRScheduler):
     """
 
     def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
-        assert gamma > 0.0, " 'gamma' must be a positive number so that the learning rate will decay."
+        assert (
+            gamma > 0.0
+        ), " 'gamma' must be a positive number so that the learning rate will decay."
         self.gamma = gamma
-        super(NaturalExpDecay, self).__init__(learning_rate, last_epoch,
-                                              verbose)
+        super(NaturalExpDecay, self).__init__(
+            learning_rate, last_epoch, verbose
+        )
 
     def get_lr(self):
         return self.base_lr * math.exp(-1 * self.gamma * self.last_epoch)
@@ -543,8 +557,9 @@ class InverseTimeDecay(LRScheduler):
 
     def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
         self.gamma = gamma
-        super(InverseTimeDecay, self).__init__(learning_rate, last_epoch,
-                                               verbose)
+        super(InverseTimeDecay, self).__init__(
+            learning_rate, last_epoch, verbose
+        )
 
     def get_lr(self):
         return self.base_lr / (1 + self.gamma * self.last_epoch)
@@ -637,30 +652,37 @@ class PolynomialDecay(LRScheduler):
               # scheduler.step()        # If you update learning rate each epoch
     """
 
-    def __init__(self,
-                 learning_rate,
-                 decay_steps,
-                 end_lr=0.0001,
-                 power=1.0,
-                 cycle=False,
-                 last_epoch=-1,
-                 verbose=False):
+    def __init__(
+        self,
+        learning_rate,
+        decay_steps,
+        end_lr=0.0001,
+        power=1.0,
+        cycle=False,
+        last_epoch=-1,
+        verbose=False,
+    ):
         assert decay_steps > 0 and isinstance(
-            decay_steps, int), " 'decay_steps' must be a positive integer."
+            decay_steps, int
+        ), " 'decay_steps' must be a positive integer."
         self.decay_steps = decay_steps
         self.end_lr = end_lr
-        assert power > 0.0, " 'power' must be greater than 0.0 so that the learning rate will decay."
+        assert (
+            power > 0.0
+        ), " 'power' must be greater than 0.0 so that the learning rate will decay."
         self.power = power
         self.cycle = cycle
-        super(PolynomialDecay, self).__init__(learning_rate, last_epoch,
-                                              verbose)
+        super(PolynomialDecay, self).__init__(
+            learning_rate, last_epoch, verbose
+        )
 
     def get_lr(self):
         tmp_epoch_num = self.last_epoch
         tmp_decay_steps = self.decay_steps
         if self.cycle:
             div_res = math.ceil(
-                float(self.last_epoch) / float(self.decay_steps))
+                float(self.last_epoch) / float(self.decay_steps)
+            )
 
             if self.last_epoch == 0:
                 div_res = 1
@@ -669,8 +691,8 @@ class PolynomialDecay(LRScheduler):
             tmp_epoch_num = min(self.last_epoch, self.decay_steps)
 
         return (self.base_lr - self.end_lr) * (
-            (1 - float(tmp_epoch_num) / float(tmp_decay_steps))**
-            self.power) + self.end_lr
+            (1 - float(tmp_epoch_num) / float(tmp_decay_steps)) ** self.power
+        ) + self.end_lr
 
 
 class LinearWarmup(LRScheduler):
@@ -758,27 +780,36 @@ class LinearWarmup(LRScheduler):
               # scheduler.step()        # If you update learning rate each epoch
     """
 
-    def __init__(self,
-                 learning_rate,
-                 warmup_steps,
-                 start_lr,
-                 end_lr,
-                 last_epoch=-1,
-                 verbose=False):
-        type_check = isinstance(learning_rate, float) or isinstance(
-            learning_rate, int) or isinstance(learning_rate, LRScheduler)
+    def __init__(
+        self,
+        learning_rate,
+        warmup_steps,
+        start_lr,
+        end_lr,
+        last_epoch=-1,
+        verbose=False,
+    ):
+        type_check = (
+            isinstance(learning_rate, float)
+            or isinstance(learning_rate, int)
+            or isinstance(learning_rate, LRScheduler)
+        )
         if not type_check:
             raise TypeError(
-                "the type of learning_rate should be [int, float or LRScheduler], the current type is {}"
-                .format(learning_rate))
+                "the type of learning_rate should be [int, float or LRScheduler], the current type is {}".format(
+                    learning_rate
+                )
+            )
         self.learning_rate = learning_rate
         assert warmup_steps > 0 and isinstance(
-            warmup_steps, int), " 'warmup_steps' must be a positive integer."
+            warmup_steps, int
+        ), " 'warmup_steps' must be a positive integer."
         self.warmup_steps = warmup_steps
         self.start_lr = start_lr
         self.end_lr = end_lr
-        assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format(
-            end_lr, start_lr)
+        assert (
+            end_lr > start_lr
+        ), "end_lr {} must be greater than start_lr {}".format(end_lr, start_lr)
         super(LinearWarmup, self).__init__(start_lr, last_epoch, verbose)
 
     def state_dict(self):
@@ -803,7 +834,8 @@ class LinearWarmup(LRScheduler):
     def get_lr(self):
         if self.last_epoch < self.warmup_steps:
             return (self.end_lr - self.start_lr) * float(
-                self.last_epoch) / float(self.warmup_steps) + self.start_lr
+                self.last_epoch
+            ) / float(self.warmup_steps) + self.start_lr
         else:
             if isinstance(self.learning_rate, LRScheduler):
                 self.learning_rate.step(self.last_epoch - self.warmup_steps)
@@ -884,10 +916,13 @@ class ExponentialDecay(LRScheduler):
     """
 
     def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
-        assert gamma > 0.0 and gamma < 1.0, " 'gamma' must be in interval (0.0, 1.0) so that the learning rate will decay."
+        assert (
+            gamma > 0.0 and gamma < 1.0
+        ), " 'gamma' must be in interval (0.0, 1.0) so that the learning rate will decay."
         self.gamma = gamma
-        super(ExponentialDecay, self).__init__(learning_rate, last_epoch,
-                                               verbose)
+        super(ExponentialDecay, self).__init__(
+            learning_rate, last_epoch, verbose
+        )
 
     def get_lr(self):
         return self.base_lr * (self.gamma**self.last_epoch)
@@ -973,21 +1008,21 @@ class MultiStepDecay(LRScheduler):
               # scheduler.step()        # If you update learning rate each epoch
     """
 
-    def __init__(self,
-                 learning_rate,
-                 milestones,
-                 gamma=0.1,
-                 last_epoch=-1,
-                 verbose=False):
+    def __init__(
+        self, learning_rate, milestones, gamma=0.1, last_epoch=-1, verbose=False
+    ):
         if not isinstance(milestones, (tuple, list)):
             raise TypeError(
                 "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
-                % type(milestones))
+                % type(milestones)
+            )
 
-        if not all([
+        if not all(
+            [
                 milestones[i] < milestones[i + 1]
                 for i in range(len(milestones) - 1)
-        ]):
+            ]
+        ):
             raise ValueError('The elements of milestones must be incremented')
         if gamma >= 1.0:
             raise ValueError('gamma should be < 1.0.')
@@ -1000,7 +1035,7 @@ class MultiStepDecay(LRScheduler):
         for i in range(len(self.milestones)):
             if self.last_epoch < self.milestones[i]:
                 return self.base_lr * (self.gamma**i)
-        return self.base_lr * (self.gamma**len(self.milestones))
+        return self.base_lr * (self.gamma ** len(self.milestones))
 
 
 class StepDecay(LRScheduler):
@@ -1082,21 +1117,20 @@ class StepDecay(LRScheduler):
               # scheduler.step()        # If you update learning rate each epoch
     """
 
-    def __init__(self,
-                 learning_rate,
-                 step_size,
-                 gamma=0.1,
-                 last_epoch=-1,
-                 verbose=False):
+    def __init__(
+        self, learning_rate, step_size, gamma=0.1, last_epoch=-1, verbose=False
+    ):
         if not isinstance(step_size, int):
             raise TypeError(
-                "The type of 'step_size' must be 'int', but received %s." %
-                type(step_size))
+                "The type of 'step_size' must be 'int', but received %s."
+                % type(step_size)
+            )
         if gamma >= 1.0:
             raise ValueError('gamma should be < 1.0.')
 
         assert step_size > 0 and isinstance(
-            step_size, int), " 'step_size' must be a positive integer."
+            step_size, int
+        ), " 'step_size' must be a positive integer."
         self.step_size = step_size
         self.gamma = gamma
         super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)
@@ -1185,7 +1219,8 @@ class LambdaDecay(LRScheduler):
         if not callable(lr_lambda):
             raise TypeError(
                 "The type of 'lr_lambda' in 'LambdaDecay' must be 'function', but received %s."
-                % type(lr_lambda))
+                % type(lr_lambda)
+            )
 
         self.lr_lambda = lr_lambda
         super(LambdaDecay, self).__init__(learning_rate, last_epoch, verbose)
@@ -1281,17 +1316,19 @@ class ReduceOnPlateau(LRScheduler):
 
     """
 
-    def __init__(self,
-                 learning_rate,
-                 mode='min',
-                 factor=0.1,
-                 patience=10,
-                 threshold=1e-4,
-                 threshold_mode='rel',
-                 cooldown=0,
-                 min_lr=0,
-                 epsilon=1e-8,
-                 verbose=False):
+    def __init__(
+        self,
+        learning_rate,
+        mode='min',
+        factor=0.1,
+        patience=10,
+        threshold=1e-4,
+        threshold_mode='rel',
+        cooldown=0,
+        min_lr=0,
+        epsilon=1e-8,
+        verbose=False,
+    ):
         mode = mode.lower()
         if mode not in ['min', 'max']:
             raise ValueError('mode: ' + mode + ' is unknown!')
@@ -1299,18 +1336,21 @@ class ReduceOnPlateau(LRScheduler):
 
         if factor >= 1.0:
             raise ValueError(
-                'new_lr = origin_lr * gamma and gamma should be < 1.0.')
+                'new_lr = origin_lr * gamma and gamma should be < 1.0.'
+            )
         self.factor = factor
 
         threshold_mode = threshold_mode.lower()
         if threshold_mode not in ['rel', 'abs']:
-            raise ValueError('threshold mode: ' + threshold_mode +
-                             ' is unknown!')
+            raise ValueError(
+                'threshold mode: ' + threshold_mode + ' is unknown!'
+            )
         self.threshold_mode = threshold_mode
         if not isinstance(learning_rate, (float, int)):
             raise TypeError(
                 "The type of 'learning_rate' in 'ReduceOnPlateau' must be 'float', but received %s."
-                % type(learning_rate))
+                % type(learning_rate)
+            )
 
         self.patience = patience
         self.threshold = threshold
@@ -1333,8 +1373,11 @@ class ReduceOnPlateau(LRScheduler):
     # "cooldown_counter / best / num_bad_epochs / last_epoch / last_lr" will be stored.
     def state_keys(self):
         self.keys = [
-            'cooldown_counter', 'best', 'num_bad_epochs', 'last_epoch',
-            'last_lr'
+            'cooldown_counter',
+            'best',
+            'num_bad_epochs',
+            'last_epoch',
+            'last_lr',
         ]
 
     def step(self, metrics, epoch=None):
@@ -1364,18 +1407,25 @@ class ReduceOnPlateau(LRScheduler):
         else:
             # need to declarate explicitly
             from paddle.framework import VarBase as Tensor
+
             tmp = Tensor
         # loss must be float, numpy.ndarray or 1-D Tensor with shape [1]
         if isinstance(metrics, (tmp, numpy.ndarray)):
-            assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \
-                                                                      "should be (1L,), but the current metrics.shape is {}. Maybe that " \
-                                                                      "you should call paddle.mean to process it first.".format(
-                metrics.shape)
-        elif not isinstance(metrics,
-                            (int, float, numpy.float32, numpy.float64)):
+            assert len(metrics.shape) == 1 and metrics.shape[0] == 1, (
+                "the metrics.shape "
+                "should be (1L,), but the current metrics.shape is {}. Maybe that "
+                "you should call paddle.mean to process it first.".format(
+                    metrics.shape
+                )
+            )
+        elif not isinstance(
+            metrics, (int, float, numpy.float32, numpy.float64)
+        ):
             raise TypeError(
-                "metrics must be 'int', 'float', 'np.float', 'numpy.ndarray' or 'paddle.Tensor', but receive {}"
-                .format(type(metrics)))
+                "metrics must be 'int', 'float', 'np.float', 'numpy.ndarray' or 'paddle.Tensor', but receive {}".format(
+                    type(metrics)
+                )
+            )
 
         if self.cooldown_counter > 0:
             self.cooldown_counter -= 1
@@ -1393,9 +1443,13 @@ class ReduceOnPlateau(LRScheduler):
                 if self.last_lr - new_lr > self.epsilon:
                     self.last_lr = new_lr
                     if self.verbose:
-                        print('Epoch {}: {} set learning rate to {}.'.format(
-                            self.last_epoch, self.__class__.__name__,
-                            self.last_lr))
+                        print(
+                            'Epoch {}: {} set learning rate to {}.'.format(
+                                self.last_epoch,
+                                self.__class__.__name__,
+                                self.last_lr,
+                            )
+                        )
 
     def _is_better(self, current, best):
         if self.mode == 'min' and self.threshold_mode == 'rel':
@@ -1493,41 +1547,50 @@ class CosineAnnealingDecay(LRScheduler):
               # scheduler.step()        # If you update learning rate each epoch
     """
 
-    def __init__(self,
-                 learning_rate,
-                 T_max,
-                 eta_min=0,
-                 last_epoch=-1,
-                 verbose=False):
+    def __init__(
+        self, learning_rate, T_max, eta_min=0, last_epoch=-1, verbose=False
+    ):
         if not isinstance(T_max, int):
             raise TypeError(
                 "The type of 'T_max' in 'CosineAnnealingDecay' must be 'int', but received %s."
-                % type(T_max))
+                % type(T_max)
+            )
         if not isinstance(eta_min, (float, int)):
             raise TypeError(
                 "The type of 'eta_min' in 'CosineAnnealingDecay' must be 'float, int', but received %s."
-                % type(eta_min))
+                % type(eta_min)
+            )
         assert T_max > 0 and isinstance(
-            T_max, int), " 'T_max' must be a positive integer."
+            T_max, int
+        ), " 'T_max' must be a positive integer."
         self.T_max = T_max
         self.eta_min = float(eta_min)
-        super(CosineAnnealingDecay, self).__init__(learning_rate, last_epoch,
-                                                   verbose)
+        super(CosineAnnealingDecay, self).__init__(
+            learning_rate, last_epoch, verbose
+        )
 
     def get_lr(self):
         if self.last_epoch == 0:
             return self.base_lr
         elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
-            return self.last_lr + (self.base_lr - self.eta_min) * (
-                1 - math.cos(math.pi / self.T_max)) / 2
+            return (
+                self.last_lr
+                + (self.base_lr - self.eta_min)
+                * (1 - math.cos(math.pi / self.T_max))
+                / 2
+            )
 
         return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
-            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * (
-                self.last_lr - self.eta_min) + self.eta_min
+            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)
+        ) * (self.last_lr - self.eta_min) + self.eta_min
 
     def _get_closed_form_lr(self):
-        return self.eta_min + (self.base_lr - self.eta_min) * (
-            1 + math.cos(math.pi * self.last_epoch / self.T_max)) / 2
+        return (
+            self.eta_min
+            + (self.base_lr - self.eta_min)
+            * (1 + math.cos(math.pi * self.last_epoch / self.T_max))
+            / 2
+        )
 
 
 class MultiplicativeDecay(LRScheduler):
@@ -1582,11 +1645,13 @@ class MultiplicativeDecay(LRScheduler):
         if not callable(lr_lambda):
             raise TypeError(
                 "The type of 'lr_lambda' in 'MultiplicativeDecay' must be 'function', but received %s."
-                % type(lr_lambda))
+                % type(lr_lambda)
+            )
 
         self.lr_lambda = lr_lambda
-        super(MultiplicativeDecay, self).__init__(learning_rate, last_epoch,
-                                                  verbose)
+        super(MultiplicativeDecay, self).__init__(
+            learning_rate, last_epoch, verbose
+        )
 
     def get_lr(self):
         cur_lr = self.base_lr
@@ -1679,29 +1744,35 @@ class OneCycleLR(LRScheduler):
                     scheduler.step()    # You should update learning rate each step
     """
 
-    def __init__(self,
-                 max_learning_rate,
-                 total_steps,
-                 divide_factor=25.,
-                 end_learning_rate=0.0001,
-                 phase_pct=0.3,
-                 anneal_strategy='cos',
-                 three_phase=False,
-                 last_epoch=-1,
-                 verbose=False):
+    def __init__(
+        self,
+        max_learning_rate,
+        total_steps,
+        divide_factor=25.0,
+        end_learning_rate=0.0001,
+        phase_pct=0.3,
+        anneal_strategy='cos',
+        three_phase=False,
+        last_epoch=-1,
+        verbose=False,
+    ):
         # Check type and value of max_learning_rate
         if not isinstance(max_learning_rate, (float, int)):
             raise TypeError(
-                "'max_learning_rate' must be 'float' or 'int', but received {}".
-                format(type(max_learning_rate)))
+                "'max_learning_rate' must be 'float' or 'int', but received {}".format(
+                    type(max_learning_rate)
+                )
+            )
         if max_learning_rate < 0:
             raise ValueError("'max_learning_rate' must be a positive integer.")
 
         # Check type and value of end_learning_rate
         if not isinstance(end_learning_rate, (float, int)):
             raise TypeError(
-                "'end_learning_rate' must be 'float' or 'int', but received {}".
-                format(type(end_learning_rate)))
+                "'end_learning_rate' must be 'float' or 'int', but received {}".format(
+                    type(end_learning_rate)
+                )
+            )
         if end_learning_rate < 0:
             raise ValueError("'end_learning_rate' must be a positive integer.")
 
@@ -1709,7 +1780,9 @@ class OneCycleLR(LRScheduler):
         if not isinstance(total_steps, int):
             raise TypeError(
                 "'total_step' must be 'int', but received {}".format(
-                    type(total_steps)))
+                    type(total_steps)
+                )
+            )
         if total_steps <= 0:
             raise ValueError("'total_step' must be a positive integer.")
         self.total_steps = total_steps
@@ -1718,17 +1791,23 @@ class OneCycleLR(LRScheduler):
         if not isinstance(phase_pct, float):
             raise TypeError(
                 "'phase_pct' must be 'float', but received {}".format(
-                    type(phase_pct)))
+                    type(phase_pct)
+                )
+            )
         if phase_pct < 0 or phase_pct > 1:
             raise ValueError(
                 "'phase_pct' must be between 0 and 1, but received {}".format(
-                    phase_pct))
+                    phase_pct
+                )
+            )
 
         # Check type and value of divide_factor
         if not isinstance(divide_factor, (float, int)):
             raise TypeError(
-                "'divide_factor' must be 'float' or 'int', but received {}".
-                format(type(divide_factor)))
+                "'divide_factor' must be 'float' or 'int', but received {}".format(
+                    type(divide_factor)
+                )
+            )
 
         initial_lr = max_learning_rate / float(divide_factor)
         min_lr = float(end_learning_rate)
@@ -1751,17 +1830,22 @@ class OneCycleLR(LRScheduler):
                 self._step_config[1] - self._step_config[0],
                 self._step_config[2] - self._step_config[1],
                 self._step_config[3] - self._step_config[2],
-                self._step_config[3] -
-                self._step_config[2],  # for the last step.
+                self._step_config[3]
+                - self._step_config[2],  # for the last step.
             ]
             # start lr and end lr of each phase.
             self._lr_config = [
-                initial_lr, max_learning_rate, initial_lr, min_lr
+                initial_lr,
+                max_learning_rate,
+                initial_lr,
+                min_lr,
             ]
         else:
             self._step_config = [
-                0, phase_pct * self.total_steps - 1, self.total_steps - 1,
-                self.total_steps - 1
+                0,
+                phase_pct * self.total_steps - 1,
+                self.total_steps - 1,
+                self.total_steps - 1,
             ]
             self._steps_size = [
                 self._step_config[1] - self._step_config[0],
@@ -1777,8 +1861,10 @@ class OneCycleLR(LRScheduler):
             self.anneal_func = self._linear_annealing
         else:
             raise ValueError(
-                "'anneal_strategy' must by one of 'cos' or 'linear', but received {}"
-                .format(anneal_strategy))
+                "'anneal_strategy' must by one of 'cos' or 'linear', but received {}".format(
+                    anneal_strategy
+                )
+            )
         super(OneCycleLR, self).__init__(initial_lr, last_epoch, verbose)
 
     def _cos_annealing(self, start_lr, end_lr, pct):
@@ -1793,17 +1879,21 @@ class OneCycleLR(LRScheduler):
 
         if current_step > self.total_steps:
             raise ValueError(
-                "Tried to step {} times. However the number of total steps is {}"
-                .format(current_step, self.total_steps))
+                "Tried to step {} times. However the number of total steps is {}".format(
+                    current_step, self.total_steps
+                )
+            )
 
         for (i, (end_step, step_size)) in enumerate(
-                zip(self._step_config[1:], self._steps_size)):
+            zip(self._step_config[1:], self._steps_size)
+        ):
             # i == len(self._lr_config) - 2 catch the last step, otherwise it will return None.
             if current_step <= end_step or i == len(self._lr_config) - 2:
                 # self._step_config[i] means start step of a phase.
                 percentage = (current_step - self._step_config[i]) / step_size
-                return self.anneal_func(self._lr_config[i],
-                                        self._lr_config[i + 1], percentage)
+                return self.anneal_func(
+                    self._lr_config[i], self._lr_config[i + 1], percentage
+                )
 
 
 class CyclicLR(LRScheduler):
@@ -1897,71 +1987,93 @@ class CyclicLR(LRScheduler):
                     scheduler.step()    # You should update learning rate each step
     """
 
-    def __init__(self,
-                 base_learning_rate,
-                 max_learning_rate,
-                 step_size_up,
-                 step_size_down=None,
-                 mode='triangular',
-                 exp_gamma=1.,
-                 scale_fn=None,
-                 scale_mode='cycle',
-                 last_epoch=-1,
-                 verbose=False):
+    def __init__(
+        self,
+        base_learning_rate,
+        max_learning_rate,
+        step_size_up,
+        step_size_down=None,
+        mode='triangular',
+        exp_gamma=1.0,
+        scale_fn=None,
+        scale_mode='cycle',
+        last_epoch=-1,
+        verbose=False,
+    ):
         # check type and value of max_learning_rate
         if not isinstance(max_learning_rate, (float, int)):
             raise TypeError(
-                "'max_learning_rate' must be 'float' or 'int', but received {}".
-                format(type(max_learning_rate)))
+                "'max_learning_rate' must be 'float' or 'int', but received {}".format(
+                    type(max_learning_rate)
+                )
+            )
         if max_learning_rate < 0:
             raise ValueError(
-                "'max_learning_rate' must be a positive integer, but received {}"
-                .format(max_learning_rate))
+                "'max_learning_rate' must be a positive integer, but received {}".format(
+                    max_learning_rate
+                )
+            )
 
         # check type and value of step_size_up
         if not isinstance(step_size_up, int):
             raise TypeError(
-                "The type of 'step_size_up' must be int, but received {}".
-                format(type(step_size_up)))
+                "The type of 'step_size_up' must be int, but received {}".format(
+                    type(step_size_up)
+                )
+            )
         if step_size_up <= 0:
             raise ValueError(
-                "'step_size_up' must be a positive integer, but received {}".
-                format(step_size_up))
+                "'step_size_up' must be a positive integer, but received {}".format(
+                    step_size_up
+                )
+            )
 
         # check type and value of step_size_down
         if step_size_down is not None:
             if not isinstance(step_size_down, int):
                 raise TypeError(
-                    "The type of 'step_size_down' must be int, but received {}".
-                    format(type(step_size_down)))
+                    "The type of 'step_size_down' must be int, but received {}".format(
+                        type(step_size_down)
+                    )
+                )
             if step_size_down <= 0:
                 raise ValueError(
-                    "'step_size_down' must be a positive integer, but received {}"
-                    .format(step_size_down))
+                    "'step_size_down' must be a positive integer, but received {}".format(
+                        step_size_down
+                    )
+                )
 
         # check type of exp_gamma
         if not isinstance(exp_gamma, float):
             raise TypeError(
                 "The type of 'exp_gamma' must be float, but received {}".format(
-                    type(exp_gamma)))
+                    type(exp_gamma)
+                )
+            )
 
         step_size_up = float(step_size_up)
-        step_size_down = float(
-            step_size_down) if step_size_down is not None else step_size_up
+        step_size_down = (
+            float(step_size_down)
+            if step_size_down is not None
+            else step_size_up
+        )
 
         self.cycle_size = step_size_up + step_size_down
         self.step_up_pct = step_size_up / self.cycle_size
         self.max_lr = float(max_learning_rate)
         self.amplitude = self.max_lr - base_learning_rate
 
-        if mode not in ['triangular', 'triangular2', 'exp_range'
-                        ] and scale_fn is None:
+        if (
+            mode not in ['triangular', 'triangular2', 'exp_range']
+            and scale_fn is None
+        ):
             raise ValueError(
                 "'mode' is invalid and 'scale_fn' is not specified, make sure one of 'mode' or 'scale_fn' is valid"
             )
         if scale_mode not in ['cycle', 'iterations']:
             raise ValueError(
-                "'scale_mode' must be one of 'cycle' or 'iterations")
+                "'scale_mode' must be one of 'cycle' or 'iterations"
+            )
 
         self.mode = mode
         self.gamma = exp_gamma  # only for exp_range mode
@@ -1982,10 +2094,10 @@ class CyclicLR(LRScheduler):
         super().__init__(base_learning_rate, last_epoch, verbose)
 
     def _triangular_scale_fn(self, x):
-        return 1.
+        return 1.0
 
     def _triangular2_scale_fn(self, x):
-        return 1 / (2.**(x - 1))
+        return 1 / (2.0 ** (x - 1))
 
     def _exp_range_scale_fn(self, x):
         return self.gamma**x
@@ -1994,7 +2106,7 @@ class CyclicLR(LRScheduler):
         iterations = self.last_epoch
 
         cycle = 1 + iterations // self.cycle_size
-        pct_per_cycle = 1. + iterations / self.cycle_size - cycle
+        pct_per_cycle = 1.0 + iterations / self.cycle_size - cycle
 
         if pct_per_cycle <= self.step_up_pct:
             scale_factor = pct_per_cycle / self.step_up_pct
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 7205a434d38..6a79d762684 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -146,16 +146,18 @@ class RMSProp(Optimizer):
     _mean_square_acc_str = "mean_square"
     _mean_grad_acc_str = "mean_grad"
 
-    def __init__(self,
-                 learning_rate,
-                 rho=0.95,
-                 epsilon=1.0e-6,
-                 momentum=0.0,
-                 centered=False,
-                 parameters=None,
-                 weight_decay=None,
-                 grad_clip=None,
-                 name=None):
+    def __init__(
+        self,
+        learning_rate,
+        rho=0.95,
+        epsilon=1.0e-6,
+        momentum=0.0,
+        centered=False,
+        parameters=None,
+        weight_decay=None,
+        grad_clip=None,
+        name=None,
+    ):
         if learning_rate is None:
             raise ValueError("learning_rate is not set.")
         if rho is None:
@@ -171,11 +173,13 @@ class RMSProp(Optimizer):
         if not 0.0 <= rho:
             raise ValueError("Invalid value of rho, expect rho >= 0.")
 
-        super(RMSProp, self).__init__(learning_rate=learning_rate,
-                                      parameters=parameters,
-                                      weight_decay=weight_decay,
-                                      grad_clip=grad_clip,
-                                      name=name)
+        super(RMSProp, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name,
+        )
 
         self.type = "rmsprop"
         self._rho = rho
@@ -208,49 +212,50 @@ class RMSProp(Optimizer):
         if isinstance(param_and_grad, dict):
             param_and_grad = self._update_param_group(param_and_grad)
 
-        momentum_acc = self._get_accumulator(self._momentum_acc_str,
-                                             param_and_grad[0])
-        mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
-                                                param_and_grad[0])
-        mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
-                                              param_and_grad[0])
-        rmsprop_op = block.append_op(type=self.type,
-                                     inputs={
-                                         "Param":
-                                         param_and_grad[0],
-                                         "Grad":
-                                         param_and_grad[1],
-                                         "Moment":
-                                         momentum_acc,
-                                         "MeanSquare":
-                                         mean_square_acc,
-                                         "MeanGrad":
-                                         mean_grad_acc,
-                                         "LearningRate":
-                                         self._create_param_lr(param_and_grad),
-                                     },
-                                     outputs={
-                                         "ParamOut": param_and_grad[0],
-                                         "MomentOut": momentum_acc,
-                                         "MeanSquareOut": mean_square_acc,
-                                         "MeanGradOut": mean_grad_acc
-                                     },
-                                     attrs={
-                                         "epsilon": self._epsilon,
-                                         "decay": self._rho,
-                                         "momentum": self._momentum,
-                                         "centered": self._centered
-                                     },
-                                     stop_gradient=True)
+        momentum_acc = self._get_accumulator(
+            self._momentum_acc_str, param_and_grad[0]
+        )
+        mean_square_acc = self._get_accumulator(
+            self._mean_square_acc_str, param_and_grad[0]
+        )
+        mean_grad_acc = self._get_accumulator(
+            self._mean_grad_acc_str, param_and_grad[0]
+        )
+        rmsprop_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Moment": momentum_acc,
+                "MeanSquare": mean_square_acc,
+                "MeanGrad": mean_grad_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "MomentOut": momentum_acc,
+                "MeanSquareOut": mean_square_acc,
+                "MeanGradOut": mean_grad_acc,
+            },
+            attrs={
+                "epsilon": self._epsilon,
+                "decay": self._rho,
+                "momentum": self._momentum,
+                "centered": self._centered,
+            },
+            stop_gradient=True,
+        )
 
         return rmsprop_op
 
     def _update_param_group(self, parameters):
         self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
         self._rho = parameters.get('rho', self._default_dict['rho'])
-        self._momentum = parameters.get('momentum',
-                                        self._default_dict['momentum'])
-        self._centered = parameters.get('centered',
-                                        self._default_dict['centered'])
+        self._momentum = parameters.get(
+            'momentum', self._default_dict['momentum']
+        )
+        self._centered = parameters.get(
+            'centered', self._default_dict['centered']
+        )
         parameters = parameters.get('params')
         return parameters
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 981f6e9253c..9e11a6c0b5e 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -51,31 +51,31 @@ else:
 
 def cache(reader):
     """
-    Cache the reader data into memory. 
+    Cache the reader data into memory.
 
-    Be careful that this method may take long time to process, 
-    and consume lots of memory. :code:`reader()` would only 
-    call once. 
+    Be careful that this method may take long time to process,
+    and consume lots of memory. :code:`reader()` would only
+    call once.
 
     Args:
-        reader (generator): a reader object which yields 
+        reader (generator): a reader object which yields
             data each time.
 
     Returns:
         generator: a decorated reader object which yields data from cached memory.
-    
+
     Examples:
         .. code-block:: python
 
             import paddle
-            
+
             def reader():
                 for i in range(3):
                     yield i
-            
+
             # All data is cached into memory
             cached_reader = paddle.io.cache(reader)
-            
+
             # Output: 0 1 2
             for i in cached_reader():
                 print(i)
@@ -100,10 +100,10 @@ def map_readers(func, *readers):
 
 
     Args:
-        func: a function to read data and compute result, the output of this function 
+        func: a function to read data and compute result, the output of this function
               will be set as the output of the resulted data reader.
         readers (Reader|list of Reader): list of readers whose outputs will be used as arguments of func.
- 
+
     Returns:
         the resulted data reader (Reader)
 
@@ -138,9 +138,9 @@ def shuffle(reader, buf_size):
 
     This API creates a decorated reader that outputs the shuffled data.
 
-    The output data from the origin reader will be saved into a buffer, 
+    The output data from the origin reader will be saved into a buffer,
     and then shuffle the data. The size of buffer is determined by argument buf_size.
- 
+
     Args:
         reader(callable): the original reader whose data will be shuffled.
         buf_size(int): the size of shuffled buffer.
@@ -255,18 +255,18 @@ def compose(*readers, **kwargs):
     (1, 2, 3, 4, 5)
 
     Args:
-        readers (Reader|list of Reader): readers that will be composed together. 
+        readers (Reader|list of Reader): readers that will be composed together.
         check_alignment(bool, optional): Indicates whether the input readers are checked for
                               alignment. If True, whether input readers are aligned
                               correctly will be checked, else alignment will not be checkout and trailing outputs
                               will be discarded. Defaults to True.
 
-    Returns: 
+    Returns:
         the new data reader (Reader).
 
     Raises:
         ComposeNotAligned: outputs of readers are not aligned. This will not raise if check_alignment is set to False.
-  
+
     Examples:
         .. code-block:: python
 
@@ -284,7 +284,7 @@ def compose(*readers, **kwargs):
         if isinstance(x, tuple):
             return x
         else:
-            return (x, )
+            return (x,)
 
     def reader():
         rs = []
@@ -299,7 +299,8 @@ def compose(*readers, **kwargs):
                     if o is None:
                         # None will be not be present if compose is aligned
                         raise ComposeNotAligned(
-                            "outputs of readers are not aligned.")
+                            "outputs of readers are not aligned."
+                        )
                 yield sum(list(map(make_tuple, outputs)), ())
 
     return reader
@@ -319,25 +320,25 @@ def buffered(reader, size):
 
     Returns:
         generator: the buffered data reader.
-    
+
     Examples:
         .. code-block:: python
 
             import paddle
-            
+
             def reader():
                 for i in range(3):
                     yield i
-            
+
             # Create a buffered reader, and the buffer size is 2.
             buffered_reader = paddle.io.buffered(reader, 2)
-            
+
             # Output: 0 1 2
             for i in buffered_reader():
                 print(i)
     """
 
-    class EndSignal():
+    class EndSignal:
         pass
 
     end = EndSignal()
@@ -350,10 +351,13 @@ def buffered(reader, size):
     def data_reader():
         r = reader()
         q = Queue(maxsize=size)
-        t = Thread(target=read_worker, args=(
-            r,
-            q,
-        ))
+        t = Thread(
+            target=read_worker,
+            args=(
+                r,
+                q,
+            ),
+        )
         t.daemon = True
         t.start()
         e = q.get()
@@ -368,8 +372,8 @@ def firstn(reader, n):
     """
     paddle.fluid.io.firstn ( :ref:`api_fluid_io_firstn` ) is recommended to use,
     and paddle.reader.firstn is an alias.
-    
-    This API creates a decorated reader, and limits the max number of 
+
+    This API creates a decorated reader, and limits the max number of
     samples that reader could return.
 
     Args:
@@ -390,7 +394,7 @@ def firstn(reader, n):
             firstn_reader = fluid.io.firstn(reader, 5)
             for e in firstn_reader():
                 print(e)
-            # the outputs are: 0 1 2 3 4  
+            # the outputs are: 0 1 2 3 4
     """
 
     # TODO(yuyang18): Check if just drop the reader, could clean the opened
@@ -405,7 +409,7 @@ def firstn(reader, n):
     return firstn_reader
 
 
-class XmapEndSignal():
+class XmapEndSignal:
     pass
 
 
@@ -415,14 +419,14 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
 
     Args:
         mapper (callable): a function to map the data from reader.
-        reader (callable): a data reader which yields the data. 
+        reader (callable): a data reader which yields the data.
         process_num (int): thread number to handle original sample.
-        buffer_size (int): size of the queue to read data in. 
-        order (bool): whether to keep the data order from original reader. 
+        buffer_size (int): size of the queue to read data in.
+        order (bool): whether to keep the data order from original reader.
             Default False.
 
     Returns:
-        callable: a decorated reader with data mapping. 
+        callable: a decorated reader with data mapping.
     """
     end = XmapEndSignal()
 
@@ -477,8 +481,11 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
         t.start()
         # start several handle_workers
         target = order_handle_worker if order else handle_worker
-        args = (in_queue, out_queue, mapper,
-                out_order) if order else (in_queue, out_queue, mapper)
+        args = (
+            (in_queue, out_queue, mapper, out_order)
+            if order
+            else (in_queue, out_queue, mapper)
+        )
         workers = []
         for i in range(process_num):
             worker = Thread(target=target, args=args)
@@ -505,17 +512,17 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
 def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
     """
     This API use python ``multiprocessing`` to read data from ``readers`` parallelly,
-    and then ``multiprocess.Queue`` or ``multiprocess.Pipe`` is used to merge 
-    these data. A separate process will be created for each reader in the 
-    ``readers`` list, please guarantee every reader can work independently 
+    and then ``multiprocess.Queue`` or ``multiprocess.Pipe`` is used to merge
+    these data. A separate process will be created for each reader in the
+    ``readers`` list, please guarantee every reader can work independently
     to avoid conflicts in parallel environment.
-    
 
-    ``Multiprocess.Queue`` require the rw access right to /dev/shm, and it's not supported 
+
+    ``Multiprocess.Queue`` require the rw access right to /dev/shm, and it's not supported
     in some platforms.
 
     Parameters:
-       readers (list( ``generator`` ) | tuple( ``generator`` )): a python ``generator`` list 
+       readers (list( ``generator`` ) | tuple( ``generator`` )): a python ``generator`` list
            used to read input data
        use_pipe (bool, optional): control the inner API used to implement the multi-processing,
            default True - use ``multiprocess.Pipe`` which is recommended
@@ -534,16 +541,16 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
         import paddle.fluid as fluid
         from paddle.fluid.io import multiprocess_reader
         import numpy as np
-        
+
         sample_files = ['sample_file_1', 'sample_file_2']
-        
+
         def fake_input_files():
             with open(sample_files[0], 'w') as f:
                np.savez(f, a=np.array([1, 2]), b=np.array([3, 4]), c=np.array([5, 6]), d=np.array([7, 8]))
             with open(sample_files[1], 'w') as f:
                np.savez(f, a=np.array([9, 10]), b=np.array([11, 12]), c=np.array([13, 14]))
-        
-        
+
+
         def generate_reader(file_name):
             # load data file
             def _impl():
@@ -551,28 +558,28 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
                 for item in sorted(data.files):
                     yield data[item],
             return _impl
-        
+
         if __name__ == '__main__':
             # generate sample input files
             fake_input_files()
-            
+
             with fluid.program_guard(fluid.Program(), fluid.Program()):
                 place = fluid.CPUPlace()
                 # the 1st 2 is batch size
-                image = fluid.data(name='image', dtype='int64', shape=[2, 1, 2]) 
+                image = fluid.data(name='image', dtype='int64', shape=[2, 1, 2])
                 fluid.layers.Print(image)
                 # print detailed tensor info of image variable
-            
+
                 reader = fluid.io.PyReader(feed_list=[image], capacity=2)
-            
+
                 decorated_reader = multiprocess_reader(
                     [generate_reader(sample_files[0]), generate_reader(sample_files[1])], False)
-            
+
                 reader.decorate_sample_generator(decorated_reader, batch_size=2, places=[place])
-            
+
                 exe = fluid.Executor(place)
                 exe.run(fluid.default_startup_program())
-            
+
                 for data in reader():
                     res = exe.run(feed=data, fetch_list=[image])
                     print(res[0])
@@ -586,7 +593,8 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
 
     if sys.platform == 'win32':
         raise NotImplementedError(
-            "The multiprocess_reader method is not supported on windows.")
+            "The multiprocess_reader method is not supported on windows."
+        )
 
     # ujson is ultra fast json encoder and decoder written in pure C with bindings for Python 3.6+.
     try:
@@ -594,11 +602,13 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
     except Exception as e:
         warnings.warn(
             "The `ujson` module is not found, use the `json` module, `ujson` encodes and decodes faster, "
-            "you can install `ujson` through `pip install ujson`.")
+            "you can install `ujson` through `pip install ujson`."
+        )
         import json
 
-    assert isinstance(readers, (list, tuple)) and len(readers) > 0, (
-        "`readers` must be list or tuple.")
+    assert (
+        isinstance(readers, (list, tuple)) and len(readers) > 0
+    ), "`readers` must be list or tuple."
 
     def _read_into_queue(reader, queue):
         try:
@@ -614,8 +624,9 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
     def queue_reader():
         queue = fork_context.Queue(queue_size)
         for reader in readers:
-            p = fork_context.Process(target=_read_into_queue,
-                                     args=(reader, queue))
+            p = fork_context.Process(
+                target=_read_into_queue, args=(reader, queue)
+            )
             p.start()
 
         reader_num = len(readers)
@@ -656,8 +667,9 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
         for reader in readers:
             parent_conn, child_conn = fork_context.Pipe()
             conns.append(parent_conn)
-            p = fork_context.Process(target=_read_into_pipe,
-                                     args=(reader, child_conn))
+            p = fork_context.Process(
+                target=_read_into_pipe, args=(reader, child_conn)
+            )
             p.start()
 
         reader_num = len(readers)
diff --git a/python/paddle/regularizer.py b/python/paddle/regularizer.py
index 586ae0f988c..89444a1357d 100644
--- a/python/paddle/regularizer.py
+++ b/python/paddle/regularizer.py
@@ -20,23 +20,23 @@ import paddle.fluid as fluid
 class L1Decay(fluid.regularizer.L1Decay):
     r"""
     Implement the L1 Weight Decay Regularization, which encourages the weights to be sparse.
-    
-    It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
-    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in 
-    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has 
-    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined 
+
+    It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ).
+    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in
+    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has
+    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined
     in its ParamAttr, then the regularizer in Optimizer will be ignored. Otherwise the  regularizer
     in Optimizer will be used.
-    
+
     In the implementation, the loss function of L1 Weight Decay Regularization is as follows:
-	
+
     .. math::
 
         loss = coeff * reduce\_sum(abs(x))
 
     Args:
         coeff(float, optional): regularization coeff. Default:0.0.
-	
+
     Examples:
         .. code-block:: python
 
@@ -82,14 +82,14 @@ class L1Decay(fluid.regularizer.L1Decay):
 class L2Decay(fluid.regularizer.L2Decay):
     r"""
     Implement the L2 Weight Decay Regularization, which helps to prevent the model over-fitting.
-    
-    It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
-    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in 
-    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has 
-    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined 
+
+    It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ).
+    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in
+    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has
+    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined
     in its ParamAttr, then the regularizer in Optimizer will be ignored. Otherwise the  regularizer
     in Optimizer will be used.
-    
+
     In the implementation, the loss function of L2 Weight Decay Regularization is as follows:
 
     .. math::
@@ -98,7 +98,7 @@ class L2Decay(fluid.regularizer.L2Decay):
 
     Args:
         regularization_coeff(float, optional): regularization coeff. Default:0.0
-	
+
     Examples:
         .. code-block:: python
 
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index 4098ae5dbf3..64b2db9f115 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -107,7 +107,8 @@ def data(name, shape, dtype=None, lod_level=0):
             stop_gradient=True,
             lod_level=lod_level,
             is_data=True,
-            need_check_feed=True)
+            need_check_feed=True,
+        )
     else:
         return helper.create_global_variable(
             name=name,
@@ -117,7 +118,8 @@ def data(name, shape, dtype=None, lod_level=0):
             stop_gradient=True,
             lod_level=lod_level,
             is_data=True,
-            need_check_feed=True)
+            need_check_feed=True,
+        )
 
 
 class InputSpec(object):
@@ -166,7 +168,8 @@ class InputSpec(object):
 
     def __repr__(self):
         return '{}(shape={}, dtype={}, name={})'.format(
-            type(self).__name__, self.shape, self.dtype, self.name)
+            type(self).__name__, self.shape, self.dtype, self.name
+        )
 
     @classmethod
     def from_tensor(cls, tensor, name=None):
@@ -198,7 +201,9 @@ class InputSpec(object):
         else:
             raise ValueError(
                 "Input `tensor` should be a Tensor, but received {}.".format(
-                    type(tensor).__name__))
+                    type(tensor).__name__
+                )
+            )
 
     @classmethod
     def from_numpy(cls, ndarray, name=None):
@@ -247,13 +252,17 @@ class InputSpec(object):
         if isinstance(batch_size, (list, tuple)):
             if len(batch_size) != 1:
                 raise ValueError(
-                    "Length of batch_size: {} shall be 1, but received {}.".
-                    format(batch_size, len(batch_size)))
+                    "Length of batch_size: {} shall be 1, but received {}.".format(
+                        batch_size, len(batch_size)
+                    )
+                )
             batch_size = batch_size[1]
         elif not isinstance(batch_size, six.integer_types):
             raise TypeError(
                 "type(batch_size) shall be `int`, but received {}.".format(
-                    type(batch_size).__name__))
+                    type(batch_size).__name__
+                )
+            )
 
         new_shape = [batch_size] + list(self.shape)
         self.shape = tuple(new_shape)
@@ -279,7 +288,8 @@ class InputSpec(object):
         """
         if len(self.shape) == 0:
             raise ValueError(
-                "Not support to unbatch a InputSpec when len(shape) == 0.")
+                "Not support to unbatch a InputSpec when len(shape) == 0."
+            )
 
         self.shape = self._verify(self.shape[1:])
         return self
@@ -290,20 +300,25 @@ class InputSpec(object):
         """
         if not isinstance(shape, (list, tuple)):
             raise TypeError(
-                "Type of `shape` in InputSpec should be one of (tuple, list), but received {}."
-                .format(type(shape).__name__))
+                "Type of `shape` in InputSpec should be one of (tuple, list), but received {}.".format(
+                    type(shape).__name__
+                )
+            )
         if len(shape) == 0:
             raise ValueError(
-                "`shape` in InputSpec should contain at least 1 element, but received {}."
-                .format(shape))
+                "`shape` in InputSpec should contain at least 1 element, but received {}.".format(
+                    shape
+                )
+            )
 
         for i, ele in enumerate(shape):
             if ele is not None:
                 if not isinstance(ele, six.integer_types):
                     raise ValueError(
-                        "shape[{}] should be an `int`, but received `{}`:{}.".
-                        format(i,
-                               type(ele).__name__, ele))
+                        "shape[{}] should be an `int`, but received `{}`:{}.".format(
+                            i, type(ele).__name__, ele
+                        )
+                    )
             if ele is None or ele < -1:
                 shape[i] = -1
 
@@ -328,8 +343,9 @@ class InputSpec(object):
 
     def __eq__(self, other):
         slots = ['shape', 'dtype', 'name']
-        return (type(self) is type(other) and all(
-            getattr(self, attr) == getattr(other, attr) for attr in slots))
+        return type(self) is type(other) and all(
+            getattr(self, attr) == getattr(other, attr) for attr in slots
+        )
 
     def __ne__(self, other):
         return not self == other
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index 30cb7a85236..53acfdb727b 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -40,9 +40,9 @@ from paddle.fluid.log_helper import get_logger
 
 __all__ = []
 
-_logger = get_logger(__name__,
-                     logging.INFO,
-                     fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
+)
 
 
 def _check_args(caller, args, supported_args=None, deprecated_args=None):
@@ -51,12 +51,16 @@ def _check_args(caller, args, supported_args=None, deprecated_args=None):
     for arg in args:
         if arg in deprecated_args:
             raise ValueError(
-                "argument '{}' in function '{}' is deprecated, only {} are supported."
-                .format(arg, caller, supported_args))
+                "argument '{}' in function '{}' is deprecated, only {} are supported.".format(
+                    arg, caller, supported_args
+                )
+            )
         elif arg not in supported_args:
             raise ValueError(
-                "function '{}' doesn't support argument '{}',\n only {} are supported."
-                .format(caller, arg, supported_args))
+                "function '{}' doesn't support argument '{}',\n only {} are supported.".format(
+                    caller, arg, supported_args
+                )
+            )
 
 
 def _check_vars(name, var_list):
@@ -64,7 +68,8 @@ def _check_vars(name, var_list):
         var_list = [var_list]
     if not var_list or not all([isinstance(var, Variable) for var in var_list]):
         raise ValueError(
-            "'{}' should be a Variable or a list of Variable.".format(name))
+            "'{}' should be a Variable or a list of Variable.".format(name)
+        )
 
 
 def _normalize_path_prefix(path_prefix):
@@ -93,29 +98,35 @@ def _get_valid_program(program=None):
                 "The type of input program is invalid, expected tyep is Program, but received None"
             )
         warnings.warn(
-            "The input is a CompiledProgram, this is not recommended.")
+            "The input is a CompiledProgram, this is not recommended."
+        )
     if not isinstance(program, Program):
         raise TypeError(
             "The type of input program is invalid, expected type is fluid.Program, but received %s"
-            % type(program))
+            % type(program)
+        )
     return program
 
 
 def _clone_var_in_block(block, var):
     assert isinstance(var, Variable)
     if var.desc.type() == core.VarDesc.VarType.LOD_TENSOR:
-        return block.create_var(name=var.name,
-                                shape=var.shape,
-                                dtype=var.dtype,
-                                type=var.type,
-                                lod_level=var.lod_level,
-                                persistable=True)
+        return block.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            lod_level=var.lod_level,
+            persistable=True,
+        )
     else:
-        return block.create_var(name=var.name,
-                                shape=var.shape,
-                                dtype=var.dtype,
-                                type=var.type,
-                                persistable=True)
+        return block.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            persistable=True,
+        )
 
 
 def normalize_program(program, feed_vars, fetch_vars):
@@ -163,18 +174,21 @@ def normalize_program(program, feed_vars, fetch_vars):
     """
     if not isinstance(program, Program):
         raise TypeError(
-            "program type must be `fluid.Program`, but received `%s`" %
-            type(program))
+            "program type must be `fluid.Program`, but received `%s`"
+            % type(program)
+        )
     if not isinstance(feed_vars, list):
         feed_vars = [feed_vars]
     if not all(isinstance(v, Variable) for v in feed_vars):
         raise TypeError(
-            "feed_vars type must be a Variable or a list of Variable.")
+            "feed_vars type must be a Variable or a list of Variable."
+        )
     if not isinstance(fetch_vars, list):
         fetch_vars = [fetch_vars]
     if not all(isinstance(v, Variable) for v in fetch_vars):
         raise TypeError(
-            "fetch_vars type must be a Variable or a list of Variable.")
+            "fetch_vars type must be a Variable or a list of Variable."
+        )
 
     # remind users to set auc_states to 0 if auc op were found.
     for op in program.global_block().ops:
@@ -182,8 +196,10 @@ def normalize_program(program, feed_vars, fetch_vars):
         device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
         op._set_attr(device_attr_name, "")
         if op.type == 'auc':
-            warnings.warn("Be sure that you have set auc states to 0 "
-                          "before saving inference model.")
+            warnings.warn(
+                "Be sure that you have set auc states to 0 "
+                "before saving inference model."
+            )
             break
 
     # fix the bug that the activation op's output as target will be pruned.
@@ -193,9 +209,9 @@ def normalize_program(program, feed_vars, fetch_vars):
         uniq_fetch_vars = []
         for i, var in enumerate(fetch_vars):
             if var.dtype != paddle.bool:
-                var = layers.scale(var,
-                                   1.,
-                                   name="save_infer_model/scale_{}".format(i))
+                var = layers.scale(
+                    var, 1.0, name="save_infer_model/scale_{}".format(i)
+                )
             uniq_fetch_vars.append(var)
         fetch_vars = uniq_fetch_vars
 
@@ -213,7 +229,8 @@ def normalize_program(program, feed_vars, fetch_vars):
 
     feed_var_names = [var.name for var in feed_vars]
     copy_program = copy_program._prune_with_input(
-        feeded_var_names=feed_var_names, targets=fetch_vars)
+        feeded_var_names=feed_var_names, targets=fetch_vars
+    )
     copy_program = copy_program._inference_optimize(prune_read_op=True)
     fetch_var_names = [var.name for var in fetch_vars]
     prepend_feed_ops(copy_program, feed_var_names)
@@ -243,9 +260,11 @@ def is_persistable(var):
             param = fluid.default_main_program().global_block().var('fc.b')
             res = fluid.io.is_persistable(param)
     """
-    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                    var.desc.type() == core.VarDesc.VarType.READER:
+    if (
+        var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH
+        or var.desc.type() == core.VarDesc.VarType.FETCH_LIST
+        or var.desc.type() == core.VarDesc.VarType.READER
+    ):
         return False
     return var.persistable
 
@@ -377,8 +396,10 @@ def _serialize_persistables(program, executor):
     vars_ = list(filter(is_persistable, program.list_vars()))
     # warn if no variable found in model
     if len(vars_) == 0:
-        warnings.warn("no variable in your model, please ensure there are any "
-                      "variables in your model to save")
+        warnings.warn(
+            "no variable in your model, please ensure there are any "
+            "variables in your model to save"
+        )
         return None
     # create a new program and clone persitable vars to it
     save_program = Program()
@@ -395,16 +416,16 @@ def _serialize_persistables(program, executor):
         in_vars.append(save_var_map[name])
 
     out_var_name = unique_name.generate("out_var")
-    out_var = save_block.create_var(type=core.VarDesc.VarType.RAW,
-                                    name=out_var_name)
+    out_var = save_block.create_var(
+        type=core.VarDesc.VarType.RAW, name=out_var_name
+    )
     out_var.desc.set_persistable(True)
-    save_block.append_op(type='save_combine',
-                         inputs={'X': in_vars},
-                         outputs={'Y': out_var},
-                         attrs={
-                             'file_path': '',
-                             'save_to_memory': True
-                         })
+    save_block.append_op(
+        type='save_combine',
+        inputs={'X': in_vars},
+        outputs={'Y': out_var},
+        attrs={'file_path': '', 'save_to_memory': True},
+    )
     # run save_program to save vars
     # NOTE(zhiqiu): save op will add variable kLookupTablePath to save_program.desc,
     # which leads to diff between save_program and its desc. Call _sync_with_cpp
@@ -451,8 +472,9 @@ def save_to_file(path, content):
 
 
 @static_only
-def save_inference_model(path_prefix, feed_vars, fetch_vars, executor,
-                         **kwargs):
+def save_inference_model(
+    path_prefix, feed_vars, fetch_vars, executor, **kwargs
+):
     """
     Save current model and its parameters to given path. i.e.
     Given path_prefix = "/path/to/modelname", after invoking
@@ -536,7 +558,8 @@ def save_inference_model(path_prefix, feed_vars, fetch_vars, executor,
     program = normalize_program(program, feed_vars, fetch_vars)
     # serialize and save program
     program_bytes = _serialize_program(
-        program._remove_training_info(clip_extra=clip_extra))
+        program._remove_training_info(clip_extra=clip_extra)
+    )
     save_to_file(model_path, program_bytes)
     # serialize and save params
     params_bytes = _serialize_persistables(program, executor)
@@ -586,8 +609,9 @@ def deserialize_program(data):
     """
     program = Program.parse_from_string(data)
     if not core._is_program_version_supported(program._version()):
-        raise ValueError("Unsupported program version: %d\n" %
-                         program._version())
+        raise ValueError(
+            "Unsupported program version: %d\n" % program._version()
+        )
     return program
 
 
@@ -636,8 +660,9 @@ def deserialize_persistables(program, data, executor):
     """
     if not isinstance(program, Program):
         raise TypeError(
-            "program type must be `fluid.Program`, but received `%s`" %
-            type(program))
+            "program type must be `fluid.Program`, but received `%s`"
+            % type(program)
+        )
     # load params to a tmp program
     load_program = Program()
     load_block = load_program.global_block()
@@ -661,9 +686,9 @@ def deserialize_persistables(program, data, executor):
         load_var_map[var_copy.name] = var_copy
 
     if data is None:
-        assert len(
-            origin_shape_map
-        ) == 0, "Required 'data' shall be not None if program contains parameter, but received 'data' is None."
+        assert (
+            len(origin_shape_map) == 0
+        ), "Required 'data' shall be not None if program contains parameter, but received 'data' is None."
         return
 
     # append load_combine op to load parameters,
@@ -675,10 +700,8 @@ def deserialize_persistables(program, data, executor):
         inputs={},
         outputs={"Out": load_var_list},
         # if load from memory, file_path is data
-        attrs={
-            'file_path': data,
-            'model_from_memory': True
-        })
+        attrs={'file_path': data, 'model_from_memory': True},
+    )
     executor.run(load_program)
     # check var shape
     for var in check_vars:
@@ -693,7 +716,9 @@ def deserialize_persistables(program, data, executor):
             raise RuntimeError(
                 "Shape mismatch, program needs a parameter with shape ({}), "
                 "but the loaded parameter ('{}') has a shape of ({}).".format(
-                    origin_shape, var.name, new_shape))
+                    origin_shape, var.name, new_shape
+                )
+            )
 
 
 def load_from_file(path):
@@ -801,7 +826,7 @@ def load_inference_model(path_prefix, executor, **kwargs):
     """
     # check kwargs
     supported_args = ('model_filename', 'params_filename')
-    deprecated_args = ('pserver_endpoints', )
+    deprecated_args = ('pserver_endpoints',)
     caller = inspect.currentframe().f_code.co_name
     _check_args(caller, kwargs, supported_args, deprecated_args)
 
@@ -812,7 +837,8 @@ def load_inference_model(path_prefix, executor, **kwargs):
         params_filename = kwargs.get('params_filename', None)
         if params_filename is None:
             raise ValueError(
-                "params_filename cannot be None when path_prefix is None.")
+                "params_filename cannot be None when path_prefix is None."
+            )
         load_dirname = ''
         program_bytes = model_filename
         params_bytes = params_filename
@@ -835,21 +861,26 @@ def load_inference_model(path_prefix, executor, **kwargs):
             if model_filename is None:
                 model_path = os.path.join(path_prefix, "__model__")
             else:
-                model_path = os.path.join(path_prefix,
-                                          model_filename + ".pdmodel")
+                model_path = os.path.join(
+                    path_prefix, model_filename + ".pdmodel"
+                )
                 if not os.path.exists(model_path):
                     model_path = os.path.join(path_prefix, model_filename)
             # set params_path
             if params_filename is None:
                 params_path = os.path.join(path_prefix, "")
             else:
-                params_path = os.path.join(path_prefix,
-                                           params_filename + ".pdiparams")
+                params_path = os.path.join(
+                    path_prefix, params_filename + ".pdiparams"
+                )
                 if not os.path.exists(params_path):
                     params_path = os.path.join(path_prefix, params_filename)
-            _logger.warning("The old way to load inference model is deprecated."
-                            " model path: {}, params path: {}".format(
-                                model_path, params_path))
+            _logger.warning(
+                "The old way to load inference model is deprecated."
+                " model path: {}, params path: {}".format(
+                    model_path, params_path
+                )
+            )
         program_bytes = load_from_file(model_path)
         load_dirname = os.path.dirname(params_path)
         params_filename = os.path.basename(params_path)
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index b8133872aa9..05a06596c71 100755
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -19,13 +19,15 @@ __all__ = []
 
 
 @static_only
-def fc(x,
-       size,
-       num_flatten_dims=1,
-       weight_attr=None,
-       bias_attr=None,
-       activation=None,
-       name=None):
+def fc(
+    x,
+    size,
+    num_flatten_dims=1,
+    weight_attr=None,
+    bias_attr=None,
+    activation=None,
+    name=None,
+):
     r"""
 
     Fully-Connected layer can take a tensor or a list of tensor as its inputs.
@@ -107,12 +109,12 @@ def fc(x,
             The default value is None, and the weight will be initialized to zero.
             For detailed information, please refer to :attr:`paddle.ParamAttr`.
             Warning, if x is a list of tensor, weight_attr should also be a list of same length.
-        bias_attr (ParamAttr|bool, optional): The attribute of the learnable bias. 
+        bias_attr (ParamAttr|bool, optional): The attribute of the learnable bias.
             If it is set to False, no bias will be added to the output.
             If it is set to None or one kind of ParamAttr, a bias parameter will
             be created according to ParamAttr. For detailed information, please refer
             to :attr:`paddle.ParamAttr`. The default value is None and the bias will be
-            initialized to zero. 
+            initialized to zero.
         activation (str, optional): Activation to be applied to the output of
             this layer, such as tanh, softmax, sigmoid, relu. For more information,
             please refer to :ref:`api_guide_activations_en` . Default: None.
@@ -157,30 +159,34 @@ def fc(x,
               bias_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=1.0)))
           # out: [[1.8 1.8]]
     """
-    return paddle.fluid.layers.fc(input=x,
-                                  size=size,
-                                  num_flatten_dims=num_flatten_dims,
-                                  param_attr=weight_attr,
-                                  bias_attr=bias_attr,
-                                  act=activation,
-                                  name=name)
+    return paddle.fluid.layers.fc(
+        input=x,
+        size=size,
+        num_flatten_dims=num_flatten_dims,
+        param_attr=weight_attr,
+        bias_attr=bias_attr,
+        act=activation,
+        name=name,
+    )
 
 
 @static_only
-def deform_conv2d(x,
-                  offset,
-                  mask,
-                  num_filters,
-                  filter_size,
-                  stride=1,
-                  padding=0,
-                  dilation=1,
-                  groups=1,
-                  deformable_groups=1,
-                  im2col_step=1,
-                  weight_attr=None,
-                  bias_attr=None,
-                  name=None):
+def deform_conv2d(
+    x,
+    offset,
+    mask,
+    num_filters,
+    filter_size,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    deformable_groups=1,
+    im2col_step=1,
+    weight_attr=None,
+    bias_attr=None,
+    name=None,
+):
     r"""
 
     Compute 2-D deformable convolution on 4-D input.
@@ -323,7 +329,8 @@ def deform_conv2d(x,
             param_attr=weight_attr,
             bias_attr=bias_attr,
             modulated=False,
-            name=name)
+            name=name,
+        )
     else:
         return paddle.fluid.layers.deformable_conv(
             input=x,
@@ -340,4 +347,5 @@ def deform_conv2d(x,
             param_attr=weight_attr,
             bias_attr=bias_attr,
             modulated=True,
-            name=name)
+            name=name,
+        )
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 540c03e36ba..dc1b268cb88 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -24,12 +24,25 @@ from ..framework import dygraph_only
 from ..framework import core
 from ..framework import in_dygraph_mode, _non_static_mode
 from ..framework import LayerHelper
-from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
-from ..framework import convert_np_dtype_to_dtype_, _varbase_creator, OpProtoHolder
+from ..fluid.data_feeder import (
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+    convert_dtype,
+)
+from ..framework import (
+    convert_np_dtype_to_dtype_,
+    _varbase_creator,
+    OpProtoHolder,
+)
+
 # TODO: define functions to get create a tensor
 import paddle
 from paddle import _C_ops, _legacy_C_ops
-from ..fluid.framework import _in_legacy_dygraph, _in_eager_without_dygraph_check
+from ..fluid.framework import (
+    _in_legacy_dygraph,
+    _in_eager_without_dygraph_check,
+)
 import warnings
 
 __all__ = []
@@ -100,11 +113,17 @@ def linspace(start, stop, num, dtype=None, name=None):
         with device_guard("cpu"):
             tensor_num = fill_constant([1], 'int32', num, force_cpu=True)
     if in_dygraph_mode():
-        return _C_ops.linspace(tensor_start, tensor_stop, tensor_num, dtype,
-                               _current_expected_place())
+        return _C_ops.linspace(
+            tensor_start,
+            tensor_stop,
+            tensor_num,
+            dtype,
+            _current_expected_place(),
+        )
     if _in_legacy_dygraph():
-        return _legacy_C_ops.linspace(tensor_start, tensor_stop, tensor_num,
-                                      'dtype', dtype)
+        return _legacy_C_ops.linspace(
+            tensor_start, tensor_stop, tensor_num, 'dtype', dtype
+        )
 
     helper = LayerHelper("linspace", **locals())
 
@@ -112,41 +131,53 @@ def linspace(start, stop, num, dtype=None, name=None):
     stop_dtype = convert_dtype(tensor_stop.dtype)
     out_dtype = convert_dtype(dtype)
     if isinstance(start, Variable):
-        check_dtype(start.dtype, 'start',
-                    ['float32', 'float64', 'int32', 'int64'], 'linspace')
+        check_dtype(
+            start.dtype,
+            'start',
+            ['float32', 'float64', 'int32', 'int64'],
+            'linspace',
+        )
     else:
         check_type(start, 'start', (int, float), 'linspace')
 
     if isinstance(stop, Variable):
-        check_dtype(stop.dtype, 'stop',
-                    ['float32', 'float64', 'int32', 'int64'], 'linspace')
+        check_dtype(
+            stop.dtype,
+            'stop',
+            ['float32', 'float64', 'int32', 'int64'],
+            'linspace',
+        )
     else:
         check_type(stop, 'stop', (int, float), 'linspace')
     if isinstance(num, Variable):
         check_dtype(num.dtype, 'num', ['int32'], 'linspace')
-    check_dtype(dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'],
-                'linspace')
-    if ((stop_dtype == "float64" or start_dtype == "float64")
-            and out_dtype in ["float32", "int32"]) or (
-                (stop_dtype == "int64" or start_dtype == "int64")
-                and out_dtype == "int32"):
+    check_dtype(
+        dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'], 'linspace'
+    )
+    if (
+        (stop_dtype == "float64" or start_dtype == "float64")
+        and out_dtype in ["float32", "int32"]
+    ) or (
+        (stop_dtype == "int64" or start_dtype == "int64")
+        and out_dtype == "int32"
+    ):
         raise ValueError(
             "The dtype of start/stop is {}/{} but the attr(dtype) of linspace is {}, "
-            "which may cause data type overflows. Please reset attr(dtype) of linspace."
-            .format(start_dtype, stop_dtype, dtype))
+            "which may cause data type overflows. Please reset attr(dtype) of linspace.".format(
+                start_dtype, stop_dtype, dtype
+            )
+        )
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
-    helper.append_op(type='linspace',
-                     inputs={
-                         'Start': tensor_start,
-                         'Stop': tensor_stop,
-                         'Num': tensor_num
-                     },
-                     attrs={'dtype': dtype},
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='linspace',
+        inputs={'Start': tensor_start, 'Stop': tensor_stop, 'Num': tensor_num},
+        attrs={'dtype': dtype},
+        outputs={'Out': [out]},
+    )
     if isinstance(num, int):
-        out.desc.set_shape((num, ))
+        out.desc.set_shape((num,))
     return out
 
 
@@ -212,8 +243,9 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None):
         with device_guard("cpu"):
             tensor_base = fill_constant([1], dtype, base)
     if _non_static_mode():
-        return _legacy_C_ops.logspace(tensor_start, tensor_stop, tensor_num,
-                                      tensor_base, 'dtype', dtype)
+        return _legacy_C_ops.logspace(
+            tensor_start, tensor_stop, tensor_num, tensor_base, 'dtype', dtype
+        )
 
     helper = LayerHelper("logspace", **locals())
 
@@ -222,14 +254,22 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None):
     base_dtype = convert_dtype(tensor_base.dtype)
     out_dtype = convert_dtype(dtype)
     if isinstance(start, Variable):
-        check_dtype(start.dtype, 'start',
-                    ['float32', 'float64', 'int32', 'int64'], 'logspace')
+        check_dtype(
+            start.dtype,
+            'start',
+            ['float32', 'float64', 'int32', 'int64'],
+            'logspace',
+        )
     else:
         check_type(start, 'start', (int, float), 'logspace')
 
     if isinstance(stop, Variable):
-        check_dtype(stop.dtype, 'stop',
-                    ['float32', 'float64', 'int32', 'int64'], 'logspace')
+        check_dtype(
+            stop.dtype,
+            'stop',
+            ['float32', 'float64', 'int32', 'int64'],
+            'logspace',
+        )
     else:
         check_type(stop, 'stop', (int, float), 'logspace')
 
@@ -237,37 +277,55 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None):
         check_dtype(num.dtype, 'num', ['int32'], 'logspace')
 
     if isinstance(base, Variable):
-        check_dtype(base.dtype, 'base',
-                    ['float32', 'float64', 'int32', 'int64'], 'logspace')
+        check_dtype(
+            base.dtype,
+            'base',
+            ['float32', 'float64', 'int32', 'int64'],
+            'logspace',
+        )
     else:
         check_type(base, 'base', (int, float), 'logspace')
 
-    check_dtype(dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'],
-                'logspace')
-    if ((stop_dtype == "float64" or start_dtype == "float64"
-                                 or base_dtype == "float64")
-                                 and out_dtype in ["float32", "int32"]) or \
-       ((stop_dtype == "int64" or start_dtype == "int64"
-                               or base_dtype == "int64")
-                               and out_dtype == "int32"):
+    check_dtype(
+        dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'], 'logspace'
+    )
+    if (
+        (
+            stop_dtype == "float64"
+            or start_dtype == "float64"
+            or base_dtype == "float64"
+        )
+        and out_dtype in ["float32", "int32"]
+    ) or (
+        (
+            stop_dtype == "int64"
+            or start_dtype == "int64"
+            or base_dtype == "int64"
+        )
+        and out_dtype == "int32"
+    ):
         raise ValueError(
             "The dtype of start/stop/base is {}/{}/{} but the attr(dtype) of logspace is {}, "
-            "which may cause data type overflows. Please reset attr(dtype) of logspace."
-            .format(start_dtype, stop_dtype, base_dtype, dtype))
+            "which may cause data type overflows. Please reset attr(dtype) of logspace.".format(
+                start_dtype, stop_dtype, base_dtype, dtype
+            )
+        )
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
-    helper.append_op(type='logspace',
-                     inputs={
-                         'Start': tensor_start,
-                         'Stop': tensor_stop,
-                         'Num': tensor_num,
-                         'Base': tensor_base
-                     },
-                     attrs={'dtype': dtype},
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='logspace',
+        inputs={
+            'Start': tensor_start,
+            'Stop': tensor_stop,
+            'Num': tensor_num,
+            'Base': tensor_base,
+        },
+        attrs={'dtype': dtype},
+        outputs={'Out': [out]},
+    )
     if isinstance(num, int):
-        out.desc.set_shape((num, ))
+        out.desc.set_shape((num,))
     return out
 
 
@@ -315,17 +373,25 @@ def _to_tensor_non_static(data, dtype=None, place=None, stop_gradient=True):
             return data
         else:
             raise TypeError(
-                "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|np.ndarray|paddle.Tensor"
-                .format(type(data)))
+                "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|np.ndarray|paddle.Tensor".format(
+                    type(data)
+                )
+            )
         if not dtype:
             if data.dtype in [
-                    'float16', 'float32', 'float64', 'complex64', 'complex128'
+                'float16',
+                'float32',
+                'float64',
+                'complex64',
+                'complex128',
             ]:
                 default_type = paddle.get_default_dtype()
                 if np.iscomplexobj(data):
-                    default_type = 'complex64' if default_type in [
-                        'float16', 'float32'
-                    ] else 'complex128'
+                    default_type = (
+                        'complex64'
+                        if default_type in ['float16', 'float32']
+                        else 'complex128'
+                    )
                 data = data.astype(default_type)
             # Windows default type is 'int32', while Linux/Mac is 'int64'. Unify they.
             if data.dtype in ['int32']:
@@ -336,18 +402,22 @@ def _to_tensor_non_static(data, dtype=None, place=None, stop_gradient=True):
         data = data.astype(convert_dtype(dtype))
 
     if _in_eager_without_dygraph_check() and isinstance(data, np.ndarray):
-        return core.eager.Tensor(value=data,
-                                 place=place,
-                                 persistable=False,
-                                 zero_copy=False,
-                                 name=None,
-                                 stop_gradient=stop_gradient)
+        return core.eager.Tensor(
+            value=data,
+            place=place,
+            persistable=False,
+            zero_copy=False,
+            name=None,
+            stop_gradient=stop_gradient,
+        )
     else:
-        return paddle.Tensor(value=data,
-                             place=place,
-                             persistable=False,
-                             zero_copy=False,
-                             stop_gradient=stop_gradient)
+        return paddle.Tensor(
+            value=data,
+            place=place,
+            persistable=False,
+            zero_copy=False,
+            stop_gradient=stop_gradient,
+        )
 
 
 def _to_tensor_static(data, dtype=None, stop_gradient=None):
@@ -362,8 +432,11 @@ def _to_tensor_static(data, dtype=None, stop_gradient=None):
             elif isinstance(data, (list, tuple)):
                 data = np.array(data)
 
-            if isinstance(data,
-                          np.ndarray) and not dtype and data.dtype != 'object':
+            if (
+                isinstance(data, np.ndarray)
+                and not dtype
+                and data.dtype != 'object'
+            ):
                 if data.dtype in ['float16', 'float32', 'float64']:
                     data = data.astype(paddle.get_default_dtype())
                 elif data.dtype in ['int32']:
@@ -378,10 +451,14 @@ def _to_tensor_static(data, dtype=None, stop_gradient=None):
 
         target_dtype = convert_dtype(target_dtype)
 
-        if isinstance(data, np.ndarray) and len(data.shape) > 0 and any(
-                isinstance(x, Variable) for x in data):
+        if (
+            isinstance(data, np.ndarray)
+            and len(data.shape) > 0
+            and any(isinstance(x, Variable) for x in data)
+        ):
             if not all(
-                [x.shape == (1, ) for x in data if isinstance(x, Variable)]):
+                [x.shape == (1,) for x in data if isinstance(x, Variable)]
+            ):
                 raise TypeError(
                     "Unsupport paddle.to_tensor([Variable, Variable...]) with non-scalar variable."
                 )
@@ -405,7 +482,7 @@ def _to_tensor_static(data, dtype=None, stop_gradient=None):
 
 def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     r"""
-    Constructs a ``paddle.Tensor`` from ``data`` , 
+    Constructs a ``paddle.Tensor`` from ``data`` ,
     which can be scalar, tuple, list, numpy\.ndarray, paddle\.Tensor.
 
     If the ``data`` is already a Tensor, copy will be performed and return a new tensor.
@@ -414,13 +491,13 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     Args:
         data(scalar|tuple|list|ndarray|Tensor): Initial data for the tensor.
             Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor.
-        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
+        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' ,
             'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8',
-            'complex64' , 'complex128'. Default: None, infers dtype from ``data`` 
+            'complex64' , 'complex128'. Default: None, infers dtype from ``data``
             except for python float number which gets dtype from ``get_default_type`` .
-        place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be  
-            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is 
-            string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs. 
+        place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be
+            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is
+            string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs.
         stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True.
 
     Returns:
@@ -431,7 +508,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     .. code-block:: python
 
         import paddle
-                
+
         type(paddle.to_tensor(1))
         # <class 'paddle.Tensor'>
 
@@ -446,7 +523,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
 
         paddle.to_tensor(x)  # A new tensor will be created with default stop_gradient=True
         # Tensor(shape=[1], dtype=int64, place=CPUPlace, stop_gradient=True,
-        #        [1])        
+        #        [1])
 
         paddle.to_tensor([[0.1, 0.2], [0.3, 0.4]], place=paddle.CPUPlace(), stop_gradient=False)
         # Tensor(shape=[2, 2], dtype=float32, place=CPUPlace, stop_gradient=False,
@@ -487,18 +564,18 @@ def full_like(x, fill_value, dtype=None, name=None):
         x(Tensor): The input tensor which specifies shape and data type. The data type can be bool, float16, float32, float64, int32, int64.
         fill_value(bool|float|int): The value to fill the tensor with. Note: this value shouldn't exceed the range of the output data type.
         dtype(np.dtype|str, optional): The data type of output. The data type can be one
-            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output 
+            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output
             data type is the same as input.
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: Tensor which is created according to ``x``, ``fill_value`` and ``dtype``.
-    
+
     Examples:
         .. code-block:: python
 
           import paddle
-          
+
           input = paddle.full(shape=[2, 3], fill_value=0.0, dtype='float32', name='input')
           output = paddle.full_like(input, 2.0)
           # [[2. 2. 2.]
@@ -515,27 +592,31 @@ def full_like(x, fill_value, dtype=None, name=None):
         return _C_ops.full_like(x, fill_value, dtype, x.place)
 
     if _in_legacy_dygraph():
-        return _legacy_C_ops.fill_any_like(x, 'value', fill_value, 'dtype',
-                                           dtype)
+        return _legacy_C_ops.fill_any_like(
+            x, 'value', fill_value, 'dtype', dtype
+        )
 
     helper = LayerHelper("full_like", **locals())
     check_variable_and_dtype(
-        x, 'x',
+        x,
+        'x',
         ['bool', 'float16', 'float32', 'float64', 'int16', 'int32', 'int64'],
-        'full_like')
+        'full_like',
+    )
     check_dtype(
-        dtype, 'dtype',
+        dtype,
+        'dtype',
         ['bool', 'float16', 'float32', 'float64', 'int16', 'int32', 'int64'],
-        'full_like/zeros_like/ones_like')
+        'full_like/zeros_like/ones_like',
+    )
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
-    helper.append_op(type='fill_any_like',
-                     inputs={'X': [x]},
-                     attrs={
-                         'value': fill_value,
-                         "dtype": dtype
-                     },
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='fill_any_like',
+        inputs={'X': [x]},
+        attrs={'value': fill_value, "dtype": dtype},
+        outputs={'Out': [out]},
+    )
     out.stop_gradient = True
     return out
 
@@ -549,28 +630,28 @@ def ones(shape, dtype=None, name=None):
         dtype (np.dtype|str, optional): Data type of output Tensor, it should be one of
             bool, float16, float32, float64, int32 and int64. If it is set to None, the data type will be float32.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: A Tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements are 1.
 
     Examples:
         .. code-block:: python
 
-            import paddle 
+            import paddle
 
             # default dtype for ones OP
-            data1 = paddle.ones(shape=[3, 2]) 
+            data1 = paddle.ones(shape=[3, 2])
             # [[1. 1.]
             #  [1. 1.]
             #  [1. 1.]]
 
-            data2 = paddle.ones(shape=[2, 2], dtype='int32') 
+            data2 = paddle.ones(shape=[2, 2], dtype='int32')
             # [[1 1]
             #  [1 1]]
 
             # shape is a Tensor
             shape = paddle.full(shape=[2], dtype='int32', fill_value=2)
-            data3 = paddle.ones(shape=shape, dtype='int32') 
+            data3 = paddle.ones(shape=shape, dtype='int32')
             # [[1 1]
             #  [1 1]]
     """
@@ -628,18 +709,18 @@ def zeros(shape, dtype=None, name=None):
         .. code-block:: python
 
           import paddle
-          
-          data = paddle.zeros(shape=[3, 2], dtype='float32') 
+
+          data = paddle.zeros(shape=[3, 2], dtype='float32')
           # [[0. 0.]
           #  [0. 0.]
           #  [0. 0.]]
-          data = paddle.zeros(shape=[2, 2]) 
+          data = paddle.zeros(shape=[2, 2])
           # [[0. 0.]
           #  [0. 0.]]
-          
+
           # shape is a Tensor
           shape = paddle.full(shape=[2], dtype='int32', fill_value=2)
-          data3 = paddle.zeros(shape=shape, dtype='int32') 
+          data3 = paddle.zeros(shape=shape, dtype='int32')
           # [[0 0]
           #  [0 0]]
     """
@@ -682,7 +763,7 @@ def zeros_like(x, dtype=None, name=None):
 
 def eye(num_rows, num_columns=None, dtype=None, name=None):
     """
-    
+
     This function constructs 2-D Tensor with ones on the diagonal and zeros elsewhere.
 
     Args:
@@ -699,7 +780,7 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
 
     Examples:
         .. code-block:: python
-          
+
           import paddle
 
           data = paddle.eye(3, dtype='int32')
@@ -730,26 +811,34 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
 
     if _non_static_mode():
         if in_dygraph_mode():
-            out = _C_ops.eye(num_rows, num_columns, dtype,
-                             _current_expected_place())
+            out = _C_ops.eye(
+                num_rows, num_columns, dtype, _current_expected_place()
+            )
         elif _in_legacy_dygraph():
-            out = _legacy_C_ops.eye('dtype', dtype, 'num_rows', num_rows,
-                                    'num_columns', num_columns)
+            out = _legacy_C_ops.eye(
+                'dtype', dtype, 'num_rows', num_rows, 'num_columns', num_columns
+            )
 
     else:
         helper = LayerHelper("eye", **locals())
-        check_dtype(dtype, 'dtype',
-                    ['float16', 'float32', 'float64', 'int32', 'int64'], 'eye')
+        check_dtype(
+            dtype,
+            'dtype',
+            ['float16', 'float32', 'float64', 'int32', 'int64'],
+            'eye',
+        )
         out = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(type='eye',
-                         inputs={},
-                         outputs={'Out': [out]},
-                         attrs={
-                             'num_rows': num_rows,
-                             'num_columns': num_columns,
-                             'dtype': dtype
-                         },
-                         stop_gradient=True)
+        helper.append_op(
+            type='eye',
+            inputs={},
+            outputs={'Out': [out]},
+            attrs={
+                'num_rows': num_rows,
+                'num_columns': num_columns,
+                'dtype': dtype,
+            },
+            stop_gradient=True,
+        )
 
     out.stop_gradient = True
     return out
@@ -759,7 +848,7 @@ def full(shape, fill_value, dtype=None, name=None):
     """
 
     Return a Tensor with the ``fill_value`` which size is same as ``shape``.
-    
+
     Args:
         shape(list|tuple|Tensor): Shape of the Tensor to be created.
                 The data type is ``int32`` or ``int64`` . If ``shape`` is a list or tuple,
@@ -771,7 +860,7 @@ def full(shape, fill_value, dtype=None, name=None):
             which can be float16, float32, float64, int32, int64, if dytpe is `None`, the data
             type of created Tensor is `float32`.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: Tensor which is created according to ``shape``, ``fill_value`` and ``dtype``.
 
@@ -780,7 +869,7 @@ def full(shape, fill_value, dtype=None, name=None):
 
             import paddle
 
-            data1 = paddle.full(shape=[2,1], fill_value=0, dtype='int64') 
+            data1 = paddle.full(shape=[2,1], fill_value=0, dtype='int64')
             #[[0]
             # [0]]
 
@@ -791,14 +880,14 @@ def full(shape, fill_value, dtype=None, name=None):
 
             # attr shape is a Tensor.
             shape = paddle.full([2], 2, "int32")
-            data4 = paddle.full(shape=shape, dtype='bool', fill_value=True) 
-            # [[True True] 
+            data4 = paddle.full(shape=shape, dtype='bool', fill_value=True)
+            # [[True True]
             #  [True True]]
-            
+
             # attr fill_value is a Tensor.
             val = paddle.full([1], 2.0, "float32")
             data5 = paddle.full(shape=[2,1], fill_value=val, dtype='float32')
-            # [[2.0] 
+            # [[2.0]
             #  [2.0]]
     """
 
@@ -836,7 +925,7 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
             If ``dytpe`` is None, the data type is float32. Default is None.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
-    Returns: 
+    Returns:
         Tensor: A 1-D Tensor with values from the interval [``start``, ``end``)
         taken with common difference ``step`` beginning from ``start``. Its
         data type is set by ``dtype``.
@@ -859,7 +948,7 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
             start_var = paddle.to_tensor([3])
             out4 = paddle.arange(start_var, 7)
             # [3, 4, 5, 6]
-             
+
     """
     if dtype is None:
         dtype = 'int64'
@@ -868,8 +957,11 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         start = 0
 
     out_shape = None
-    if not isinstance(start, Variable) and not isinstance(
-            end, Variable) and not isinstance(step, Variable):
+    if (
+        not isinstance(start, Variable)
+        and not isinstance(end, Variable)
+        and not isinstance(step, Variable)
+    ):
         out_shape = [int(math.ceil((end - start) / step))]
 
     if not isinstance(dtype, core.VarDesc.VarType):
@@ -901,17 +993,16 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         out.stop_gradient = True
         return out
 
-    check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'],
-                'range/arange')
+    check_dtype(
+        dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'], 'range/arange'
+    )
     helper = LayerHelper('range', **locals())
     out = helper.create_variable_for_type_inference(dtype, shape=out_shape)
-    helper.append_op(type='range',
-                     inputs={
-                         'Start': start,
-                         'End': end,
-                         'Step': step
-                     },
-                     outputs={'Out': out})
+    helper.append_op(
+        type='range',
+        inputs={'Start': start, 'End': end, 'Step': step},
+        outputs={'Out': out},
+    )
     out.stop_gradient = True
     if out_shape is not None:
         out.desc.set_shape(out_shape)
@@ -919,28 +1010,30 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
 
 
 def _tril_triu_op(helper):
-    """Base op of tril_op and triu_op
-    """
+    """Base op of tril_op and triu_op"""
     op_type = helper.layer_type
     x = helper.kwargs.get('x', None)
 
     assert x is not None, 'x cannot be None in {}'.format(op_type)
     check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
-        op_type)
+        x,
+        'x',
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
+        op_type,
+    )
     if len(x.shape) < 2:
         raise ValueError("x shape in {} must be at least 2-D".format(op_type))
     diagonal = helper.kwargs.get('diagonal', 0)
-    if not isinstance(diagonal, (int, )):
+    if not isinstance(diagonal, (int,)):
         raise TypeError("diagonal in {} must be a python Int".format(op_type))
     name = helper.kwargs.get('name', None)
 
     if name is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
-        out = helper.create_variable(name=name,
-                                     dtype=x.dtype,
-                                     persistable=False)
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False
+        )
 
     helper.append_op(
         type="tril_triu",
@@ -958,8 +1051,8 @@ def _tril_triu_op(helper):
 def tril(x, diagonal=0, name=None):
     r"""
     Returns the lower triangular part of a matrix (2-D tensor) or batch
-    of matrices :attr:`x`, the other elements of the result tensor are set 
-    to 0. The lower triangular part of the matrix is defined as the elements 
+    of matrices :attr:`x`, the other elements of the result tensor are set
+    to 0. The lower triangular part of the matrix is defined as the elements
     on and below the diagonal.
 
     Args:
@@ -1087,14 +1180,14 @@ def triu(x, diagonal=0, name=None):
 def meshgrid(*args, **kwargs):
     """
     Takes a list of N tensors as input *args, each of which is 1-dimensional vector, and creates N-dimensional grids.
-    
+
     Args:
-        *args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,), 
+        *args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,),
             (N2,),..., (Nk,). Support data types: ``float64``, ``float32``, ``int32``, ``int64``.
-        **kwargs (optional): Currently, only accept name in **kwargs 
+        **kwargs (optional): Currently, only accept name in **kwargs
             The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name`.
- 
+
     Returns:
          Tensor: k tensors. The shape of each tensor is (N1, N2, ..., Nk)
 
@@ -1132,18 +1225,21 @@ def meshgrid(*args, **kwargs):
         raise TypeError("The type of input args in meshgrid should be list.")
 
     for id, input_ in enumerate(args):
-        check_dtype(input_.dtype, 'create data type',
-                    ['float16', 'float32', 'float64', 'int32', 'int64'],
-                    'meshgrid')
+        check_dtype(
+            input_.dtype,
+            'create data type',
+            ['float16', 'float32', 'float64', 'int32', 'int64'],
+            'meshgrid',
+        )
 
     num = len(args)
     out = [
         helper.create_variable_for_type_inference(dtype=args[i].dtype)
         for i in range(num)
     ]
-    helper.append_op(type='meshgrid',
-                     inputs={'X': list(args)},
-                     outputs={'Out': out})
+    helper.append_op(
+        type='meshgrid', inputs={'X': list(args)}, outputs={'Out': out}
+    )
 
     return out
 
@@ -1237,17 +1333,21 @@ def diagflat(x, offset=0, name=None):
 
     if _in_legacy_dygraph():
         if len(x.shape) == 1:
-            return _legacy_C_ops.diag_v2(x, "offset", offset, "padding_value",
-                                         padding_value)
+            return _legacy_C_ops.diag_v2(
+                x, "offset", offset, "padding_value", padding_value
+            )
         else:
             y, _ = _legacy_C_ops.flatten_contiguous_range(
-                x, "start_axis", 0, "stop_axis", -1)
-            return _legacy_C_ops.diag_v2(y, "offset", offset, "padding_value",
-                                         padding_value)
+                x, "start_axis", 0, "stop_axis", -1
+            )
+            return _legacy_C_ops.diag_v2(
+                y, "offset", offset, "padding_value", padding_value
+            )
 
     check_type(x, 'x', (Variable), 'diagflat')
-    check_dtype(x.dtype, 'x', ['float32', 'float64', 'int32', 'int64'],
-                'diagflat')
+    check_dtype(
+        x.dtype, 'x', ['float32', 'float64', 'int32', 'int64'], 'diagflat'
+    )
     check_type(offset, 'offset', (int), 'diagflat')
 
     helper = LayerHelper("diagflat", **locals())
@@ -1256,33 +1356,27 @@ def diagflat(x, offset=0, name=None):
     out2 = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     if len(x.shape) == 1:
-        helper.append_op(type='diag_v2',
-                         inputs={'X': x},
-                         outputs={'Out': out2},
-                         attrs={
-                             'offset': offset,
-                             'padding_value': padding_value
-                         })
+        helper.append_op(
+            type='diag_v2',
+            inputs={'X': x},
+            outputs={'Out': out2},
+            attrs={'offset': offset, 'padding_value': padding_value},
+        )
     else:
-        helper.append_op(type='flatten_contiguous_range',
-                         inputs={'X': x},
-                         outputs={
-                             'Out': out1,
-                             'XShape': out1_shape
-                         },
-                         attrs={
-                             'start_axis': 0,
-                             'stop_axis': -1
-                         })
+        helper.append_op(
+            type='flatten_contiguous_range',
+            inputs={'X': x},
+            outputs={'Out': out1, 'XShape': out1_shape},
+            attrs={'start_axis': 0, 'stop_axis': -1},
+        )
         out1.stop_gradient = True
 
-        helper.append_op(type='diag_v2',
-                         inputs={'X': out1},
-                         outputs={'Out': out2},
-                         attrs={
-                             'offset': offset,
-                             'padding_value': padding_value
-                         })
+        helper.append_op(
+            type='diag_v2',
+            inputs={'X': out1},
+            outputs={'Out': out2},
+            attrs={'offset': offset, 'padding_value': padding_value},
+        )
     out2.stop_gradient = True
     return out2
 
@@ -1306,7 +1400,7 @@ def diag(x, offset=0, padding_value=0, name=None):
         offset (int, optional): The diagonal offset. A positive value represents superdiagonal, 0 represents the main diagonal, and a negative value represents subdiagonal.
         padding_value (int|float, optional): Use this value to fill the area outside the specified diagonal band. Only takes effect when the input is a 1-D Tensor. The default value is 0.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-        
+
     Returns:
         Tensor, a square matrix or a vector. The output data type is the same as input data type.
 
@@ -1360,30 +1454,36 @@ def diag(x, offset=0, padding_value=0, name=None):
         return _C_ops.diag(x, offset, padding_value)
     else:
         if _in_legacy_dygraph():
-            return _legacy_C_ops.diag_v2(x, "offset", offset, "padding_value",
-                                         padding_value)
+            return _legacy_C_ops.diag_v2(
+                x, "offset", offset, "padding_value", padding_value
+            )
         else:
             check_type(x, 'x', (Variable), 'diag_v2')
-            check_dtype(x.dtype, 'x', ['float32', 'float64', 'int32', 'int64'],
-                        'diag_v2')
+            check_dtype(
+                x.dtype,
+                'x',
+                ['float32', 'float64', 'int32', 'int64'],
+                'diag_v2',
+            )
             check_type(offset, 'offset', (int), 'diag_v2')
             check_type(padding_value, 'padding_value', (int, float), 'diag_v2')
             if len(x.shape) != 1 and len(x.shape) != 2:
                 raise ValueError(
-                    "The dimension of input x must be either 1 or 2, but received {}"
-                    .format(len(x.shape)))
+                    "The dimension of input x must be either 1 or 2, but received {}".format(
+                        len(x.shape)
+                    )
+                )
 
             helper = LayerHelper("diag_v2", **locals())
 
             out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-            helper.append_op(type='diag_v2',
-                             inputs={'X': x},
-                             outputs={'Out': out},
-                             attrs={
-                                 'offset': offset,
-                                 'padding_value': padding_value
-                             })
+            helper.append_op(
+                type='diag_v2',
+                inputs={'X': x},
+                outputs={'Out': out},
+                attrs={'offset': offset, 'padding_value': padding_value},
+            )
 
             out.stop_gradient = True
             return out
@@ -1392,7 +1492,7 @@ def diag(x, offset=0, padding_value=0, name=None):
 def empty(shape, dtype=None, name=None):
     """
     Returns a Tensor with uninitialized data which size is same as ``shape``.
-    
+
     Args:
         shape(list|tuple|Tensor): Shape of the Tensor to be created.
                 The data type of dimension of shape is ``int32`` or ``int64`` . If ``shape`` is a list or tuple,
@@ -1403,7 +1503,7 @@ def empty(shape, dtype=None, name=None):
             type of created Tensor use global default dtype (see ``get_default_dtype``
             for details).
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: Tensor which is created according to ``shape`` and ``dtype``, and is uninitialized.
 
@@ -1445,42 +1545,48 @@ def empty(shape, dtype=None, name=None):
 
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
-        out = _C_ops.empty(shape, convert_np_dtype_to_dtype_(dtype),
-                           _current_expected_place())
+        out = _C_ops.empty(
+            shape, convert_np_dtype_to_dtype_(dtype), _current_expected_place()
+        )
         out.stop_gradient = True
         return out
 
     if _in_legacy_dygraph():
         shape = utils.convert_shape_to_list(shape)
-        out = _legacy_C_ops.empty('shape', shape, 'dtype',
-                                  convert_np_dtype_to_dtype_(dtype))
+        out = _legacy_C_ops.empty(
+            'shape', shape, 'dtype', convert_np_dtype_to_dtype_(dtype)
+        )
         out.stop_gradient = True
         return out
 
     helper = LayerHelper("empty", **locals())
     inputs = {}
 
-    check_dtype(dtype, 'dtype',
-                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-                'empty')
+    check_dtype(
+        dtype,
+        'dtype',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'empty',
+    )
     check_type(shape, 'shape', (Variable, list, tuple), 'empty')
 
     if isinstance(shape, Variable):
         check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'empty')
 
     attrs = {}
-    utils.get_shape_tensor_inputs(inputs=inputs,
-                                  attrs=attrs,
-                                  shape=shape,
-                                  op_type='empty')
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='empty'
+    )
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
     attrs['dtype'] = convert_np_dtype_to_dtype_(dtype)
-    helper.append_op(type='empty',
-                     inputs=inputs,
-                     outputs={'Out': [out]},
-                     attrs=attrs,
-                     stop_gradient=True)
+    helper.append_op(
+        type='empty',
+        inputs=inputs,
+        outputs={'Out': [out]},
+        attrs=attrs,
+        stop_gradient=True,
+    )
     out.stop_gradient = True
     return out
 
@@ -1489,14 +1595,14 @@ def empty_like(x, dtype=None, name=None):
     """
     Returns a Tensor with uninitialized data which has identical shape of ``x`` and ``dtype``.
     If the ``dtype`` is None, the data type of Tensor is same with ``x``.
-    
+
     Args:
         x(Tensor): The input tensor which specifies shape and data type. The data type can be bool, float16, float32, float64, int32, int64.
         dtype(np.dtype|str, optional): The data type of output. The data type can be one
-            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output 
+            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output
             data type is the same as input.
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: Tensor which is created according to ``x`` and ``dtype``, and is uninitialized.
 
@@ -1518,40 +1624,51 @@ def empty_like(x, dtype=None, name=None):
     dtype = convert_dtype(dtype)
 
     if in_dygraph_mode():
-        out = _C_ops.empty(x.shape, convert_np_dtype_to_dtype_(dtype),
-                           _current_expected_place())
+        out = _C_ops.empty(
+            x.shape,
+            convert_np_dtype_to_dtype_(dtype),
+            _current_expected_place(),
+        )
         out.stop_gradient = True
         return out
 
     if _in_legacy_dygraph():
-        out = _legacy_C_ops.empty('shape', x.shape, 'dtype',
-                                  convert_np_dtype_to_dtype_(dtype))
+        out = _legacy_C_ops.empty(
+            'shape', x.shape, 'dtype', convert_np_dtype_to_dtype_(dtype)
+        )
         out.stop_gradient = True
         return out
 
     helper = LayerHelper("empty_like", **locals())
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-        'empty_like')
-    check_dtype(dtype, 'dtype',
-                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-                'empty_like')
+        x,
+        'x',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'empty_like',
+    )
+    check_dtype(
+        dtype,
+        'dtype',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'empty_like',
+    )
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
     inputs = {}
     attrs = {}
     attrs['dtype'] = convert_np_dtype_to_dtype_(dtype)
     shape = paddle.shape(x)
-    utils.get_shape_tensor_inputs(inputs=inputs,
-                                  attrs=attrs,
-                                  shape=shape,
-                                  op_type='empty_like')
-
-    helper.append_op(type='empty',
-                     inputs=inputs,
-                     outputs={'Out': [out]},
-                     attrs=attrs,
-                     stop_gradient=True)
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='empty_like'
+    )
+
+    helper.append_op(
+        type='empty',
+        inputs=inputs,
+        outputs={'Out': [out]},
+        attrs=attrs,
+        stop_gradient=True,
+    )
     out.stop_gradient = True
     return out
 
@@ -1560,16 +1677,16 @@ def assign(x, output=None):
     """
 
     Copy value of the :attr:`x` to the :attr:`output`.
- 
+
     Parameters:
         x (Tensor|np.ndarray|list|tuple|scalar): A Tensor, numpy ndarray, tuple/list of scalar,
             or scalar. Its data type can be float16, float32, float64, int32, int64 or bool. Note: the float64 data will be converted to float32 because of current platform protobuf
             data limitation.
         output (Tensor, optional): A Tensor. If :attr:`output` is None, a new Tensor will be created as :attr:`output`. Default: None.
- 
+
     Returns:
         Tensor: A Tensor with the same shape, data type and value as :attr:`x`.
- 
+
     Examples:
         .. code-block:: python
 
@@ -1586,8 +1703,12 @@ def assign(x, output=None):
     """
     input = x
     helper = LayerHelper('assign', **locals())
-    check_type(input, 'input',
-               (Variable, np.ndarray, list, tuple, float, int, bool), 'assign')
+    check_type(
+        input,
+        'input',
+        (Variable, np.ndarray, list, tuple, float, int, bool),
+        'assign',
+    )
     is_inplace = True if output is not None else False
 
     if np.isscalar(input) and not isinstance(input, str):
@@ -1610,24 +1731,40 @@ def assign(x, output=None):
                 output = core.VarBase()
             _legacy_C_ops.assign(input, output)
         else:
-            check_dtype(input.dtype, 'input', [
-                'float16', 'uint16', 'float32', 'float64', 'int32', 'int64',
-                'uint8', 'bool'
-            ], 'assign', '(When the type of input in assign is Variable.)')
+            check_dtype(
+                input.dtype,
+                'input',
+                [
+                    'float16',
+                    'uint16',
+                    'float32',
+                    'float64',
+                    'int32',
+                    'int64',
+                    'uint8',
+                    'bool',
+                ],
+                'assign',
+                '(When the type of input in assign is Variable.)',
+            )
             if output is None:
                 output = helper.create_variable_for_type_inference(
-                    dtype=input.dtype)
-            helper.append_op(type='assign',
-                             inputs={'X': [input]},
-                             outputs={'Out': [output]})
+                    dtype=input.dtype
+                )
+            helper.append_op(
+                type='assign', inputs={'X': [input]}, outputs={'Out': [output]}
+            )
     elif isinstance(input, np.ndarray):
         # We now support the form of [var, VAR...] if the Var.shape=[1,]
         if len(input.shape) > 0 and any(isinstance(x, Variable) for x in input):
             # We only deal with the case where the list is nested one level, convert all scalars into variables, and then use stack to process. It is necessary to ensure the consistency of types.
-            if not all([
-                    x.shape == (1, ) for x in input
+            if not all(
+                [
+                    x.shape == (1,)
+                    for x in input
                     if isinstance(x, (Variable, core.eager.Tensor))
-            ]):
+                ]
+            ):
                 raise TypeError(
                     "Unsupport paddle.assign([Variable, Variable...]) with non-scalar variable."
                 )
@@ -1643,8 +1780,7 @@ def assign(x, output=None):
             return ret
 
         if input.dtype == 'object':
-            """ may be this form [[Var], [Var], [3], [4]], we reject them.
-            """
+            """may be this form [[Var], [Var], [3], [4]], we reject them."""
             raise TypeError(
                 "The type of received input == `object`, it is not supported to convert to tensor, such as [[Var], [Var], [3], [4]]"
             )
@@ -1656,7 +1792,8 @@ def assign(x, output=None):
             warnings.warn(
                 "paddle.assign doesn't support float64 input now due "
                 "to current platform protobuf data limitation, we convert "
-                "it to float32")
+                "it to float32"
+            )
             dtype = core.VarDesc.VarType.FP32
         if dtype == core.VarDesc.VarType.BOOL:
             value_name = "bool_values"
@@ -1674,31 +1811,49 @@ def assign(x, output=None):
             raise TypeError(
                 "When the type of 'input' in assign is numpy.ndarray, "
                 "the data type of 'input' must be bool, float32, int32 or int64, but "
-                "received %s." % convert_dtype(dtype))
+                "received %s." % convert_dtype(dtype)
+            )
         if input.size > 1024 * 1024:
-            raise ValueError("The size of input is too big. Please consider "
-                             "saving it to file and 'load_op' to load it")
+            raise ValueError(
+                "The size of input is too big. Please consider "
+                "saving it to file and 'load_op' to load it"
+            )
         if in_dygraph_mode():
             if output is None:
                 output = zeros(list(input.shape), dtype)
-            _C_ops.assign_value_(output, list(input.shape), dtype, values,
-                                 _current_expected_place())
+            _C_ops.assign_value_(
+                output,
+                list(input.shape),
+                dtype,
+                values,
+                _current_expected_place(),
+            )
         elif _in_legacy_dygraph():
             if output is None:
                 output = core.VarBase()
-            _legacy_C_ops.assign_value(output, 'shape', list(input.shape),
-                                       'dtype', dtype, value_name, values)
+            _legacy_C_ops.assign_value(
+                output,
+                'shape',
+                list(input.shape),
+                'dtype',
+                dtype,
+                value_name,
+                values,
+            )
         else:
             if output is None:
                 output = helper.create_variable_for_type_inference(
-                    dtype=input.dtype)
-            helper.append_op(type='assign_value',
-                             outputs={'Out': [output]},
-                             attrs={
-                                 'dtype': dtype,
-                                 'shape': list(input.shape),
-                                 value_name: values
-                             })
+                    dtype=input.dtype
+                )
+            helper.append_op(
+                type='assign_value',
+                outputs={'Out': [output]},
+                attrs={
+                    'dtype': dtype,
+                    'shape': list(input.shape),
+                    value_name: values,
+                },
+            )
 
     if is_inplace and _in_legacy_dygraph():
         output._bump_inplace_version()
@@ -1708,15 +1863,15 @@ def assign(x, output=None):
 
 def clone(x, name=None):
     """
-    Returns a copy of input Tensor. It will always have a Tensor copy. 
-    
+    Returns a copy of input Tensor. It will always have a Tensor copy.
+
     In addition, This function is derivable, so gradients will flow back from the output to input.
 
     Parameters:
         x (Tensor): The input Tensor.
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
-    Returns: 
+    Returns:
         Tensor, A Tensor copied from ``input``.
 
     Examples:
@@ -1736,7 +1891,7 @@ def clone(x, name=None):
     return x.clone()
 
 
-#NOTE(zhiqiu): not public
+# NOTE(zhiqiu): not public
 def _memcpy(input, place=None, output=None):
     """
 
@@ -1764,10 +1919,22 @@ def _memcpy(input, place=None, output=None):
     check_type(input, 'input', (Variable), 'memcpy')
 
     if isinstance(input, (Variable, core.VarBase)):
-        check_dtype(input.dtype, 'input', [
-            'float16', 'uint16', 'float32', 'float64', 'int32', 'int64',
-            'uint8', 'bool'
-        ], 'memcpy', '(When the type of input in memcpy is Variable.)')
+        check_dtype(
+            input.dtype,
+            'input',
+            [
+                'float16',
+                'uint16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'uint8',
+                'bool',
+            ],
+            'memcpy',
+            '(When the type of input in memcpy is Variable.)',
+        )
     if output is None:
         output = helper.create_variable_for_type_inference(dtype=input.dtype)
 
@@ -1789,10 +1956,12 @@ def _memcpy(input, place=None, output=None):
             dst_place_type = 4
 
     attrs = {'dst_place_type': dst_place_type}
-    helper.append_op(type='memcpy',
-                     inputs={'X': [input]},
-                     outputs={'Out': [output]},
-                     attrs=attrs)
+    helper.append_op(
+        type='memcpy',
+        inputs={'X': [input]},
+        outputs={'Out': [output]},
+        attrs=attrs,
+    )
     return output
 
 
@@ -1835,7 +2004,8 @@ def complex(real, imag, name=None):
     helper = LayerHelper(op_type, **locals())
     inputs = {"X": real, "Y": imag}
     out = helper.create_variable_for_type_inference(
-        dtype=_real_to_complex_dtype(real.dtype))
+        dtype=_real_to_complex_dtype(real.dtype)
+    )
     outputs = {"Out": out}
     attrs = {}
     helper.append_op(type=op_type, inputs=inputs, attrs=attrs, outputs=outputs)
@@ -1844,20 +2014,20 @@ def complex(real, imag, name=None):
 
 def tril_indices(row, col, offset=0, dtype='int64'):
     """
-    Return the indices of the lower triangular part of the 2-D matrix 
-    whose row and col is knowed.Indices are ordered based on row and then columns. 
+    Return the indices of the lower triangular part of the 2-D matrix
+    whose row and col is knowed.Indices are ordered based on row and then columns.
     The lower triangular part of the matrix is defined as the elements on
     and below the diagonal.
-    
+
     Args:
         row (int): The input x which is a int number describe the number of row of the matrix.
         col (int): The input x which is a int number describe the number of col of the matrix.
         offset (int, optional): The offset to consider, default value is 0.
 
-            - If offset = 0, all elements on and below the main diagonal are retained.  
-            - If offset > 0, include just as many diagonals above the main diagonal.  
-            - If offset < 0, excludes just as many diagonals below the main diagonal.  
- 
+            - If offset = 0, all elements on and below the main diagonal are retained.
+            - If offset > 0, include just as many diagonals above the main diagonal.
+            - If offset < 0, excludes just as many diagonals below the main diagonal.
+
         dtype (int, optional): the data type of the output tensor, can be int32, int64.
 
     Returns:
@@ -1868,17 +2038,17 @@ def tril_indices(row, col, offset=0, dtype='int64'):
         .. code-block:: python
 
             import paddle
-            
+
             # example 1, default offset value
             data1 = paddle.tril_indices(4,4,0)
             print(data1)
-            # [[0, 1, 1, 2, 2, 2, 3, 3, 3, 3], 
+            # [[0, 1, 1, 2, 2, 2, 3, 3, 3, 3],
             #  [0, 0, 1, 0, 1, 2, 0, 1, 2, 3]]
 
             # example 2, positive offset value
             data2 = paddle.tril_indices(4,4,2)
             print(data2)
-            # [[0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], 
+            # [[0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
             #  [0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]]
 
             # example 3, negative offset value
@@ -1903,13 +2073,15 @@ def tril_indices(row, col, offset=0, dtype='int64'):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        out = _C_ops.tril_indices(row, col, offset, dtype,
-                                  _current_expected_place())
+        out = _C_ops.tril_indices(
+            row, col, offset, dtype, _current_expected_place()
+        )
         return out
 
     if _in_legacy_dygraph():
-        out = _legacy_C_ops.tril_indices('rows', row, 'cols', col, 'offset',
-                                         offset, "dtype", dtype)
+        out = _legacy_C_ops.tril_indices(
+            'rows', row, 'cols', col, 'offset', offset, "dtype", dtype
+        )
         return out
 
     else:
@@ -1917,15 +2089,12 @@ def tril_indices(row, col, offset=0, dtype='int64'):
 
         out = helper.create_variable_for_type_inference(dtype=dtype)
 
-        helper.append_op(type='tril_indices',
-                         inputs={},
-                         outputs={'out': [out]},
-                         attrs={
-                             'rows': row,
-                             'cols': col,
-                             'offset': offset,
-                             'dtype': dtype
-                         })
+        helper.append_op(
+            type='tril_indices',
+            inputs={},
+            outputs={'out': [out]},
+            attrs={'rows': row, 'cols': col, 'offset': offset, 'dtype': dtype},
+        )
     return out
 
 
@@ -1988,13 +2157,15 @@ def triu_indices(row, col=None, offset=0, dtype='int64'):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        out = _C_ops.triu_indices(row, col, offset, dtype,
-                                  _current_expected_place())
+        out = _C_ops.triu_indices(
+            row, col, offset, dtype, _current_expected_place()
+        )
         return out
 
     if _in_legacy_dygraph():
-        out = _legacy_C_ops.triu_indices('row', row, 'col', col, 'offset',
-                                         offset, "dtype", dtype)
+        out = _legacy_C_ops.triu_indices(
+            'row', row, 'col', col, 'offset', offset, "dtype", dtype
+        )
         return out
 
     else:
@@ -2002,13 +2173,10 @@ def triu_indices(row, col=None, offset=0, dtype='int64'):
 
         out = helper.create_variable_for_type_inference(dtype=dtype)
 
-        helper.append_op(type='triu_indices',
-                         inputs={},
-                         outputs={'out': [out]},
-                         attrs={
-                             'row': row,
-                             'col': col,
-                             'offset': offset,
-                             'dtype': dtype
-                         })
+        helper.append_op(
+            type='triu_indices',
+            inputs={},
+            outputs={'out': [out]},
+            attrs={'row': row, 'col': col, 'offset': offset, 'dtype': dtype},
+        )
     return out
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 8765c7a5049..934a345c97e 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -14,8 +14,17 @@
 
 import numpy as np
 from ..framework import LayerHelper
-from ..framework import _varbase_creator, _dygraph_tracer, in_dygraph_mode, _non_static_mode
-from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
+from ..framework import (
+    _varbase_creator,
+    _dygraph_tracer,
+    in_dygraph_mode,
+    _non_static_mode,
+)
+from ..fluid.data_feeder import (
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+)
 from ..static import Variable
 from ..fluid.framework import _in_legacy_dygraph
 from .manipulation import cast
@@ -92,10 +101,21 @@ def transpose(x, perm, name=None):
             out, _ = _legacy_C_ops.transpose2(x, 'axis', perm)
             return out
 
-    check_variable_and_dtype(x, 'x', [
-        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
-        'complex128'
-    ], 'transpose')
+    check_variable_and_dtype(
+        x,
+        'x',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'transpose',
+    )
     check_type(perm, 'perm', (list, tuple), 'transpose')
     if isinstance(perm, tuple):
         perm = list(perm)
@@ -104,24 +124,25 @@ def transpose(x, perm, name=None):
             "Input(perm) is the permutation of dimensions of Input(x), "
             "its length should be equal to dimensions of Input(x), "
             "but received dimension of Input(x) is %s, "
-            "the length of Input(perm) is %s." % (len(x.shape), len(perm)))
+            "the length of Input(perm) is %s." % (len(x.shape), len(perm))
+        )
     for idx, dim in enumerate(perm):
         if dim >= len(x.shape):
             raise ValueError(
                 "Each element in Input(perm) should be less than Input(x)'s dimension, "
                 "but %d-th element in Input(perm) is %d which exceeds Input(x)'s "
-                "dimension %d." % (idx, perm[idx], len(x.shape)))
+                "dimension %d." % (idx, perm[idx], len(x.shape))
+            )
 
     helper = LayerHelper('transpose', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='transpose2',
-                     inputs={'X': [x]},
-                     outputs={
-                         'Out': [out],
-                         'XShape': [x_shape]
-                     },
-                     attrs={'axis': perm})
+    helper.append_op(
+        type='transpose2',
+        inputs={'X': [x]},
+        outputs={'Out': [out], 'XShape': [x_shape]},
+        attrs={'axis': perm},
+    )
     return out
 
 
@@ -236,21 +257,22 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
         var_names = {'x': x, 'y': y}
         for name, val in var_names.items():
             check_variable_and_dtype(
-                val, name,
+                val,
+                name,
                 ['float16', 'float32', 'float64', 'complex64', 'complex128'],
-                'matmul')
+                'matmul',
+            )
 
     __check_input(x, y)
 
     helper = LayerHelper('matmul_v2', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='matmul_v2',
-                     inputs={
-                         'X': x,
-                         'Y': y
-                     },
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='matmul_v2',
+        inputs={'X': x, 'Y': y},
+        outputs={'Out': out},
+        attrs=attrs,
+    )
     return out
 
 
@@ -345,32 +367,35 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
             return _C_ops.frobenius_norm(input, dim, keepdim, False)
         if _in_legacy_dygraph():
             if dim is None:
-                return _legacy_C_ops.frobenius_norm(input, 'keep_dim', keepdim,
-                                                    'reduce_all', True)
-            return _legacy_C_ops.frobenius_norm(input, 'dim', dim, 'keep_dim',
-                                                keepdim, 'reduce_all', False)
+                return _legacy_C_ops.frobenius_norm(
+                    input, 'keep_dim', keepdim, 'reduce_all', True
+                )
+            return _legacy_C_ops.frobenius_norm(
+                input, 'dim', dim, 'keep_dim', keepdim, 'reduce_all', False
+            )
         attrs = {'dim': dim, 'keep_dim': keepdim, 'reduce_all': False}
         if dim is None:
             attrs['reduce_all'] = True
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'frobenius_norm')
+        check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'frobenius_norm'
+        )
 
         helper = LayerHelper('frobenius_norm', **locals())
         out = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
+            dtype=helper.input_dtype()
+        )
 
-        helper.append_op(type='frobenius_norm',
-                         inputs={'X': input},
-                         outputs={'Out': out},
-                         attrs=attrs)
+        helper.append_op(
+            type='frobenius_norm',
+            inputs={'X': input},
+            outputs={'Out': out},
+            attrs=attrs,
+        )
         return out
 
-    def vector_norm(input,
-                    porder=None,
-                    axis=None,
-                    keepdim=False,
-                    asvector=False,
-                    name=None):
+    def vector_norm(
+        input, porder=None, axis=None, keepdim=False, asvector=False, name=None
+    ):
         """
         Calculate the p-order vector norm for certain  dimension of Tensor `input`.
         Args:
@@ -380,21 +405,32 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
           keepdim (bool, optional): Whether keep the dimensions as the `input`, Default False.
         """
         if in_dygraph_mode():
-            if axis is None: axis = -1
+            if axis is None:
+                axis = -1
             return _C_ops.p_norm(input, porder, axis, 1e-12, keepdim, asvector)
 
         if _in_legacy_dygraph():
-            if axis is None: axis = -1
-            return _legacy_C_ops.p_norm(input, 'porder', porder, 'axis', axis,
-                                        'keepdim', keepdim, 'asvector',
-                                        asvector)
+            if axis is None:
+                axis = -1
+            return _legacy_C_ops.p_norm(
+                input,
+                'porder',
+                porder,
+                'axis',
+                axis,
+                'keepdim',
+                keepdim,
+                'asvector',
+                asvector,
+            )
 
         if porder is not None:
             check_type(porder, 'porder', (float, int), 'p_norm')
         if axis is not None:
             check_type(axis, 'axis', (int), 'p_norm')
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'p_norm')
+        check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'p_norm'
+        )
 
         attrs = {
             'axis': axis if axis is not None else -1,
@@ -405,23 +441,27 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
         }
         helper = LayerHelper('p_norm', **locals())
         out = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
+            dtype=helper.input_dtype()
+        )
 
-        helper.append_op(type='p_norm',
-                         inputs={'X': input},
-                         outputs={'Out': out},
-                         attrs=attrs)
+        helper.append_op(
+            type='p_norm',
+            inputs={'X': input},
+            outputs={'Out': out},
+            attrs=attrs,
+        )
         return out
 
-    def inf_norm(input,
-                 porder=None,
-                 axis=axis,
-                 keepdim=False,
-                 asvector=False,
-                 name=None):
+    def inf_norm(
+        input, porder=None, axis=axis, keepdim=False, asvector=False, name=None
+    ):
         if in_dygraph_mode():
             out = _C_ops.abs(input)
-            reduce_all = True if axis == None or axis == [] or asvector == True else False
+            reduce_all = (
+                True
+                if axis == None or axis == [] or asvector == True
+                else False
+            )
             axis = axis if axis != None and axis != [] else [0]
             if reduce_all:
                 assert (axis == []) or (axis is None)
@@ -432,28 +472,31 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
 
         helper = LayerHelper('inf_norm', **locals())
         out = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
+            dtype=helper.input_dtype()
+        )
         helper.append_op(type='abs', inputs={'X': input}, outputs={'Out': out})
         reduce_out = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
+            dtype=helper.input_dtype()
+        )
 
-        reduce_all = True if axis == None or axis == [] or asvector == True else False
+        reduce_all = (
+            True if axis == None or axis == [] or asvector == True else False
+        )
         axis = axis if axis != None and axis != [] else [0]
 
-        reduce_type = 'reduce_max' if porder == np.float64(
-            'inf') else 'reduce_min'
-        helper.append_op(type=reduce_type,
-                         inputs={'X': out},
-                         outputs={'Out': reduce_out},
-                         attrs={
-                             'dim': axis,
-                             'keep_dim': keepdim,
-                             'reduce_all': reduce_all
-                         })
+        reduce_type = (
+            'reduce_max' if porder == np.float64('inf') else 'reduce_min'
+        )
+        helper.append_op(
+            type=reduce_type,
+            inputs={'X': out},
+            outputs={'Out': reduce_out},
+            attrs={'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all},
+        )
 
         return reduce_out
 
-    def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
+    def p_matrix_norm(input, porder=1.0, axis=axis, keepdim=False, name=None):
         """
         NOTE:
             This function actually treats the matrix as flattened vector to calculate vector norm instead of matrix norm.
@@ -462,38 +505,48 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
             abs_out = _C_ops.abs(input)
             pow_out = _C_ops.pow(abs_out, porder)
             sum_out = _C_ops.sum(pow_out, axis, None, keepdim)
-            out = _C_ops.pow(sum_out, float(1. / porder))
+            out = _C_ops.pow(sum_out, float(1.0 / porder))
             return out
 
         block = LayerHelper('norm', **locals())
         out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
+            dtype=block.input_dtype()
+        )
         abs_out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
-        block.append_op(type='abs',
-                        inputs={'X': input},
-                        outputs={'Out': abs_out})
+            dtype=block.input_dtype()
+        )
+        block.append_op(
+            type='abs', inputs={'X': input}, outputs={'Out': abs_out}
+        )
         pow_out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
+            dtype=block.input_dtype()
+        )
 
-        block.append_op(type='pow',
-                        inputs={'X': abs_out},
-                        outputs={'Out': pow_out},
-                        attrs={'factor': porder})
+        block.append_op(
+            type='pow',
+            inputs={'X': abs_out},
+            outputs={'Out': pow_out},
+            attrs={'factor': porder},
+        )
         sum_out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
-        block.append_op(type='reduce_sum',
-                        inputs={'X': pow_out},
-                        outputs={'Out': sum_out},
-                        attrs={
-                            'dim': axis,
-                            'keep_dim': keepdim,
-                            'reduce_all': True if axis is None else False
-                        })
-        block.append_op(type='pow',
-                        inputs={'X': sum_out},
-                        outputs={'Out': out},
-                        attrs={'factor': float(1. / porder)})
+            dtype=block.input_dtype()
+        )
+        block.append_op(
+            type='reduce_sum',
+            inputs={'X': pow_out},
+            outputs={'Out': sum_out},
+            attrs={
+                'dim': axis,
+                'keep_dim': keepdim,
+                'reduce_all': True if axis is None else False,
+            },
+        )
+        block.append_op(
+            type='pow',
+            inputs={'X': sum_out},
+            outputs={'Out': out},
+            attrs={'factor': float(1.0 / porder)},
+        )
         return out
 
     if axis is None and p is not None:
@@ -502,50 +555,60 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
                 return frobenius_norm(x, dim=axis, keepdim=keepdim, name=name)
             else:
                 raise ValueError(
-                    "only valid string values are 'fro', found {}".format(p))
+                    "only valid string values are 'fro', found {}".format(p)
+                )
         elif isinstance(p, (int, float)):
-            return vector_norm(x,
-                               porder=p,
-                               axis=axis,
-                               keepdim=keepdim,
-                               asvector=True,
-                               name=name)
+            return vector_norm(
+                x,
+                porder=p,
+                axis=axis,
+                keepdim=keepdim,
+                asvector=True,
+                name=name,
+            )
         else:
             raise ValueError(
-                "only valid p type is string or float, found {}".format(
-                    type(p)))
+                "only valid p type is string or float, found {}".format(type(p))
+            )
 
     if isinstance(axis, tuple):
         axis = list(axis)
     if isinstance(axis, list) and len(axis) == 1:
         axis = axis[0]
 
-    #calculate vector norm, where axis is int or list with only one integer
+    # calculate vector norm, where axis is int or list with only one integer
     if isinstance(axis, int):
         if isinstance(p, str):
             if p == "fro":
-                return vector_norm(x,
-                                   porder=2,
-                                   axis=axis,
-                                   keepdim=keepdim,
-                                   asvector=False,
-                                   name=name)
+                return vector_norm(
+                    x,
+                    porder=2,
+                    axis=axis,
+                    keepdim=keepdim,
+                    asvector=False,
+                    name=name,
+                )
 
             else:
                 raise ValueError(
-                    "only valid string values are 'fro', found {}".format(p))
+                    "only valid string values are 'fro', found {}".format(p)
+                )
         elif isinstance(p, (int, float)):
-            return vector_norm(x,
-                               axis=axis,
-                               porder=p,
-                               keepdim=keepdim,
-                               asvector=False,
-                               name=name)
+            return vector_norm(
+                x,
+                axis=axis,
+                porder=p,
+                keepdim=keepdim,
+                asvector=False,
+                name=name,
+            )
         else:
             raise ValueError(
-                "unspport p for p-order vector norm. except float, found {}".
-                format(p))
-    #calculate matrix norm, where axis is list with two integers
+                "unspport p for p-order vector norm. except float, found {}".format(
+                    p
+                )
+            )
+    # calculate matrix norm, where axis is list with two integers
     elif isinstance(axis, list) and len(axis) == 2:
         if p == "fro":
             return frobenius_norm(x, dim=axis, keepdim=keepdim, name=name)
@@ -553,18 +616,20 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
             return inf_norm(x, porder=p, axis=axis, keepdim=keepdim, name=name)
         elif p == 0:
             raise ValueError(
-                "just suport axis type int or list (length of list <=1) if p = 0, found {}"
-                .format(axis))
+                "just suport axis type int or list (length of list <=1) if p = 0, found {}".format(
+                    axis
+                )
+            )
         else:
-            return p_matrix_norm(x,
-                                 porder=p,
-                                 axis=axis,
-                                 keepdim=keepdim,
-                                 name=name)
+            return p_matrix_norm(
+                x, porder=p, axis=axis, keepdim=keepdim, name=name
+            )
     else:
         raise ValueError(
-            "except axis type int or list (length of list <=2), found {}".
-            format(axis))
+            "except axis type int or list (length of list <=2), found {}".format(
+                axis
+            )
+        )
 
 
 def dist(x, y, p=2, name=None):
@@ -662,10 +727,9 @@ def dist(x, y, p=2, name=None):
     inputs = {"X": [x], "Y": [y]}
     outputs = {'Out': [out]}
     attrs = {"p": float(p)}
-    helper.append_op(type='dist',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='dist', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -756,7 +820,7 @@ def cond(x, p=None, name=None):
 
     """
 
-    def mat_norm(input, porder=1., axis=None):
+    def mat_norm(input, porder=1.0, axis=None):
         """
         NOTE:
             Calculate the matrix norm of a square matrix or batches of square matrices,
@@ -777,54 +841,81 @@ def cond(x, p=None, name=None):
 
         elif _in_legacy_dygraph():
             abs_out = _legacy_C_ops.abs(input)
-            sum_out = _legacy_C_ops.reduce_sum(abs_out, 'dim', axis, 'keepdim',
-                                               keepdim, 'reduce_all',
-                                               reduce_all)
+            sum_out = _legacy_C_ops.reduce_sum(
+                abs_out,
+                'dim',
+                axis,
+                'keepdim',
+                keepdim,
+                'reduce_all',
+                reduce_all,
+            )
             if porder == 1 or porder == np.inf:
-                return _legacy_C_ops.reduce_max(sum_out, 'dim', [-1], 'keepdim',
-                                                keepdim, 'reduce_all',
-                                                reduce_all)
+                return _legacy_C_ops.reduce_max(
+                    sum_out,
+                    'dim',
+                    [-1],
+                    'keepdim',
+                    keepdim,
+                    'reduce_all',
+                    reduce_all,
+                )
             if porder == -1 or porder == -np.inf:
-                return _legacy_C_ops.reduce_min(sum_out, 'dim', [-1], 'keepdim',
-                                                keepdim, 'reduce_all',
-                                                reduce_all)
+                return _legacy_C_ops.reduce_min(
+                    sum_out,
+                    'dim',
+                    [-1],
+                    'keepdim',
+                    keepdim,
+                    'reduce_all',
+                    reduce_all,
+                )
         else:
             block = LayerHelper('norm', **locals())
             abs_out = block.create_variable_for_type_inference(
-                dtype=block.input_dtype())
+                dtype=block.input_dtype()
+            )
             sum_out = block.create_variable_for_type_inference(
-                dtype=block.input_dtype())
+                dtype=block.input_dtype()
+            )
             out = block.create_variable_for_type_inference(
-                dtype=block.input_dtype())
-            block.append_op(type='abs',
-                            inputs={'X': input},
-                            outputs={'Out': abs_out})
-            block.append_op(type='reduce_sum',
-                            inputs={'X': abs_out},
-                            outputs={'Out': sum_out},
-                            attrs={
-                                'dim': axis,
-                                'keep_dim': keepdim,
-                                'reduce_all': reduce_all
-                            })
+                dtype=block.input_dtype()
+            )
+            block.append_op(
+                type='abs', inputs={'X': input}, outputs={'Out': abs_out}
+            )
+            block.append_op(
+                type='reduce_sum',
+                inputs={'X': abs_out},
+                outputs={'Out': sum_out},
+                attrs={
+                    'dim': axis,
+                    'keep_dim': keepdim,
+                    'reduce_all': reduce_all,
+                },
+            )
             if porder == 1 or porder == np.inf:
-                block.append_op(type='reduce_max',
-                                inputs={'X': sum_out},
-                                outputs={'Out': out},
-                                attrs={
-                                    'dim': [-1],
-                                    'keep_dim': keepdim,
-                                    'reduce_all': reduce_all
-                                })
+                block.append_op(
+                    type='reduce_max',
+                    inputs={'X': sum_out},
+                    outputs={'Out': out},
+                    attrs={
+                        'dim': [-1],
+                        'keep_dim': keepdim,
+                        'reduce_all': reduce_all,
+                    },
+                )
             if porder == -1 or porder == -np.inf:
-                block.append_op(type='reduce_min',
-                                inputs={'X': sum_out},
-                                outputs={'Out': out},
-                                attrs={
-                                    'dim': [-1],
-                                    'keep_dim': keepdim,
-                                    'reduce_all': reduce_all
-                                })
+                block.append_op(
+                    type='reduce_min',
+                    inputs={'X': sum_out},
+                    outputs={'Out': out},
+                    attrs={
+                        'dim': [-1],
+                        'keep_dim': keepdim,
+                        'reduce_all': reduce_all,
+                    },
+                )
             return out
 
     def fro_norm(input, porder=2, axis=[-1]):
@@ -839,50 +930,66 @@ def cond(x, p=None, name=None):
             pow_out = _C_ops.pow(input, porder)
             sum_out_1 = _C_ops.sum(pow_out, axis, None, keepdim)
             sum_out_2 = _C_ops.sum(sum_out_1, axis, None, keepdim)
-            return _C_ops.pow(sum_out_2, float(1. / porder))
+            return _C_ops.pow(sum_out_2, float(1.0 / porder))
         elif paddle.in_dynamic_mode():
             pow_out = _legacy_C_ops.pow(input, 'factor', porder)
-            sum_out_1 = _legacy_C_ops.reduce_sum(pow_out, 'dim', axis,
-                                                 'keepdim', keepdim,
-                                                 'reduce_all', reduce_all)
-            sum_out_2 = _legacy_C_ops.reduce_sum(sum_out_1, 'dim', axis,
-                                                 'keepdim', keepdim,
-                                                 'reduce_all', reduce_all)
-            return _legacy_C_ops.pow(sum_out_2, 'factor', float(1. / porder))
+            sum_out_1 = _legacy_C_ops.reduce_sum(
+                pow_out,
+                'dim',
+                axis,
+                'keepdim',
+                keepdim,
+                'reduce_all',
+                reduce_all,
+            )
+            sum_out_2 = _legacy_C_ops.reduce_sum(
+                sum_out_1,
+                'dim',
+                axis,
+                'keepdim',
+                keepdim,
+                'reduce_all',
+                reduce_all,
+            )
+            return _legacy_C_ops.pow(sum_out_2, 'factor', float(1.0 / porder))
 
         block = LayerHelper('norm', **locals())
         pow_out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
+            dtype=block.input_dtype()
+        )
         sum_out_1 = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
+            dtype=block.input_dtype()
+        )
         sum_out_2 = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
+            dtype=block.input_dtype()
+        )
         out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
-        block.append_op(type='pow',
-                        inputs={'X': input},
-                        outputs={'Out': pow_out},
-                        attrs={'factor': porder})
-        block.append_op(type='reduce_sum',
-                        inputs={'X': pow_out},
-                        outputs={'Out': sum_out_1},
-                        attrs={
-                            'dim': axis,
-                            'keep_dim': keepdim,
-                            'reduce_all': reduce_all
-                        })
-        block.append_op(type='reduce_sum',
-                        inputs={'X': sum_out_1},
-                        outputs={'Out': sum_out_2},
-                        attrs={
-                            'dim': axis,
-                            'keep_dim': keepdim,
-                            'reduce_all': reduce_all
-                        })
-        block.append_op(type='pow',
-                        inputs={'X': sum_out_2},
-                        outputs={'Out': out},
-                        attrs={'factor': float(1. / porder)})
+            dtype=block.input_dtype()
+        )
+        block.append_op(
+            type='pow',
+            inputs={'X': input},
+            outputs={'Out': pow_out},
+            attrs={'factor': porder},
+        )
+        block.append_op(
+            type='reduce_sum',
+            inputs={'X': pow_out},
+            outputs={'Out': sum_out_1},
+            attrs={'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all},
+        )
+        block.append_op(
+            type='reduce_sum',
+            inputs={'X': sum_out_1},
+            outputs={'Out': sum_out_2},
+            attrs={'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all},
+        )
+        block.append_op(
+            type='pow',
+            inputs={'X': sum_out_2},
+            outputs={'Out': out},
+            attrs={'factor': float(1.0 / porder)},
+        )
         return out
 
     def svd_norm(input, porder, axis=[-1]):
@@ -901,9 +1008,15 @@ def cond(x, p=None, name=None):
                 if in_dygraph_mode():
                     return _C_ops.sum(s, axis, None, keepdim)
                 else:
-                    return _legacy_C_ops.reduce_sum(s, 'dim', axis, 'keepdim',
-                                                    keepdim, 'reduce_all',
-                                                    reduce_all)
+                    return _legacy_C_ops.reduce_sum(
+                        s,
+                        'dim',
+                        axis,
+                        'keepdim',
+                        keepdim,
+                        'reduce_all',
+                        reduce_all,
+                    )
             if in_dygraph_mode():
                 max_out = _C_ops.max(s, axis, keepdim)
                 min_out = _C_ops.min(s, axis, keepdim)
@@ -913,75 +1026,70 @@ def cond(x, p=None, name=None):
                     return _C_ops.divide(min_out, max_out)
 
             else:
-                max_out = _legacy_C_ops.reduce_max(s, 'dim', axis, 'keepdim',
-                                                   keepdim, 'reduce_all',
-                                                   reduce_all)
-                min_out = _legacy_C_ops.reduce_min(s, 'dim', axis, 'keepdim',
-                                                   keepdim, 'reduce_all',
-                                                   reduce_all)
+                max_out = _legacy_C_ops.reduce_max(
+                    s, 'dim', axis, 'keepdim', keepdim, 'reduce_all', reduce_all
+                )
+                min_out = _legacy_C_ops.reduce_min(
+                    s, 'dim', axis, 'keepdim', keepdim, 'reduce_all', reduce_all
+                )
                 if porder == 2:
                     return _legacy_C_ops.elementwise_div(
-                        max_out, min_out, 'aixs', axis, 'use_mkldnn', False)
+                        max_out, min_out, 'aixs', axis, 'use_mkldnn', False
+                    )
                 if porder == -2:
                     return _legacy_C_ops.elementwise_div(
-                        min_out, max_out, 'aixs', axis, 'use_mkldnn', False)
+                        min_out, max_out, 'aixs', axis, 'use_mkldnn', False
+                    )
 
         block = LayerHelper('norm', **locals())
         out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
+            dtype=block.input_dtype()
+        )
         if porder == "nuc":
-            block.append_op(type='reduce_sum',
-                            inputs={'X': s},
-                            outputs={'Out': out},
-                            attrs={
-                                'dim': axis,
-                                'keep_dim': keepdim,
-                                'reduce_all': reduce_all
-                            })
+            block.append_op(
+                type='reduce_sum',
+                inputs={'X': s},
+                outputs={'Out': out},
+                attrs={
+                    'dim': axis,
+                    'keep_dim': keepdim,
+                    'reduce_all': reduce_all,
+                },
+            )
             return out
         max_out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
+            dtype=block.input_dtype()
+        )
         min_out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
-        block.append_op(type='reduce_max',
-                        inputs={'X': s},
-                        outputs={'Out': max_out},
-                        attrs={
-                            'dim': axis,
-                            'keep_dim': keepdim,
-                            'reduce_all': reduce_all
-                        })
-        block.append_op(type='reduce_min',
-                        inputs={'X': s},
-                        outputs={'Out': min_out},
-                        attrs={
-                            'dim': axis,
-                            'keep_dim': keepdim,
-                            'reduce_all': reduce_all
-                        })
+            dtype=block.input_dtype()
+        )
+        block.append_op(
+            type='reduce_max',
+            inputs={'X': s},
+            outputs={'Out': max_out},
+            attrs={'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all},
+        )
+        block.append_op(
+            type='reduce_min',
+            inputs={'X': s},
+            outputs={'Out': min_out},
+            attrs={'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all},
+        )
         if porder == 2:
-            block.append_op(type='elementwise_div',
-                            inputs={
-                                'X': max_out,
-                                'Y': min_out
-                            },
-                            outputs={'Out': out},
-                            attrs={
-                                'aixs': axis,
-                                'use_mkldnn': False
-                            })
+            block.append_op(
+                type='elementwise_div',
+                inputs={'X': max_out, 'Y': min_out},
+                outputs={'Out': out},
+                attrs={'aixs': axis, 'use_mkldnn': False},
+            )
             return out
         if porder == -2:
-            block.append_op(type='elementwise_div',
-                            inputs={
-                                'X': min_out,
-                                'Y': max_out
-                            },
-                            outputs={'Out': out},
-                            attrs={
-                                'aixs': axis,
-                                'use_mkldnn': False
-                            })
+            block.append_op(
+                type='elementwise_div',
+                inputs={'X': min_out, 'Y': max_out},
+                outputs={'Out': out},
+                attrs={'aixs': axis, 'use_mkldnn': False},
+            )
             return out
 
     def empty_tensor(input, shape):
@@ -992,8 +1100,9 @@ def cond(x, p=None, name=None):
     x_shape = list(x.shape)
     if not len(x_shape) >= 2:
         raise ValueError(
-            "input should be a matrix or batches of matrices, " +
-            "but the dimention of received input is {}".format(len(x_shape)))
+            "input should be a matrix or batches of matrices, "
+            + "but the dimention of received input is {}".format(len(x_shape))
+        )
     if p == None:
         p = 2
     x_size = 0 if (0 in x_shape) else 1
@@ -1008,21 +1117,26 @@ def cond(x, p=None, name=None):
                 return svd_norm(x, p) * svd_norm(x_inv, p)
             if p in (1, -1):
                 return mat_norm(x, porder=p, axis=[-2]) * mat_norm(
-                    x_inv, porder=p, axis=[-2])
+                    x_inv, porder=p, axis=[-2]
+                )
             if p in (np.inf, -np.inf):
                 return mat_norm(x, porder=p, axis=[-1]) * mat_norm(
-                    x_inv, porder=p, axis=[-1])
+                    x_inv, porder=p, axis=[-1]
+                )
         else:
-            raise ValueError("only support p is {} when input is a ".format(p) +
-                             "square matrix or batches of square matrices")
+            raise ValueError(
+                "only support p is {} when input is a ".format(p)
+                + "square matrix or batches of square matrices"
+            )
     elif p in (2, -2):
         if x_size == 0:
             return empty_tensor(x, x_shape[:-2])
         return svd_norm(x, porder=p)
     else:
         raise ValueError(
-            "unsupported {} for p, only supporting ('fro', 'nuc', ".format(p) +
-            "1, -1, 2, -2, inf, -inf) or none")
+            "unsupported {} for p, only supporting ('fro', 'nuc', ".format(p)
+            + "1, -1, 2, -2, inf, -inf) or none"
+        )
 
 
 def dot(x, y, name=None):
@@ -1066,25 +1180,23 @@ def dot(x, y, name=None):
     assert x is not None, 'x cannot be None in {}'.format(op_type)
     assert y is not None, 'y cannot be None in {}'.format(op_type)
 
-    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
-                             op_type)
-    check_variable_and_dtype(y, 'y', ['float32', 'float64', 'int32', 'int64'],
-                             op_type)
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], op_type
+    )
+    check_variable_and_dtype(
+        y, 'y', ['float32', 'float64', 'int32', 'int64'], op_type
+    )
 
     helper = LayerHelper(op_type, **locals())
     if name is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
-        out = helper.create_variable(name=name,
-                                     dtype=x.dtype,
-                                     persistable=False)
-    helper.append_op(type="dot",
-                     inputs={
-                         'X': x,
-                         'Y': y
-                     },
-                     attrs={},
-                     outputs={"Out": out})
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False
+        )
+    helper.append_op(
+        type="dot", inputs={'X': x, 'Y': y}, attrs={}, outputs={"Out": out}
+    )
     return out
 
 
@@ -1093,7 +1205,7 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
     Estimate the covariance matrix of the input variables, given data and weights.
 
     A covariance matrix is a square matrix, indicate the covariance of each pair variables in the input matrix.
-    For example, for an N-dimensional samples X=[x1,x2,…xN]T, then the covariance matrix 
+    For example, for an N-dimensional samples X=[x1,x2,…xN]T, then the covariance matrix
     element Cij is the covariance of xi and xj. The element Cii is the variance of xi itself.
 
     Parameters:
@@ -1127,7 +1239,8 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
     if len(x.shape) > 2 or len(x.shape) < 1:
         raise ValueError(
             "Input(x) only support N-D (1<=N<=2) tensor in cov, but received "
-            "length of Input(input) is %s." % len(x.shape))
+            "length of Input(input) is %s." % len(x.shape)
+        )
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'cov')
     nx = x
     if len(x.shape) == 1:
@@ -1141,16 +1254,20 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
         if len(w.shape) > 1:
             raise ValueError(
                 "Input(fweights) only support N-D (N<=1) tensor in cov, but received "
-                "shape of Input(input) is %s." % len(fweights.shape))
+                "shape of Input(input) is %s." % len(fweights.shape)
+            )
         if fweights.shape[0] != observation_num:
             raise ValueError(
                 "The number of Input(fweights) should equal to x's dim[1]: {}, but received "
-                "size of Input(fweights) is {}.".format(observation_num,
-                                                        fweights.shape[0]))
+                "size of Input(fweights) is {}.".format(
+                    observation_num, fweights.shape[0]
+                )
+            )
         if fweights.min() < 0:
             raise ValueError(
                 "The value of Input(fweights) cannot be negtive, but received "
-                "min of Input(fweights) is {}.".format(fweights.min()))
+                "min of Input(fweights) is {}.".format(fweights.min())
+            )
         if not paddle.all(fweights == paddle.round(fweights.astype('float64'))):
             raise ValueError("Input(fweights) must be integer ")
 
@@ -1159,18 +1276,23 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
         if len(aw.shape) > 1:
             raise ValueError(
                 "Input(aweights) only support N-D (N<=1) tensor in cov, but received "
-                "length of Input(input) is %s." % len(aweights.shape))
-        check_variable_and_dtype(aweights, 'dtype', ['float32', 'float64'],
-                                 'cov')
+                "length of Input(input) is %s." % len(aweights.shape)
+            )
+        check_variable_and_dtype(
+            aweights, 'dtype', ['float32', 'float64'], 'cov'
+        )
         if aweights.shape[0] != observation_num:
             raise ValueError(
                 "The number of Input(aweights) should equal to x's dim[1]: {}, but received "
-                "size of Input(aweights) is {}.".format(observation_num,
-                                                        aweights.shape[0]))
+                "size of Input(aweights) is {}.".format(
+                    observation_num, aweights.shape[0]
+                )
+            )
         if aweights.min() < 0:
             raise ValueError(
                 "The value of Input(aweights) cannot be negtive, but received "
-                "min of Input(aweights) is {}.".format(aweights.min()))
+                "min of Input(aweights) is {}.".format(aweights.min())
+            )
         if w is not None:
             w = w * aw
         else:
@@ -1219,11 +1341,11 @@ def t(input, name=None):
         .. code-block:: python
            :name: code-example
              import paddle
-             
+
              # Example 1 (0-D tensor)
              x = paddle.to_tensor([0.79])
              paddle.t(x) # [0.79]
-             
+
              # Example 2 (1-D tensor)
              x = paddle.to_tensor([0.79, 0.84, 0.32])
              paddle.t(x) # [0.79000002, 0.83999997, 0.31999999]
@@ -1244,7 +1366,8 @@ def t(input, name=None):
         raise ValueError(
             "Input(input) only support N-D (N<=2) tensor, but received "
             "length of Input(input) is %s. Perhaps you can use paddle."
-            "tensor.transpose() instead." % len(input.shape))
+            "tensor.transpose() instead." % len(input.shape)
+        )
     if in_dygraph_mode():
         if len(input.shape) == 1:
             return input
@@ -1262,8 +1385,11 @@ def t(input, name=None):
         return out
 
     check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'transpose')
+        input,
+        'input',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'transpose',
+    )
 
     helper = LayerHelper('t', **locals())
     out = helper.create_variable_for_type_inference(input.dtype)
@@ -1271,13 +1397,12 @@ def t(input, name=None):
     if len(input.shape) == 1:
         out = input
     else:
-        helper.append_op(type='transpose2',
-                         inputs={'X': [input]},
-                         outputs={
-                             'Out': [out],
-                             'XShape': [input_shape]
-                         },
-                         attrs={'axis': [1, 0]})
+        helper.append_op(
+            type='transpose2',
+            inputs={'X': [input]},
+            outputs={'Out': [out], 'XShape': [input_shape]},
+            attrs={'axis': [1, 0]},
+        )
     return out
 
 
@@ -1334,13 +1459,12 @@ def cross(x, y, axis=9, name=None):
             attrs = dict()
             attrs['dim'] = axis
 
-            helper.append_op(type='cross',
-                             inputs={
-                                 'X': x,
-                                 'Y': y
-                             },
-                             outputs={'Out': out},
-                             attrs=attrs)
+            helper.append_op(
+                type='cross',
+                inputs={'X': x, 'Y': y},
+                outputs={'Out': out},
+                attrs=attrs,
+            )
             return out
 
 
@@ -1393,10 +1517,12 @@ def cholesky(x, upper=False, name=None):
     check_type(upper, 'upper', bool, 'cholesky')
     helper = LayerHelper('cholesky', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='cholesky',
-                     inputs={'X': [x]},
-                     outputs={'Out': out},
-                     attrs={'upper': upper})
+    helper.append_op(
+        type='cholesky',
+        inputs={'X': [x]},
+        outputs={'Out': out},
+        attrs={'upper': upper},
+    )
     return out
 
 
@@ -1446,8 +1572,9 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
             else:
                 tol_tensor = tol
             use_default_tol = False
-            return _C_ops.matrix_rank_tol(x, tol_tensor, use_default_tol,
-                                          hermitian)
+            return _C_ops.matrix_rank_tol(
+                x, tol_tensor, use_default_tol, hermitian
+            )
 
         if tol is None:
             tol_attr = 0.0
@@ -1473,9 +1600,16 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
             tol_tensor = None
             tol_attr = float(tol)
             use_default_tol = False
-        return _legacy_C_ops.matrix_rank(x, tol_tensor, "tol", tol_attr,
-                                         'hermitian', hermitian,
-                                         'use_default_tol', use_default_tol)
+        return _legacy_C_ops.matrix_rank(
+            x,
+            tol_tensor,
+            "tol",
+            tol_attr,
+            'hermitian',
+            hermitian,
+            'use_default_tol',
+            use_default_tol,
+        )
 
     inputs = {}
     attrs = {}
@@ -1498,10 +1632,9 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
 
     helper = LayerHelper('matrix_rank', **locals())
     out = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(type='matrix_rank',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='matrix_rank', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -1548,16 +1681,22 @@ def bmm(x, y, name=None):
     y_shape = y.shape
     if not len(x_shape) == len(y_shape) == 3:
         raise ValueError(
-            "x and y should be 3-dimensional. But received x's dimention: {}, y's dimention: {}"
-            .format(x_shape, y_shape))
+            "x and y should be 3-dimensional. But received x's dimention: {}, y's dimention: {}".format(
+                x_shape, y_shape
+            )
+        )
     if x_shape[2] != y_shape[1]:
         raise ValueError(
-            "x's width must be equal with y's height. But received x's shape: {}, y's shape: {}"
-            .format(x_shape, y_shape))
+            "x's width must be equal with y's height. But received x's shape: {}, y's shape: {}".format(
+                x_shape, y_shape
+            )
+        )
     if x_shape[0] != y_shape[0]:
         raise ValueError(
-            "x's batch (shape[0]) must be equal with y's batch (shape[0]). But received x's shape: {}, y's shape: {}"
-            .format(x_shape, y_shape))
+            "x's batch (shape[0]) must be equal with y's batch (shape[0]). But received x's shape: {}, y's shape: {}".format(
+                x_shape, y_shape
+            )
+        )
 
     if in_dygraph_mode():
         return _C_ops.bmm(x, y)
@@ -1600,28 +1739,27 @@ def histogram(input, bins=100, min=0, max=0, name=None):
         return _C_ops.histogram(input, bins, min, max)
 
     if _in_legacy_dygraph():
-        return _legacy_C_ops.histogram(input, "bins", bins, "min", min, "max",
-                                       max)
+        return _legacy_C_ops.histogram(
+            input, "bins", bins, "min", min, "max", max
+        )
 
     helper = LayerHelper('histogram', **locals())
-    check_variable_and_dtype(input, 'X',
-                             ['int32', 'int64', 'float32', 'float64'],
-                             'histogram')
+    check_variable_and_dtype(
+        input, 'X', ['int32', 'int64', 'float32', 'float64'], 'histogram'
+    )
     out = helper.create_variable_for_type_inference(VarDesc.VarType.INT64)
-    helper.append_op(type='histogram',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'bins': bins,
-                         'min': min,
-                         'max': max
-                     })
+    helper.append_op(
+        type='histogram',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={'bins': bins, 'min': min, 'max': max},
+    )
     return out
 
 
 def bincount(x, weights=None, minlength=0, name=None):
     """
-    Computes frequency of each value in the input tensor. 
+    Computes frequency of each value in the input tensor.
 
     Args:
         x (Tensor): A Tensor with non-negative integer. Should be 1-D tensor.
@@ -1657,19 +1795,21 @@ def bincount(x, weights=None, minlength=0, name=None):
     check_variable_and_dtype(x, 'X', ['int32', 'int64'], 'bincount')
 
     if weights is not None:
-        check_variable_and_dtype(weights, 'Weights',
-                                 ['int32', 'int64', 'float32', 'float64'],
-                                 'bincount')
+        check_variable_and_dtype(
+            weights,
+            'Weights',
+            ['int32', 'int64', 'float32', 'float64'],
+            'bincount',
+        )
         out = helper.create_variable_for_type_inference(dtype=weights.dtype)
     else:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='bincount',
-                     inputs={
-                         'X': x,
-                         'Weights': weights
-                     },
-                     outputs={'Out': out},
-                     attrs={'minlength': minlength})
+    helper.append_op(
+        type='bincount',
+        inputs={'X': x, 'Weights': weights},
+        outputs={'Out': out},
+        attrs={'minlength': minlength},
+    )
     return out
 
 
@@ -1714,29 +1854,31 @@ def mv(x, vec, name=None):
             def __check_input(x, vec):
                 var_names = {'x': x, 'vec': vec}
                 for name, val in var_names.items():
-                    check_variable_and_dtype(val, name, ['float32', 'float64'],
-                                             'mv')
+                    check_variable_and_dtype(
+                        val, name, ['float32', 'float64'], 'mv'
+                    )
                 x_shape = list(x.shape)
                 vec_shape = list(vec.shape)
                 if len(x_shape) != 2:
                     raise ValueError(
-                        "x should be 2-dimensional. But received x's dimention: {}"
-                        .format(x_shape))
+                        "x should be 2-dimensional. But received x's dimention: {}".format(
+                            x_shape
+                        )
+                    )
                 if len(vec_shape) != 1:
                     raise ValueError(
-                        "vec should be 1-dimensional. But received vec's dimention: {}"
-                        .format(vec_shape))
+                        "vec should be 1-dimensional. But received vec's dimention: {}".format(
+                            vec_shape
+                        )
+                    )
 
             __check_input(x, vec)
 
             helper = LayerHelper('mv', **locals())
             out = helper.create_variable_for_type_inference(dtype=x.dtype)
-            helper.append_op(type='mv',
-                             inputs={
-                                 'X': x,
-                                 'Vec': vec
-                             },
-                             outputs={'Out': out})
+            helper.append_op(
+                type='mv', inputs={'X': x, 'Vec': vec}, outputs={'Out': out}
+            )
             return out
 
 
@@ -1773,22 +1915,23 @@ def det(x, name=None):
     check_dtype(x.dtype, 'Input', ['float32', 'float64'], 'det')
 
     input_shape = list(x.shape)
-    assert len(input_shape) >= 2,                     \
-            "The x must be at least 2-dimensional, "   \
-            "but received Input x's dimensional: %s.\n" %  \
-            len(input_shape)
-
-    assert (input_shape[-1] == input_shape[-2]),    \
-            "Expect squared input," \
-            "but received %s by %s matrix.\n" \
-            %(input_shape[-2], input_shape[-1]) \
+    assert len(input_shape) >= 2, (
+        "The x must be at least 2-dimensional, "
+        "but received Input x's dimensional: %s.\n" % len(input_shape)
+    )
 
+    assert (
+        input_shape[-1] == input_shape[-2]
+    ), "Expect squared input," "but received %s by %s matrix.\n" % (
+        input_shape[-2],
+        input_shape[-1],
+    )
     helper = LayerHelper('determinant', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type='determinant',
-                     inputs={'Input': [x]},
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='determinant', inputs={'Input': [x]}, outputs={'Out': [out]}
+    )
     return out
 
 
@@ -1832,22 +1975,23 @@ def slogdet(x, name=None):
     check_dtype(x.dtype, 'Input', ['float32', 'float64'], 'slogdet')
 
     input_shape = list(x.shape)
-    assert len(input_shape) >= 2,                     \
-            "The x must be at least 2-dimensional, "   \
-            "but received Input x's dimensional: %s.\n" %  \
-            len(input_shape)
-
-    assert (input_shape[-1] == input_shape[-2]),    \
-            "Expect squared input," \
-            "but received %s by %s matrix.\n" \
-            %(input_shape[-2], input_shape[-1]) \
+    assert len(input_shape) >= 2, (
+        "The x must be at least 2-dimensional, "
+        "but received Input x's dimensional: %s.\n" % len(input_shape)
+    )
 
+    assert (
+        input_shape[-1] == input_shape[-2]
+    ), "Expect squared input," "but received %s by %s matrix.\n" % (
+        input_shape[-2],
+        input_shape[-1],
+    )
     helper = LayerHelper('slogdeterminant', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type='slogdeterminant',
-                     inputs={'Input': [x]},
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='slogdeterminant', inputs={'Input': [x]}, outputs={'Out': [out]}
+    )
     return out
 
 
@@ -1914,11 +2058,7 @@ def svd(x, full_matrices=False, name=None):
     helper.append_op(
         type='svd',
         inputs={'X': [x]},
-        outputs={
-            'U': u,
-            'VH': vh,
-            'S': s
-        },
+        outputs={'U': u, 'VH': vh, 'S': s},
         attrs=attrs,
     )
     return u, s, vh
@@ -1989,10 +2129,12 @@ def matrix_power(x, n, name=None):
     check_type(n, 'n', int, 'matrix_power')
     helper = LayerHelper('matrix_power', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='matrix_power',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'n': n})
+    helper.append_op(
+        type='matrix_power',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'n': n},
+    )
     return out
 
 
@@ -2003,26 +2145,26 @@ def qr(x, mode="reduced", name=None):
     Args:
         x (Tensor): The input tensor. Its shape should be `[..., M, N]`,
             where ... is zero or more batch dimensions. M and N can be arbitrary
-            positive number. The data type of x should be float32 or float64. 
-        mode (str, optional): A flag to control the behavior of qr, the default is "reduced". 
+            positive number. The data type of x should be float32 or float64.
+        mode (str, optional): A flag to control the behavior of qr, the default is "reduced".
             Suppose x's shape is `[..., M, N]` and denoting `K = min(M, N)`:
-            If mode = "reduced", qr op will return reduced Q and R matrices, 
+            If mode = "reduced", qr op will return reduced Q and R matrices,
             which means Q's shape is `[..., M, K]` and R's shape is `[..., K, N]`.
-            If mode = "complete", qr op will return complete Q and R matrices, 
+            If mode = "complete", qr op will return complete Q and R matrices,
             which means Q's shape is `[..., M, M]` and R's shape is `[..., M, N]`.
             If mode = "r", qr op will only return reduced R matrix, which means
             R's shape is `[..., K, N]`.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-            
+
     Returns:
-        If mode = "reduced" or mode = "complete", qr will return a two tensor-tuple, which represents Q and R. 
+        If mode = "reduced" or mode = "complete", qr will return a two tensor-tuple, which represents Q and R.
         If mode = "r", qr will return a tensor which represents R.
-        
-    Examples:            
+
+    Examples:
         .. code-block:: python
 
-            import paddle 
+            import paddle
 
             x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
             q, r = paddle.linalg.qr(x)
@@ -2035,8 +2177,8 @@ def qr(x, mode="reduced", name=None):
 
             # R = [[-5.91607978, -7.43735744],
             #      [ 0.        ,  0.82807867]])
-            
-            # one can verify : X = Q * R ;     
+
+            # one can verify : X = Q * R ;
     """
     if in_dygraph_mode():
         q, r = _C_ops.qr(x, mode)
@@ -2057,13 +2199,9 @@ def qr(x, mode="reduced", name=None):
     r = helper.create_variable_for_type_inference(dtype=x.dtype)
     attrs = dict()
     attrs['mode'] = mode
-    helper.append_op(type='qr',
-                     inputs={'X': [x]},
-                     outputs={
-                         'Q': q,
-                         'R': r
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='qr', inputs={'X': [x]}, outputs={'Q': q, 'R': r}, attrs=attrs
+    )
     if mode == "r":
         return r
     else:
@@ -2072,9 +2210,9 @@ def qr(x, mode="reduced", name=None):
 
 def lu(x, pivot=True, get_infos=False, name=None):
     r"""
-    Computes the LU factorization of an N-D(N>=2) matrix x. 
+    Computes the LU factorization of an N-D(N>=2) matrix x.
 
-    Returns the LU factorization(inplace x) and Pivots. low triangular matrix L and 
+    Returns the LU factorization(inplace x) and Pivots. low triangular matrix L and
     upper triangular matrix U are combined to a single LU matrix.
 
     Pivoting is done if pivot is set to True.
@@ -2094,23 +2232,23 @@ def lu(x, pivot=True, get_infos=False, name=None):
 
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-            
+
     Returns:
         factorization (Tensor): LU matrix, the factorization of input X.
 
-        pivots (IntTensor): the pivots of size(∗(N-2), min(m,n)). `pivots` stores all the 
-                    intermediate transpositions of rows. The final permutation `perm` could be 
+        pivots (IntTensor): the pivots of size(∗(N-2), min(m,n)). `pivots` stores all the
+                    intermediate transpositions of rows. The final permutation `perm` could be
                     reconstructed by this, details refer to upper example.
 
-        infos (IntTensor, optional): if `get_infos` is `True`, this is a tensor of size (∗(N-2)) 
-                    where non-zero values indicate whether factorization for the matrix or each minibatch 
+        infos (IntTensor, optional): if `get_infos` is `True`, this is a tensor of size (∗(N-2))
+                    where non-zero values indicate whether factorization for the matrix or each minibatch
                     has succeeded or failed.
 
-        
-    Examples:            
+
+    Examples:
         .. code-block:: python
 
-            import paddle 
+            import paddle
 
             x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
             lu,p,info = paddle.linalg.lu(x, get_infos=True)
@@ -2126,26 +2264,26 @@ def lu(x, pivot=True, get_infos=False, name=None):
             # >>> info
             # Tensor(shape=[], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
             #    0)
-            
+
             P,L,U = paddle.linalg.lu_unpack(lu,p)
 
             # >>> P
             # (Tensor(shape=[3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[0., 1., 0.],
             # [0., 0., 1.],
-            # [1., 0., 0.]]), 
+            # [1., 0., 0.]]),
             # >>> L
             # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[1.        , 0.        ],
             # [0.20000000, 1.        ],
-            # [0.60000000, 0.50000000]]), 
+            # [0.60000000, 0.50000000]]),
             # >>> U
             # Tensor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[5.        , 6.        ],
             # [0.        , 0.80000000]]))
-            
 
-            # one can verify : X = P @ L @ U ;     
+
+            # one can verify : X = P @ L @ U ;
     """
 
     if in_dygraph_mode():
@@ -2160,14 +2298,12 @@ def lu(x, pivot=True, get_infos=False, name=None):
         info = helper.create_variable_for_type_inference(dtype='int')
         attrs = dict()
         attrs['pivot'] = pivot
-        helper.append_op(type='lu',
-                         inputs={'X': x},
-                         outputs={
-                             'Out': lu,
-                             'Pivots': p,
-                             'Infos': info
-                         },
-                         attrs=attrs)
+        helper.append_op(
+            type='lu',
+            inputs={'X': x},
+            outputs={'Out': lu, 'Pivots': p, 'Infos': info},
+            attrs=attrs,
+        )
     if get_infos:
         return lu, p, info
     else:
@@ -2176,7 +2312,7 @@ def lu(x, pivot=True, get_infos=False, name=None):
 
 def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
     r"""
-    Unpack L U and P to single matrix tensor . 
+    Unpack L U and P to single matrix tensor .
     unpack L and U matrix from LU, unpack permutation matrix P from Pivtos .
 
     P mat can be get by pivots:
@@ -2196,7 +2332,7 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
 
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-            
+
     Returns:
         P (Tensor): Permutation matrix P of lu factorization.
 
@@ -2204,11 +2340,11 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
 
         U (Tensor): The upper triangular matrix tensor of lu factorization.
 
-        
-    Examples:            
+
+    Examples:
         .. code-block:: python
 
-            import paddle 
+            import paddle
 
             x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
             lu,p,info = paddle.linalg.lu(x, get_infos=True)
@@ -2224,25 +2360,25 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
             # >>> info
             # Tensor(shape=[], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
             #    0)
-            
+
             P,L,U = paddle.linalg.lu_unpack(lu,p)
 
             # >>> P
             # (Tensor(shape=[3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[0., 1., 0.],
             # [0., 0., 1.],
-            # [1., 0., 0.]]), 
+            # [1., 0., 0.]]),
             # >>> L
             # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[1.        , 0.        ],
             # [0.20000000, 1.        ],
-            # [0.60000000, 0.50000000]]), 
+            # [0.60000000, 0.50000000]]),
             # >>> U
             # Tensor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[5.        , 6.        ],
             # [0.        , 0.80000000]]))
 
-            # one can verify : X = P @ L @ U ;   
+            # one can verify : X = P @ L @ U ;
     """
 
     if in_dygraph_mode():
@@ -2250,8 +2386,9 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
         return P, L, U
 
     if paddle.in_dynamic_mode():
-        P, L, U = _legacy_C_ops.lu_unpack(x, y, 'unpack_ludata', unpack_ludata,
-                                          'unpack_pivots', unpack_pivots)
+        P, L, U = _legacy_C_ops.lu_unpack(
+            x, y, 'unpack_ludata', unpack_ludata, 'unpack_pivots', unpack_pivots
+        )
         return P, L, U
 
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'lu_unpack')
@@ -2263,17 +2400,12 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
     attrs = dict()
     attrs['unpack_ludata'] = unpack_ludata
     attrs['unpack_pivots'] = unpack_pivots
-    helper.append_op(type='lu_unpack',
-                     inputs={
-                         'X': x,
-                         'Pivots': y
-                     },
-                     outputs={
-                         'Pmat': p,
-                         'L': l,
-                         'U': u
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='lu_unpack',
+        inputs={'X': x, 'Pivots': y},
+        outputs={'Pmat': p, 'L': l, 'U': u},
+        attrs=attrs,
+    )
     return p, l, u
 
 
@@ -2291,7 +2423,7 @@ def eig(x, name=None):
     Args:
         x (Tensor): A tensor with shape math:`[*, N, N]`, The data type of the x should be one of ``float32``,
             ``float64``, ``compplex64`` or ``complex128``.
-        name (str, optional): The default value is `None`. Normally there is no need for user to set 
+        name (str, optional): The default value is `None`. Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -2331,9 +2463,9 @@ def eig(x, name=None):
         w, v = _legacy_C_ops.eig(x)
         return w, v
 
-    check_variable_and_dtype(x, 'X',
-                             ['float32', 'float64', 'complex64', 'complex128'],
-                             'eig')
+    check_variable_and_dtype(
+        x, 'X', ['float32', 'float64', 'complex64', 'complex128'], 'eig'
+    )
     helper = LayerHelper('eig', **locals())
 
     w = helper.create_variable_for_type_inference(x.dtype)
@@ -2360,7 +2492,7 @@ def eigvals(x, name=None):
             Its data type should be float32, float64, complex64, or complex128.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-            
+
     Returns:
         Tensor: A tensor containing the unsorted eigenvalues which has the same batch dimensions with `x`.
             The eigenvalues are complex-valued even when `x` is real.
@@ -2382,20 +2514,24 @@ def eigvals(x, name=None):
             # [(-0.27078833542132674+0j), (0.29962280156230725+0j), (0.8824477020120244+0j)] #complex128
     """
 
-    check_variable_and_dtype(x, 'dtype',
-                             ['float32', 'float64', 'complex64', 'complex128'],
-                             'eigvals')
+    check_variable_and_dtype(
+        x, 'dtype', ['float32', 'float64', 'complex64', 'complex128'], 'eigvals'
+    )
 
     x_shape = list(x.shape)
     if len(x_shape) < 2:
         raise ValueError(
-            "The dimension of Input(x) should be at least 2, but received x's dimention = {}, x's shape = {}"
-            .format(len(x_shape), x_shape))
+            "The dimension of Input(x) should be at least 2, but received x's dimention = {}, x's shape = {}".format(
+                len(x_shape), x_shape
+            )
+        )
 
     if x_shape[-1] != x_shape[-2]:
         raise ValueError(
-            "The last two dimensions of Input(x) should be equal, but received x's shape = {}"
-            .format(x_shape))
+            "The last two dimensions of Input(x) should be equal, but received x's shape = {}".format(
+                x_shape
+            )
+        )
 
     if in_dygraph_mode():
         return _C_ops.eigvals(x)
@@ -2477,11 +2613,16 @@ def multi_dot(x, name=None):
 
     check_type(x, 'x', (list, tuple), 'multi_dot')
     for id, item in enumerate(x):
-        check_variable_and_dtype(item, 'x[' + str(id) + ']',
-                                 ['float16', 'float32', 'float64'], 'multi_dot')
+        check_variable_and_dtype(
+            item,
+            'x[' + str(id) + ']',
+            ['float16', 'float32', 'float64'],
+            'multi_dot',
+        )
         if item.dtype != x[0].dtype:
             raise TypeError(
-                "All the Tensors in the input must have the same data type.")
+                "All the Tensors in the input must have the same data type."
+            )
 
     helper = LayerHelper('multi_dot', **locals())
     dtype = helper.input_dtype(input_param_name='x')
@@ -2535,32 +2676,35 @@ def eigh(x, UPLO='L', name=None):
         if len(x.shape) < 2:
             raise ValueError(
                 "Input(input) only support >=2 tensor, but received "
-                "length of Input(input) is %s." % len(x.shape))
+                "length of Input(input) is %s." % len(x.shape)
+            )
         if x_shape[-1] != x_shape[-2]:
             raise ValueError(
-                "The input matrix must be batches of square matrices. But received x's dimention: {}"
-                .format(x_shape))
+                "The input matrix must be batches of square matrices. But received x's dimention: {}".format(
+                    x_shape
+                )
+            )
         if UPLO != 'L' and UPLO != 'U':
             raise ValueError(
-                "UPLO must be L or U. But received UPLO is: {}".format(UPLO))
+                "UPLO must be L or U. But received UPLO is: {}".format(UPLO)
+            )
 
     __check_input(x, UPLO)
 
     helper = LayerHelper('eigh', **locals())
-    check_variable_and_dtype(x, 'dtype',
-                             ['float32', 'float64', 'complex64', 'complex128'],
-                             'eigh')
+    check_variable_and_dtype(
+        x, 'dtype', ['float32', 'float64', 'complex64', 'complex128'], 'eigh'
+    )
 
     out_value = helper.create_variable_for_type_inference(dtype=x.dtype)
     out_vector = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type='eigh',
-                     inputs={'X': x},
-                     outputs={
-                         'Eigenvalues': out_value,
-                         'Eigenvectors': out_vector
-                     },
-                     attrs={'UPLO': UPLO})
+    helper.append_op(
+        type='eigh',
+        inputs={'X': x},
+        outputs={'Eigenvalues': out_value, 'Eigenvectors': out_vector},
+        attrs={'UPLO': UPLO},
+    )
     return out_value, out_vector
 
 
@@ -2679,8 +2823,9 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
         if not hermitian:
             # combine svd and matmul op
             u, s, vt = _legacy_C_ops.svd(x, 'full_matrices', False)
-            max_singular_val = _legacy_C_ops.reduce_max(s, 'dim', [-1], 'keep_dim', True, \
-                'reduce_all', False)
+            max_singular_val = _legacy_C_ops.reduce_max(
+                s, 'dim', [-1], 'keep_dim', True, 'reduce_all', False
+            )
             rcond = paddle.to_tensor(rcond, dtype=x.dtype)
             cutoff = rcond * max_singular_val
             y = float('inf')
@@ -2702,15 +2847,17 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             if in_dygraph_mode():
                 out_2 = _C_ops.matmul(out_1, u, False, True)
             else:
-                out_2 = _legacy_C_ops.matmul_v2(out_1, u, 'trans_x', False,
-                                                'trans_y', True)
+                out_2 = _legacy_C_ops.matmul_v2(
+                    out_1, u, 'trans_x', False, 'trans_y', True
+                )
             return out_2
         else:
             # combine eigh and matmul op
             s, u = _legacy_C_ops.eigh(x, 'UPLO', 'L')
             s_abs = paddle.abs(s)
-            max_singular_val = _legacy_C_ops.reduce_max(s_abs, 'dim', [-1], 'keep_dim', True, \
-                'reduce_all', False)
+            max_singular_val = _legacy_C_ops.reduce_max(
+                s_abs, 'dim', [-1], 'keep_dim', True, 'reduce_all', False
+            )
             rcond = paddle.to_tensor(rcond, dtype=s.dtype)
             cutoff = rcond * max_singular_val
             y = float('inf')
@@ -2729,8 +2876,9 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             if in_dygraph_mode():
                 out_2 = _C_ops.matmul(out_1, u_conj, False, True)
             else:
-                out_2 = _legacy_C_ops.matmul_v2(out_1, u_conj, 'trans_x', False,
-                                                'trans_y', True)
+                out_2 = _legacy_C_ops.matmul_v2(
+                    out_1, u_conj, 'trans_x', False, 'trans_y', True
+                )
             return out_2
     else:
         if not hermitian:
@@ -2744,23 +2892,17 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             helper.append_op(
                 type='svd',
                 inputs={'X': [x]},
-                outputs={
-                    'U': u,
-                    'VH': vt,
-                    'S': s
-                },
+                outputs={'U': u, 'VH': vt, 'S': s},
                 attrs={'full_matrices': False},
             )
 
             max_singular_val = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='reduce_max',
-                             inputs={'X': s},
-                             outputs={'Out': max_singular_val},
-                             attrs={
-                                 'dim': [-1],
-                                 'keep_dim': True,
-                                 'reduce_all': False
-                             })
+            helper.append_op(
+                type='reduce_max',
+                inputs={'X': s},
+                outputs={'Out': max_singular_val},
+                attrs={'dim': [-1], 'keep_dim': True, 'reduce_all': False},
+            )
 
             rcond = full(shape=[1], fill_value=rcond, dtype=dtype)
             cutoff = rcond * max_singular_val
@@ -2776,59 +2918,50 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
 
             st = helper.create_variable_for_type_inference(dtype=dtype)
             st_shape = helper.create_variable_for_type_inference(dtype=dtype)
-            helper.append_op(type='unsqueeze2',
-                             inputs={'X': singular},
-                             attrs={'axes': [-2]},
-                             outputs={
-                                 'Out': st,
-                                 'XShape': st_shape
-                             })
+            helper.append_op(
+                type='unsqueeze2',
+                inputs={'X': singular},
+                attrs={'axes': [-2]},
+                outputs={'Out': st, 'XShape': st_shape},
+            )
 
             dims = list(range(len(vt.shape)))
             perm = dims[:-2] + [dims[-1]] + [dims[-2]]
             v = helper.create_variable_for_type_inference(dtype)
             v_shape = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='transpose2',
-                             inputs={'X': [vt]},
-                             outputs={
-                                 'Out': [v],
-                                 'XShape': [v_shape]
-                             },
-                             attrs={'axis': perm})
+            helper.append_op(
+                type='transpose2',
+                inputs={'X': [vt]},
+                outputs={'Out': [v], 'XShape': [v_shape]},
+                attrs={'axis': perm},
+            )
 
             out_1 = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='elementwise_mul',
-                             inputs={
-                                 'X': v,
-                                 'Y': st
-                             },
-                             outputs={'Out': out_1},
-                             attrs={
-                                 'axis': -1,
-                                 'use_mkldnn': False
-                             })
+            helper.append_op(
+                type='elementwise_mul',
+                inputs={'X': v, 'Y': st},
+                outputs={'Out': out_1},
+                attrs={'axis': -1, 'use_mkldnn': False},
+            )
             out_1 = helper.append_activation(out_1)
 
             out_2 = helper.create_variable_for_type_inference(dtype)
             helper.append_op(
                 type='matmul_v2',
-                inputs={
-                    'X': out_1,
-                    'Y': u
-                },
+                inputs={'X': out_1, 'Y': u},
                 outputs={'Out': out_2},
-                attrs={
-                    'trans_x': False,
-                    'trans_y': True
-                },
+                attrs={'trans_x': False, 'trans_y': True},
             )
             return out_2
         else:
             helper = LayerHelper('pinv', **locals())
             dtype = x.dtype
             check_variable_and_dtype(
-                x, 'dtype', ['float32', 'float64', 'complex64', 'complex128'],
-                'pinv')
+                x,
+                'dtype',
+                ['float32', 'float64', 'complex64', 'complex128'],
+                'pinv',
+            )
 
             if dtype == paddle.complex128:
                 s_type = 'float64'
@@ -2839,26 +2972,23 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
 
             u = helper.create_variable_for_type_inference(dtype)
             s = helper.create_variable_for_type_inference(s_type)
-            helper.append_op(type='eigh',
-                             inputs={'X': x},
-                             outputs={
-                                 'Eigenvalues': s,
-                                 'Eigenvectors': u
-                             },
-                             attrs={'UPLO': 'L'})
+            helper.append_op(
+                type='eigh',
+                inputs={'X': x},
+                outputs={'Eigenvalues': s, 'Eigenvectors': u},
+                attrs={'UPLO': 'L'},
+            )
             s_abs = helper.create_variable_for_type_inference(s_type)
-            helper.append_op(type='abs',
-                             inputs={'X': s},
-                             outputs={'Out': s_abs})
+            helper.append_op(
+                type='abs', inputs={'X': s}, outputs={'Out': s_abs}
+            )
             max_singular_val = helper.create_variable_for_type_inference(s_type)
-            helper.append_op(type='reduce_max',
-                             inputs={'X': s_abs},
-                             outputs={'Out': max_singular_val},
-                             attrs={
-                                 'dim': [-1],
-                                 'keep_dim': True,
-                                 'reduce_all': False
-                             })
+            helper.append_op(
+                type='reduce_max',
+                inputs={'X': s_abs},
+                outputs={'Out': max_singular_val},
+                attrs={'dim': [-1], 'keep_dim': True, 'reduce_all': False},
+            )
 
             rcond = full(shape=[1], fill_value=rcond, dtype=s_type)
             cutoff = rcond * max_singular_val
@@ -2874,44 +3004,33 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
 
             st = helper.create_variable_for_type_inference(dtype=s_type)
             st_shape = helper.create_variable_for_type_inference(dtype=s_type)
-            helper.append_op(type='unsqueeze2',
-                             inputs={'X': singular},
-                             attrs={'axes': [-2]},
-                             outputs={
-                                 'Out': st,
-                                 'XShape': st_shape
-                             })
+            helper.append_op(
+                type='unsqueeze2',
+                inputs={'X': singular},
+                attrs={'axes': [-2]},
+                outputs={'Out': st, 'XShape': st_shape},
+            )
 
             out_1 = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='elementwise_mul',
-                             inputs={
-                                 'X': u,
-                                 'Y': st
-                             },
-                             outputs={'Out': out_1},
-                             attrs={
-                                 'axis': -1,
-                                 'use_mkldnn': False
-                             })
+            helper.append_op(
+                type='elementwise_mul',
+                inputs={'X': u, 'Y': st},
+                outputs={'Out': out_1},
+                attrs={'axis': -1, 'use_mkldnn': False},
+            )
             out_1 = helper.append_activation(out_1)
 
             u_conj = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='conj',
-                             inputs={'X': u},
-                             outputs={'Out': [u_conj]})
+            helper.append_op(
+                type='conj', inputs={'X': u}, outputs={'Out': [u_conj]}
+            )
 
             out_2 = helper.create_variable_for_type_inference(dtype)
             helper.append_op(
                 type='matmul_v2',
-                inputs={
-                    'X': out_1,
-                    'Y': u_conj
-                },
+                inputs={'X': out_1, 'Y': u_conj},
                 outputs={'Out': out_2},
-                attrs={
-                    'trans_x': False,
-                    'trans_y': True
-                },
+                attrs={'trans_x': False, 'trans_y': True},
             )
             return out_2
 
@@ -2970,21 +3089,15 @@ def solve(x, y, name=None):
     check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'solve')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type="solve",
-                     inputs={
-                         "X": x,
-                         "Y": y
-                     },
-                     outputs={"Out": out})
+    helper.append_op(
+        type="solve", inputs={"X": x, "Y": y}, outputs={"Out": out}
+    )
     return out
 
 
-def triangular_solve(x,
-                     y,
-                     upper=True,
-                     transpose=False,
-                     unitriangular=False,
-                     name=None):
+def triangular_solve(
+    x, y, upper=True, transpose=False, unitriangular=False, name=None
+):
     r"""
     Computes the solution of a system of equations with a triangular coefficient matrix `x` and
     multiple right-hand sides `y` .
@@ -2995,12 +3108,12 @@ def triangular_solve(x,
     Args:
         x (Tensor): The input triangular coefficient matrix. Its shape should be `[*, M, M]`, where `*` is zero or
             more batch dimensions. Its data type should be float32 or float64.
-        y (Tensor): Multiple right-hand sides of system of equations. Its shape should be `[*, M, K]`, where `*` is 
+        y (Tensor): Multiple right-hand sides of system of equations. Its shape should be `[*, M, K]`, where `*` is
             zero or more batch dimensions. Its data type should be float32 or float64.
-        upper (bool, optional): Whether to solve the upper-triangular system of equations (default) or the lower-triangular 
+        upper (bool, optional): Whether to solve the upper-triangular system of equations (default) or the lower-triangular
             system of equations. Default: True.
         transpose (bool, optional): whether `x` should be transposed before calculation. Default: False.
-        unitriangular (bool, optional): whether `x` is unit triangular. If True, the diagonal elements of `x` are assumed 
+        unitriangular (bool, optional): whether `x` is unit triangular. If True, the diagonal elements of `x` are assumed
             to be 1 and not referenced from `x` . Default: False.
         name(str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
@@ -3019,7 +3132,7 @@ def triangular_solve(x,
         import paddle
         import numpy as np
 
-        x = paddle.to_tensor([[1, 1, 1], 
+        x = paddle.to_tensor([[1, 1, 1],
                               [0, 2, 1],
                               [0, 0,-1]], dtype="float64")
         y = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
@@ -3032,9 +3145,16 @@ def triangular_solve(x,
         return _C_ops.triangular_solve(x, y, upper, transpose, unitriangular)
 
     if paddle.in_dynamic_mode():
-        return _legacy_C_ops.triangular_solve(x, y, 'upper', upper, 'transpose',
-                                              transpose, 'unitriangular',
-                                              unitriangular)
+        return _legacy_C_ops.triangular_solve(
+            x,
+            y,
+            'upper',
+            upper,
+            'transpose',
+            transpose,
+            'unitriangular',
+            unitriangular,
+        )
 
     inputs = {"X": [x], "Y": [y]}
     helper = LayerHelper("triangular_solve", **locals())
@@ -3042,17 +3162,16 @@ def triangular_solve(x,
     check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'triangular_solve')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type='triangular_solve',
-                     inputs={
-                         'X': x,
-                         'Y': y
-                     },
-                     outputs={'Out': out},
-                     attrs={
-                         'upper': upper,
-                         'transpose': transpose,
-                         'unitriangular': unitriangular
-                     })
+    helper.append_op(
+        type='triangular_solve',
+        inputs={'X': x, 'Y': y},
+        outputs={'Out': out},
+        attrs={
+            'upper': upper,
+            'transpose': transpose,
+            'unitriangular': unitriangular,
+        },
+    )
     return out
 
 
@@ -3066,7 +3185,7 @@ def cholesky_solve(x, y, upper=False, name=None):
     Args:
         x (Tensor): The input matrix which is upper or lower triangular Cholesky factor of square matrix A. Its shape should be `[*, M, M]`, where `*` is zero or
             more batch dimensions. Its data type should be float32 or float64.
-        y (Tensor): Multiple right-hand sides of system of equations. Its shape should be `[*, M, K]`, where `*` is 
+        y (Tensor): Multiple right-hand sides of system of equations. Its shape should be `[*, M, K]`, where `*` is
             zero or more batch dimensions. Its data type should be float32 or float64.
         upper (bool, optional): whether to consider the Cholesky factor as a lower or upper triangular matrix. Default: False.
         name(str, optional): Name for the operation (optional, default is None).
@@ -3080,7 +3199,7 @@ def cholesky_solve(x, y, upper=False, name=None):
 
         import paddle
 
-        u = paddle.to_tensor([[1, 1, 1], 
+        u = paddle.to_tensor([[1, 1, 1],
                                 [0, 2, 1],
                                 [0, 0,-1]], dtype="float64")
         b = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
@@ -3100,19 +3219,18 @@ def cholesky_solve(x, y, upper=False, name=None):
     check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'cholesky_solve')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type='cholesky_solve',
-                     inputs={
-                         'X': x,
-                         'Y': y
-                     },
-                     outputs={'Out': out},
-                     attrs={'upper': upper})
+    helper.append_op(
+        type='cholesky_solve',
+        inputs={'X': x, 'Y': y},
+        outputs={'Out': out},
+        attrs={'upper': upper},
+    )
     return out
 
 
 def eigvalsh(x, UPLO='L', name=None):
     """
-    Computes the eigenvalues of a 
+    Computes the eigenvalues of a
     complex Hermitian (conjugate symmetric) or a real symmetric matrix.
 
     Args:
@@ -3151,36 +3269,39 @@ def eigvalsh(x, UPLO='L', name=None):
         if len(x.shape) < 2:
             raise ValueError(
                 "Input(input) only support >=2 tensor, but received "
-                "length of Input(input) is %s." % len(x.shape))
+                "length of Input(input) is %s." % len(x.shape)
+            )
         if x_shape[-1] != x_shape[-2]:
             raise ValueError(
-                "The input matrix must be batches of square matrices. But received x's dimention: {}"
-                .format(x_shape))
+                "The input matrix must be batches of square matrices. But received x's dimention: {}".format(
+                    x_shape
+                )
+            )
         if UPLO != 'L' and UPLO != 'U':
             raise ValueError(
-                "UPLO must be L or U. But received UPLO is: {}".format(UPLO))
+                "UPLO must be L or U. But received UPLO is: {}".format(UPLO)
+            )
 
     __check_input(x, UPLO)
 
     helper = LayerHelper('eigvalsh', **locals())
-    check_variable_and_dtype(x, 'dtype',
-                             ['float32', 'float64', 'complex64', 'complex128'],
-                             'eigvalsh')
+    check_variable_and_dtype(
+        x,
+        'dtype',
+        ['float32', 'float64', 'complex64', 'complex128'],
+        'eigvalsh',
+    )
 
     out_value = helper.create_variable_for_type_inference(dtype=x.dtype)
     out_vector = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     is_test = x.stop_gradient
-    helper.append_op(type='eigvalsh',
-                     inputs={'X': x},
-                     outputs={
-                         'Eigenvalues': out_value,
-                         'Eigenvectors': out_vector
-                     },
-                     attrs={
-                         'UPLO': UPLO,
-                         'is_test': is_test
-                     })
+    helper.append_op(
+        type='eigvalsh',
+        inputs={'X': x},
+        outputs={'Eigenvalues': out_value, 'Eigenvectors': out_vector},
+        attrs={'UPLO': UPLO, 'is_test': is_test},
+    )
     return out_value
 
 
@@ -3192,26 +3313,26 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
     Args:
         x (Tensor): A tensor with shape ``(*, M, N)`` , the data type of the input Tensor ``x``
             should be one of float32, float64.
-        y (Tensor): A tensor with shape ``(*, M, K)`` , the data type of the input Tensor ``y`` 
+        y (Tensor): A tensor with shape ``(*, M, K)`` , the data type of the input Tensor ``y``
             should be one of float32, float64.
-        rcond(float, optional): The default value is None. A float pointing number used to determine 
-            the effective rank of ``x``. If ``rcond`` is None, it will be set to max(M, N) times the 
+        rcond(float, optional): The default value is None. A float pointing number used to determine
+            the effective rank of ``x``. If ``rcond`` is None, it will be set to max(M, N) times the
             machine precision of x_dtype.
-        driver(str, optional): The default value is None. The name of LAPACK method to be used. For 
-            CPU inputs the valid values are ‘gels’, ‘gelsy’, ‘gelsd, ‘gelss’. For CUDA input, the only 
-            valid driver is ‘gels’. If ``driver`` is None, ‘gelsy’ is used for CPU inputs and ‘gels’ 
+        driver(str, optional): The default value is None. The name of LAPACK method to be used. For
+            CPU inputs the valid values are ‘gels’, ‘gelsy’, ‘gelsd, ‘gelss’. For CUDA input, the only
+            valid driver is ‘gels’. If ``driver`` is None, ‘gelsy’ is used for CPU inputs and ‘gels’
             for CUDA inputs.
-        name(str, optional): The default value is None. Normally there is no need for user to set 
+        name(str, optional): The default value is None. Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tuple: A tuple of 4 Tensors which is (``solution``, ``residuals``, ``rank``, ``singular_values``). 
-        ``solution`` is a tensor with shape ``(*, N, K)``, meaning the least squares solution. ``residuals`` 
-        is a tensor with shape ``(*, K)``, meaning the squared residuals of the solutions, which is computed 
-        when M > N and every matrix in ``x`` is full-rank, otherwise return an empty tensor. ``rank`` is a tensor 
-        with shape ``(*)``, meaning the ranks of the matrices in ``x``, which is computed when ``driver`` in 
-        (‘gelsy’, ‘gelsd’, ‘gelss’), otherwise return an empty tensor. ``singular_values`` is a tensor with 
-        shape ``(*, min(M, N))``, meaning singular values of the matrices in ``x``, which is computed when 
+        Tuple: A tuple of 4 Tensors which is (``solution``, ``residuals``, ``rank``, ``singular_values``).
+        ``solution`` is a tensor with shape ``(*, N, K)``, meaning the least squares solution. ``residuals``
+        is a tensor with shape ``(*, K)``, meaning the squared residuals of the solutions, which is computed
+        when M > N and every matrix in ``x`` is full-rank, otherwise return an empty tensor. ``rank`` is a tensor
+        with shape ``(*)``, meaning the ranks of the matrices in ``x``, which is computed when ``driver`` in
+        (‘gelsy’, ‘gelsd’, ‘gelss’), otherwise return an empty tensor. ``singular_values`` is a tensor with
+        shape ``(*, min(M, N))``, meaning singular values of the matrices in ``x``, which is computed when
         ``driver`` in (‘gelsd’, ‘gelss’), otherwise return an empty tensor.
 
     Examples:
@@ -3247,14 +3368,18 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
     if device == "cpu":
         if driver not in (None, "gels", "gelss", "gelsd", "gelsy"):
             raise ValueError(
-                "Only support valid driver is 'gels', 'gelss', 'gelsd', 'gelsy' or None for CPU inputs. But got {}"
-                .format(driver))
+                "Only support valid driver is 'gels', 'gelss', 'gelsd', 'gelsy' or None for CPU inputs. But got {}".format(
+                    driver
+                )
+            )
         driver = "gelsy" if driver is None else driver
     elif "gpu" in device:
         if driver not in (None, "gels"):
             raise ValueError(
-                "Only support valid driver is 'gels' or None for CUDA inputs. But got {}"
-                .format(driver))
+                "Only support valid driver is 'gels' or None for CUDA inputs. But got {}".format(
+                    driver
+                )
+            )
         driver = "gels" if driver is None else driver
     else:
         raise RuntimeError("Only support lstsq api for CPU or CUDA device.")
@@ -3275,10 +3400,12 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
     if _non_static_mode():
         if in_dygraph_mode():
             solution, residuals, rank, singular_values = _C_ops.lstsq(
-                x, y, rcond, driver)
+                x, y, rcond, driver
+            )
         else:
             solution, residuals, rank, singular_values = _legacy_C_ops.lstsq(
-                x, y, 'rcond', rcond, 'driver', driver)
+                x, y, 'rcond', rcond, 'driver', driver
+            )
 
         if driver == "gels":
             rank = paddle.empty(shape=[0], dtype=paddle.int32)
@@ -3289,33 +3416,29 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
         return solution, residuals, rank, singular_values
 
     helper = LayerHelper('lstsq', **locals())
-    check_variable_and_dtype(x, 'dtype',
-                             ['float32', 'float64', 'complex64', 'complex128'],
-                             'lstsq')
-    check_variable_and_dtype(y, 'dtype',
-                             ['float32', 'float64', 'complex64', 'complex128'],
-                             'lstsq')
+    check_variable_and_dtype(
+        x, 'dtype', ['float32', 'float64', 'complex64', 'complex128'], 'lstsq'
+    )
+    check_variable_and_dtype(
+        y, 'dtype', ['float32', 'float64', 'complex64', 'complex128'], 'lstsq'
+    )
 
     solution = helper.create_variable_for_type_inference(dtype=x.dtype)
     residuals = helper.create_variable_for_type_inference(dtype=x.dtype)
     rank = helper.create_variable_for_type_inference(dtype=paddle.int32)
     singular_values = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type='lstsq',
-                     inputs={
-                         'X': x,
-                         'Y': y
-                     },
-                     outputs={
-                         'Solution': solution,
-                         'Residuals': residuals,
-                         'Rank': rank,
-                         'SingularValues': singular_values
-                     },
-                     attrs={
-                         'rcond': rcond,
-                         'driver': driver
-                     })
+    helper.append_op(
+        type='lstsq',
+        inputs={'X': x, 'Y': y},
+        outputs={
+            'Solution': solution,
+            'Residuals': residuals,
+            'Rank': rank,
+            'SingularValues': singular_values,
+        },
+        attrs={'rcond': rcond, 'driver': driver},
+    )
 
     if driver == "gels":
         rank = paddle.static.data(name='rank', shape=[0])
@@ -3328,7 +3451,7 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
 
 def corrcoef(x, rowvar=True, name=None):
     """
-    
+
     A correlation coefficient matrix indicate the correlation of each pair variables in the input matrix.
     For example, for an N-dimensional samples X=[x1,x2,…xN]T, then the correlation coefficient matrix
     element Rij is the correlation of xi and xj. The element Rii is the covariance of xi itself.
@@ -3367,11 +3490,12 @@ def corrcoef(x, rowvar=True, name=None):
     if len(x.shape) > 2 or len(x.shape) < 1:
         raise ValueError(
             "Input(x) only support N-D (1<=N<=2) tensor in corrcoef, but received "
-            "length of Input(input) is %s." % len(x.shape))
+            "length of Input(input) is %s." % len(x.shape)
+        )
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'corrcoef')
 
     c = cov(x, rowvar)
-    if (c.ndim == 0):
+    if c.ndim == 0:
         # scalar covariance
         # nan if incorrect value (nan, inf, 0), 1 otherwise
         return c / c
@@ -3386,8 +3510,9 @@ def corrcoef(x, rowvar=True, name=None):
 
     # Clip to [-1, 1].  This does not guarantee
     if paddle.is_complex(c):
-        return paddle.complex(paddle.clip(c.real(), -1, 1),
-                              paddle.clip(c.imag(), -1, 1))
+        return paddle.complex(
+            paddle.clip(c.real(), -1, 1), paddle.clip(c.imag(), -1, 1)
+        )
     else:
         c = paddle.clip(c, -1, 1)
 
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 63a89327505..c0a81c1934c 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -16,8 +16,10 @@ import paddle
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from .layer_function_generator import templatedoc
 from ..static import Variable
+
 # TODO: define logic functions of a tensor
 from ..fluid.framework import _in_eager_mode_
+
 if _in_eager_mode_:
     Tensor = paddle.fluid.framework.core.eager.Tensor
 else:
@@ -26,6 +28,7 @@ else:
 from ..framework import in_dygraph_mode, _non_static_mode
 from ..framework import LayerHelper
 from ..fluid.framework import _in_legacy_dygraph
+
 # TODO: define logic functions of a tensor
 from paddle import _C_ops, _legacy_C_ops
 from paddle.tensor.creation import full
@@ -47,14 +50,18 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
         else:
             return op(x)
     check_variable_and_dtype(
-        x, "x",
+        x,
+        "x",
         ["bool", "int8", "int16", "int32", "int64", "float32", "float64"],
-        op_name)
+        op_name,
+    )
     if y is not None:
         check_variable_and_dtype(
-            y, "y",
+            y,
+            "y",
             ["bool", "int8", "int16", "int32", "int64", "float32", "float64"],
-            op_name)
+            op_name,
+        )
     if out is not None:
         check_type(out, "out", Variable, op_name)
 
@@ -63,18 +70,16 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
     if binary_op and x.dtype != y.dtype:
         raise ValueError(
             "(InvalidArgument) The DataType of %s Op's Variable must be consistent, but received %s and %s."
-            % (op_name, x.dtype, y.dtype))
+            % (op_name, x.dtype, y.dtype)
+        )
 
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     if binary_op:
-        helper.append_op(type=op_name,
-                         inputs={
-                             "X": x,
-                             "Y": y
-                         },
-                         outputs={"Out": out})
+        helper.append_op(
+            type=op_name, inputs={"X": x, "Y": y}, outputs={"Out": out}
+        )
     else:
         helper.append_op(type=op_name, inputs={"X": x}, outputs={"Out": out})
 
@@ -116,12 +121,9 @@ def logical_and(x, y, out=None, name=None):
     if in_dygraph_mode():
         return _C_ops.logical_and(x, y)
 
-    return _logical_op(op_name="logical_and",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _logical_op(
+        op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 def logical_or(x, y, out=None, name=None):
@@ -136,7 +138,7 @@ def logical_or(x, y, out=None, name=None):
 
     .. note::
         ``paddle.logical_or`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
-    
+
     Args:
         x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
         y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
@@ -161,12 +163,9 @@ def logical_or(x, y, out=None, name=None):
     """
     if in_dygraph_mode():
         return _C_ops.logical_or(x, y)
-    return _logical_op(op_name="logical_or",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _logical_op(
+        op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 def logical_xor(x, y, out=None, name=None):
@@ -207,12 +206,9 @@ def logical_xor(x, y, out=None, name=None):
     if in_dygraph_mode():
         return _C_ops.logical_xor(x, y)
 
-    return _logical_op(op_name="logical_xor",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _logical_op(
+        op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 @templatedoc()
@@ -245,12 +241,9 @@ def logical_not(x, out=None, name=None):
     """
     if in_dygraph_mode():
         return _C_ops.logical_not(x)
-    return _logical_op(op_name="logical_not",
-                       x=x,
-                       y=None,
-                       name=name,
-                       out=out,
-                       binary_op=False)
+    return _logical_op(
+        op_name="logical_not", x=x, y=None, name=name, out=out, binary_op=False
+    )
 
 
 def is_empty(x, name=None):
@@ -288,16 +281,17 @@ def is_empty(x, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.is_empty(x)
 
-    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
-                             'is_empty')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'is_empty'
+    )
     check_type(name, "name", (str, type(None)), "is_empty")
 
     helper = LayerHelper("is_empty", **locals())
     cond = helper.create_variable_for_type_inference(dtype='bool')
     cond.stop_gradient = True
-    helper.append_op(type='is_empty',
-                     inputs={'X': [x]},
-                     outputs={'Out': [cond]})
+    helper.append_op(
+        type='is_empty', inputs={'X': [x]}, outputs={'Out': [cond]}
+    )
     return cond
 
 
@@ -305,7 +299,7 @@ def equal_all(x, y, name=None):
     """
     Returns the truth value of :math:`x == y`. True if two inputs have the same elements, False otherwise.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -338,12 +332,9 @@ def equal_all(x, y, name=None):
 
     helper = LayerHelper("equal_all", **locals())
     out = helper.create_variable_for_type_inference(dtype='bool')
-    helper.append_op(type='equal_all',
-                     inputs={
-                         'X': [x],
-                         'Y': [y]
-                     },
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='equal_all', inputs={'X': [x], 'Y': [y]}, outputs={'Out': [out]}
+    )
     return out
 
 
@@ -396,12 +387,15 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
         # NOTE(dev): Pass tol as Tensor to fix precision loss problem, because
         # C++ backend will cast it into float32 if passing float from python.
         as_tensor = lambda x: paddle.to_tensor(
-            [x], dtype='float64', place='cpu')
-        return _C_ops.allclose(x, y, as_tensor(rtol), as_tensor(atol),
-                               equal_nan)
+            [x], dtype='float64', place='cpu'
+        )
+        return _C_ops.allclose(
+            x, y, as_tensor(rtol), as_tensor(atol), equal_nan
+        )
     if _in_legacy_dygraph():
-        return _legacy_C_ops.allclose(x, y, 'rtol', str(rtol), 'atol',
-                                      str(atol), 'equal_nan', equal_nan)
+        return _legacy_C_ops.allclose(
+            x, y, 'rtol', str(rtol), 'atol', str(atol), 'equal_nan', equal_nan
+        )
     check_variable_and_dtype(x, "input", ['float32', 'float64'], 'allclose')
     check_variable_and_dtype(y, "input", ['float32', 'float64'], 'allclose')
     check_type(rtol, 'rtol', float, 'allclose')
@@ -414,10 +408,9 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     inputs = {'Input': x, 'Other': y}
     outputs = {'Out': out}
     attrs = {'rtol': str(rtol), 'atol': str(atol), 'equal_nan': equal_nan}
-    helper.append_op(type='allclose',
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type='allclose', inputs=inputs, outputs=outputs, attrs=attrs
+    )
 
     return out
 
@@ -428,7 +421,7 @@ def equal(x, y, name=None):
 
     This layer returns the truth value of :math:`x == y` elementwise.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -439,7 +432,7 @@ def equal(x, y, name=None):
 
     Returns:
         Tensor: output Tensor, it's shape is the same as the input's Tensor,
-        and the data type is bool. The result of this op is stop_gradient. 
+        and the data type is bool. The result of this op is stop_gradient.
 
     Examples:
         .. code-block:: python
@@ -453,8 +446,10 @@ def equal(x, y, name=None):
     """
     if not isinstance(y, (int, bool, float, Variable)):
         raise TypeError(
-            "Type of input args must be float, bool, int or Tensor, but received type {}"
-            .format(type(y)))
+            "Type of input args must be float, bool, int or Tensor, but received type {}".format(
+                type(y)
+            )
+        )
     if not isinstance(y, Variable):
         y = full(shape=[1], dtype=x.dtype, fill_value=y)
 
@@ -466,21 +461,26 @@ def equal(x, y, name=None):
             return _legacy_C_ops.equal(x, y)
         else:
             check_variable_and_dtype(
-                x, "x", ["bool", "float32", "float64", "int32", "int64"],
-                "equal")
+                x,
+                "x",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "equal",
+            )
             check_variable_and_dtype(
-                y, "y", ["bool", "float32", "float64", "int32", "int64"],
-                "equal")
+                y,
+                "y",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "equal",
+            )
             helper = LayerHelper("equal", **locals())
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(type='equal',
-                             inputs={
-                                 'X': [x],
-                                 'Y': [y]
-                             },
-                             outputs={'Out': [out]})
+            helper.append_op(
+                type='equal',
+                inputs={'X': [x], 'Y': [y]},
+                outputs={'Out': [out]},
+            )
             return out
 
 
@@ -489,7 +489,7 @@ def greater_equal(x, y, name=None):
     """
     Returns the truth value of :math:`x >= y` elementwise, which is equivalent function to the overloaded operator `>=`.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -518,21 +518,26 @@ def greater_equal(x, y, name=None):
             return _legacy_C_ops.greater_equal(x, y)
         else:
             check_variable_and_dtype(
-                x, "x", ["bool", "float32", "float64", "int32", "int64"],
-                "greater_equal")
+                x,
+                "x",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "greater_equal",
+            )
             check_variable_and_dtype(
-                y, "y", ["bool", "float32", "float64", "int32", "int64"],
-                "greater_equal")
+                y,
+                "y",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "greater_equal",
+            )
             helper = LayerHelper("greater_equal", **locals())
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(type='greater_equal',
-                             inputs={
-                                 'X': [x],
-                                 'Y': [y]
-                             },
-                             outputs={'Out': [out]})
+            helper.append_op(
+                type='greater_equal',
+                inputs={'X': [x], 'Y': [y]},
+                outputs={'Out': [out]},
+            )
             return out
 
 
@@ -541,7 +546,7 @@ def greater_than(x, y, name=None):
     """
     Returns the truth value of :math:`x > y` elementwise, which is equivalent function to the overloaded operator `>`.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -569,21 +574,26 @@ def greater_than(x, y, name=None):
             return _legacy_C_ops.greater_than(x, y)
         else:
             check_variable_and_dtype(
-                x, "x", ["bool", "float32", "float64", "int32", "int64"],
-                "greater_than")
+                x,
+                "x",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "greater_than",
+            )
             check_variable_and_dtype(
-                y, "y", ["bool", "float32", "float64", "int32", "int64"],
-                "greater_than")
+                y,
+                "y",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "greater_than",
+            )
             helper = LayerHelper("greater_than", **locals())
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(type='greater_than',
-                             inputs={
-                                 'X': [x],
-                                 'Y': [y]
-                             },
-                             outputs={'Out': [out]})
+            helper.append_op(
+                type='greater_than',
+                inputs={'X': [x], 'Y': [y]},
+                outputs={'Out': [out]},
+            )
             return out
 
 
@@ -592,7 +602,7 @@ def less_equal(x, y, name=None):
     """
     Returns the truth value of :math:`x <= y` elementwise, which is equivalent function to the overloaded operator `<=`.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -622,21 +632,26 @@ def less_equal(x, y, name=None):
             return _legacy_C_ops.less_equal(x, y)
         else:
             check_variable_and_dtype(
-                x, "x", ["bool", "float32", "float64", "int32", "int64"],
-                "less_equal")
+                x,
+                "x",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "less_equal",
+            )
             check_variable_and_dtype(
-                y, "y", ["bool", "float32", "float64", "int32", "int64"],
-                "less_equal")
+                y,
+                "y",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "less_equal",
+            )
             helper = LayerHelper("less_equal", **locals())
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(type='less_equal',
-                             inputs={
-                                 'X': [x],
-                                 'Y': [y]
-                             },
-                             outputs={'Out': [out]})
+            helper.append_op(
+                type='less_equal',
+                inputs={'X': [x], 'Y': [y]},
+                outputs={'Out': [out]},
+            )
             return out
 
 
@@ -645,7 +660,7 @@ def less_than(x, y, name=None):
     """
     Returns the truth value of :math:`x < y` elementwise, which is equivalent function to the overloaded operator `<`.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -675,21 +690,26 @@ def less_than(x, y, name=None):
             return _legacy_C_ops.less_than(x, y)
         else:
             check_variable_and_dtype(
-                x, "x", ["bool", "float32", "float64", "int32", "int64"],
-                "less_than")
+                x,
+                "x",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "less_than",
+            )
             check_variable_and_dtype(
-                y, "y", ["bool", "float32", "float64", "int32", "int64"],
-                "less_than")
+                y,
+                "y",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "less_than",
+            )
             helper = LayerHelper("less_than", **locals())
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(type='less_than',
-                             inputs={
-                                 'X': [x],
-                                 'Y': [y]
-                             },
-                             outputs={'Out': [out]})
+            helper.append_op(
+                type='less_than',
+                inputs={'X': [x], 'Y': [y]},
+                outputs={'Out': [out]},
+            )
             return out
 
 
@@ -697,8 +717,8 @@ def less_than(x, y, name=None):
 def not_equal(x, y, name=None):
     """
     Returns the truth value of :math:`x != y` elementwise, which is equivalent function to the overloaded operator `!=`.
-    
-    Note: 
+
+    Note:
         The output has no gradient.
 
     Args:
@@ -728,21 +748,26 @@ def not_equal(x, y, name=None):
             return _legacy_C_ops.not_equal(x, y)
         else:
             check_variable_and_dtype(
-                x, "x", ["bool", "float32", "float64", "int32", "int64"],
-                "not_equal")
+                x,
+                "x",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "not_equal",
+            )
             check_variable_and_dtype(
-                y, "y", ["bool", "float32", "float64", "int32", "int64"],
-                "not_equal")
+                y,
+                "y",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "not_equal",
+            )
             helper = LayerHelper("not_equal", **locals())
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(type='not_equal',
-                             inputs={
-                                 'X': [x],
-                                 'Y': [y]
-                             },
-                             outputs={'Out': [out]})
+            helper.append_op(
+                type='not_equal',
+                inputs={'X': [x], 'Y': [y]},
+                outputs={'Out': [out]},
+            )
             return out
 
 
@@ -769,7 +794,7 @@ def is_tensor(x):
             input3 = [1, 4]
             check = paddle.is_tensor(input3)
             print(check)  #False
-            
+
     """
     return isinstance(x, (Tensor, paddle.fluid.core.eager.Tensor))
 
@@ -789,11 +814,15 @@ def _bitwise_op(op_name, x, y, out=None, name=None, binary_op=True):
             return op(x)
 
     check_variable_and_dtype(
-        x, "x", ["bool", "uint8", "int8", "int16", "int32", "int64"], op_name)
+        x, "x", ["bool", "uint8", "int8", "int16", "int32", "int64"], op_name
+    )
     if y is not None:
         check_variable_and_dtype(
-            y, "y", ["bool", "uint8", "int8", "int16", "int32", "int64"],
-            op_name)
+            y,
+            "y",
+            ["bool", "uint8", "int8", "int16", "int32", "int64"],
+            op_name,
+        )
     if out is not None:
         check_type(out, "out", Variable, op_name)
 
@@ -805,12 +834,9 @@ def _bitwise_op(op_name, x, y, out=None, name=None, binary_op=True):
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     if binary_op:
-        helper.append_op(type=op_name,
-                         inputs={
-                             "X": x,
-                             "Y": y
-                         },
-                         outputs={"Out": out})
+        helper.append_op(
+            type=op_name, inputs={"X": x, "Y": y}, outputs={"Out": out}
+        )
     else:
         helper.append_op(type=op_name, inputs={"X": x}, outputs={"Out": out})
 
@@ -821,7 +847,7 @@ def _bitwise_op(op_name, x, y, out=None, name=None, binary_op=True):
 def bitwise_and(x, y, out=None, name=None):
     """
     ${comment}
-    
+
     Args:
         x (Tensor): ${x_comment}
         y (Tensor): ${y_comment}
@@ -829,7 +855,7 @@ def bitwise_and(x, y, out=None, name=None):
 
     Returns:
         Tensor: ${out_comment}
-        
+
     Examples:
         .. code-block:: python
 
@@ -841,19 +867,16 @@ def bitwise_and(x, y, out=None, name=None):
     """
     if in_dygraph_mode() and out is None:
         return _C_ops.bitwise_and(x, y)
-    return _bitwise_op(op_name="bitwise_and",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _bitwise_op(
+        op_name="bitwise_and", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 @templatedoc()
 def bitwise_or(x, y, out=None, name=None):
     """
     ${comment}
-    
+
     Args:
         x (Tensor): ${x_comment}
         y (Tensor): ${y_comment}
@@ -874,12 +897,9 @@ def bitwise_or(x, y, out=None, name=None):
     if in_dygraph_mode() and out is None:
         return _C_ops.bitwise_or(x, y)
 
-    return _bitwise_op(op_name="bitwise_or",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _bitwise_op(
+        op_name="bitwise_or", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 @templatedoc()
@@ -906,12 +926,9 @@ def bitwise_xor(x, y, out=None, name=None):
     """
     if in_dygraph_mode() and out is None:
         return _C_ops.bitwise_xor(x, y)
-    return _bitwise_op(op_name="bitwise_xor",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _bitwise_op(
+        op_name="bitwise_xor", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 @templatedoc()
@@ -922,7 +939,7 @@ def bitwise_not(x, out=None, name=None):
     Args:
         x(Tensor):  ${x_comment}
         out(Tensor): ${out_comment}
-    
+
     Returns:
         Tensor: ${out_comment}
 
@@ -937,12 +954,9 @@ def bitwise_not(x, out=None, name=None):
     if in_dygraph_mode() and out is None:
         return _C_ops.bitwise_not(x)
 
-    return _bitwise_op(op_name="bitwise_not",
-                       x=x,
-                       y=None,
-                       name=name,
-                       out=out,
-                       binary_op=False)
+    return _bitwise_op(
+        op_name="bitwise_not", x=x, y=None, name=name, out=out, binary_op=False
+    )
 
 
 @templatedoc()
@@ -1001,11 +1015,13 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
         # NOTE(dev): Pass tol as Tensor to fix precision loss problem, because
         # C++ backend will cast it into float32 if passing float from python.
         as_tensor = lambda x: paddle.to_tensor(
-            [x], dtype='float64', place='cpu')
+            [x], dtype='float64', place='cpu'
+        )
         return _C_ops.isclose(x, y, as_tensor(rtol), as_tensor(atol), equal_nan)
     if _in_legacy_dygraph():
-        return _legacy_C_ops.isclose(x, y, 'rtol', str(rtol), 'atol', str(atol),
-                                     'equal_nan', equal_nan)
+        return _legacy_C_ops.isclose(
+            x, y, 'rtol', str(rtol), 'atol', str(atol), 'equal_nan', equal_nan
+        )
 
     check_variable_and_dtype(x, "input", ['float32', 'float64'], 'isclose')
     check_variable_and_dtype(y, "input", ['float32', 'float64'], 'isclose')
@@ -1019,8 +1035,7 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     inputs = {'Input': x, 'Other': y}
     outputs = {'Out': out}
     attrs = {'rtol': str(rtol), 'atol': str(atol), 'equal_nan': equal_nan}
-    helper.append_op(type='isclose',
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type='isclose', inputs=inputs, outputs=outputs, attrs=attrs
+    )
     return out
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 5e05a93e905..66c5e3c65f5 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -17,12 +17,22 @@ from collections import Counter
 
 from ..static import Variable, device_guard
 from ..framework import core, in_dygraph_mode
-from ..fluid.framework import _in_legacy_dygraph, _in_eager_without_dygraph_check, _non_static_mode
+from ..fluid.framework import (
+    _in_legacy_dygraph,
+    _in_eager_without_dygraph_check,
+    _non_static_mode,
+)
 from ..framework import LayerHelper
 from ..framework import OpProtoHolder, convert_np_dtype_to_dtype_, dygraph_only
-from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ..fluid.data_feeder import (
+    convert_dtype,
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+)
 from ..fluid.layers import utils
 import numpy as np
+
 # TODO: define functions to manipulate a tensor
 from ..fluid.layers.nn import _elementwise_op_in_dygraph
 from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
@@ -72,25 +82,50 @@ def cast(x, dtype):
         out = _legacy_C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
         return out
 
-    check_variable_and_dtype(x, 'x', [
-        'bool', 'float16', 'float32', 'float64', 'int16', 'int32', 'int64',
-        'uint8', 'uint16'
-    ], 'cast')
-    check_dtype(dtype, 'dtype', [
-        'bool', 'float16', 'float32', 'float64', 'int8', 'int16', 'int32',
-        'int64', 'uint8', 'uint16'
-    ], 'cast')
+    check_variable_and_dtype(
+        x,
+        'x',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int16',
+            'int32',
+            'int64',
+            'uint8',
+            'uint16',
+        ],
+        'cast',
+    )
+    check_dtype(
+        dtype,
+        'dtype',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int8',
+            'int16',
+            'int32',
+            'int64',
+            'uint8',
+            'uint16',
+        ],
+        'cast',
+    )
 
     helper = LayerHelper('cast', **locals())
     out = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=x.stop_gradient)
-    helper.append_op(type='cast',
-                     inputs={'X': [x]},
-                     outputs={'Out': [out]},
-                     attrs={
-                         'in_dtype': x.dtype,
-                         'out_dtype': out.dtype
-                     })
+        dtype=dtype, stop_gradient=x.stop_gradient
+    )
+    helper.append_op(
+        type='cast',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'in_dtype': x.dtype, 'out_dtype': out.dtype},
+    )
     return out
 
 
@@ -128,7 +163,7 @@ def slice(input, axes, starts, ends):
                 ends = [-1, 1000]       # -1 denotes the reverse 0th position of dimension 0.
             Then:
                 result = [ [2, 3, 4], ] # result = data[0:1, 1:4]
-    
+
     Args:
         input (Tensor): A ``Tensor`` . The data type is ``float16``, ``float32``, ``float64``, ``int32`` or ``int64``.
         axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to .
@@ -175,7 +210,8 @@ def slice(input, axes, starts, ends):
             axes = list(axes)
             if len(axes) == 0:
                 raise ValueError(
-                    "Input axes should not be an empty list/tuple.")
+                    "Input axes should not be an empty list/tuple."
+                )
             for i in range(len(axes)):
                 if axes[i] < 0:
                     axes[i] = max(0, axes[i] + len(input.shape))
@@ -184,8 +220,10 @@ def slice(input, axes, starts, ends):
 
         else:
             raise ValueError(
-                "Input axes must be a python list or tuple, but reveived {}".
-                format(type(axes)))
+                "Input axes must be a python list or tuple, but reveived {}".format(
+                    type(axes)
+                )
+            )
 
         infer_flags = list(1 for i in range(len(axes)))
 
@@ -194,7 +232,8 @@ def slice(input, axes, starts, ends):
         if isinstance(starts, (list, tuple)):
             starts = [
                 item.numpy().item(0)
-                if isinstance(item, tmp_tensor_type) else item
+                if isinstance(item, tmp_tensor_type)
+                else item
                 for item in starts
             ]
         elif isinstance(starts, tmp_tensor_type):
@@ -205,7 +244,9 @@ def slice(input, axes, starts, ends):
         if isinstance(ends, (list, tuple)):
             ends = [
                 item.numpy().item(0)
-                if isinstance(item, tmp_tensor_type) else item for item in ends
+                if isinstance(item, tmp_tensor_type)
+                else item
+                for item in ends
             ]
         elif isinstance(ends, tmp_tensor_type):
             tensor_t = ends.numpy()
@@ -223,7 +264,8 @@ def slice(input, axes, starts, ends):
                 axes = list(axes)
                 if len(axes) == 0:
                     raise ValueError(
-                        "Input axes should not be an empty list/tuple.")
+                        "Input axes should not be an empty list/tuple."
+                    )
                 for i in range(len(axes)):
                     if axes[i] < 0:
                         axes[i] = max(0, axes[i] + len(input.shape))
@@ -232,8 +274,10 @@ def slice(input, axes, starts, ends):
 
             else:
                 raise ValueError(
-                    "Input axes must be a python list or tuple, but reveived {}"
-                    .format(type(axes)))
+                    "Input axes must be a python list or tuple, but reveived {}".format(
+                        type(axes)
+                    )
+                )
 
             infer_flags = list(1 for i in range(len(axes)))
 
@@ -242,7 +286,8 @@ def slice(input, axes, starts, ends):
             if isinstance(starts, (list, tuple)):
                 starts = [
                     item.numpy().item(0)
-                    if isinstance(item, tmp_tensor_type) else item
+                    if isinstance(item, tmp_tensor_type)
+                    else item
                     for item in starts
                 ]
                 attrs += ('starts', starts)
@@ -254,7 +299,8 @@ def slice(input, axes, starts, ends):
             if isinstance(ends, (list, tuple)):
                 ends = [
                     item.numpy().item(0)
-                    if isinstance(item, tmp_tensor_type) else item
+                    if isinstance(item, tmp_tensor_type)
+                    else item
                     for item in ends
                 ]
                 attrs += ('ends', ends)
@@ -263,16 +309,27 @@ def slice(input, axes, starts, ends):
                 ends_tensor.stop_gradient = True
                 infer_flags = list(-1 for i in range(len(axes)))
 
-            return _legacy_C_ops.slice(input, starts_tensor, ends_tensor, None,
-                                       None, 'axes', axes, 'infer_flags',
-                                       infer_flags, *attrs)
+            return _legacy_C_ops.slice(
+                input,
+                starts_tensor,
+                ends_tensor,
+                None,
+                None,
+                'axes',
+                axes,
+                'infer_flags',
+                infer_flags,
+                *attrs,
+            )
 
     if not isinstance(starts, (list, tuple, Variable)):
         raise ValueError(
-            "Input starts must be an Variable, python list or tuple.")
+            "Input starts must be an Variable, python list or tuple."
+        )
     if not isinstance(ends, (list, tuple, Variable)):
         raise ValueError(
-            "Input ends must be an Variable, python list or tuple.")
+            "Input ends must be an Variable, python list or tuple."
+        )
 
     helper = LayerHelper('slice', **locals())
 
@@ -319,11 +376,11 @@ def slice(input, axes, starts, ends):
     # infer_flags
     attrs['infer_flags'] = infer_flags
     out = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype('input'))
-    helper.append_op(type='slice',
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={'Out': out})
+        dtype=helper.input_dtype('input')
+    )
+    helper.append_op(
+        type='slice', inputs=inputs, attrs=attrs, outputs={'Out': out}
+    )
 
     return out
 
@@ -385,10 +442,21 @@ def transpose(x, perm, name=None):
             out, _ = _legacy_C_ops.transpose2(x, 'axis', perm)
             return out
 
-    check_variable_and_dtype(x, 'x', [
-        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
-        'complex128'
-    ], 'transpose')
+    check_variable_and_dtype(
+        x,
+        'x',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'transpose',
+    )
     check_type(perm, 'perm', (list, tuple), 'transpose')
     if isinstance(perm, tuple):
         perm = list(perm)
@@ -397,32 +465,33 @@ def transpose(x, perm, name=None):
             "Input(perm) is the permutation of dimensions of Input(x), "
             "its length should be equal to dimensions of Input(x), "
             "but received dimension of Input(x) is %s, "
-            "the length of Input(perm) is %s." % (len(x.shape), len(perm)))
+            "the length of Input(perm) is %s." % (len(x.shape), len(perm))
+        )
     for idx, dim in enumerate(perm):
         if dim >= len(x.shape):
             raise ValueError(
                 "Each element in Input(perm) should be less than Input(x)'s dimension, "
                 "but %d-th element in Input(perm) is %d which exceeds Input(x)'s "
-                "dimension %d." % (idx, perm[idx], len(x.shape)))
+                "dimension %d." % (idx, perm[idx], len(x.shape))
+            )
 
     helper = LayerHelper('transpose', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='transpose2',
-                     inputs={'X': [x]},
-                     outputs={
-                         'Out': [out],
-                         'XShape': [x_shape]
-                     },
-                     attrs={'axis': perm})
+    helper.append_op(
+        type='transpose2',
+        inputs={'X': [x]},
+        outputs={'Out': [out], 'XShape': [x_shape]},
+        attrs={'axis': perm},
+    )
     return out
 
 
 def unstack(x, axis=0, num=None):
     """
     :alias_main: paddle.unstack
-	:alias: paddle.unstack,paddle.tensor.unstack,paddle.tensor.manipulation.unstack
-	:old_api: paddle.fluid.layers.unstack
+        :alias: paddle.unstack,paddle.tensor.unstack,paddle.tensor.manipulation.unstack
+        :old_api: paddle.fluid.layers.unstack
 
     **UnStack Layer**
 
@@ -477,13 +546,12 @@ def unstack(x, axis=0, num=None):
     for _ in range(num):
         outs.append(helper.create_variable_for_type_inference(x.dtype))
 
-    helper.append_op(type='unstack',
-                     inputs={'X': [x]},
-                     outputs={'Y': outs},
-                     attrs={
-                         'axis': axis,
-                         'num': num
-                     })
+    helper.append_op(
+        type='unstack',
+        inputs={'X': [x]},
+        outputs={'Y': outs},
+        attrs={'axis': axis, 'num': num},
+    )
     return outs
 
 
@@ -505,7 +573,7 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
     For each value `v` in `input`, we reset it to a new value according to the
     following formula:
     ::
-   
+
         v = v - shard_id * shard_size if shard_id * shard_size <= v < (shard_id+1) * shard_size else ignore_value
 
     That is, the value `v` is set to the new offset within the range represented by the shard `shard_id`
@@ -534,27 +602,31 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
             # [[-1], [1]]
     """
     if in_dygraph_mode():
-        return _C_ops.shard_index(input, index_num, nshards, shard_id,
-                                  ignore_value)
+        return _C_ops.shard_index(
+            input, index_num, nshards, shard_id, ignore_value
+        )
 
     check_variable_and_dtype(input, 'input', ['int64', 'int32'], 'shard_index')
     op_type = 'shard_index'
     helper = LayerHelper(op_type, **locals())
     if shard_id < 0 or shard_id >= nshards:
-        raise ValueError('The shard_id(%d) should be in [0, %d)' %
-                         (shard_id, nshards))
+        raise ValueError(
+            'The shard_id(%d) should be in [0, %d)' % (shard_id, nshards)
+        )
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type=op_type,
-                     inputs={'X': [input]},
-                     outputs={'Out': out},
-                     attrs={
-                         'index_num': index_num,
-                         'nshards': nshards,
-                         'shard_id': shard_id,
-                         'ignore_value': ignore_value
-                     },
-                     stop_gradient=True)
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [input]},
+        outputs={'Out': out},
+        attrs={
+            'index_num': index_num,
+            'nshards': nshards,
+            'shard_id': shard_id,
+            'ignore_value': ignore_value,
+        },
+        stop_gradient=True,
+    )
     return out
 
 
@@ -646,11 +718,13 @@ def crop(x, shape=None, offsets=None, name=None):
     """
 
     helper = LayerHelper('crop_tensor', **locals())
-    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
-                             'crop_tensor')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'crop_tensor'
+    )
     check_type(shape, 'shape', (list, tuple, Variable), 'crop_tensor')
-    check_type(offsets, 'offsets', (list, tuple, Variable, type(None)),
-               'crop_tensor')
+    check_type(
+        offsets, 'offsets', (list, tuple, Variable, type(None)), 'crop_tensor'
+    )
 
     if offsets is None:
         offsets = [0] * len(x.shape)
@@ -666,25 +740,30 @@ def crop(x, shape=None, offsets=None, name=None):
         if not isinstance(shape_val, int):
             raise TypeError(
                 "Attr(shape)'s dtype of Op(crop_tensor) should be int32, but received: %s."
-                % type(shape_val))
+                % type(shape_val)
+            )
         if shape_val == 0:
             raise ValueError(
                 "Attr(shape) of Op(crop_tensor) should not be zero, but received: %s."
-                % str(shape_val))
+                % str(shape_val)
+            )
         if shape_val < -1:
             raise ValueError(
                 "When the element in Attr(shape) of Op(crop_tensor) is negative, only -1 is supported, but received: %s."
-                % str(shape_val))
+                % str(shape_val)
+            )
 
     def _attr_offsets_check(offset_val):
         if not isinstance(offset_val, int):
             raise TypeError(
                 "Attr(offsets)'s dtype of Op(crop_tensor) should be int32, but received: %s."
-                % type(offset_val))
+                % type(offset_val)
+            )
         if offset_val < 0:
             raise ValueError(
                 "Attr(offsets) of Op(crop_tensor) should be greater or equal to zero, but received: %s."
-                % str(offset_val))
+                % str(offset_val)
+            )
 
     if isinstance(offsets, Variable):
         offsets.stop_gradient = True
@@ -725,11 +804,9 @@ def crop(x, shape=None, offsets=None, name=None):
             else:
                 _attr_shape_check(dim_size)
                 temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant([1],
-                              'int32',
-                              dim_size,
-                              force_cpu=True,
-                              out=temp_out)
+                fill_constant(
+                    [1], 'int32', dim_size, force_cpu=True, out=temp_out
+                )
                 new_shape_tensor.append(temp_out)
                 shape_attr.append(dim_size)
         ipts['ShapeTensor'] = new_shape_tensor
@@ -739,10 +816,12 @@ def crop(x, shape=None, offsets=None, name=None):
             _attr_shape_check(dim_size)
         attrs['shape'] = shape
 
-    helper.append_op(type='crop_tensor',
-                     inputs=ipts,
-                     outputs={'Out': out},
-                     attrs=None if len(attrs) == 0 else attrs)
+    helper.append_op(
+        type='crop_tensor',
+        inputs=ipts,
+        outputs={'Out': out},
+        attrs=None if len(attrs) == 0 else attrs,
+    )
     return out
 
 
@@ -774,13 +853,15 @@ def fill_(x, value):
     """
     if not isinstance(value, (float, int)):
         raise TypeError(
-            "The type of 'value'  must be int or float, but received %s." %
-            (type(value)))
+            "The type of 'value'  must be int or float, but received %s."
+            % (type(value))
+        )
     if in_dygraph_mode():
         return _C_ops.fill_(x, value)
     else:
-        return _legacy_C_ops.fill_any_(x, "value_float", float(value),
-                                       "value_int", int(value))
+        return _legacy_C_ops.fill_any_(
+            x, "value_float", float(value), "value_int", int(value)
+        )
 
 
 @dygraph_only
@@ -809,10 +890,11 @@ def zero_(x):
 
     """
     if in_dygraph_mode():
-        return _C_ops.fill_(x, 0.)
+        return _C_ops.fill_(x, 0.0)
     else:
-        return _legacy_C_ops.fill_any_(x, "value_float", 0., "value_int",
-                                       int(0))
+        return _legacy_C_ops.fill_any_(
+            x, "value_float", 0.0, "value_int", int(0)
+        )
 
 
 @dygraph_only
@@ -820,16 +902,16 @@ def fill_diagonal_(x, value, offset=0, wrap=False, name=None):
     """
     Note:
         This API is ONLY available in Dygraph mode.
-	
+
     This function fill the value into the x Tensor's diagonal inplace.
-    
+
     Args:
         x(Tensor): ``x`` is the original Tensor
         value(Scale): ``value`` is the value to filled in x
         offset(int,optional): the offset to the main diagonal. Default: 0 (main diagonal).
         wrap(bool,optional): the diagonal 'wrapped' after N columns for tall matrices.
         name(str,optional): Name for the operation (optional, default is None)
-    
+
     Returns:
         Tensor: Tensor with diagonal filled with value.
 
@@ -844,39 +926,45 @@ def fill_diagonal_(x, value, offset=0, wrap=False, name=None):
     helper = LayerHelper("fill_diagonal_", **locals())
     check_type(x, 'X', (Variable), 'fill_diagonal_')
     dtype = helper.input_dtype('x')
-    check_dtype(dtype, 'X',
-                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-                'fill_diagonal_')
+    check_dtype(
+        dtype,
+        'X',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'fill_diagonal_',
+    )
     check_type(value, 'value', (bool, int, float), 'fill_diagonal_')
     check_type(wrap, 'wrap', (bool), 'fill_diagonal_')
 
     inshape = x.shape
     inshapeset = set(inshape)
-    assert len(inshape) >= 2, ('Tensor dims should >= 2 in fill_diagonal_ API')
+    assert len(inshape) >= 2, 'Tensor dims should >= 2 in fill_diagonal_ API'
     if len(inshape) > 2:
-        assert len(inshapeset) == 1, (
-            'Tensor dims should be equal while input dims > 2 in fill_diagonal_ API'
-        )
+        assert (
+            len(inshapeset) == 1
+        ), 'Tensor dims should be equal while input dims > 2 in fill_diagonal_ API'
     if in_dygraph_mode():
         if len(inshape) == 2:
             return _C_ops.fill_diagonal_(x, value, offset, wrap)
         return _C_ops.fill_diagonal_(x, value, offset, True)
 
     if len(inshape) == 2:
-        return _legacy_C_ops.fill_diagonal_(x, 'value', value, 'offset', offset,
-                                            'wrap', wrap)
-    return _legacy_C_ops.fill_diagonal_(x, 'value', value, 'offset', offset,
-                                        'wrap', True)
+        return _legacy_C_ops.fill_diagonal_(
+            x, 'value', value, 'offset', offset, 'wrap', wrap
+        )
+    return _legacy_C_ops.fill_diagonal_(
+        x, 'value', value, 'offset', offset, 'wrap', True
+    )
 
 
 def _fill_diagonal_tensor_impl(x, y, offset=0, dim1=0, dim2=1, inplace=False):
     inshape = x.shape
-    assert dim1 < len(inshape) and dim1 >= -len(inshape), (
-        'dim1 should between [-rank,rank) in fill_diagonal_tensor_')
-    assert dim2 < len(inshape) and dim2 >= -len(inshape), (
-        'dim2 should between [-rank,rank) in fill_diagonal_tensor_')
-    assert len(inshape) >= 2, (
-        'Tensor dims should >= 2 in fill_diagonal_tensor_')
+    assert dim1 < len(inshape) and dim1 >= -len(
+        inshape
+    ), 'dim1 should between [-rank,rank) in fill_diagonal_tensor_'
+    assert dim2 < len(inshape) and dim2 >= -len(
+        inshape
+    ), 'dim2 should between [-rank,rank) in fill_diagonal_tensor_'
+    assert len(inshape) >= 2, 'Tensor dims should >= 2 in fill_diagonal_tensor_'
     dim1 %= len(inshape)
     dim2 %= len(inshape)
 
@@ -884,11 +972,14 @@ def _fill_diagonal_tensor_impl(x, y, offset=0, dim1=0, dim2=1, inplace=False):
     for i in range(len(inshape)):
         if i != dim1 and i != dim2:
             predshape.append(inshape[i])
-    diaglen = min(min(inshape[dim1], inshape[dim1] + offset),
-                  min(inshape[dim2], inshape[dim2] - offset))
+    diaglen = min(
+        min(inshape[dim1], inshape[dim1] + offset),
+        min(inshape[dim2], inshape[dim2] - offset),
+    )
     predshape.append(diaglen)
     assert tuple(predshape) == tuple(
-        y.shape), ("the y shape should be {}".format(predshape))
+        y.shape
+    ), "the y shape should be {}".format(predshape)
     if len(y.shape) == 1:
         y = y.reshape([1, -1])
 
@@ -896,14 +987,15 @@ def _fill_diagonal_tensor_impl(x, y, offset=0, dim1=0, dim2=1, inplace=False):
         if in_dygraph_mode():
             return _C_ops.fill_diagonal_tensor_(x, y, offset, dim1, dim2)
         else:
-            return _legacy_C_ops.fill_diagonal_tensor_(x, y, 'offset', offset,
-                                                       'dim1', dim1, 'dim2',
-                                                       dim2)
+            return _legacy_C_ops.fill_diagonal_tensor_(
+                x, y, 'offset', offset, 'dim1', dim1, 'dim2', dim2
+            )
     if in_dygraph_mode():
         return _C_ops.fill_diagonal_tensor(x, y, offset, dim1, dim2)
     else:
-        return _legacy_C_ops.fill_diagonal_tensor(x, y, 'offset', offset,
-                                                  'dim1', dim1, 'dim2', dim2)
+        return _legacy_C_ops.fill_diagonal_tensor(
+            x, y, 'offset', offset, 'dim1', dim1, 'dim2', dim2
+        )
 
 
 def fill_diagonal_tensor_(x, y, offset=0, dim1=0, dim2=1, name=None):
@@ -935,12 +1027,9 @@ def fill_diagonal_tensor_(x, y, offset=0, dim1=0, dim2=1, name=None):
             print(x.tolist())   #[[1.0, 2.0, 2.0], [2.0, 1.0, 2.0], [2.0, 2.0, 1.0], [2.0, 2.0, 2.0]]
 
     """
-    return _fill_diagonal_tensor_impl(x,
-                                      y,
-                                      offset=offset,
-                                      dim1=dim1,
-                                      dim2=dim2,
-                                      inplace=True)
+    return _fill_diagonal_tensor_impl(
+        x, y, offset=offset, dim1=dim1, dim2=dim2, inplace=True
+    )
 
 
 def fill_diagonal_tensor(x, y, offset=0, dim1=0, dim2=1, name=None):
@@ -969,12 +1058,9 @@ def fill_diagonal_tensor(x, y, offset=0, dim1=0, dim2=1, name=None):
             print(nx.tolist())   #[[1.0, 2.0, 2.0], [2.0, 1.0, 2.0], [2.0, 2.0, 1.0], [2.0, 2.0, 2.0]]
 
     """
-    return _fill_diagonal_tensor_impl(x,
-                                      y,
-                                      offset=offset,
-                                      dim1=dim1,
-                                      dim2=dim2,
-                                      inplace=False)
+    return _fill_diagonal_tensor_impl(
+        x, y, offset=offset, dim1=dim1, dim2=dim2, inplace=False
+    )
 
 
 @dygraph_only
@@ -1017,7 +1103,7 @@ def concat(x, axis=0, name=None):
         x (list|tuple): ``x`` is a Tensor list or Tensor tuple which is with data type bool, float16,
             float32, float64, int32, int64, int8, uint8. All the Tensors in ``x`` must have same data type.
         axis (int|Tensor, optional): Specify the axis to operate on the input Tensors.
-            It's a scalar with data type int or a Tensor with shape [1] and data type int32 
+            It's a scalar with data type int or a Tensor with shape [1] and data type int32
             or int64. The effective range is [-R, R), where R is Rank(x). When ``axis < 0``,
             it works the same way as ``axis+R``. Default is 0.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
@@ -1027,9 +1113,9 @@ def concat(x, axis=0, name=None):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
-            
+
             x1 = paddle.to_tensor([[1, 2, 3],
                                    [4, 5, 6]])
             x2 = paddle.to_tensor([[11, 12, 13],
@@ -1073,10 +1159,21 @@ def concat(x, axis=0, name=None):
     check_type(input, 'input', (list, tuple, Variable), 'concat')
     if not isinstance(input, Variable):
         for id, x in enumerate(input):
-            check_variable_and_dtype(x, 'input[' + str(id) + ']', [
-                'bool', 'float16', 'float32', 'float64', 'int32', 'int64',
-                'int8', 'unit8'
-            ], 'concat')
+            check_variable_and_dtype(
+                x,
+                'input[' + str(id) + ']',
+                [
+                    'bool',
+                    'float16',
+                    'float32',
+                    'float64',
+                    'int32',
+                    'int64',
+                    'int8',
+                    'unit8',
+                ],
+                'concat',
+            )
             if x.dtype != input[0].dtype:
                 raise TypeError(
                     "All the Tensors in the input must have the same data type."
@@ -1087,8 +1184,11 @@ def concat(x, axis=0, name=None):
 
     if isinstance(axis, Variable):
         check_dtype(
-            axis.dtype, 'axis', ['int32', 'int64'], 'concat',
-            "The data type of axis must be int32 or int64 when axis is a Tensor"
+            axis.dtype,
+            'axis',
+            ['int32', 'int64'],
+            'concat',
+            "The data type of axis must be int32 or int64 when axis is a Tensor",
         )
 
     helper = LayerHelper('concat', **locals())
@@ -1099,19 +1199,17 @@ def concat(x, axis=0, name=None):
         # This feature is supported for Dynamic-to-Static, because after transformed, the type of inputs[0]
         # is LOD_TENSOR_ARRAY in some scenarios. And this feature can be used in static mode.
 
-        assert len(input) == 1, "If the elements of 'input' in concat are Variable(LoDTensorArray), " \
-                "number of the elements must be 1, but received %s." % len(input)
+        assert len(input) == 1, (
+            "If the elements of 'input' in concat are Variable(LoDTensorArray), "
+            "number of the elements must be 1, but received %s." % len(input)
+        )
         out_index = helper.create_variable_for_type_inference(dtype="int32")
-        helper.append_op(type='tensor_array_to_tensor',
-                         inputs={'X': input[0]},
-                         outputs={
-                             'Out': [out],
-                             'OutIndex': [out_index]
-                         },
-                         attrs={
-                             'axis': axis,
-                             'use_stack': False
-                         })
+        helper.append_op(
+            type='tensor_array_to_tensor',
+            inputs={'X': input[0]},
+            outputs={'Out': [out], 'OutIndex': [out_index]},
+            attrs={'axis': axis, 'use_stack': False},
+        )
     else:
         inputs = {'X': input}
         attrs = {}
@@ -1121,10 +1219,9 @@ def concat(x, axis=0, name=None):
         else:
             attrs['axis'] = axis
 
-        helper.append_op(type='concat',
-                         inputs=inputs,
-                         outputs={'Out': [out]},
-                         attrs=attrs)
+        helper.append_op(
+            type='concat', inputs=inputs, outputs={'Out': [out]}, attrs=attrs
+        )
     return out
 
 
@@ -1164,17 +1261,21 @@ def broadcast_tensors(input, name=None):
     check_type(input, 'input', (list, tuple), 'broadcast_tensors')
     if num_inputs < 1:
         raise TypeError(
-            "At least 1 tensor is needed to perform broadcast_tensors")
+            "At least 1 tensor is needed to perform broadcast_tensors"
+        )
 
     # Check input types
     for id, x in enumerate(input):
         check_variable_and_dtype(
-            x, 'input[' + str(id) + ']',
+            x,
+            'input[' + str(id) + ']',
             ['bool', 'float32', 'float64', 'int32', 'int64'],
-            'broadcast_tensors')
+            'broadcast_tensors',
+        )
         if x.dtype != input[0].dtype:
             raise TypeError(
-                "All the Tensors in the input must have the same data type.")
+                "All the Tensors in the input must have the same data type."
+            )
 
     # Check bcast semantics
     output_shape_r_last_tensor_index = []
@@ -1192,8 +1293,11 @@ def broadcast_tensors(input, name=None):
                 output_shape_r.append(shape[i])
                 output_shape_r_last_tensor_index.append(j)
             else:
-                invalid = (output_shape_r[i] != shape[i]
-                           and output_shape_r[i] != 1 and shape[i] != 1)
+                invalid = (
+                    output_shape_r[i] != shape[i]
+                    and output_shape_r[i] != 1
+                    and shape[i] != 1
+                )
                 if invalid:
                     last_index = output_shape_r_last_tensor_index[i]
                     raise TypeError(
@@ -1212,14 +1316,15 @@ def broadcast_tensors(input, name=None):
     while i < num_inputs:
         out.append(
             helper.create_variable_for_type_inference(
-                dtype=helper.input_dtype()))
+                dtype=helper.input_dtype()
+            )
+        )
         i += 1
 
     inputs = {'X': input}
-    helper.append_op(type='broadcast_tensors',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs={})
+    helper.append_op(
+        type='broadcast_tensors', inputs=inputs, outputs={'Out': out}, attrs={}
+    )
 
     return out
 
@@ -1265,19 +1370,21 @@ def flip(x, axis, name=None):
     helper = LayerHelper("flip", **locals())
     check_type(x, 'X', (Variable), 'flip')
     dtype = helper.input_dtype('x')
-    check_dtype(dtype, 'X',
-                ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
-                'flip')
+    check_dtype(
+        dtype,
+        'X',
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
+        'flip',
+    )
     check_type(axis, 'axis', (list, tuple), 'flip')
     if name is None:
         out = helper.create_variable_for_type_inference(dtype)
     else:
         out = helper.create_variable(name=name, dtype=dtype, persistable=False)
 
-    helper.append_op(type="flip",
-                     inputs={"X": x},
-                     outputs={"Out": out},
-                     attrs={"axis": axis})
+    helper.append_op(
+        type="flip", inputs={"X": x}, outputs={"Out": out}, attrs={"axis": axis}
+    )
     return out
 
 
@@ -1303,23 +1410,23 @@ def rot90(x, k=1, axes=[0, 1], name=None):
 
           data = paddle.arange(4)
           data = paddle.reshape(data, (2, 2))
-          print(data) 
+          print(data)
           #[[0, 1],
           # [2, 3]]
 
           y = paddle.rot90(data, 1, [0, 1])
-          print(y) 
+          print(y)
           #[[1, 3],
           # [0, 2]]
 
           y= paddle.rot90(data, -1, [0, 1])
-          print(y) 
+          print(y)
           #[[2, 0],
           # [3, 1]]
 
           data2 = paddle.arange(8)
           data2 = paddle.reshape(data2, (2,2,2))
-          print(data2) 
+          print(data2)
           #[[[0, 1],
           #  [2, 3]],
           # [[4, 5],
@@ -1336,9 +1443,12 @@ def rot90(x, k=1, axes=[0, 1], name=None):
     helper = LayerHelper("rot90", **locals())
     check_type(x, 'X', (Variable), 'rot90')
     dtype = helper.input_dtype('x')
-    check_dtype(dtype, 'X',
-                ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
-                'rot90')
+    check_dtype(
+        dtype,
+        'X',
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
+        'rot90',
+    )
     check_type(axes, 'axes', (list, tuple), 'rot90')
 
     input_total_dims = len(x.shape)
@@ -1346,23 +1456,31 @@ def rot90(x, k=1, axes=[0, 1], name=None):
     if total_rot_dims != 2:
         raise ValueError(
             "expected total rotation axes == 2, but got axes = {}".format(
-                total_rot_dims))
+                total_rot_dims
+            )
+        )
     if input_total_dims < 2:
         raise ValueError(
             "expected total dims >= 2, but got total dims = {}".format(
-                input_total_dims))
+                input_total_dims
+            )
+        )
 
     if not (axes[0] != axes[1] and abs(axes[0] - axes[1]) != input_total_dims):
         raise ValueError(
-            "expected rotation axes to be different, but got axis0 = {}, and axis1 = {}"
-            .format(axes[0], axes[1]))
+            "expected rotation axes to be different, but got axis0 = {}, and axis1 = {}".format(
+                axes[0], axes[1]
+            )
+        )
 
     if not (axes[0] < input_total_dims and axes[0] >= -input_total_dims):
-        raise ValueError("Rotation axis0 out of range, axis0 = {}".format(
-            axes[0]))
+        raise ValueError(
+            "Rotation axis0 out of range, axis0 = {}".format(axes[0])
+        )
     if not (axes[1] < input_total_dims and axes[1] >= -input_total_dims):
-        raise ValueError("Rotation axis1 out of range, axis1 = {}".format(
-            axes[1]))
+        raise ValueError(
+            "Rotation axis1 out of range, axis1 = {}".format(axes[1])
+        )
 
     k %= 4
     if k == 0:
@@ -1371,8 +1489,10 @@ def rot90(x, k=1, axes=[0, 1], name=None):
         return flip(flip(x, axes[0]), axes[1])
 
     axes_list = list(range(0, input_total_dims))
-    (axes_list[axes[0]], axes_list[axes[1]]) = (axes_list[axes[1]],
-                                                axes_list[axes[0]])
+    (axes_list[axes[0]], axes_list[axes[1]]) = (
+        axes_list[axes[1]],
+        axes_list[axes[0]],
+    )
     if k == 1:
         return transpose(flip(x, axes[1]), axes_list)
     else:
@@ -1455,19 +1575,29 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
 
     if not paddle.in_dynamic_mode():
         check_variable_and_dtype(
-            x, 'x',
+            x,
+            'x',
             ['float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8'],
-            'flatten')
+            'flatten',
+        )
 
     x_dim = len(x.shape)
-    if not (isinstance(start_axis,
-                       int)) or (start_axis > x_dim - 1) or start_axis < -x_dim:
+    if (
+        not (isinstance(start_axis, int))
+        or (start_axis > x_dim - 1)
+        or start_axis < -x_dim
+    ):
         raise ValueError(
-            "The start_axis should be a int, and in range [-rank(x), rank(x))")
-    if not (isinstance(stop_axis,
-                       int)) or (stop_axis > x_dim - 1) or stop_axis < -x_dim:
+            "The start_axis should be a int, and in range [-rank(x), rank(x))"
+        )
+    if (
+        not (isinstance(stop_axis, int))
+        or (stop_axis > x_dim - 1)
+        or stop_axis < -x_dim
+    ):
         raise ValueError(
-            "The stop_axis should be a int, and in range [-rank(x), rank(x))")
+            "The stop_axis should be a int, and in range [-rank(x), rank(x))"
+        )
     if start_axis < 0:
         start_axis = start_axis + x_dim
     if stop_axis < 0:
@@ -1480,22 +1610,19 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
 
     if _in_legacy_dygraph():
         dy_out, _ = _legacy_C_ops.flatten_contiguous_range(
-            x, 'start_axis', start_axis, 'stop_axis', stop_axis)
+            x, 'start_axis', start_axis, 'stop_axis', stop_axis
+        )
         return dy_out
 
     helper = LayerHelper('flatten', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='flatten_contiguous_range',
-                     inputs={"X": x},
-                     outputs={
-                         'Out': out,
-                         'XShape': x_shape
-                     },
-                     attrs={
-                         "start_axis": start_axis,
-                         "stop_axis": stop_axis
-                     })
+    helper.append_op(
+        type='flatten_contiguous_range',
+        inputs={"X": x},
+        outputs={'Out': out, 'XShape': x_shape},
+        attrs={"start_axis": start_axis, "stop_axis": stop_axis},
+    )
     return out
 
 
@@ -1509,14 +1636,22 @@ def flatten_(x, start_axis=0, stop_axis=-1, name=None):
         raise ValueError("The input x should be a Tensor")
 
     x_dim = len(x.shape)
-    if not (isinstance(start_axis,
-                       int)) or (start_axis > x_dim - 1) or start_axis < -x_dim:
+    if (
+        not (isinstance(start_axis, int))
+        or (start_axis > x_dim - 1)
+        or start_axis < -x_dim
+    ):
         raise ValueError(
-            "The start_axis should be a int, and in range [-rank(x), rank(x))")
-    if not (isinstance(stop_axis,
-                       int)) or (stop_axis > x_dim - 1) or stop_axis < -x_dim:
+            "The start_axis should be a int, and in range [-rank(x), rank(x))"
+        )
+    if (
+        not (isinstance(stop_axis, int))
+        or (stop_axis > x_dim - 1)
+        or stop_axis < -x_dim
+    ):
         raise ValueError(
-            "The stop_axis should be a int, and in range [-rank(x), rank(x))")
+            "The stop_axis should be a int, and in range [-rank(x), rank(x))"
+        )
     if start_axis < 0:
         start_axis = start_axis + x_dim
     if stop_axis < 0:
@@ -1529,15 +1664,16 @@ def flatten_(x, start_axis=0, stop_axis=-1, name=None):
 
     if _in_legacy_dygraph():
         dy_out, _ = _legacy_C_ops.flatten_contiguous_range_(
-            x, 'start_axis', start_axis, 'stop_axis', stop_axis)
+            x, 'start_axis', start_axis, 'stop_axis', stop_axis
+        )
         return dy_out
 
 
 def roll(x, shifts, axis=None, name=None):
     """
-    Roll the `x` tensor along the given axis(axes). With specific 'shifts', Elements that 
-    roll beyond the last position are re-introduced at the first according to 'shifts'. 
-    If a axis is not specified, 
+    Roll the `x` tensor along the given axis(axes). With specific 'shifts', Elements that
+    roll beyond the last position are re-introduced at the first according to 'shifts'.
+    If a axis is not specified,
     the tensor will be flattened before rolling and then restored to the original shape.
 
     Args:
@@ -1554,7 +1690,7 @@ def roll(x, shifts, axis=None, name=None):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
 
             x = paddle.to_tensor([[1.0, 2.0, 3.0],
@@ -1587,8 +1723,10 @@ def roll(x, shifts, axis=None, name=None):
         for i in range(len(axis)):
             if axis[i] >= len_origin_shape or axis[i] < -len_origin_shape:
                 raise ValueError(
-                    "axis is out of range, it should be in range [{}, {}), but received {}"
-                    .format(-len_origin_shape, len_origin_shape, axis))
+                    "axis is out of range, it should be in range [{}, {}), but received {}".format(
+                        -len_origin_shape, len_origin_shape, axis
+                    )
+                )
     else:
         axis = []
 
@@ -1604,34 +1742,32 @@ def roll(x, shifts, axis=None, name=None):
     out = helper.create_variable_for_type_inference(x.dtype)
 
     if isinstance(shifts, Variable):
-        helper.append_op(type='roll',
-                         inputs={
-                             'X': x,
-                             "ShiftsTensor": shifts
-                         },
-                         outputs={'Out': out},
-                         attrs={'axis': axis})
+        helper.append_op(
+            type='roll',
+            inputs={'X': x, "ShiftsTensor": shifts},
+            outputs={'Out': out},
+            attrs={'axis': axis},
+        )
     else:
         check_type(shifts, 'shifts', (list, tuple), 'roll')
-        helper.append_op(type='roll',
-                         inputs={'X': x},
-                         outputs={'Out': out},
-                         attrs={
-                             'axis': axis,
-                             'shifts': shifts
-                         })
+        helper.append_op(
+            type='roll',
+            inputs={'X': x},
+            outputs={'Out': out},
+            attrs={'axis': axis, 'shifts': shifts},
+        )
     return out
 
 
 def stack(x, axis=0, name=None):
     """
-    Stacks all the input tensors ``x`` along ``axis`` dimemsion. 
+    Stacks all the input tensors ``x`` along ``axis`` dimemsion.
     All tensors must be of the same shape and same dtype.
-    
-    For example, given N tensors of shape [A, B], if ``axis == 0``, the shape of stacked 
-    tensor is [N, A, B]; if ``axis == 1``, the shape of stacked 
+
+    For example, given N tensors of shape [A, B], if ``axis == 0``, the shape of stacked
+    tensor is [N, A, B]; if ``axis == 1``, the shape of stacked
     tensor is [A, N, B], etc.
-    
+
 
     .. code-block:: text
 
@@ -1679,35 +1815,35 @@ def stack(x, axis=0, name=None):
         x (list[Tensor]|tuple[Tensor]): Input ``x`` can be a ``list`` or ``tuple`` of tensors, the Tensors in ``x``
                                      must be of the same shape and dtype. Supported data types: float32, float64, int32, int64.
         axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
-                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
+                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``.
                               If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-        
+
     Returns:
         Tensor: The stacked tensor with same data type as input.
 
-    Example:    
+    Example:
         .. code-block:: python
 
             import paddle
-            
+
             x1 = paddle.to_tensor([[1.0, 2.0]])
             x2 = paddle.to_tensor([[3.0, 4.0]])
             x3 = paddle.to_tensor([[5.0, 6.0]])
-	    
+
             out = paddle.stack([x1, x2, x3], axis=0)
             print(out.shape)  # [3, 1, 2]
             print(out)
             # [[[1., 2.]],
             #  [[3., 4.]],
             #  [[5., 6.]]]
-	    
-	    out = paddle.stack([x1, x2, x3], axis=-2)
-	    print(out.shape)  # [1, 3, 2]
-	    print(out)
-	    # [[[1., 2.],
-	    #   [3., 4.],
-	    #   [5., 6.]]]
+
+            out = paddle.stack([x1, x2, x3], axis=-2)
+            print(out.shape)  # [1, 3, 2]
+            print(out)
+            # [[[1., 2.],
+            #   [3., 4.],
+            #   [5., 6.]]]
     """
     axis = 0 if axis is None else axis
 
@@ -1720,42 +1856,53 @@ def stack(x, axis=0, name=None):
     if not isinstance(x, list) and not isinstance(x, tuple):
         # NOTE:(zhiqiu) Only support Variable as input if the Variable is a LOD_TENSOR_ARRAY create by create_array, array_write, array_read, etc.
         # In that case, Variable is array of tensors indeed.
-        if isinstance(x, Variable) and x.desc.type(
-        ) == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+        if (
+            isinstance(x, Variable)
+            and x.desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+        ):
             x = [x]
         else:
             raise TypeError(
-                "The type of '%s' in %s must be %s, but received %s" %
-                ('x', 'stack', 'list[Tensor], tuple[Tensor] or TensorArray',
-                 type(x)))
+                "The type of '%s' in %s must be %s, but received %s"
+                % (
+                    'x',
+                    'stack',
+                    'list[Tensor], tuple[Tensor] or TensorArray',
+                    type(x),
+                )
+            )
 
     helper = LayerHelper('stack', **locals())
 
     out = helper.create_variable_for_type_inference(x[0].dtype)
     if x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
-        assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \
-                            "number of the elements must be 1, but received %s." % len(x)
+        assert len(x) == 1, (
+            "If the elements of 'x' in stack are Variable(LoDTensorArray), "
+            "number of the elements must be 1, but received %s." % len(x)
+        )
         out_index = helper.create_variable_for_type_inference(dtype="int32")
 
         for i in x:
-            check_variable_and_dtype(i, 'x', \
-                ['float16', 'float32', 'float64', 'int32', 'int64'], 'stack')
-
-        helper.append_op(type='tensor_array_to_tensor',
-                         inputs={'X': x[0]},
-                         outputs={
-                             'Out': [out],
-                             'OutIndex': [out_index]
-                         },
-                         attrs={
-                             'axis': axis,
-                             'use_stack': True
-                         })
+            check_variable_and_dtype(
+                i,
+                'x',
+                ['float16', 'float32', 'float64', 'int32', 'int64'],
+                'stack',
+            )
+
+        helper.append_op(
+            type='tensor_array_to_tensor',
+            inputs={'X': x[0]},
+            outputs={'Out': [out], 'OutIndex': [out_index]},
+            attrs={'axis': axis, 'use_stack': True},
+        )
     else:
-        helper.append_op(type='stack',
-                         inputs={'X': x},
-                         outputs={'Y': out},
-                         attrs={'axis': axis})
+        helper.append_op(
+            type='stack',
+            inputs={'X': x},
+            outputs={'Y': out},
+            attrs={'axis': axis},
+        )
 
     return out
 
@@ -1763,27 +1910,27 @@ def stack(x, axis=0, name=None):
 def split(x, num_or_sections, axis=0, name=None):
     """
     Split the input tensor into multiple sub-Tensors.
-    
+
     Args:
         x (Tensor): A N-D Tensor. The data type is bool, float16, float32, float64, uint8, int8, int32 or int64.
-        num_or_sections (int|list|tuple): If ``num_or_sections`` is an int, then ``num_or_sections`` 
+        num_or_sections (int|list|tuple): If ``num_or_sections`` is an int, then ``num_or_sections``
             indicates the number of equal sized sub-Tensors that the ``x`` will be divided into.
             If ``num_or_sections`` is a list or tuple, the length of it indicates the number of
             sub-Tensors and the elements in it indicate the sizes of sub-Tensors'  dimension orderly.
             The length of the list must not  be larger than the ``x`` 's size of specified ``axis``.
-        axis (int|Tensor, optional): The axis along which to split, it can be a scalar with type 
+        axis (int|Tensor, optional): The axis along which to split, it can be a scalar with type
             ``int`` or a ``Tensor`` with shape [1] and data type  ``int32`` or ``int64``.
             If :math::`axis < 0`, the axis to split along is :math:`rank(x) + axis`. Default is 0.
         name (str, optional): The default value is None.  Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name` .
     Returns:
         list(Tensor): The list of segmented Tensors.
-    
+
     Example:
         .. code-block:: python
-            
+
             import paddle
-            
+
             # x is a Tensor of shape [3, 9, 5]
             x = paddle.rand([3, 9, 5])
 
@@ -1801,7 +1948,7 @@ def split(x, num_or_sections, axis=0, name=None):
             print(out0.shape)  # [3, 2, 5]
             print(out1.shape)  # [3, 3, 5]
             print(out2.shape)  # [3, 4, 5]
-            
+
             # axis is negative, the real axis is (rank(x) + axis)=1
             out0, out1, out2 = paddle.split(x, num_or_sections=3, axis=-2)
             print(out0.shape)  # [3, 3, 5]
@@ -1829,15 +1976,17 @@ def split(x, num_or_sections, axis=0, name=None):
             if utils._contain_var(num_or_sections):
                 for index, item in enumerate(num_or_sections):
                     if isinstance(item, Variable):
-                        num_or_sections[index] = num_or_sections[index].numpy(
-                        )[0]
+                        num_or_sections[index] = num_or_sections[index].numpy()[
+                            0
+                        ]
                 attrs += ('sections', list(num_or_sections))
             else:
                 attrs += ('sections', list(num_or_sections))
         else:
             raise TypeError(
                 "The type of 'num_or_sections' in split must be int, list or tuple in imperative mode, but "
-                "received %s." % (type(num_or_sections)))
+                "received %s." % (type(num_or_sections))
+            )
         if in_dygraph_mode():
             if isinstance(num_or_sections, int):
                 return _C_ops.split_with_num(input, num_or_sections, dim)
@@ -1848,10 +1997,21 @@ def split(x, num_or_sections, axis=0, name=None):
             _legacy_C_ops.split(input, out, *attrs)
             return out
 
-    check_variable_and_dtype(input, 'input', [
-        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'uint8',
-        'int8'
-    ], 'split')
+    check_variable_and_dtype(
+        input,
+        'input',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'uint8',
+            'int8',
+        ],
+        'split',
+    )
     check_type(num_or_sections, 'num_or_sections', (list, int, tuple), 'split')
     check_type(dim, 'dim', (int, Variable), 'split')
     if isinstance(dim, Variable):
@@ -1871,19 +2031,18 @@ def split(x, num_or_sections, axis=0, name=None):
                 dim_size.stop_gradient = True
                 tensor_list.append(dim_size)
             else:
-                assert (isinstance(dim_size, int))
+                assert isinstance(dim_size, int)
                 if dim_size == -1:
                     assert unk_dim_idx == -1, (
                         "Only one value of 'num_or_section' in split can "
-                        "be -1. But received num_or_section[%d] is also -1." %
-                        idx)
+                        "be -1. But received num_or_section[%d] is also -1."
+                        % idx
+                    )
                     unk_dim_idx = idx
                 temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant([1],
-                              'int32',
-                              dim_size,
-                              force_cpu=True,
-                              out=temp_out)
+                fill_constant(
+                    [1], 'int32', dim_size, force_cpu=True, out=temp_out
+                )
                 tensor_list.append(temp_out)
         return tensor_list
 
@@ -1898,44 +2057,50 @@ def split(x, num_or_sections, axis=0, name=None):
     if isinstance(num_or_sections, int):
         assert num_or_sections > 1, 'num_or_sections must be more than 1.'
         if isinstance(dim, int) and input_shape[dim] > 0:
-            assert input_shape[dim] % num_or_sections ==0, \
-                "The input's size along the split dimension " \
-                "must be evenly divisible by Attr(num_or_sections). " \
-                "But %d is not evenly divisible by %d. " % (num_or_sections,input_shape[dim])
+            assert input_shape[dim] % num_or_sections == 0, (
+                "The input's size along the split dimension "
+                "must be evenly divisible by Attr(num_or_sections). "
+                "But %d is not evenly divisible by %d. "
+                % (num_or_sections, input_shape[dim])
+            )
         num = num_or_sections
     else:
         if isinstance(dim, int) and input_shape[dim] > 0:
-            assert len(num_or_sections) <= input_shape[
-                dim], 'len(num_or_sections) must not be more than input.shape[dim].'
+            assert (
+                len(num_or_sections) <= input_shape[dim]
+            ), 'len(num_or_sections) must not be more than input.shape[dim].'
         num = len(num_or_sections)
         attrs['sections'] = list(
-            map(lambda ele: -1
-                if isinstance(ele, Variable) else ele, num_or_sections))
+            map(
+                lambda ele: -1 if isinstance(ele, Variable) else ele,
+                num_or_sections,
+            )
+        )
         if utils._contain_var(num_or_sections):
             inputs['SectionsTensorList'] = _get_SectionsTensorList(
-                num_or_sections)
+                num_or_sections
+            )
 
     outs = [
         helper.create_variable_for_type_inference(dtype=helper.input_dtype())
         for i in range(num)
     ]
-    helper.append_op(type='split',
-                     inputs=inputs,
-                     outputs={'Out': outs},
-                     attrs=attrs)
+    helper.append_op(
+        type='split', inputs=inputs, outputs={'Out': outs}, attrs=attrs
+    )
     return outs
 
 
 def squeeze(x, axis=None, name=None):
     """
-    Squeeze the dimension(s) of size 1 of input tensor x's shape. 
-    
-    Note that the output Tensor will share data with origin Tensor and doesn't have a 
-    Tensor copy in ``dygraph`` mode. If you want to use the Tensor copy version, 
+    Squeeze the dimension(s) of size 1 of input tensor x's shape.
+
+    Note that the output Tensor will share data with origin Tensor and doesn't have a
+    Tensor copy in ``dygraph`` mode. If you want to use the Tensor copy version,
     please use `Tensor.clone` like ``squeeze_clone_x = x.squeeze().clone()``.
 
-    If axis is provided, it will remove the dimension(s) by given axis that of size 1. 
-    If the dimension of given axis is not of size 1, the dimension remain unchanged. 
+    If axis is provided, it will remove the dimension(s) by given axis that of size 1.
+    If the dimension of given axis is not of size 1, the dimension remain unchanged.
     If axis is not provided, all dims equal of size 1 will be removed.
 
     .. code-block:: text
@@ -1955,11 +2120,11 @@ def squeeze(x, axis=None, name=None):
             axis = 0
           Output:
             out.shape = [3, 1, 5]
-        
+
         Case4:
 
           Input:
-            x.shape = [1, 3, 1, 5]  # If the dimension of one given axis (3) is not of size 1, the dimension remain unchanged. 
+            x.shape = [1, 3, 1, 5]  # If the dimension of one given axis (3) is not of size 1, the dimension remain unchanged.
             axis = [0, 2, 3]
           Output:
             out.shape = [3, 5]
@@ -1967,7 +2132,7 @@ def squeeze(x, axis=None, name=None):
         Case4:
 
           Input:
-            x.shape = [1, 3, 1, 5]  # If axis is negative, axis = axis + ndim (number of dimensions in x). 
+            x.shape = [1, 3, 1, 5]  # If axis is negative, axis = axis + ndim (number of dimensions in x).
             axis = [-2]
           Output:
             out.shape = [1, 3, 5]
@@ -1987,7 +2152,7 @@ def squeeze(x, axis=None, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             x = paddle.rand([5, 1, 10])
             output = paddle.squeeze(x, axis=1)
 
@@ -2015,10 +2180,22 @@ def squeeze(x, axis=None, name=None):
         return out
 
     helper = LayerHelper("squeeze", **locals())
-    check_variable_and_dtype(input, 'input', [
-        'float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64',
-        'complex64', 'complex128'
-    ], 'squeeze')
+    check_variable_and_dtype(
+        input,
+        'input',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'bool',
+            'int8',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'squeeze',
+    )
 
     check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'squeeze')
     attrs = {}
@@ -2033,13 +2210,12 @@ def squeeze(x, axis=None, name=None):
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type="squeeze2",
-                     inputs={"X": input},
-                     attrs=attrs,
-                     outputs={
-                         "Out": out,
-                         "XShape": x_shape
-                     })
+    helper.append_op(
+        type="squeeze2",
+        inputs={"X": input},
+        attrs=attrs,
+        outputs={"Out": out, "XShape": x_shape},
+    )
 
     return out
 
@@ -2066,12 +2242,14 @@ def squeeze_(x, axis=None, name=None):
         return out
 
 
-def unique_consecutive(x,
-                       return_inverse=False,
-                       return_counts=False,
-                       axis=None,
-                       dtype="int64",
-                       name=None):
+def unique_consecutive(
+    x,
+    return_inverse=False,
+    return_counts=False,
+    axis=None,
+    dtype="int64",
+    name=None,
+):
     r"""
     Eliminates all but the first element from every consecutive group of equivalent elements.
 
@@ -2097,21 +2275,21 @@ def unique_consecutive(x,
     Example:
         .. code-block:: python
 
-            import paddle 
+            import paddle
 
             x = paddle.to_tensor([1, 1, 2, 2, 3, 1, 1, 2])
-            output = paddle.unique_consecutive(x) # 
+            output = paddle.unique_consecutive(x) #
             np_output = output.numpy() # [1 2 3 1 2]
             _, inverse, counts = paddle.unique_consecutive(x, return_inverse=True, return_counts=True)
             np_inverse = inverse.numpy() # [0 0 1 1 2 3 3 4]
             np_counts = inverse.numpy() # [2 2 1 2 1]
 
             x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3], [2, 1, 3]])
-            output = paddle.unique_consecutive(x, axis=0) # 
+            output = paddle.unique_consecutive(x, axis=0) #
             np_output = output.numpy() # [2 1 3 0 1 2 1 3 2 1 3]
 
             x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3], [2, 1, 3]])
-            output = paddle.unique_consecutive(x, axis=0) # 
+            output = paddle.unique_consecutive(x, axis=0) #
             np_output = output.numpy()
             # [[2 1 3]
             #  [3 0 1]
@@ -2125,7 +2303,8 @@ def unique_consecutive(x,
     attr_dtype = convert_np_dtype_to_dtype_(dtype)
     if in_dygraph_mode():
         out, inverse, counts = _C_ops.unique_consecutive(
-            x, return_inverse, return_counts, axis, attr_dtype)
+            x, return_inverse, return_counts, axis, attr_dtype
+        )
         outs = [out]
         if return_inverse:
             outs.append(inverse)
@@ -2136,8 +2315,16 @@ def unique_consecutive(x,
         return tuple(outs)
     elif paddle.in_dynamic_mode():
         out, inverse, counts = _legacy_C_ops.unique_consecutive(
-            x, 'dtype', attr_dtype, 'return_inverse', return_inverse,
-            'return_counts', return_counts, 'axis', axis)
+            x,
+            'dtype',
+            attr_dtype,
+            'return_inverse',
+            return_inverse,
+            'return_counts',
+            return_counts,
+            'axis',
+            axis,
+        )
         outs = [out]
         if return_inverse:
             outs.append(inverse)
@@ -2146,9 +2333,12 @@ def unique_consecutive(x,
         if len(outs) == 1:
             return outs[0]
         return tuple(outs)
-    check_variable_and_dtype(x, "input",
-                             ['float32', 'float64', 'int32', 'int64'],
-                             'unique_consecutive')
+    check_variable_and_dtype(
+        x,
+        "input",
+        ['float32', 'float64', 'int32', 'int64'],
+        'unique_consecutive',
+    )
     check_type(return_inverse, 'return_inverse', bool, 'unique_consecutive')
     check_type(return_counts, 'return_counts', bool, 'unique_consecutive')
     check_dtype(dtype, 'dtype', ['int32', 'int64'], 'unique_consecutive')
@@ -2161,34 +2351,38 @@ def unique_consecutive(x,
         "return_counts": return_counts,
         "axis": axis,
     }
-    out = helper.create_variable_for_type_inference(dtype=x.dtype,
-                                                    stop_gradient=True)
-    inverse = helper.create_variable_for_type_inference(dtype=attr_dtype,
-                                                        stop_gradient=True)
-    counts = helper.create_variable_for_type_inference(dtype=attr_dtype,
-                                                       stop_gradient=True)
+    out = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
+    inverse = helper.create_variable_for_type_inference(
+        dtype=attr_dtype, stop_gradient=True
+    )
+    counts = helper.create_variable_for_type_inference(
+        dtype=attr_dtype, stop_gradient=True
+    )
     outputs = {"Out": out, "Index": inverse, "Counts": counts}
     outs = [out]
     if return_inverse:
         outs.append(inverse)
     if return_counts:
         outs.append(counts)
-    helper.append_op(type="unique_consecutive",
-                     inputs={"X": x},
-                     attrs=attrs,
-                     outputs=outputs)
+    helper.append_op(
+        type="unique_consecutive", inputs={"X": x}, attrs=attrs, outputs=outputs
+    )
     if len(outs) == 1:
         return outs[0]
     return tuple(outs)
 
 
-def unique(x,
-           return_index=False,
-           return_inverse=False,
-           return_counts=False,
-           axis=None,
-           dtype="int64",
-           name=None):
+def unique(
+    x,
+    return_index=False,
+    return_inverse=False,
+    return_counts=False,
+    axis=None,
+    dtype="int64",
+    name=None,
+):
     r"""
     Returns the unique elements of `x` in ascending order.
 
@@ -2241,13 +2435,24 @@ def unique(x,
     if _non_static_mode():
         if in_dygraph_mode():
             out, indices, inverse, counts = _C_ops.unique(
-                x, return_index, return_inverse, return_counts, axis,
-                attr_dtype)
+                x, return_index, return_inverse, return_counts, axis, attr_dtype
+            )
         if _in_legacy_dygraph():
             out, inverse, indices, counts = _legacy_C_ops.unique(
-                x, 'dtype', attr_dtype, 'return_index', return_index,
-                'return_inverse', return_inverse, 'return_counts',
-                return_counts, 'axis', axis, "is_sorted", True)
+                x,
+                'dtype',
+                attr_dtype,
+                'return_index',
+                return_index,
+                'return_inverse',
+                return_inverse,
+                'return_counts',
+                return_counts,
+                'axis',
+                axis,
+                "is_sorted",
+                True,
+            )
         outs = [out]
         if return_index:
             outs.append(indices)
@@ -2261,8 +2466,9 @@ def unique(x,
 
         return tuple(outs)
 
-    check_variable_and_dtype(x, "input",
-                             ['float32', 'float64', 'int32', 'int64'], 'unique')
+    check_variable_and_dtype(
+        x, "input", ['float32', 'float64', 'int32', 'int64'], 'unique'
+    )
     check_type(return_index, 'return_index', bool, 'unique')
     check_type(return_inverse, 'return_inverse', bool, 'unique')
     check_type(return_counts, 'return_counts', bool, 'unique')
@@ -2277,21 +2483,25 @@ def unique(x,
         "return_inverse": return_inverse,
         "return_counts": return_counts,
         "axis": axis,
-        "is_sorted": True
+        "is_sorted": True,
     }
-    out = helper.create_variable_for_type_inference(dtype=x.dtype,
-                                                    stop_gradient=True)
-    indices = helper.create_variable_for_type_inference(dtype=attr_dtype,
-                                                        stop_gradient=True)
-    inverse = helper.create_variable_for_type_inference(dtype=attr_dtype,
-                                                        stop_gradient=True)
-    counts = helper.create_variable_for_type_inference(dtype=attr_dtype,
-                                                       stop_gradient=True)
+    out = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
+    indices = helper.create_variable_for_type_inference(
+        dtype=attr_dtype, stop_gradient=True
+    )
+    inverse = helper.create_variable_for_type_inference(
+        dtype=attr_dtype, stop_gradient=True
+    )
+    counts = helper.create_variable_for_type_inference(
+        dtype=attr_dtype, stop_gradient=True
+    )
     outputs = {
         "Out": out,
         "Indices": indices,
         "Index": inverse,
-        "Counts": counts
+        "Counts": counts,
     }
     outs = [out]
     if return_index:
@@ -2301,10 +2511,9 @@ def unique(x,
     if return_counts:
         outs.append(counts)
 
-    helper.append_op(type="unique",
-                     inputs={"X": x},
-                     attrs=attrs,
-                     outputs=outputs)
+    helper.append_op(
+        type="unique", inputs={"X": x}, attrs=attrs, outputs=outputs
+    )
 
     if len(outs) == 1:
         return outs[0]
@@ -2318,14 +2527,14 @@ def unsqueeze(x, axis, name=None):
     required argument axis, a dimension or list of dimensions that will be inserted.
     Dimension indices in axis are as seen in the output tensor.
 
-    Note that the output Tensor will share data with origin Tensor and doesn't have a 
-    Tensor copy in ``dygraph`` mode. If you want to use the Tensor copy version, 
+    Note that the output Tensor will share data with origin Tensor and doesn't have a
+    Tensor copy in ``dygraph`` mode. If you want to use the Tensor copy version,
     please use `Tensor.clone` like ``unsqueeze_clone_x = x.unsqueeze(-1).clone()``.
 
     Args:
         x (Tensor): The input Tensor to be unsqueezed. Supported data type: float32, float64, bool, int8, int32, int64.
-        axis (int|list|tuple|Tensor): Indicates the dimensions to be inserted. The data type is ``int32`` . 
-                                    If ``axis`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. 
+        axis (int|list|tuple|Tensor): Indicates the dimensions to be inserted. The data type is ``int32`` .
+                                    If ``axis`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
                                     If ``axis`` is a Tensor, it should be an 1-D Tensor .
                                     If ``axis`` is negative, ``axis = axis + ndim(x) + 1``.
         name (str|None): Name for this layer. Please refer to :ref:`api_guide_Name`, Default None.
@@ -2340,15 +2549,15 @@ def unsqueeze(x, axis, name=None):
 
             x = paddle.rand([5, 10])
             print(x.shape)  # [5, 10]
-            
+
             out1 = paddle.unsqueeze(x, axis=0)
             print(out1.shape)  # [1, 5, 10]
-            
-            out2 = paddle.unsqueeze(x, axis=[0, 2]) 
+
+            out2 = paddle.unsqueeze(x, axis=[0, 2])
             print(out2.shape)  # [1, 5, 1, 10]
 
             axis = paddle.to_tensor([0, 1, 2])
-            out3 = paddle.unsqueeze(x, axis=axis) 
+            out3 = paddle.unsqueeze(x, axis=axis)
             print(out3.shape)  # [1, 1, 1, 5, 10]
 
             # out1, out2, out3 share data with x in dygraph mode
@@ -2356,7 +2565,7 @@ def unsqueeze(x, axis, name=None):
             print(out1[0, 0, 0]) # [10.]
             print(out2[0, 0, 0, 0]) # [10.]
             print(out3[0, 0, 0, 0, 0]) # [10.]
-            
+
     """
     input = x
     axes = axis
@@ -2376,18 +2585,23 @@ def unsqueeze(x, axis, name=None):
         return _C_ops.unsqueeze(input, axes)
 
     check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
-    check_variable_and_dtype(input, 'input', [
-        'float16',
-        'float32',
-        'float64',
-        'bool',
-        'int8',
-        'int16',
-        'int32',
-        'int64',
-        'complex64',
-        'complex128',
-    ], 'unsqueeze')
+    check_variable_and_dtype(
+        input,
+        'input',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'bool',
+            'int8',
+            'int16',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'unsqueeze',
+    )
     helper = LayerHelper("unsqueeze2", **locals())
     inputs = {"X": input}
     attrs = {}
@@ -2405,13 +2619,12 @@ def unsqueeze(x, axis, name=None):
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type="unsqueeze2",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={
-                         "Out": out,
-                         "XShape": x_shape
-                     })
+    helper.append_op(
+        type="unsqueeze2",
+        inputs=inputs,
+        attrs=attrs,
+        outputs={"Out": out, "XShape": x_shape},
+    )
 
     return out
 
@@ -2459,7 +2672,7 @@ def gather(x, index, axis=None, name=None):
                 Then:
 
                 out = [[3, 4],
-                       [5, 6]] 
+                       [5, 6]]
 
     Args:
         x (Tensor): The source input tensor with rank>=1. Supported data type is
@@ -2472,7 +2685,7 @@ def gather(x, index, axis=None, name=None):
 
     Returns:
         output (Tensor): The output is a tensor with the same rank as ``x``.
-    
+
     Examples:
 
         .. code-block:: python
@@ -2491,13 +2704,16 @@ def gather(x, index, axis=None, name=None):
         return _C_ops.gather(x, index, axis)
     if _in_legacy_dygraph():
         axis = axis.item() if isinstance(axis, paddle.Tensor) else axis
-        return _legacy_C_ops.gather(x, index, None, "axis", axis, "overwrite",
-                                    False)
+        return _legacy_C_ops.gather(
+            x, index, None, "axis", axis, "overwrite", False
+        )
 
     check_variable_and_dtype(
-        x, 'x',
+        x,
+        'x',
         ['float16', 'float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
-        'gather')
+        'gather',
+    )
     check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather')
 
     if isinstance(axis, Variable):
@@ -2507,25 +2723,19 @@ def gather(x, index, axis=None, name=None):
     dtype = helper.input_dtype('x')
     out = helper.create_variable_for_type_inference(dtype)
     if not isinstance(axis, Variable):
-        helper.append_op(type="gather",
-                         inputs={
-                             "X": x,
-                             "Index": index
-                         },
-                         attrs={
-                             'axis': axis,
-                             'overwrite': False
-                         },
-                         outputs={"Out": out})
+        helper.append_op(
+            type="gather",
+            inputs={"X": x, "Index": index},
+            attrs={'axis': axis, 'overwrite': False},
+            outputs={"Out": out},
+        )
     else:
-        helper.append_op(type="gather",
-                         inputs={
-                             "X": x,
-                             "Index": index,
-                             "Axis": axis
-                         },
-                         attrs={"overwrite": False},
-                         outputs={"Out": out})
+        helper.append_op(
+            type="gather",
+            inputs={"X": x, "Index": index, "Axis": axis},
+            attrs={"overwrite": False},
+            outputs={"Out": out},
+        )
 
     return out
 
@@ -2537,7 +2747,7 @@ def unbind(input, axis=0):
 
     Args:
         input (Tensor): The input variable which is an N-D Tensor, data type being float32, float64, int32 or int64.
-        axis (int32|int64, optional): A scalar with type ``int32|int64`` shape [1]. The dimension along which to unbind. 
+        axis (int32|int64, optional): A scalar with type ``int32|int64`` shape [1]. The dimension along which to unbind.
             If :math:`axis < 0`, the dimension to unbind along is :math:`rank(input) + axis`. Default is 0.
     Returns:
         list(Tensor): The list of segmented Tensor variables.
@@ -2549,7 +2759,7 @@ def unbind(input, axis=0):
 
             # input is a Tensor which shape is [3, 4, 5]
             input = paddle.rand([3, 4, 5])
-       
+
             [x0, x1, x2] = paddle.unbind(input, axis=0)
             # x0.shape [4, 5]
             # x1.shape [4, 5]
@@ -2565,8 +2775,9 @@ def unbind(input, axis=0):
         return _C_ops.unbind(input, axis)
 
     if not isinstance(axis, (int)):
-        raise TypeError("The type of 'axis'  must be int, but received %s." %
-                        (type(axis)))
+        raise TypeError(
+            "The type of 'axis'  must be int, but received %s." % (type(axis))
+        )
     if isinstance(axis, np.generic):
         axis = np.asscalar(axis)
     input_shape = input.shape
@@ -2578,16 +2789,19 @@ def unbind(input, axis=0):
     helper = LayerHelper("unbind", **locals())
     check_type(input, 'input', (Variable), 'unbind')
     dtype = helper.input_dtype()
-    check_dtype(dtype, 'unbind', ['float32', 'float64', 'int32', 'int64'],
-                'unbind')
+    check_dtype(
+        dtype, 'unbind', ['float32', 'float64', 'int32', 'int64'], 'unbind'
+    )
     outs = [
         helper.create_variable_for_type_inference(dtype=helper.input_dtype())
         for i in range(num)
     ]
-    helper.append_op(type="unbind",
-                     inputs={"X": input},
-                     outputs={"Out": outs},
-                     attrs={"axis": axis})
+    helper.append_op(
+        type="unbind",
+        inputs={"X": input},
+        outputs={"Out": outs},
+        attrs={"axis": axis},
+    )
     return outs
 
 
@@ -2595,9 +2809,9 @@ def scatter(x, index, updates, overwrite=True, name=None):
     """
     **Scatter Layer**
     Output is obtained by updating the input on selected indices based on updates.
-    
+
     .. code-block:: python
-    
+
         import numpy as np
         #input:
         x = np.array([[1, 1], [2, 2], [3, 3]])
@@ -2619,32 +2833,32 @@ def scatter(x, index, updates, overwrite=True, name=None):
         out = np.array([[3, 3], [6, 6], [1, 1]])
         out.shape # [3, 2]
 
-    **NOTICE**: The order in which updates are applied is nondeterministic, 
+    **NOTICE**: The order in which updates are applied is nondeterministic,
     so the output will be nondeterministic if index contains duplicates.
 
     Args:
         x (Tensor): The input N-D Tensor with ndim>=1. Data type can be float32, float64.
         index (Tensor): The index 1-D Tensor. Data type can be int32, int64. The length of index cannot exceed updates's length, and the value in index cannot exceed input's length.
         updates (Tensor): update input with updates parameter based on index. shape should be the same as input, and dim value with dim > 1 should be the same as input.
-        overwrite (bool): The mode that updating the output when there are same indices. 
-            
+        overwrite (bool): The mode that updating the output when there are same indices.
+
             If True, use the overwrite mode to update the output of the same index,
-	        if False, use the accumulate mode to update the output of the same index.Default value is True.
-        
+                if False, use the accumulate mode to update the output of the same index.Default value is True.
+
         name(str, optional): The default value is None. Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
- 
+
     Returns:
         Tensor: The output is a Tensor with the same shape as x.
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
 
             x = paddle.to_tensor([[1, 1], [2, 2], [3, 3]], dtype='float32')
             index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
             updates = paddle.to_tensor([[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
-  
+
             output1 = paddle.scatter(x, index, updates, overwrite=False)
             # [[3., 3.],
             #  [6., 6.],
@@ -2669,23 +2883,25 @@ def scatter(x, index, updates, overwrite=True, name=None):
         return _C_ops.scatter(x, index, updates, overwrite)
     else:
         if _in_legacy_dygraph():
-            return _legacy_C_ops.scatter(x, index, updates, 'overwrite',
-                                         overwrite)
+            return _legacy_C_ops.scatter(
+                x, index, updates, 'overwrite', overwrite
+            )
         else:
             check_variable_and_dtype(
-                x, 'dtype', ['float32', 'float64', 'float16', 'int32', 'int64'],
-                'scatter')
+                x,
+                'dtype',
+                ['float32', 'float64', 'float16', 'int32', 'int64'],
+                'scatter',
+            )
             check_type(overwrite, 'overwrite', bool, 'scatter')
             helper = LayerHelper('scatter', **locals())
             out = helper.create_variable_for_type_inference(x.dtype)
-            helper.append_op(type="scatter",
-                             inputs={
-                                 "X": x,
-                                 "Ids": index,
-                                 "Updates": updates
-                             },
-                             attrs={'overwrite': overwrite},
-                             outputs={"Out": out})
+            helper.append_op(
+                type="scatter",
+                inputs={"X": x, "Ids": index, "Updates": updates},
+                attrs={'overwrite': overwrite},
+                outputs={"Out": out},
+            )
             return out
 
 
@@ -2764,7 +2980,7 @@ def scatter_nd_add(x, index, updates, name=None):
             index = paddle.to_tensor([[1, 1],
                                     [0, 1],
                                     [1, 3]], dtype='int64')
-            
+
             output = paddle.scatter_nd_add(x, index, updates)
             print(output.shape)
             # [3, 5, 9, 10]
@@ -2782,13 +2998,11 @@ def scatter_nd_add(x, index, updates, name=None):
             helper = LayerHelper('scatter_nd_add', **locals())
             dtype = helper.input_dtype(input_param_name='x')
             output = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type="scatter_nd_add",
-                             inputs={
-                                 "X": x,
-                                 "Index": index,
-                                 "Updates": updates
-                             },
-                             outputs={"Out": output})
+            helper.append_op(
+                type="scatter_nd_add",
+                inputs={"X": x, "Index": index, "Updates": updates},
+                outputs={"Out": output},
+            )
             return output
 
 
@@ -2838,24 +3052,24 @@ def scatter_nd(index, updates, shape, name=None):
 def chunk(x, chunks, axis=0, name=None):
     """
     Split the input tensor into multiple sub-Tensors.
-    
+
     Args:
         x (Tensor): A N-D Tensor. The data type is bool, float16, float32, float64, int32 or int64.
         chunks(int): The number of tensor to be split along the certain axis.
-        axis (int|Tensor, optional): The axis along which to split, it can be a scalar with type 
+        axis (int|Tensor, optional): The axis along which to split, it can be a scalar with type
             ``int`` or a ``Tensor`` with shape [1] and data type  ``int32`` or ``int64``.
             If :math::`axis < 0`, the axis to split along is :math:`rank(x) + axis`. Default is 0.
         name (str, optional): The default value is None.  Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name` .
     Returns:
         list(Tensor): The list of segmented Tensors.
-    
+
     Example:
         .. code-block:: python
-            
+
             import numpy as np
             import paddle
-            
+
             # x is a Tensor which shape is [3, 9, 5]
             x_np = np.random.random([3, 9, 5]).astype("int32")
             x = paddle.to_tensor(x_np)
@@ -2865,7 +3079,7 @@ def chunk(x, chunks, axis=0, name=None):
             # out1.shape [3, 3, 5]
             # out2.shape [3, 3, 5]
 
-            
+
             # axis is negative, the real axis is (rank(x) + axis) which real
             # value is 1.
             out0, out1, out2 = paddle.chunk(x, chunks=3, axis=-2)
@@ -2917,7 +3131,9 @@ def tile(x, repeat_times, name=None):
     """
     if in_dygraph_mode():
         if isinstance(repeat_times, core.eager.Tensor):
-            assert repeat_times.ndim == 1, "Only support ndim == 1 while repeat_times is a Tensor."
+            assert (
+                repeat_times.ndim == 1
+            ), "Only support ndim == 1 while repeat_times is a Tensor."
             repeat_times = repeat_times.numpy().tolist()
 
         return _C_ops.tile(x, repeat_times)
@@ -2927,26 +3143,30 @@ def tile(x, repeat_times, name=None):
 
     check_type(repeat_times, 'repeat_times', (list, tuple, Variable), 'tile')
     if isinstance(repeat_times, Variable):
-        assert len(
-            repeat_times.shape) == 1, ('repeat_times must be an 1-D Tensor.')
+        assert (
+            len(repeat_times.shape) == 1
+        ), 'repeat_times must be an 1-D Tensor.'
     else:
         for elem in repeat_times:
             if isinstance(elem, Variable):
-                assert len(elem.shape) == 1, (
-                    'Elements in repeat_times must be 1-D Tensors or integers.')
+                assert (
+                    len(elem.shape) == 1
+                ), 'Elements in repeat_times must be 1-D Tensors or integers.'
             else:
                 type_tuple = (int, np.int32, np.int64)
-                assert isinstance(elem, type_tuple), (
-                    'Elements in repeat_times must be 1-D Tensors or integers.')
+                assert isinstance(
+                    elem, type_tuple
+                ), 'Elements in repeat_times must be 1-D Tensors or integers.'
 
-    check_variable_and_dtype(x, 'x',
-                             ['bool', 'float32', 'float64', 'int32', 'int64'],
-                             'tile')
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'tile'
+    )
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
         raise ValueError(
             "When the date type is bool for the input 'x' of tile op, you "
             "must set its stop_gradient to be True by "
-            "some_var.stop_gradient == True supporting some_var is the input.")
+            "some_var.stop_gradient == True supporting some_var is the input."
+        )
 
     helper = LayerHelper('tile', **locals())
 
@@ -2960,8 +3180,9 @@ def tile(x, repeat_times, name=None):
                 attrs_repeat_times.append(-1)
             else:
                 attrs_repeat_times.append(times)
-                assert times > 0, (
-                    "All elements in repeat_times must be positive for tile.")
+                assert (
+                    times > 0
+                ), "All elements in repeat_times must be positive for tile."
         return attrs_repeat_times
 
     if isinstance(repeat_times, Variable):
@@ -2972,14 +3193,14 @@ def tile(x, repeat_times, name=None):
         attrs['repeat_times'] = get_attr_repeat_times(repeat_times)
         if utils._contain_var(repeat_times):
             inputs['repeat_times_tensor'] = utils._convert_to_tensor_list(
-                repeat_times)
+                repeat_times
+            )
 
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='tile',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='tile', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -3015,9 +3236,9 @@ def expand_as(x, y, name=None):
     if _non_static_mode():
         return _legacy_C_ops.expand_as_v2(x, 'target_shape', y.shape)
 
-    check_variable_and_dtype(x, 'x',
-                             ['bool', 'float32', 'float64', 'int32', 'int64'],
-                             'expand_as')
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand_as'
+    )
     check_type(y, 'y', Variable, 'expand_as')
 
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
@@ -3025,16 +3246,19 @@ def expand_as(x, y, name=None):
             "When the data type of input 'x' for expand_as is bool, "
             "you must set its stop_gradient to be False by "
             "some_var.stop_gradient = True, supporting "
-            "some_var as the input 'x'.")
+            "some_var as the input 'x'."
+        )
     inputs = {"X": [x], "Y": [y]}
 
     helper = LayerHelper('expand_as', **locals())
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='expand_as_v2',
-                     inputs=inputs,
-                     attrs={'target_shape': y.shape},
-                     outputs={'Out': out})
+    helper.append_op(
+        type='expand_as_v2',
+        inputs=inputs,
+        attrs={'target_shape': y.shape},
+        outputs={'Out': out},
+    )
     return out
 
 
@@ -3049,7 +3273,7 @@ def broadcast_to(x, shape, name=None):
     Args:
         x (Tensor): The input tensor, its data type is bool, float32, float64, int32 or int64.
         shape (list|tuple|Tensor): The result shape after broadcasting. The data type is int32. If shape is a list or tuple, all its elements
-            should be integers or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32. 
+            should be integers or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32.
             The value -1 in shape means keeping the corresponding dimension unchanged.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
     Returns:
@@ -3071,27 +3295,30 @@ def broadcast_to(x, shape, name=None):
         return _legacy_C_ops.expand_v2(x, 'shape', shape)
 
     if isinstance(shape, Variable):
-        assert len(shape.shape) == 1, ('shape must be an 1-D Tensor.')
+        assert len(shape.shape) == 1, 'shape must be an 1-D Tensor.'
     else:
         for elem in shape:
             if isinstance(elem, Variable):
-                assert len(elem.shape) == 1, (
-                    'Elements in shape must be 1-D Tensors or integers.')
+                assert (
+                    len(elem.shape) == 1
+                ), 'Elements in shape must be 1-D Tensors or integers.'
             else:
                 type_tuple = (int, np.int32, np.int64)
-                assert isinstance(elem, type_tuple), (
-                    'Elements in shape must be 1-D Tensors or integers.')
+                assert isinstance(
+                    elem, type_tuple
+                ), 'Elements in shape must be 1-D Tensors or integers.'
 
-    check_variable_and_dtype(x, 'x',
-                             ['bool', 'float32', 'float64', 'int32', 'int64'],
-                             'broadcast_to')
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'broadcast_to'
+    )
     check_type(shape, 'shape', (list, tuple, Variable), 'broadcast_to')
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
         raise ValueError(
             "When the data type of input 'x' for broadcast_to is bool, "
             "you must set its stop_gradient to be False by "
             "some_var.stop_gradient = True, supporting "
-            "some_var as the input.")
+            "some_var as the input."
+        )
 
     inputs = {"X": [x]}
     attrs = {}
@@ -3105,9 +3332,9 @@ def broadcast_to(x, shape, name=None):
                 attrs_expand_shape.append(-1)
             else:
                 attrs_expand_shape.append(shape)
-                assert shape > 0 or shape == -1, (
-                    "All elements in shape of broadcast_to must be positive or -1."
-                )
+                assert (
+                    shape > 0 or shape == -1
+                ), "All elements in shape of broadcast_to must be positive or -1."
         return attrs_expand_shape
 
     if isinstance(shape, Variable):
@@ -3117,14 +3344,14 @@ def broadcast_to(x, shape, name=None):
         attrs['shape'] = get_attr_expand_shape(shape)
         if utils._contain_var(shape):
             inputs['expand_shapes_tensor'] = utils._convert_to_tensor_list(
-                shape)
+                shape
+            )
 
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='expand_v2',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='expand_v2', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -3139,7 +3366,7 @@ def expand(x, shape, name=None):
     Args:
         x (Tensor): The input Tensor, its data type is bool, float32, float64, int32 or int64.
         shape (list|tuple|Tensor): The result shape after expanding. The data type is int32. If shape is a list or tuple, all its elements
-            should be integers or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32. 
+            should be integers or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32.
             The value -1 in shape means keeping the corresponding dimension unchanged.
         name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` .
 
@@ -3163,26 +3390,33 @@ def expand(x, shape, name=None):
         return _legacy_C_ops.expand_v2(x, 'shape', shape)
 
     if isinstance(shape, Variable):
-        assert len(shape.shape) == 1, ('shape must be an 1-D Tensor.')
+        assert len(shape.shape) == 1, 'shape must be an 1-D Tensor.'
     else:
         for elem in shape:
             if isinstance(elem, Variable):
-                assert len(elem.shape) == 1, (
-                    'Elements in shape must be 1-D Tensors or integers.')
+                assert (
+                    len(elem.shape) == 1
+                ), 'Elements in shape must be 1-D Tensors or integers.'
             else:
                 type_tuple = (int, np.int32, np.int64)
-                assert isinstance(elem, type_tuple), (
-                    'Elements in shape must be 1-D Tensors or integers.')
+                assert isinstance(
+                    elem, type_tuple
+                ), 'Elements in shape must be 1-D Tensors or integers.'
 
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-        'expand')
+        x,
+        'x',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'expand',
+    )
     check_type(shape, 'shape', (list, tuple, Variable), 'expand')
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
-        raise ValueError("When the data type of input 'x' for expand is bool, "
-                         "you must set its stop_gradient to be False by "
-                         "some_var.stop_gradient = True, supporting "
-                         "some_var as the input.")
+        raise ValueError(
+            "When the data type of input 'x' for expand is bool, "
+            "you must set its stop_gradient to be False by "
+            "some_var.stop_gradient = True, supporting "
+            "some_var as the input."
+        )
 
     inputs = {"X": [x]}
     attrs = {}
@@ -3196,8 +3430,9 @@ def expand(x, shape, name=None):
                 attrs_expand_shape.append(-2)
             else:
                 attrs_expand_shape.append(shape)
-                assert shape > 0 or shape == -1, (
-                    "All elements in shape of expand must be positive or -1.")
+                assert (
+                    shape > 0 or shape == -1
+                ), "All elements in shape of expand must be positive or -1."
         return attrs_expand_shape
 
     if isinstance(shape, Variable):
@@ -3207,14 +3442,14 @@ def expand(x, shape, name=None):
         attrs['shape'] = get_attr_expand_shape(shape)
         if utils._contain_var(shape):
             inputs['expand_shapes_tensor'] = utils._convert_to_tensor_list(
-                shape)
+                shape
+            )
 
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='expand_v2',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='expand_v2', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -3223,8 +3458,8 @@ def reshape(x, shape, name=None):
     Changes the shape of ``x`` without changing its data.
 
     Note that the output Tensor will share data with origin Tensor and doesn't
-    have a Tensor copy in ``dygraph`` mode. 
-    If you want to use the Tensor copy version, please use `Tensor.clone` like 
+    have a Tensor copy in ``dygraph`` mode.
+    If you want to use the Tensor copy version, please use `Tensor.clone` like
     ``reshape_clone_x = x.reshape([-1]).clone()``.
 
     Some tricks exist when specifying the target shape.
@@ -3283,7 +3518,7 @@ def reshape(x, shape, name=None):
 
     if in_dygraph_mode():
         tmp_tensor_type = core.eager.Tensor
-        #TODO(zhiqiu): enable inplace in dygraph mode.
+        # TODO(zhiqiu): enable inplace in dygraph mode.
         if inplace:
             warnings.warn(
                 "Inplace on reshape is not allowed and will be discarded in dygraph mode currently."
@@ -3291,7 +3526,9 @@ def reshape(x, shape, name=None):
         if isinstance(shape, (list, tuple)):
             shape = [
                 item.numpy().item(0)
-                if isinstance(item, tmp_tensor_type) else item for item in shape
+                if isinstance(item, tmp_tensor_type)
+                else item
+                for item in shape
             ]
             out = _C_ops.reshape(x, shape)
         elif isinstance(shape, tmp_tensor_type):
@@ -3300,7 +3537,8 @@ def reshape(x, shape, name=None):
         else:
             raise ValueError(
                 "shape must be an instance of `list`, `tuple` or `Variable`,"
-                " got '{}.'".format(type(shape)))
+                " got '{}.'".format(type(shape))
+            )
 
         return dygraph_utils._append_activation_in_dygraph(out, act)
     else:
@@ -3322,14 +3560,26 @@ def reshape(x, shape, name=None):
             else:
                 raise ValueError(
                     "shape must be an instance of `list`, `tuple` or `Variable`,"
-                    " got '{}.'".format(type(shape)))
+                    " got '{}.'".format(type(shape))
+                )
 
             return dygraph_utils._append_activation_in_dygraph(out, act)
 
-    check_variable_and_dtype(x, 'x', [
-        'float16', 'float32', 'float64', 'int16', 'int32', 'int64', 'bool',
-        'uint16'
-    ], 'reshape')
+    check_variable_and_dtype(
+        x,
+        'x',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'int16',
+            'int32',
+            'int64',
+            'bool',
+            'uint16',
+        ],
+        'reshape',
+    )
     check_type(shape, 'shape', (list, tuple, Variable), 'reshape')
     check_type(actual_shape, 'actual_shape', (Variable, type(None)), 'reshape')
 
@@ -3353,20 +3603,23 @@ def reshape(x, shape, name=None):
                         "\t# z.shape is [-1, -1, 4]\n\n"
                         "    If your target shape in Reshape represents dynamic shape, "
                         "please turn it into a Tensor under @to_static. See above example for details."
-                        % dim_idx)
+                        % dim_idx
+                    )
                     unk_dim_idx = dim_idx
                 elif dim_size == 0:
                     assert dim_idx < len(x.shape), (
                         "The index of 0 in `shape` must be less than "
                         "the input tensor X's dimensions. "
-                        "But received shape[%d] = 0, X's dimensions = %d." %
-                        (dim_idx, len(x.shape)))
+                        "But received shape[%d] = 0, X's dimensions = %d."
+                        % (dim_idx, len(x.shape))
+                    )
                 else:
                     assert dim_size > 0, (
                         "Each dimension value of 'shape' in reshape must not "
                         "be negative except one unknown dimension. "
-                        "But received shape[%d] = %s." %
-                        (dim_idx, str(dim_size)))
+                        "But received shape[%d] = %s."
+                        % (dim_idx, str(dim_size))
+                    )
         return attrs_shape
 
     inputs = {"X": x}
@@ -3375,8 +3628,10 @@ def reshape(x, shape, name=None):
         shape.stop_gradient = True
         inputs["Shape"] = shape
     elif isinstance(shape, (list, tuple)):
-        assert len(shape) > 0, ("The size of 'shape' in reshape can't be zero, "
-                                "but received %s." % len(shape))
+        assert len(shape) > 0, (
+            "The size of 'shape' in reshape can't be zero, "
+            "but received %s." % len(shape)
+        )
         attrs["shape"] = get_attr_shape(shape)
         if utils._contain_var(shape):
             inputs['ShapeTensor'] = utils._convert_to_tensor_list(shape)
@@ -3384,16 +3639,18 @@ def reshape(x, shape, name=None):
             actual_shape.stop_gradient = True
             inputs["Shape"] = actual_shape
 
-    out = x if inplace else helper.create_variable_for_type_inference(
-        dtype=x.dtype)
+    out = (
+        x
+        if inplace
+        else helper.create_variable_for_type_inference(dtype=x.dtype)
+    )
     x_shape = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="reshape2",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={
-                         "Out": out,
-                         "XShape": x_shape
-                     })
+    helper.append_op(
+        type="reshape2",
+        inputs=inputs,
+        attrs=attrs,
+        outputs={"Out": out, "XShape": x_shape},
+    )
 
     return helper.append_activation(out)
 
@@ -3409,7 +3666,9 @@ def reshape_(x, shape, name=None):
         if isinstance(shape, (list, tuple)):
             shape = [
                 item.numpy().item(0)
-                if isinstance(item, tmp_tensor_type) else item for item in shape
+                if isinstance(item, tmp_tensor_type)
+                else item
+                for item in shape
             ]
             out = _C_ops.reshape_(x, shape)
         elif isinstance(shape, tmp_tensor_type):
@@ -3418,7 +3677,8 @@ def reshape_(x, shape, name=None):
         else:
             raise ValueError(
                 "shape must be an instance of `list`, `tuple` or `Variable`,"
-                " got '{}.'".format(type(shape)))
+                " got '{}.'".format(type(shape))
+            )
 
         return out
     else:
@@ -3499,17 +3759,17 @@ def gather_nd(x, index, name=None):
 
     Returns:
         output (Tensor): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
-    
+
     Examples:
 
         .. code-block:: python
-            
+
             import paddle
-            
+
             x = paddle.to_tensor([[[1, 2], [3, 4], [5, 6]],
                                   [[7, 8], [9, 10], [11, 12]]])
             index = paddle.to_tensor([[0, 1]])
-            
+
             output = paddle.gather_nd(x, index) #[[3, 4]]
 
     """
@@ -3519,18 +3779,20 @@ def gather_nd(x, index, name=None):
         if _in_legacy_dygraph():
             return _legacy_C_ops.gather_nd(x, index)
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float32', 'float64', 'int16', 'int32', 'int64'],
-        'gather_np')
+        x,
+        'x',
+        ['bool', 'float32', 'float64', 'int16', 'int32', 'int64'],
+        'gather_np',
+    )
     check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather_np')
     helper = LayerHelper('gather_nd', **locals())
     dtype = helper.input_dtype()
     output = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="gather_nd",
-                     inputs={
-                         "X": x,
-                         "Index": index
-                     },
-                     outputs={"Out": output})
+    helper.append_op(
+        type="gather_nd",
+        inputs={"X": x, "Index": index},
+        outputs={"Out": output},
+    )
     return output
 
 
@@ -3609,7 +3871,7 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
             strides_1 = [1, 1, 1]
             strides_2 = [1, 1, 2]
             sliced_1 = paddle.strided_slice(x, axes=axes, starts=starts, ends=ends, strides=strides_1)
-            # sliced_1 is x[:, 1:3:1, 0:2:1, 2:4:1].                                
+            # sliced_1 is x[:, 1:3:1, 0:2:1, 2:4:1].
             # example 2:
             # attr starts is a list which contain tensor Tensor.
             minus_3 = paddle.full(shape=[1], fill_value=-3, dtype='int32')
@@ -3622,8 +3884,11 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
     helper = LayerHelper('strided_slice', **locals())
 
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-        'strided_slice')
+        x,
+        'x',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'strided_slice',
+    )
     check_type(axes, 'axes', (list, tuple), 'strided_slice')
     check_type(starts, 'starts', (list, tuple, Variable), 'strided_slice')
     check_type(ends, 'ends', (list, tuple, Variable), 'strided_slice')
@@ -3631,8 +3896,9 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
 
     def check_list_elements_dtype(list_input, input_name):
         if isinstance(list_input, Variable):
-            check_dtype(list_input.dtype, input_name, ['int32'],
-                        'strided_slice')
+            check_dtype(
+                list_input.dtype, input_name, ['int32'], 'strided_slice'
+            )
         else:
             for i, var in enumerate(list_input):
                 var_name = input_name + '[' + str(i) + ']'
@@ -3651,7 +3917,7 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
                 dim.stop_gradient = True
                 new_list_tensor.append(dim)
             else:
-                assert (isinstance(dim, int))
+                assert isinstance(dim, int)
                 temp_out = helper.create_variable_for_type_inference('int32')
                 fill_constant([1], 'int32', dim, force_cpu=True, out=temp_out)
                 new_list_tensor.append(temp_out)
@@ -3668,7 +3934,7 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
             'starts': starts,
             'ends': ends,
             'strides': strides,
-            'infer_flags': infer_flags
+            'infer_flags': infer_flags,
         }
     else:
         # starts
@@ -3723,55 +3989,55 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
                 attrs['strides'] = strides
         attrs['infer_flags'] = infer_flags
     out = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype('x'))
-    helper.append_op(type='strided_slice',
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={'Out': out})
+        dtype=helper.input_dtype('x')
+    )
+    helper.append_op(
+        type='strided_slice', inputs=inputs, attrs=attrs, outputs={'Out': out}
+    )
 
     return out
 
 
 def tensordot(x, y, axes=2, name=None):
     r"""
-    This function computes a contraction, which sum the product of elements from two tensors along the given axes. 
+    This function computes a contraction, which sum the product of elements from two tensors along the given axes.
 
     Args:
         x (Tensor): The left tensor for contraction with data type ``float32`` or ``float64``.
         y (Tensor): The right tensor for contraction with the same data type as ``x``.
         axes (int|tuple|list|Tensor, optional):  The axes to contract for ``x`` and ``y``, defaulted to integer ``2``.
 
-            1. It could be a non-negative integer ``n``, 
+            1. It could be a non-negative integer ``n``,
                in which the function will sum over the last ``n`` axes of ``x`` and the first ``n`` axes of ``y`` in order.
-        
-            2. It could be a 1-d tuple or list with data type ``int``, in which ``x`` and ``y`` will be contracted along the same given axes. 
+
+            2. It could be a 1-d tuple or list with data type ``int``, in which ``x`` and ``y`` will be contracted along the same given axes.
                For example, ``axes`` =[0, 1] applies contraction along the first two axes for ``x`` and the first two axes for ``y``.
-        
-            3. It could be a tuple or list containing one or two 1-d tuple|list|Tensor with data type ``int``. 
-               When containing one tuple|list|Tensor, the data in tuple|list|Tensor specified the same axes for ``x`` and ``y`` to contract. 
-               When containing two tuple|list|Tensor, the first will be applied to ``x`` and the second to ``y``. 
+
+            3. It could be a tuple or list containing one or two 1-d tuple|list|Tensor with data type ``int``.
+               When containing one tuple|list|Tensor, the data in tuple|list|Tensor specified the same axes for ``x`` and ``y`` to contract.
+               When containing two tuple|list|Tensor, the first will be applied to ``x`` and the second to ``y``.
                When containing more than two tuple|list|Tensor, only the first two axis sequences will be used while the others will be ignored.
-        
-            4. It could be a tensor, in which the ``axes`` tensor will be translated to a python list 
-               and applied the same rules described above to determine the contraction axes. 
+
+            4. It could be a tensor, in which the ``axes`` tensor will be translated to a python list
+               and applied the same rules described above to determine the contraction axes.
                Note that the ``axes`` with Tensor type is ONLY available in Dygraph mode.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this property. 
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
                              For more information, please refer to :ref:`api_guide_Name` .
 
-    Return: 
-        Output (Tensor): The contraction result with the same data type as ``x`` and ``y``. 
+    Return:
+        Output (Tensor): The contraction result with the same data type as ``x`` and ``y``.
         In general, :math:`output.ndim = x.ndim + y.ndim - 2 \times n_{axes}`, where :math:`n_{axes}` denotes the number of axes to be contracted.
-    
+
     NOTES:
-        1. This function supports tensor broadcast, 
+        1. This function supports tensor broadcast,
            the size in the corresponding dimensions of ``x`` and ``y`` should be equal, or applies to the broadcast rules.
-        2. This function also supports axes expansion, 
-           when the two given axis sequences for ``x`` and ``y`` are of different lengths, 
-           the shorter sequence will expand the same axes as the longer one at the end. 
-           For example, if ``axes`` =[[0, 1, 2, 3], [1, 0]], 
-           the axis sequence for ``x`` is [0, 1, 2, 3], 
+        2. This function also supports axes expansion,
+           when the two given axis sequences for ``x`` and ``y`` are of different lengths,
+           the shorter sequence will expand the same axes as the longer one at the end.
+           For example, if ``axes`` =[[0, 1, 2, 3], [1, 0]],
+           the axis sequence for ``x`` is [0, 1, 2, 3],
            while the corresponding axis sequences for ``y`` will be expanded from [1, 0] to [1, 0, 2, 3].
-  
+
     Examples:
         .. code-block:: python
 
@@ -3780,7 +4046,7 @@ def tensordot(x, y, axes=2, name=None):
             data_type = 'float64'
 
             # For two 2-d tensor x and y, the case axes=0 is equivalent to outer product.
-            # Note that tensordot supports empty axis sequence, so all the axes=0, axes=[], axes=[[]], and axes=[[],[]] are equivalent cases.   
+            # Note that tensordot supports empty axis sequence, so all the axes=0, axes=[], axes=[[]], and axes=[[],[]] are equivalent cases.
             x = paddle.arange(4, dtype=data_type).reshape([2, 2])
             y = paddle.arange(4, dtype=data_type).reshape([2, 2])
             z = paddle.tensordot(x, y, axes=0)
@@ -3842,7 +4108,7 @@ def tensordot(x, y, axes=2, name=None):
             # z = [[23217330., 24915630., 26613930., 28312230.],
             #      [24915630., 26775930., 28636230., 30496530.],
             #      [26613930., 28636230., 30658530., 32680830.],
-            #      [28312230., 30496530., 32680830., 34865130.]] 
+            #      [28312230., 30496530., 32680830., 34865130.]]
     """
     op_type = 'tensordot'
     input_dtype = ['float32', 'float64']
@@ -3855,8 +4121,9 @@ def tensordot(x, y, axes=2, name=None):
         if paddle.in_dynamic_mode():
             return tolist(var)
         raise TypeError(
-            "The 'axes' with type 'Tensor' in " + op_type +
-            " is not available in static graph mode, "
+            "The 'axes' with type 'Tensor' in "
+            + op_type
+            + " is not available in static graph mode, "
             "please convert its type to int|Tuple|List, or use dynamic graph mode."
         )
 
@@ -3864,8 +4131,10 @@ def tensordot(x, y, axes=2, name=None):
     axes_y = []
     if np.issubdtype(type(axes), np.integer):
         assert axes >= 0, (
-            "The 'axes' in " + op_type +
-            f" should not be negative, but received axes={axes}.")
+            "The 'axes' in "
+            + op_type
+            + f" should not be negative, but received axes={axes}."
+        )
         axes_x = range(x.ndim - axes, x.ndim)
         axes_y = range(axes)
     else:
@@ -3905,7 +4174,11 @@ def tensordot(x, y, axes=2, name=None):
             shape_x[dim_x] = 1
             x = x.sum(dim_x).reshape(shape_x)
         else:
-            assert sx == sy, "The dimensional size for 'x' and 'y' in " + op_type + f" should match each other, but 'x' has size {sx} in dim {dim_x} while 'y' has size {sy} in dim {dim_y}."
+            assert sx == sy, (
+                "The dimensional size for 'x' and 'y' in "
+                + op_type
+                + f" should match each other, but 'x' has size {sx} in dim {dim_x} while 'y' has size {sy} in dim {dim_y}."
+            )
 
         need_contracted_dim_x[dim_x] = True
         need_contracted_dim_y[dim_y] = True
@@ -3933,20 +4206,22 @@ def tensordot(x, y, axes=2, name=None):
         shape_out = [1]
 
     x = x.transpose(perm=perm_x).reshape(
-        [not_contraction_size_x, contraction_size])
+        [not_contraction_size_x, contraction_size]
+    )
     y = y.transpose(perm=perm_y).reshape(
-        [contraction_size, not_contraction_size_y])
+        [contraction_size, not_contraction_size_y]
+    )
     out = x.matmul(y).reshape(shape_out)
     return out
 
 
 def as_complex(x, name=None):
-    """Transform a real tensor to a complex tensor. 
-    
+    """Transform a real tensor to a complex tensor.
+
     The data type of the input tensor is 'float32' or 'float64', and the data
     type of the returned tensor is 'complex64' or 'complex128', respectively.
 
-    The shape of the input tensor is ``(* ,2)``, (``*`` means arbitary shape), i.e. 
+    The shape of the input tensor is ``(* ,2)``, (``*`` means arbitary shape), i.e.
     the size of the last axis shoule be 2, which represent the real and imag part
     of a complex number. The shape of the returned tensor is ``(*,)``.
 
@@ -3956,7 +4231,7 @@ def as_complex(x, name=None):
 
     Returns:
         Tensor: The output. Data type is 'complex64' or 'complex128', with the same precision as the input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -3978,7 +4253,8 @@ def as_complex(x, name=None):
     helper = LayerHelper(op_type, **locals())
     inputs = {"X": x}
     out = helper.create_variable_for_type_inference(
-        dtype=_real_to_complex_dtype(x.dtype))
+        dtype=_real_to_complex_dtype(x.dtype)
+    )
     outputs = {"Out": out}
     attrs = {}
     helper.append_op(type=op_type, inputs=inputs, attrs=attrs, outputs=outputs)
@@ -3986,9 +4262,9 @@ def as_complex(x, name=None):
 
 
 def as_real(x, name=None):
-    """Transform a complex tensor to a real tensor. 
-    
-    The data type of the input tensor is 'complex64' or 'complex128', and the data 
+    """Transform a complex tensor to a real tensor.
+
+    The data type of the input tensor is 'complex64' or 'complex128', and the data
     type of the returned tensor is 'float32' or 'float64', respectively.
 
     When the shape of the input tensor is ``(*, )``, (``*`` means arbitary shape),
@@ -4001,7 +4277,7 @@ def as_real(x, name=None):
 
     Returns:
         Tensor: The output. Data type is 'float32' or 'float64', with the same precision as the input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -4029,7 +4305,8 @@ def as_real(x, name=None):
     helper = LayerHelper(op_type, **locals())
     inputs = {"X": x}
     out = helper.create_variable_for_type_inference(
-        dtype=_complex_to_real_dtype(x.dtype))
+        dtype=_complex_to_real_dtype(x.dtype)
+    )
     outputs = {"Out": out}
     helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
     return out
@@ -4081,23 +4358,27 @@ def repeat_interleave(x, repeats, axis=None, name=None):
         return _C_ops.repeat_interleave(x, repeats, axis)
 
     helper = LayerHelper("repeat_interleave", **locals())
-    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
-                             'paddle.tensor.manipulation.repeat_interleave')
+    check_variable_and_dtype(
+        x,
+        'x',
+        ['float32', 'float64', 'int32', 'int64'],
+        'paddle.tensor.manipulation.repeat_interleave',
+    )
 
     out = helper.create_variable_for_type_inference(x.dtype)
 
-    helper.append_op(type='repeat_interleave',
-                     inputs={
-                         'X':
-                         x,
-                         'RepeatsTensor':
-                         repeats if isinstance(repeats, Variable) else None
-                     },
-                     outputs={'Out': out},
-                     attrs={
-                         'dim': axis,
-                         'Repeats': repeats if isinstance(repeats, int) else 0
-                     })
+    helper.append_op(
+        type='repeat_interleave',
+        inputs={
+            'X': x,
+            'RepeatsTensor': repeats if isinstance(repeats, Variable) else None,
+        },
+        outputs={'Out': out},
+        attrs={
+            'dim': axis,
+            'Repeats': repeats if isinstance(repeats, int) else 0,
+        },
+    )
     return out
 
 
@@ -4119,7 +4400,7 @@ def moveaxis(x, source, destination, name=None):
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
 
             x = paddle.ones([3, 2, 4])
@@ -4128,13 +4409,14 @@ def moveaxis(x, source, destination, name=None):
 
             x = paddle.ones([2, 3])
             paddle.moveaxis(x, 0, 1).shape # equivalent to paddle.t(x)
-            # [3, 2]  
+            # [3, 2]
     """
     src = [source] if isinstance(source, int) else source
     dst = [destination] if isinstance(destination, int) else destination
 
     assert len(src) == len(
-        dst), "'source' must have the same number with 'destination'"
+        dst
+    ), "'source' must have the same number with 'destination'"
 
     count = Counter(src).most_common(1)
     if count[0][1] > 1:
@@ -4151,29 +4433,31 @@ def moveaxis(x, source, destination, name=None):
     dst_dims = list(range(ndim))
 
     for i, axis in enumerate(zip(src, dst)):
-        assert isinstance(axis[0],
-                          int), "Each elemment of 'source' must be integer."
+        assert isinstance(
+            axis[0], int
+        ), "Each elemment of 'source' must be integer."
         if axis[0] < 0:
-            assert axis[
-                0] >= -ndim, "'source' must be in the range of [-{0}, {0})".format(
-                    ndim)
+            assert (
+                axis[0] >= -ndim
+            ), "'source' must be in the range of [-{0}, {0})".format(ndim)
             src[i] += ndim
         else:
-            assert axis[
-                0] < ndim, "'source' must be in the range of [-{0}, {0})".format(
-                    ndim)
+            assert (
+                axis[0] < ndim
+            ), "'source' must be in the range of [-{0}, {0})".format(ndim)
 
-        assert isinstance(axis[1],
-                          int), "Each elemment of 'source' must be integer."
+        assert isinstance(
+            axis[1], int
+        ), "Each elemment of 'source' must be integer."
         if axis[1] < 0:
-            assert axis[
-                1] >= -ndim, "'source' must be in the range of [-{0}, {0})".format(
-                    ndim)
+            assert (
+                axis[1] >= -ndim
+            ), "'source' must be in the range of [-{0}, {0})".format(ndim)
             dst[i] += ndim
         else:
-            assert axis[
-                1] < ndim, "'source' must be in the range of [-{0}, {0})".format(
-                    ndim)
+            assert (
+                axis[1] < ndim
+            ), "'source' must be in the range of [-{0}, {0})".format(ndim)
         perm[dst[i]] = src[i]
         src_dims.remove(src[i])
         dst_dims.remove(dst[i])
@@ -4189,32 +4473,44 @@ def moveaxis(x, source, destination, name=None):
         out, _ = _legacy_C_ops.transpose2(x, 'axis', perm)
         return out
 
-    check_variable_and_dtype(x, 'x', [
-        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
-        'complex128'
-    ], 'moveaxis')
+    check_variable_and_dtype(
+        x,
+        'x',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'moveaxis',
+    )
 
     helper = LayerHelper('moveaxis', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='transpose2',
-                     inputs={'X': [x]},
-                     outputs={
-                         'Out': [out],
-                         'XShape': [x_shape]
-                     },
-                     attrs={'axis': perm})
+    helper.append_op(
+        type='transpose2',
+        inputs={'X': [x]},
+        outputs={'Out': [out], 'XShape': [x_shape]},
+        attrs={'axis': perm},
+    )
     return out
 
 
 def non_negative_axis(arr, axis):
     ndim = len(arr.shape)
     if axis >= 0:
-        assert axis < ndim, "'axis'  must be in the range of [-{0}, {0})".format(
-            ndim)
+        assert (
+            axis < ndim
+        ), "'axis'  must be in the range of [-{0}, {0})".format(ndim)
     else:
-        assert axis >= -ndim, "'axis'  must be in the range of [-{0}, {0})".format(
-            ndim)
+        assert (
+            axis >= -ndim
+        ), "'axis'  must be in the range of [-{0}, {0})".format(ndim)
         axis += ndim
 
     return axis
@@ -4240,11 +4536,11 @@ def take_along_axis(arr, indices, axis):
         arr (Tensor) : The input Tensor. Supported data types are float32 and float64.
         indices (Tensor) : Indices to take along each 1d slice of arr. This must match the dimension of arr,
             and need to broadcast against arr. Supported data type are int and int64.
-        axis (int) : The axis to take 1d slices along. 
+        axis (int) : The axis to take 1d slices along.
 
-    Returns: 
+    Returns:
         Tensor: The indexed element, same dtype with arr
-    
+
     Examples:
         .. code-block:: python
 
@@ -4257,9 +4553,10 @@ def take_along_axis(arr, indices, axis):
             print(result)
             # [[1, 2, 3]]
     """
-    if (len(arr.shape) != len(indices.shape)):
+    if len(arr.shape) != len(indices.shape):
         raise ValueError(
-            "`indices` and `arr` must have the same number of dimensions!")
+            "`indices` and `arr` must have the same number of dimensions!"
+        )
     axis = non_negative_axis(arr, axis)
     broadcast_shape = infer_broadcast_shape(arr, indices, axis)
     if not broadcast_shape:
@@ -4275,10 +4572,14 @@ def take_along_axis(arr, indices, axis):
             return _C_ops.take_along_axis(arr, indices, axis)
         return _legacy_C_ops.take_along_axis(arr, indices, 'Axis', axis)
     check_variable_and_dtype(
-        arr, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
-        'take_along_axis')
-    check_variable_and_dtype(indices, 'index', ['int32', 'int64'],
-                             'take_along_axis')
+        arr,
+        'x',
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+        'take_along_axis',
+    )
+    check_variable_and_dtype(
+        indices, 'index', ['int32', 'int64'], 'take_along_axis'
+    )
     indices = paddle.broadcast_to(indices, broadcast_shape)
     broadcast_shape_list = list(broadcast_shape)
     broadcast_shape_list[axis] = list(arr.shape)[axis]
@@ -4287,13 +4588,12 @@ def take_along_axis(arr, indices, axis):
     helper = LayerHelper('take_along_axis', **locals())
     dtype = helper.input_dtype()
     result = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="take_along_axis",
-                     inputs={
-                         "Input": arr,
-                         "Index": indices
-                     },
-                     attrs={"Axis": axis},
-                     outputs={"Result": result})
+    helper.append_op(
+        type="take_along_axis",
+        inputs={"Input": arr, "Index": indices},
+        attrs={"Axis": axis},
+        outputs={"Result": result},
+    )
     return result
 
 
@@ -4305,11 +4605,11 @@ def put_along_axis(arr, indices, values, axis, reduce='assign'):
         arr (Tensor) : The Destination Tensor. Supported data types are float32 and float64.
         indices (Tensor) : Indices to put along each 1d slice of arr. This must match the dimension of arr,
             and need to broadcast against arr. Supported data type are int and int64.
-        axis (int) : The axis to put 1d slices along. 
+        axis (int) : The axis to put 1d slices along.
         reduce (string | optinal) : The reduce operation, default is 'assign', support 'add', 'assign', 'mul' and 'multiply'.
-    Returns : 
+    Returns :
         Tensor: The indexed element, same dtype with arr
-    
+
     Examples:
         .. code-block:: python
 
@@ -4325,44 +4625,48 @@ def put_along_axis(arr, indices, values, axis, reduce='assign'):
             # [60, 40, 50]]
 
     """
-    if (len(arr.shape) != len(indices.shape)):
+    if len(arr.shape) != len(indices.shape):
         raise ValueError(
-            "`indices` and `arr` must have the same number of dimensions!")
+            "`indices` and `arr` must have the same number of dimensions!"
+        )
     axis = non_negative_axis(arr, axis)
     broadcast_shape = infer_broadcast_shape(arr, indices, axis)
     if _non_static_mode():
-        values = paddle.to_tensor(values) if not isinstance(
-            values, paddle.Tensor) else values
+        values = (
+            paddle.to_tensor(values)
+            if not isinstance(values, paddle.Tensor)
+            else values
+        )
         if broadcast_shape:
             indices = paddle.broadcast_to(indices, broadcast_shape)
         values = paddle.broadcast_to(values, indices.shape)
         if in_dygraph_mode():
             return _C_ops.put_along_axis(arr, indices, values, axis, reduce)
-        return _legacy_C_ops.put_along_axis(arr, indices, values, "Axis", axis,
-                                            "Reduce", reduce)
+        return _legacy_C_ops.put_along_axis(
+            arr, indices, values, "Axis", axis, "Reduce", reduce
+        )
 
     check_variable_and_dtype(
-        arr, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
-        'put_along_axis')
-    check_variable_and_dtype(indices, 'index', ['int32', 'int64'],
-                             'put_along_axis')
+        arr,
+        'x',
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+        'put_along_axis',
+    )
+    check_variable_and_dtype(
+        indices, 'index', ['int32', 'int64'], 'put_along_axis'
+    )
     if broadcast_shape:
         indices = paddle.broadcast_to(indices, broadcast_shape)
     values = paddle.broadcast_to(values, indices.shape)
     helper = LayerHelper('put_along_axis', **locals())
     dtype = helper.input_dtype()
     result = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="put_along_axis",
-                     inputs={
-                         "Input": arr,
-                         "Index": indices,
-                         "Value": values
-                     },
-                     attrs={
-                         "Axis": axis,
-                         "Reduce": reduce
-                     },
-                     outputs={"Result": result})
+    helper.append_op(
+        type="put_along_axis",
+        inputs={"Input": arr, "Index": indices, "Value": values},
+        attrs={"Axis": axis, "Reduce": reduce},
+        outputs={"Result": result},
+    )
     return result
 
 
@@ -4372,20 +4676,25 @@ def put_along_axis_(arr, indices, values, axis, reduce='assign'):
     Inplace version of ``put_along_axis`` API, the output Tensor will be inplaced with input ``arr``.
     Please refer to :ref:`api_tensor_put_along_axis`.
     """
-    if (len(arr.shape) != len(indices.shape)):
+    if len(arr.shape) != len(indices.shape):
         raise ValueError(
-            "`indices` and `arr` must have the same number of dimensions!")
+            "`indices` and `arr` must have the same number of dimensions!"
+        )
     axis = non_negative_axis(arr, axis)
     broadcast_shape = infer_broadcast_shape(arr, indices, axis)
-    values = paddle.to_tensor(values) if not isinstance(
-        values, paddle.Tensor) else values
+    values = (
+        paddle.to_tensor(values)
+        if not isinstance(values, paddle.Tensor)
+        else values
+    )
     if broadcast_shape:
         indices = paddle.broadcast_to(indices, broadcast_shape)
     values = paddle.broadcast_to(values, indices.shape)
     if in_dygraph_mode():
         return _C_ops.put_along_axis_(arr, indices, values, axis, reduce)
-    return _legacy_C_ops.put_along_axis_(arr, indices, values, "Axis", axis,
-                                         "Reduce", reduce)
+    return _legacy_C_ops.put_along_axis_(
+        arr, indices, values, "Axis", axis, "Reduce", reduce
+    )
 
 
 def _index_add_params_check(x, index, input_axis, add_value):
@@ -4426,7 +4735,7 @@ def index_add(x, index, axis, value, name=None):
         x (Tensor) : The Destination Tensor. Supported data types are int32, int64, float16, float32, float64.
         index (Tensor): The 1-D Tensor containing the indices to index.
             The data type of ``index`` must be int32 or int64.
-        axis (int): The dimension in which we index. 
+        axis (int): The dimension in which we index.
         value (Tensor): The tensor used to add the elements along the target axis.
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
@@ -4455,24 +4764,36 @@ def index_add(x, index, axis, value, name=None):
 
     helper = LayerHelper("index_add", **locals())
     check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'paddle.tensor.manipulation.index_add')
-    check_variable_and_dtype(index, 'index', ['int32', 'int64'],
-                             'paddle.tensor.manipulation.index_add')
+        x,
+        'x',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'paddle.tensor.manipulation.index_add',
+    )
+    check_variable_and_dtype(
+        index,
+        'index',
+        ['int32', 'int64'],
+        'paddle.tensor.manipulation.index_add',
+    )
     check_variable_and_dtype(
-        value, 'add_value', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'paddle.tensor.manipulation.index_add')
+        value,
+        'add_value',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'paddle.tensor.manipulation.index_add',
+    )
 
     out = helper.create_variable_for_type_inference(x.dtype)
 
-    helper.append_op(type='index_add',
-                     inputs={
-                         'X': x,
-                         'Index': index,
-                         'AddValue': value,
-                     },
-                     outputs={'Out': out},
-                     attrs={'axis': axis})
+    helper.append_op(
+        type='index_add',
+        inputs={
+            'X': x,
+            'Index': index,
+            'AddValue': value,
+        },
+        outputs={'Out': out},
+        attrs={'axis': axis},
+    )
     return out
 
 
@@ -4481,7 +4802,7 @@ def index_add_(x, index, axis, value, name=None):
     """
     Inplace version of ``index_add`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_paddle_tensor_index_add`.
-    
+
     Examples:
         .. code-block:: python
 
@@ -4509,7 +4830,7 @@ __METHODS = {
     'fill_diagonal_': fill_diagonal_,
     'fill_diagonal_tensor_': fill_diagonal_tensor_,
     "fill_diagonal_tensor": fill_diagonal_tensor,
-    'tolist': tolist
+    'tolist': tolist,
 }
 for name, func in __METHODS.items():
     setattr(core.VarBase, name, func)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index c5b995454ae..5c191b7ffc5 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -25,14 +25,29 @@ from paddle.common_ops_import import dygraph_utils
 
 from .manipulation import cast
 from .creation import _complex_to_real_dtype
-from .layer_function_generator import _generate_doc_string_, generate_activation_fn, generate_layer_fn
+from .layer_function_generator import (
+    _generate_doc_string_,
+    generate_activation_fn,
+    generate_layer_fn,
+)
 
 import paddle
 from ..static import Variable
-from ..framework import core, in_dygraph_mode, _non_static_mode, LayerHelper, _in_legacy_dygraph
+from ..framework import (
+    core,
+    in_dygraph_mode,
+    _non_static_mode,
+    LayerHelper,
+    _in_legacy_dygraph,
+)
 from ..fluid.framework import _in_legacy_dygraph
 from ..framework import _varbase_creator, convert_np_dtype_to_dtype_
-from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
+from ..fluid.data_feeder import (
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+    convert_dtype,
+)
 from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
 from ..fluid.layers import utils
 
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 1d5027ce0d6..ab81f8c4d92 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -20,7 +20,12 @@ from ..framework import LayerHelper
 from ..framework import core
 from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
 from .search import where
-from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ..fluid.data_feeder import (
+    convert_dtype,
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+)
 from ..fluid.layers import utils
 import paddle
 from paddle import _C_ops, _legacy_C_ops
@@ -86,9 +91,11 @@ def mean(x, axis=None, keepdim=False, name=None):
     else:
         if isinstance(axis, int):
             axis = [axis]
-        reduce_all = True if axis is None \
-            or len(axis)==0 \
-            or len(axis) == len(x.shape) else False
+        reduce_all = (
+            True
+            if axis is None or len(axis) == 0 or len(axis) == len(x.shape)
+            else False
+        )
         if axis is None or len(axis) == 0:
             axis = [0]
 
@@ -97,18 +104,27 @@ def mean(x, axis=None, keepdim=False, name=None):
             axis = list(range(len(x.shape)))
         return _C_ops.mean(x, axis, keepdim)
     if _in_legacy_dygraph():
-        return _legacy_C_ops.reduce_mean(x, 'dim', axis, 'keep_dim', keepdim,
-                                         'reduce_all', reduce_all)
-
-    check_variable_and_dtype(x, 'x/input',
-                             ['uint16', 'float16', 'float32', 'float64'],
-                             'mean/reduce_mean')
-    check_type(axis, 'axis/dim', (int, list, tuple, Variable),
-               'mean/reduce_mean')
+        return _legacy_C_ops.reduce_mean(
+            x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all
+        )
+
+    check_variable_and_dtype(
+        x,
+        'x/input',
+        ['uint16', 'float16', 'float32', 'float64'],
+        'mean/reduce_mean',
+    )
+    check_type(
+        axis, 'axis/dim', (int, list, tuple, Variable), 'mean/reduce_mean'
+    )
     if isinstance(axis, (list, tuple)):
         for item in axis:
-            check_type(item, 'elements of axis/dim', (int, Variable),
-                       'mean/reduce_mean')
+            check_type(
+                item,
+                'elements of axis/dim',
+                (int, Variable),
+                'mean/reduce_mean',
+            )
 
     helper = LayerHelper('mean', **locals())
 
@@ -116,10 +132,9 @@ def mean(x, axis=None, keepdim=False, name=None):
         axis = utils._convert_to_tensor_list(axis)
     attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='reduce_mean',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='reduce_mean', inputs={'X': x}, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -129,10 +144,10 @@ def var(x, axis=None, unbiased=True, keepdim=False, name=None):
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
-        axis (int|list|tuple, optional): The axis along which to perform variance calculations. ``axis`` should be int, list(int) or tuple(int). 
-        
-            - If ``axis`` is a list/tuple of dimension(s), variance is calculated along all element(s) of ``axis`` . ``axis`` or element(s) of ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` . 
-            - If ``axis`` or element(s) of ``axis`` is less than 0, it works the same way as :math:`axis + D` . 
+        axis (int|list|tuple, optional): The axis along which to perform variance calculations. ``axis`` should be int, list(int) or tuple(int).
+
+            - If ``axis`` is a list/tuple of dimension(s), variance is calculated along all element(s) of ``axis`` . ``axis`` or element(s) of ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` .
+            - If ``axis`` or element(s) of ``axis`` is less than 0, it works the same way as :math:`axis + D` .
             - If ``axis`` is None, variance is calculated over all elements of ``x``. Default is None.
 
         unbiased (bool, optional): Whether to use the unbiased estimation. If ``unbiased`` is True, the divisor used in the computation is :math:`N - 1`, where :math:`N` represents the number of elements along ``axis`` , otherwise the divisor is :math:`N`. Default is True.
@@ -157,15 +172,16 @@ def var(x, axis=None, unbiased=True, keepdim=False, name=None):
         check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'var')
 
     u = mean(x, axis, True, name)
-    out = paddle.sum((x - u)**2, axis, keepdim=keepdim, name=name)
+    out = paddle.sum((x - u) ** 2, axis, keepdim=keepdim, name=name)
 
     dtype = x.dtype
-    n = paddle.cast(paddle.numel(x), paddle.int64) \
-        / paddle.cast(paddle.numel(out), paddle.int64)
+    n = paddle.cast(paddle.numel(x), paddle.int64) / paddle.cast(
+        paddle.numel(out), paddle.int64
+    )
     n = n.astype(dtype)
     if unbiased:
         one_const = paddle.ones([1], x.dtype)
-        n = where(n > one_const, n - 1., one_const)
+        n = where(n > one_const, n - 1.0, one_const)
     out /= n
     return out
 
@@ -238,7 +254,7 @@ def numel(x, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             x = paddle.full(shape=[4, 5, 7], fill_value=0, dtype='int32')
             numel = paddle.numel(x) # 140
 
@@ -253,7 +269,8 @@ def numel(x, name=None):
         raise TypeError("x must be a Tensor in numel")
     helper = LayerHelper('numel', **locals())
     out = helper.create_variable_for_type_inference(
-        dtype=core.VarDesc.VarType.INT64)
+        dtype=core.VarDesc.VarType.INT64
+    )
     helper.append_op(type='size', inputs={'Input': x}, outputs={'Out': out})
     return out
 
@@ -321,8 +338,9 @@ def nanmedian(x, axis=None, keepdim=True, name=None):
         )
 
     for i in range(len(axis)):
-        if not isinstance(axis[i], int) or not (axis[i] < dims
-                                                and axis[i] >= -dims):
+        if not isinstance(axis[i], int) or not (
+            axis[i] < dims and axis[i] >= -dims
+        ):
             raise ValueError(
                 "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
             )
@@ -333,25 +351,25 @@ def nanmedian(x, axis=None, keepdim=True, name=None):
         raise ValueError("Axis has duplicated elements.")
 
     if _in_legacy_dygraph():
-        median_index, out = _legacy_C_ops.nanmedian(x, 'axis', axis, 'keepdim',
-                                                    keepdim)
+        median_index, out = _legacy_C_ops.nanmedian(
+            x, 'axis', axis, 'keepdim', keepdim
+        )
         return out
 
     check_variable_and_dtype(
-        x, 'X', ['int32', 'int64', 'float16', 'float32', 'float64'],
-        'nanmedian')
+        x, 'X', ['int32', 'int64', 'float16', 'float32', 'float64'], 'nanmedian'
+    )
 
     helper = LayerHelper('nanmedian', **locals())
     attrs = {'axis': axis, 'keepdim': keepdim}
     out = helper.create_variable_for_type_inference(x.dtype)
     medians = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='nanmedian',
-                     inputs={'X': x},
-                     outputs={
-                         'Out': out,
-                         'MedianIndex': medians
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='nanmedian',
+        inputs={'X': x},
+        outputs={'Out': out, 'MedianIndex': medians},
+        attrs=attrs,
+    )
     return out
 
 
@@ -424,21 +442,22 @@ def median(x, axis=None, keepdim=False, name=None):
     dtype = 'float64' if x.dtype == core.VarDesc.VarType.FP64 else 'float32'
     if sz & 1 == 0:
         out_tensor = paddle.slice(
-            tensor_topk, axes=[axis], starts=[kth - 1],
-            ends=[kth]) + paddle.slice(
-                tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1])
+            tensor_topk, axes=[axis], starts=[kth - 1], ends=[kth]
+        ) + paddle.slice(tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1])
         out_tensor = paddle.cast(out_tensor, dtype=dtype) / 2
     else:
-        out_tensor = paddle.cast(paddle.slice(tensor_topk,
-                                              axes=[axis],
-                                              starts=[kth],
-                                              ends=[kth + 1]),
-                                 dtype=dtype)
+        out_tensor = paddle.cast(
+            paddle.slice(
+                tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1]
+            ),
+            dtype=dtype,
+        )
     out_tensor = out_tensor + paddle.sum(
-        paddle.cast(paddle.isnan(x), dtype=dtype) * x, axis=axis, keepdim=True)
+        paddle.cast(paddle.isnan(x), dtype=dtype) * x, axis=axis, keepdim=True
+    )
     if not keepdim or is_flatten:
         if not is_flatten:
-            newshape = x.shape[:axis] + x.shape[axis + 1:]
+            newshape = x.shape[:axis] + x.shape[axis + 1 :]
         elif not keepdim:
             newshape = [1]
         else:
@@ -502,7 +521,8 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False):
             axis_src, axis_dst = [], []
             for axis_single in axis:
                 if not isinstance(axis_single, int) or not (
-                        axis_single < dims and axis_single >= -dims):
+                    axis_single < dims and axis_single >= -dims
+                ):
                     raise ValueError(
                         "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
                     )
@@ -524,9 +544,9 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False):
             out_shape[axis] = 1
 
     mask = x.isnan()
-    valid_counts = mask.logical_not().sum(axis=axis,
-                                          keepdim=True,
-                                          dtype='float64')
+    valid_counts = mask.logical_not().sum(
+        axis=axis, keepdim=True, dtype='float64'
+    )
 
     indices = []
 
@@ -553,15 +573,18 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False):
     for index in indices:
         indices_below = paddle.floor(index).astype(paddle.int32)
         indices_upper = paddle.ceil(index).astype(paddle.int32)
-        tensor_upper = paddle.take_along_axis(sorted_tensor,
-                                              indices_upper,
-                                              axis=axis)
-        tensor_below = paddle.take_along_axis(sorted_tensor,
-                                              indices_below,
-                                              axis=axis)
-        weights = (index - indices_below.astype('float64'))
-        out = paddle.lerp(tensor_below.astype('float64'),
-                          tensor_upper.astype('float64'), weights)
+        tensor_upper = paddle.take_along_axis(
+            sorted_tensor, indices_upper, axis=axis
+        )
+        tensor_below = paddle.take_along_axis(
+            sorted_tensor, indices_below, axis=axis
+        )
+        weights = index - indices_below.astype('float64')
+        out = paddle.lerp(
+            tensor_below.astype('float64'),
+            tensor_upper.astype('float64'),
+            weights,
+        )
         if not keepdim:
             out = paddle.squeeze(out, axis=axis)
         else:
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index 7943f5f0e96..0e265d51a7e 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -22,14 +22,46 @@ from setuptools.command.easy_install import easy_install
 from setuptools.command.build_ext import build_ext
 from distutils.command.build import build
 
-from .extension_utils import find_cuda_home, find_rocm_home, normalize_extension_kwargs, add_compile_flag, run_cmd
-from .extension_utils import is_cuda_file, prepare_unix_cudaflags, prepare_win_cudaflags
-from .extension_utils import _import_module_from_library, _write_setup_file, _jit_compile
-from .extension_utils import check_abi_compatibility, log_v, CustomOpInfo, parse_op_name_from
-from .extension_utils import clean_object_if_change_cflags, _reset_so_rpath, _get_fluid_path
-from .extension_utils import bootstrap_context, get_build_directory, add_std_without_repeat
-
-from .extension_utils import IS_WINDOWS, OS_NAME, MSVC_COMPILE_FLAGS, MSVC_COMPILE_FLAGS
+from .extension_utils import (
+    find_cuda_home,
+    find_rocm_home,
+    normalize_extension_kwargs,
+    add_compile_flag,
+    run_cmd,
+)
+from .extension_utils import (
+    is_cuda_file,
+    prepare_unix_cudaflags,
+    prepare_win_cudaflags,
+)
+from .extension_utils import (
+    _import_module_from_library,
+    _write_setup_file,
+    _jit_compile,
+)
+from .extension_utils import (
+    check_abi_compatibility,
+    log_v,
+    CustomOpInfo,
+    parse_op_name_from,
+)
+from .extension_utils import (
+    clean_object_if_change_cflags,
+    _reset_so_rpath,
+    _get_fluid_path,
+)
+from .extension_utils import (
+    bootstrap_context,
+    get_build_directory,
+    add_std_without_repeat,
+)
+
+from .extension_utils import (
+    IS_WINDOWS,
+    OS_NAME,
+    MSVC_COMPILE_FLAGS,
+    MSVC_COMPILE_FLAGS,
+)
 from .extension_utils import CLANG_COMPILE_FLAGS, CLANG_LINK_FLAGS
 
 from ...fluid import core
@@ -40,6 +72,7 @@ from ...fluid import core
 if IS_WINDOWS and six.PY3:
     from distutils.command.build_ext import build_ext as _du_build_ext
     from unittest.mock import Mock
+
     _du_build_ext.get_export_symbols = Mock(return_value=None)
 
 CUDA_HOME = find_cuda_home()
@@ -51,33 +84,33 @@ if core.is_compiled_with_rocm():
 def setup(**attr):
     """
     The interface is used to config the process of compiling customized operators,
-    mainly includes how to compile shared library, automatically generate python API 
+    mainly includes how to compile shared library, automatically generate python API
     and install it into site-package. It supports using customized operators directly with
     ``import`` statement.
 
     It encapsulates the python built-in ``setuptools.setup`` function and keeps arguments
     and usage same as the native interface. Meanwhile, it hiddens Paddle inner framework
     concepts, such as necessary compiling flags, included paths of head files, and linking
-    flags. It also will automatically search and valid local environment and versions of 
-    ``cc(Linux)`` , ``cl.exe(Windows)`` and ``nvcc`` , then compiles customized operators 
+    flags. It also will automatically search and valid local environment and versions of
+    ``cc(Linux)`` , ``cl.exe(Windows)`` and ``nvcc`` , then compiles customized operators
     supporting CPU or GPU device according to the specified Extension type.
 
-    Moreover, `ABI compatibility <https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html>`_ 
+    Moreover, `ABI compatibility <https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html>`_
     will be checked to ensure that compiler version from ``cc(Linux)`` , ``cl.exe(Windows)``
     on local machine is compatible with pre-installed Paddle whl in python site-packages.
 
-    For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, 
-    then the version of user's local machine should satisfy GCC >= 8.2. 
-    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of 
-    PaddlePaddle (Visual Studio 2017). 
-    If the above conditions are not met, the corresponding warning will be printed, and a fatal error may 
+    For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2,
+    then the version of user's local machine should satisfy GCC >= 8.2.
+    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of
+    PaddlePaddle (Visual Studio 2017).
+    If the above conditions are not met, the corresponding warning will be printed, and a fatal error may
     occur because of ABI compatibility.
 
     .. note::
-        
+
         1. Currently we support Linux, MacOS and Windows platfrom.
         2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
-           Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking 
+           Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking
            GCC version.
         3. On Windows platform, we recommend to install `` Visual Studio`` (>=2017).
 
@@ -86,11 +119,11 @@ def setup(**attr):
     ``python setup.py install`` . Then customized operators API will be available everywhere
     after importing it.
 
-    A simple example of ``setup.py`` as followed: 
+    A simple example of ``setup.py`` as followed:
 
     .. code-block:: text
 
-        # setup.py 
+        # setup.py
 
         # Case 1: Compiling customized operators supporting CPU and GPU devices
         from paddle.utils.cpp_extension import CUDAExtension, setup
@@ -124,11 +157,11 @@ def setup(**attr):
         x = paddle.randn([4, 10], dtype='float32')
         relu_out = relu(x)
         tanh_out = tanh(x)
-    
+
 
     Args:
         name(str): Specify the name of shared library file and installed python package.
-        ext_modules(Extension): Specify the Extension instance including customized operator source files, compiling flags et.al. 
+        ext_modules(Extension): Specify the Extension instance including customized operator source files, compiling flags et.al.
                                 If only compile operator supporting CPU device, please use ``CppExtension`` ; If compile operator
                                 supporting CPU and GPU devices, please use ``CUDAExtension`` .
         include_dirs(list[str], optional): Specify the extra include directories to search head files. The interface will automatically add
@@ -139,7 +172,7 @@ def setup(**attr):
                                 compiler using dict type with ``{'cxx': [...], 'nvcc': [...]}`` . Default is None.
         **attr(dict, optional): Specify other arguments same as ``setuptools.setup`` .
 
-    Returns: 
+    Returns:
         None
 
     """
@@ -148,7 +181,8 @@ def setup(**attr):
     # if not specific cmdclass in setup, add it automatically.
     if 'build_ext' not in cmdclass:
         cmdclass['build_ext'] = BuildExtension.with_options(
-            no_python_abi_suffix=True)
+            no_python_abi_suffix=True
+        )
         attr['cmdclass'] = cmdclass
 
     error_msg = """
@@ -168,17 +202,19 @@ def setup(**attr):
     if 'name' not in attr:
         raise ValueError(error_msg)
 
-    assert not attr['name'].endswith('module'),  \
-    "Please don't use 'module' as suffix in `name` argument, "
+    assert not attr['name'].endswith(
+        'module'
+    ), "Please don't use 'module' as suffix in `name` argument, "
     "it will be stripped in setuptools.bdist_egg and cause import error."
 
     ext_modules = attr.get('ext_modules', [])
     if not isinstance(ext_modules, list):
         ext_modules = [ext_modules]
-    assert len(
-        ext_modules
-    ) == 1, "Required only one Extension, but received {}. If you want to compile multi operators, you can include all necessary source files in one Extension.".format(
-        len(ext_modules))
+    assert (
+        len(ext_modules) == 1
+    ), "Required only one Extension, but received {}. If you want to compile multi operators, you can include all necessary source files in one Extension.".format(
+        len(ext_modules)
+    )
     # replace Extension.name with attr['name] to keep consistant with Package name.
     for ext_module in ext_modules:
         ext_module.name = attr['name']
@@ -219,7 +255,7 @@ def CppExtension(sources, *args, **kwargs):
 
     .. code-block:: text
 
-        # setup.py 
+        # setup.py
 
         # Compiling customized operators supporting only CPU device
         from paddle.utils.cpp_extension import CppExtension, setup
@@ -269,7 +305,7 @@ def CUDAExtension(sources, *args, **kwargs):
 
     .. code-block:: text
 
-        # setup.py 
+        # setup.py
 
         # Compiling customized operators supporting CPU and GPU devices
         from paddle.utils.cpp_extension import CUDAExtension, setup
@@ -336,7 +372,6 @@ class BuildExtension(build_ext, object):
         """
 
         class cls_with_options(cls):
-
             def __init__(self, *args, **kwargs):
                 kwargs.update(options)
                 cls.__init__(self, *args, **kwargs)
@@ -381,8 +416,9 @@ class BuildExtension(build_ext, object):
         # cflags have changed and delete the built shared library to re-compile the source
         # even though source file content keep unchanged.
         so_name = self.get_ext_fullpath(self.extensions[0].name)
-        clean_object_if_change_cflags(os.path.abspath(so_name),
-                                      self.extensions[0])
+        clean_object_if_change_cflags(
+            os.path.abspath(so_name), self.extensions[0]
+        )
 
         # Consider .cu, .cu.cc as valid source extensions.
         self.compiler.src_extensions += ['.cu', '.cu.cc']
@@ -394,8 +430,9 @@ class BuildExtension(build_ext, object):
         else:
             original_compile = self.compiler._compile
 
-        def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
-                                        pp_opts):
+        def unix_custom_single_compiler(
+            obj, src, ext, cc_args, extra_postargs, pp_opts
+        ):
             """
             Monkey patch machanism to replace inner compiler to custom complie process on Unix platform.
             """
@@ -408,7 +445,9 @@ class BuildExtension(build_ext, object):
                 # nvcc or hipcc compile CUDA source
                 if is_cuda_file(src):
                     if core.is_compiled_with_rocm():
-                        assert ROCM_HOME is not None, "Not found ROCM runtime, \
+                        assert (
+                            ROCM_HOME is not None
+                        ), "Not found ROCM runtime, \
                             please use `export ROCM_PATH= XXX` to specify it."
 
                         hipcc_cmd = os.path.join(ROCM_HOME, 'bin', 'hipcc')
@@ -417,7 +456,9 @@ class BuildExtension(build_ext, object):
                         if isinstance(cflags, dict):
                             cflags = cflags['hipcc']
                     else:
-                        assert CUDA_HOME is not None, "Not found CUDA runtime, \
+                        assert (
+                            CUDA_HOME is not None
+                        ), "Not found CUDA runtime, \
                             please use `export CUDA_HOME= XXX` to specify it."
 
                         nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc')
@@ -436,7 +477,8 @@ class BuildExtension(build_ext, object):
                     cflags.append('-D__HIP_PLATFORM_HCC__')
                     cflags.append('-D__HIP_NO_HALF_CONVERSIONS__=1')
                     cflags.append(
-                        '-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP')
+                        '-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP'
+                    )
 
                 # NOTE(Aurelius84): Since Paddle 2.0, we require gcc version > 5.x,
                 # so we add this flag to ensure the symbol names from user compiled
@@ -450,22 +492,24 @@ class BuildExtension(build_ext, object):
                     else:
                         cflags.append('-DPADDLE_WITH_CUDA')
 
-                add_std_without_repeat(cflags,
-                                       self.compiler.compiler_type,
-                                       use_std14=True)
+                add_std_without_repeat(
+                    cflags, self.compiler.compiler_type, use_std14=True
+                )
                 original_compile(obj, src, ext, cc_args, cflags, pp_opts)
             finally:
                 # restore original_compiler
                 self.compiler.set_executable('compiler_so', original_compiler)
 
-        def win_custom_single_compiler(sources,
-                                       output_dir=None,
-                                       macros=None,
-                                       include_dirs=None,
-                                       debug=0,
-                                       extra_preargs=None,
-                                       extra_postargs=None,
-                                       depends=None):
+        def win_custom_single_compiler(
+            sources,
+            output_dir=None,
+            macros=None,
+            include_dirs=None,
+            debug=0,
+            extra_preargs=None,
+            extra_postargs=None,
+            depends=None,
+        ):
 
             self.cflags = copy.deepcopy(extra_postargs)
             extra_postargs = None
@@ -482,27 +526,32 @@ class BuildExtension(build_ext, object):
                 # Using regex to match src, obj and include files
                 src_regex = re.compile('/T(p|c)(.*)')
                 src_list = [
-                    m.group(2) for m in (src_regex.match(elem) for elem in cmd)
+                    m.group(2)
+                    for m in (src_regex.match(elem) for elem in cmd)
                     if m
                 ]
 
                 obj_regex = re.compile('/Fo(.*)')
                 obj_list = [
-                    m.group(1) for m in (obj_regex.match(elem) for elem in cmd)
+                    m.group(1)
+                    for m in (obj_regex.match(elem) for elem in cmd)
                     if m
                 ]
 
                 include_regex = re.compile(r'((\-|\/)I.*)')
                 include_list = [
                     m.group(1)
-                    for m in (include_regex.match(elem) for elem in cmd) if m
+                    for m in (include_regex.match(elem) for elem in cmd)
+                    if m
                 ]
 
                 assert len(src_list) == 1 and len(obj_list) == 1
                 src = src_list[0]
                 obj = obj_list[0]
                 if is_cuda_file(src):
-                    assert CUDA_HOME is not None, "Not found CUDA runtime, \
+                    assert (
+                        CUDA_HOME is not None
+                    ), "Not found CUDA runtime, \
                         please use `export CUDA_HOME= XXX` to specify it."
 
                     nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc')
@@ -516,8 +565,9 @@ class BuildExtension(build_ext, object):
                     cflags = prepare_win_cudaflags(cflags) + ['--use-local-env']
                     for flag in MSVC_COMPILE_FLAGS:
                         cflags = ['-Xcompiler', flag] + cflags
-                    cmd = [nvcc_cmd, '-c', src, '-o', obj
-                           ] + include_list + cflags
+                    cmd = (
+                        [nvcc_cmd, '-c', src, '-o', obj] + include_list + cflags
+                    )
                 elif isinstance(self.cflags, dict):
                     cflags = MSVC_COMPILE_FLAGS + self.cflags['cxx']
                     cmd += cflags
@@ -532,9 +582,16 @@ class BuildExtension(build_ext, object):
 
             try:
                 self.compiler.spawn = win_custom_spawn
-                return original_compile(sources, output_dir, macros,
-                                        include_dirs, debug, extra_preargs,
-                                        extra_postargs, depends)
+                return original_compile(
+                    sources,
+                    output_dir,
+                    macros,
+                    include_dirs,
+                    debug,
+                    extra_preargs,
+                    extra_postargs,
+                    depends,
+                )
             finally:
                 self.compiler.spawn = original_spawn
 
@@ -547,8 +604,9 @@ class BuildExtension(build_ext, object):
 
             def wrapper(source_filenames, strip_dir=0, output_dir=''):
                 try:
-                    objects = origina_func(source_filenames, strip_dir,
-                                           output_dir)
+                    objects = origina_func(
+                        source_filenames, strip_dir, output_dir
+                    )
                     for i, source in enumerate(source_filenames):
                         # modify xx.o -> xx.cu.o/xx.cu.obj
                         if is_cuda_file(source):
@@ -579,7 +637,8 @@ class BuildExtension(build_ext, object):
             self.compiler._compile = unix_custom_single_compiler
 
         self.compiler.object_filenames = object_filenames_with_cuda(
-            self.compiler.object_filenames, self.build_lib)
+            self.compiler.object_filenames, self.build_lib
+        )
         self._record_op_info()
 
         print("Compiling user custom op, it will cost a few seconds.....")
@@ -595,10 +654,11 @@ class BuildExtension(build_ext, object):
         split_str = '.'
         name_items = ext_name.split(split_str)
         if self.no_python_abi_suffix and six.PY3:
-            assert len(
-                name_items
-            ) > 2, "Expected len(name_items) > 2, but received {}".format(
-                len(name_items))
+            assert (
+                len(name_items) > 2
+            ), "Expected len(name_items) > 2, but received {}".format(
+                len(name_items)
+            )
             name_items.pop(-2)
             ext_name = split_str.join(name_items)
 
@@ -614,11 +674,13 @@ class BuildExtension(build_ext, object):
         """
         compiler_infos = ['clang'] + CLANG_COMPILE_FLAGS
         linker_infos = ['clang'] + CLANG_LINK_FLAGS
-        self.compiler.set_executables(compiler=compiler_infos,
-                                      compiler_so=compiler_infos,
-                                      compiler_cxx=['clang'],
-                                      linker_exe=['clang'],
-                                      linker_so=linker_infos)
+        self.compiler.set_executables(
+            compiler=compiler_infos,
+            compiler_so=compiler_infos,
+            compiler_cxx=['clang'],
+            linker_exe=['clang'],
+            linker_so=linker_infos,
+        )
 
     def _check_abi(self):
         """
@@ -633,11 +695,16 @@ class BuildExtension(build_ext, object):
 
         check_abi_compatibility(compiler)
         # Warn user if VC env is activated but `DISTUTILS_USE_SDK` is not set.
-        if IS_WINDOWS and 'VSCMD_ARG_TGT_ARCH' in os.environ and 'DISTUTILS_USE_SDK' not in os.environ:
+        if (
+            IS_WINDOWS
+            and 'VSCMD_ARG_TGT_ARCH' in os.environ
+            and 'DISTUTILS_USE_SDK' not in os.environ
+        ):
             msg = (
                 'It seems that the VC environment is activated but DISTUTILS_USE_SDK is not set.'
                 'This may lead to multiple activations of the VC env.'
-                'Please run `set DISTUTILS_USE_SDK=1` and try again.')
+                'Please run `set DISTUTILS_USE_SDK=1` and try again.'
+            )
             raise UserWarning(msg)
 
     def _record_op_info(self):
@@ -658,9 +725,9 @@ class BuildExtension(build_ext, object):
             op_names = parse_op_name_from(sources)
 
             for op_name in op_names:
-                CustomOpInfo.instance().add(op_name,
-                                            so_name=so_name,
-                                            so_path=so_path)
+                CustomOpInfo.instance().add(
+                    op_name, so_name=so_name, so_path=so_path
+                )
 
 
 class EasyInstallCommand(easy_install, object):
@@ -713,7 +780,6 @@ class BuildCommand(build, object):
         """
 
         class cls_with_options(cls):
-
             def __init__(self, *args, **kwargs):
                 kwargs.update(options)
                 cls.__init__(self, *args, **kwargs)
@@ -736,36 +802,38 @@ class BuildCommand(build, object):
             self.build_base = self._specified_build_base
 
 
-def load(name,
-         sources,
-         extra_cxx_cflags=None,
-         extra_cuda_cflags=None,
-         extra_ldflags=None,
-         extra_include_paths=None,
-         build_directory=None,
-         verbose=False):
+def load(
+    name,
+    sources,
+    extra_cxx_cflags=None,
+    extra_cuda_cflags=None,
+    extra_ldflags=None,
+    extra_include_paths=None,
+    build_directory=None,
+    verbose=False,
+):
     """
     An Interface to automatically compile C++/CUDA source files Just-In-Time
     and return callable python function as other Paddle layers API. It will
     append user defined custom operators in background while building models.
 
     It will perform compiling, linking, Python API generation and module loading
-    processes under a individual subprocess. It does not require CMake or Ninja 
-    environment. On Linux platform, it requires GCC compiler whose version is 
-    greater than 5.4 and it should be soft linked to ``/usr/bin/cc`` . On Windows 
+    processes under a individual subprocess. It does not require CMake or Ninja
+    environment. On Linux platform, it requires GCC compiler whose version is
+    greater than 5.4 and it should be soft linked to ``/usr/bin/cc`` . On Windows
     platform, it requires Visual Studio whose version is greater than 2017.
-    On MacOS, clang++ is requited. In addition, if compiling Operators supporting 
+    On MacOS, clang++ is requited. In addition, if compiling Operators supporting
     GPU device, please make sure ``nvcc`` compiler is installed in local environment.
-    
-    Moreover, `ABI compatibility <https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html>`_ 
+
+    Moreover, `ABI compatibility <https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html>`_
     will be checked to ensure that compiler version from ``cc(Linux)`` , ``cl.exe(Windows)``
     on local machine is compatible with pre-installed Paddle whl in python site-packages.
 
-    For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, 
-    then the version of user's local machine should satisfy GCC >= 8.2. 
-    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of 
-    PaddlePaddle (Visual Studio 2017). 
-    If the above conditions are not met, the corresponding warning will be printed, and a fatal error may 
+    For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2,
+    then the version of user's local machine should satisfy GCC >= 8.2.
+    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of
+    PaddlePaddle (Visual Studio 2017).
+    If the above conditions are not met, the corresponding warning will be printed, and a fatal error may
     occur because of ABI compatibility.
 
     Compared with ``setup`` interface, it doesn't need extra ``setup.py`` and excute
@@ -776,7 +844,7 @@ def load(name,
 
         1. Currently we support Linux, MacOS and Windows platfrom.
         2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
-           Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking 
+           Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking
            GCC version.
         3. On Windows platform, we recommend to install `` Visual Studio`` (>=2017).
 
@@ -784,7 +852,7 @@ def load(name,
     **A simple example:**
 
     .. code-block:: text
-    
+
         import paddle
         from paddle.utils.cpp_extension import load
 
@@ -807,7 +875,7 @@ def load(name,
         extra_cxx_cflags(list[str], optional): Specify additional flags used to compile CPP files. By default
                                all basic and framework related flags have been included.
         extra_cuda_cflags(list[str], optional): Specify additional flags used to compile CUDA files. By default
-                               all basic and framework related flags have been included. 
+                               all basic and framework related flags have been included.
                                See `Cuda Compiler Driver NVCC <https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html>`_
                                for details. Default is None.
         extra_ldflags(list[str], optional): Specify additional flags used to link shared library. See
@@ -837,27 +905,42 @@ def load(name,
     file_path = os.path.join(build_directory, "{}_setup.py".format(name))
     sources = [os.path.abspath(source) for source in sources]
 
-    if extra_cxx_cflags is None: extra_cxx_cflags = []
-    if extra_cuda_cflags is None: extra_cuda_cflags = []
+    if extra_cxx_cflags is None:
+        extra_cxx_cflags = []
+    if extra_cuda_cflags is None:
+        extra_cuda_cflags = []
     assert isinstance(
         extra_cxx_cflags, list
     ), "Required type(extra_cxx_cflags) == list[str], but received {}".format(
-        extra_cxx_cflags)
+        extra_cxx_cflags
+    )
     assert isinstance(
         extra_cuda_cflags, list
     ), "Required type(extra_cuda_cflags) == list[str], but received {}".format(
-        extra_cuda_cflags)
+        extra_cuda_cflags
+    )
 
     log_v(
         "additional extra_cxx_cflags: [{}], extra_cuda_cflags: [{}]".format(
-            ' '.join(extra_cxx_cflags), ' '.join(extra_cuda_cflags)), verbose)
+            ' '.join(extra_cxx_cflags), ' '.join(extra_cuda_cflags)
+        ),
+        verbose,
+    )
 
     # write setup.py file and compile it
     build_base_dir = os.path.join(build_directory, name)
 
-    _write_setup_file(name, sources, file_path, build_base_dir,
-                      extra_include_paths, extra_cxx_cflags, extra_cuda_cflags,
-                      extra_ldflags, verbose)
+    _write_setup_file(
+        name,
+        sources,
+        file_path,
+        build_base_dir,
+        extra_include_paths,
+        extra_cxx_cflags,
+        extra_cuda_cflags,
+        extra_ldflags,
+        verbose,
+    )
     _jit_compile(file_path, verbose)
 
     # import as callable python api
diff --git a/python/paddle/vision/datasets/cifar.py b/python/paddle/vision/datasets/cifar.py
index f083d01c5a8..8c4d9ac7815 100644
--- a/python/paddle/vision/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
@@ -36,7 +36,7 @@ MODE_FLAG_MAP = {
     'train10': 'data_batch',
     'test10': 'test_batch',
     'train100': 'train',
-    'test100': 'test'
+    'test100': 'test',
 }
 
 
@@ -105,14 +105,22 @@ class Cifar10(Dataset):
                 # <class 'paddle.Tensor'> [3, 64, 64] 3
     """
 
-    def __init__(self,
-                 data_file=None,
-                 mode='train',
-                 transform=None,
-                 download=True,
-                 backend=None):
-        assert mode.lower() in ['train', 'test', 'train', 'test'], \
-            "mode should be 'train10', 'test10', 'train100' or 'test100', but got {}".format(mode)
+    def __init__(
+        self,
+        data_file=None,
+        mode='train',
+        transform=None,
+        download=True,
+        backend=None,
+    ):
+        assert mode.lower() in [
+            'train',
+            'test',
+            'train',
+            'test',
+        ], "mode should be 'train10', 'test10', 'train100' or 'test100', but got {}".format(
+            mode
+        )
         self.mode = mode.lower()
 
         if backend is None:
@@ -120,18 +128,21 @@ class Cifar10(Dataset):
         if backend not in ['pil', 'cv2']:
             raise ValueError(
                 "Expected backend are one of ['pil', 'cv2'], but got {}".format(
-                    backend))
+                    backend
+                )
+            )
         self.backend = backend
 
         self._init_url_md5_flag()
 
         self.data_file = data_file
         if self.data_file is None:
-            assert download, "data_file is not set and downloading automatically is disabled"
-            self.data_file = _check_exists_and_download(data_file,
-                                                        self.data_url,
-                                                        self.data_md5, 'cifar',
-                                                        download)
+            assert (
+                download
+            ), "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, self.data_url, self.data_md5, 'cifar', download
+            )
 
         self.transform = transform
 
@@ -148,8 +159,9 @@ class Cifar10(Dataset):
     def _load_data(self):
         self.data = []
         with tarfile.open(self.data_file, mode='r') as f:
-            names = (each_item.name for each_item in f
-                     if self.flag in each_item.name)
+            names = (
+                each_item.name for each_item in f if self.flag in each_item.name
+            )
 
             names = sorted(list(names))
 
@@ -157,8 +169,9 @@ class Cifar10(Dataset):
                 batch = pickle.load(f.extractfile(name), encoding='bytes')
 
                 data = batch[six.b('data')]
-                labels = batch.get(six.b('labels'),
-                                   batch.get(six.b('fine_labels'), None))
+                labels = batch.get(
+                    six.b('labels'), batch.get(six.b('fine_labels'), None)
+                )
                 assert labels is not None
                 for sample, label in six.moves.zip(data, labels):
                     self.data.append((sample, label))
@@ -247,14 +260,17 @@ class Cifar100(Cifar10):
                 # <class 'paddle.Tensor'> [3, 64, 64] 49
     """
 
-    def __init__(self,
-                 data_file=None,
-                 mode='train',
-                 transform=None,
-                 download=True,
-                 backend=None):
-        super(Cifar100, self).__init__(data_file, mode, transform, download,
-                                       backend)
+    def __init__(
+        self,
+        data_file=None,
+        mode='train',
+        transform=None,
+        download=True,
+        backend=None,
+    ):
+        super(Cifar100, self).__init__(
+            data_file, mode, transform, download, backend
+        )
 
     def _init_url_md5_flag(self):
         self.data_url = CIFAR100_URL
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index f9a0f4785e5..663b2eabe35 100755
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -14,16 +14,25 @@
 
 import numpy as np
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
+from ..fluid.data_feeder import (
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+)
 from ..fluid import core, layers
 from ..fluid.layers import nn, utils
 from ..nn import Layer, Conv2D, Sequential, ReLU, BatchNorm2D
 from ..fluid.initializer import Normal
-from ..fluid.framework import Variable, _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
+from ..fluid.framework import (
+    Variable,
+    _non_static_mode,
+    in_dygraph_mode,
+    _in_legacy_dygraph,
+)
 from paddle.common_ops_import import *
 from paddle import _C_ops, _legacy_C_ops
 
-__all__ = [  #noqa
+__all__ = [  # noqa
     'yolo_loss',
     'yolo_box',
     'prior_box',
@@ -45,30 +54,32 @@ __all__ = [  #noqa
 ]
 
 
-def yolo_loss(x,
-              gt_box,
-              gt_label,
-              anchors,
-              anchor_mask,
-              class_num,
-              ignore_thresh,
-              downsample_ratio,
-              gt_score=None,
-              use_label_smooth=True,
-              name=None,
-              scale_x_y=1.):
+def yolo_loss(
+    x,
+    gt_box,
+    gt_label,
+    anchors,
+    anchor_mask,
+    class_num,
+    ignore_thresh,
+    downsample_ratio,
+    gt_score=None,
+    use_label_smooth=True,
+    name=None,
+    scale_x_y=1.0,
+):
     r"""
 
     This operator generates YOLOv3 loss based on given predict result and ground
     truth boxes.
-    
+
     The output of previous network is in shape [N, C, H, W], while H and W
-    should be the same, H and W specify the grid size, each grid point predict 
+    should be the same, H and W specify the grid size, each grid point predict
     given number bounding boxes, this given number, which following will be represented as S,
     is specified by the number of anchor clusters in each scale. In the second dimension(the channel
-    dimension), C should be equal to S * (class_num + 5), class_num is the object 
-    category number of source dataset(such as 80 in coco dataset), so in the 
-    second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
+    dimension), C should be equal to S * (class_num + 5), class_num is the object
+    category number of source dataset(such as 80 in coco dataset), so in the
+    second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
     also includes confidence score of the box and class one-hot key of each anchor box.
 
     Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions
@@ -91,21 +102,21 @@ def yolo_loss(x,
     and :math:`p_w, p_h` is specified by anchors.
 
     As for confidence score, it is the logistic regression value of IoU between
-    anchor boxes and ground truth boxes, the score of the anchor box which has 
-    the max IoU should be 1, and if the anchor box has IoU bigger than ignore 
+    anchor boxes and ground truth boxes, the score of the anchor box which has
+    the max IoU should be 1, and if the anchor box has IoU bigger than ignore
     thresh, the confidence score loss of this anchor box will be ignored.
 
     Therefore, the YOLOv3 loss consists of three major parts: box location loss,
-    objectness loss and classification loss. The L1 loss is used for 
-    box coordinates (w, h), sigmoid cross entropy loss is used for box 
+    objectness loss and classification loss. The L1 loss is used for
+    box coordinates (w, h), sigmoid cross entropy loss is used for box
     coordinates (x, y), objectness loss and classification loss.
 
-    Each groud truth box finds a best matching anchor box in all anchors. 
+    Each groud truth box finds a best matching anchor box in all anchors.
     Prediction of this anchor box will incur all three parts of losses, and
     prediction of anchor boxes with no GT box matched will only incur objectness
     loss.
 
-    In order to trade off box coordinate losses between big boxes and small 
+    In order to trade off box coordinate losses between big boxes and small
     boxes, box coordinate losses will be mutiplied by scale weight, which is
     calculated as follows.
 
@@ -120,12 +131,12 @@ def yolo_loss(x,
     $$
 
     While :attr:`use_label_smooth` is set to be :attr:`True`, the classification
-    target will be smoothed when calculating classification loss, target of 
+    target will be smoothed when calculating classification loss, target of
     positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of
     negetive samples will be smoothed to :math:`1.0 / class\_num`.
 
-    While :attr:`gt_score` is given, which means the mixup score of ground truth 
-    boxes, all losses incured by a ground truth box will be multiplied by its 
+    While :attr:`gt_score` is given, which means the mixup score of ground truth
+    boxes, all losses incured by a ground truth box will be multiplied by its
     mixup score.
 
     Args:
@@ -133,16 +144,16 @@ def yolo_loss(x,
                       tensor with shape of [N, C, H, W]. H and W should be same,
                       and the second dimension(C) stores box locations, confidence
                       score and classification one-hot keys of each anchor box.
-                      The data type is float32 or float64. 
+                      The data type is float32 or float64.
         gt_box (Tensor): groud truth boxes, should be in shape of [N, B, 4],
-                          in the third dimension, x, y, w, h should be stored. 
+                          in the third dimension, x, y, w, h should be stored.
                           x,y is the center coordinate of boxes, w, h are the
-                          width and height, x, y, w, h should be divided by 
+                          width and height, x, y, w, h should be divided by
                           input image height to scale to [0, 1].
-                          N is the batch number and B is the max box number in 
-                          an image.The data type is float32 or float64. 
+                          N is the batch number and B is the max box number in
+                          an image.The data type is float32 or float64.
         gt_label (Tensor): class id of ground truth boxes, should be in shape
-                            of [N, B].The data type is int32. 
+                            of [N, B].The data type is int32.
         anchors (list|tuple): The anchor width and height, it will be parsed
                               pair by pair.
         anchor_mask (list|tuple): The mask index of anchors used in current
@@ -151,13 +162,13 @@ def yolo_loss(x,
         ignore_thresh (float): The ignore threshold to ignore confidence loss.
         downsample_ratio (int): The downsample ratio from network input to YOLOv3
                                 loss input, so 32, 16, 8 should be set for the
-                                first, second, and thrid YOLOv3 loss operators. 
-        name (string): The default value is None.  Normally there is no need 
-                       for user to set this property.  For more information, 
+                                first, second, and thrid YOLOv3 loss operators.
+        name (string): The default value is None.  Normally there is no need
+                       for user to set this property.  For more information,
                        please refer to :ref:`api_guide_Name`
         gt_score (Tensor): mixup score of ground truth boxes, should be in shape
                             of [N, B]. Default None.
-        use_label_smooth (bool): Whether to use label smooth. Default True. 
+        use_label_smooth (bool): Whether to use label smooth. Default True.
         scale_x_y (float): Scale the center point of decoded bounding box.
                            Default 1.0
 
@@ -166,9 +177,9 @@ def yolo_loss(x,
 
     Raises:
         TypeError: Input x of yolov3_loss must be Tensor
-        TypeError: Input gtbox of yolov3_loss must be Tensor 
-        TypeError: Input gtlabel of yolov3_loss must be Tensor 
-        TypeError: Input gtscore of yolov3_loss must be None or Tensor 
+        TypeError: Input gtbox of yolov3_loss must be Tensor
+        TypeError: Input gtlabel of yolov3_loss must be Tensor
+        TypeError: Input gtscore of yolov3_loss must be None or Tensor
         TypeError: Attr anchors of yolov3_loss must be list or tuple
         TypeError: Attr class_num of yolov3_loss must be an integer
         TypeError: Attr ignore_thresh of yolov3_loss must be a float number
@@ -201,25 +212,50 @@ def yolo_loss(x,
     """
 
     if in_dygraph_mode():
-        loss, _, _ = _C_ops.yolov3_loss(x, gt_box, gt_label, gt_score, anchors,
-                                        anchor_mask, class_num, ignore_thresh,
-                                        downsample_ratio, use_label_smooth,
-                                        scale_x_y)
+        loss, _, _ = _C_ops.yolov3_loss(
+            x,
+            gt_box,
+            gt_label,
+            gt_score,
+            anchors,
+            anchor_mask,
+            class_num,
+            ignore_thresh,
+            downsample_ratio,
+            use_label_smooth,
+            scale_x_y,
+        )
         return loss
 
     if _non_static_mode():
         loss, _, _ = _legacy_C_ops.yolov3_loss(
-            x, gt_box, gt_label, gt_score, 'anchors', anchors, 'anchor_mask',
-            anchor_mask, 'class_num', class_num, 'ignore_thresh', ignore_thresh,
-            'downsample_ratio', downsample_ratio, 'use_label_smooth',
-            use_label_smooth, 'scale_x_y', scale_x_y)
+            x,
+            gt_box,
+            gt_label,
+            gt_score,
+            'anchors',
+            anchors,
+            'anchor_mask',
+            anchor_mask,
+            'class_num',
+            class_num,
+            'ignore_thresh',
+            ignore_thresh,
+            'downsample_ratio',
+            downsample_ratio,
+            'use_label_smooth',
+            use_label_smooth,
+            'scale_x_y',
+            scale_x_y,
+        )
         return loss
 
     helper = LayerHelper('yolov3_loss', **locals())
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'yolo_loss')
-    check_variable_and_dtype(gt_box, 'gt_box', ['float32', 'float64'],
-                             'yolo_loss')
+    check_variable_and_dtype(
+        gt_box, 'gt_box', ['float32', 'float64'], 'yolo_loss'
+    )
     check_variable_and_dtype(gt_label, 'gt_label', 'int32', 'yolo_loss')
     check_type(anchors, 'anchors', (list, tuple), 'yolo_loss')
     check_type(anchor_mask, 'anchor_mask', (list, tuple), 'yolo_loss')
@@ -250,28 +286,32 @@ def yolo_loss(x,
         "scale_x_y": scale_x_y,
     }
 
-    helper.append_op(type='yolov3_loss',
-                     inputs=inputs,
-                     outputs={
-                         'Loss': loss,
-                         'ObjectnessMask': objectness_mask,
-                         'GTMatchMask': gt_match_mask
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='yolov3_loss',
+        inputs=inputs,
+        outputs={
+            'Loss': loss,
+            'ObjectnessMask': objectness_mask,
+            'GTMatchMask': gt_match_mask,
+        },
+        attrs=attrs,
+    )
     return loss
 
 
-def yolo_box(x,
-             img_size,
-             anchors,
-             class_num,
-             conf_thresh,
-             downsample_ratio,
-             clip_bbox=True,
-             name=None,
-             scale_x_y=1.,
-             iou_aware=False,
-             iou_aware_factor=0.5):
+def yolo_box(
+    x,
+    img_size,
+    anchors,
+    class_num,
+    conf_thresh,
+    downsample_ratio,
+    clip_bbox=True,
+    name=None,
+    scale_x_y=1.0,
+    iou_aware=False,
+    iou_aware_factor=0.5,
+):
     r"""
 
     This operator generates YOLO detection boxes from output of YOLOv3 network.
@@ -390,18 +430,41 @@ def yolo_box(x,
                                                    scale_x_y=1.)
     """
     if in_dygraph_mode():
-        boxes, scores = _C_ops.yolo_box(x, img_size, anchors, class_num,
-                                        conf_thresh, downsample_ratio,
-                                        clip_bbox, scale_x_y, iou_aware,
-                                        iou_aware_factor)
+        boxes, scores = _C_ops.yolo_box(
+            x,
+            img_size,
+            anchors,
+            class_num,
+            conf_thresh,
+            downsample_ratio,
+            clip_bbox,
+            scale_x_y,
+            iou_aware,
+            iou_aware_factor,
+        )
         return boxes, scores
 
     if _non_static_mode():
         boxes, scores = _legacy_C_ops.yolo_box(
-            x, img_size, 'anchors', anchors, 'class_num', class_num,
-            'conf_thresh', conf_thresh, 'downsample_ratio', downsample_ratio,
-            'clip_bbox', clip_bbox, 'scale_x_y', scale_x_y, 'iou_aware',
-            iou_aware, 'iou_aware_factor', iou_aware_factor)
+            x,
+            img_size,
+            'anchors',
+            anchors,
+            'class_num',
+            class_num,
+            'conf_thresh',
+            conf_thresh,
+            'downsample_ratio',
+            downsample_ratio,
+            'clip_bbox',
+            clip_bbox,
+            'scale_x_y',
+            scale_x_y,
+            'iou_aware',
+            iou_aware,
+            'iou_aware_factor',
+            iou_aware_factor,
+        )
         return boxes, scores
 
     helper = LayerHelper('yolo_box', **locals())
@@ -422,34 +485,38 @@ def yolo_box(x,
         "clip_bbox": clip_bbox,
         "scale_x_y": scale_x_y,
         "iou_aware": iou_aware,
-        "iou_aware_factor": iou_aware_factor
+        "iou_aware_factor": iou_aware_factor,
     }
 
-    helper.append_op(type='yolo_box',
-                     inputs={
-                         "X": x,
-                         "ImgSize": img_size,
-                     },
-                     outputs={
-                         'Boxes': boxes,
-                         'Scores': scores,
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='yolo_box',
+        inputs={
+            "X": x,
+            "ImgSize": img_size,
+        },
+        outputs={
+            'Boxes': boxes,
+            'Scores': scores,
+        },
+        attrs=attrs,
+    )
     return boxes, scores
 
 
-def prior_box(input,
-              image,
-              min_sizes,
-              max_sizes=None,
-              aspect_ratios=[1.],
-              variance=[0.1, 0.1, 0.2, 0.2],
-              flip=False,
-              clip=False,
-              steps=[0.0, 0.0],
-              offset=0.5,
-              min_max_aspect_ratios_order=False,
-              name=None):
+def prior_box(
+    input,
+    image,
+    min_sizes,
+    max_sizes=None,
+    aspect_ratios=[1.0],
+    variance=[0.1, 0.1, 0.2, 0.2],
+    flip=False,
+    clip=False,
+    steps=[0.0, 0.0],
+    offset=0.5,
+    min_max_aspect_ratios_order=False,
+    name=None,
+):
     r"""
 
     This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
@@ -509,17 +576,21 @@ def prior_box(input,
     """
     helper = LayerHelper("prior_box", **locals())
     dtype = helper.input_dtype()
-    check_variable_and_dtype(input, 'input',
-                             ['uint8', 'int8', 'float32', 'float64'],
-                             'prior_box')
+    check_variable_and_dtype(
+        input, 'input', ['uint8', 'int8', 'float32', 'float64'], 'prior_box'
+    )
 
     def _is_list_or_tuple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
+        return isinstance(data, list) or isinstance(data, tuple)
 
-    if not _is_list_or_tuple_(min_sizes): min_sizes = [min_sizes]
-    if not _is_list_or_tuple_(aspect_ratios): aspect_ratios = [aspect_ratios]
-    if not _is_list_or_tuple_(steps): steps = [steps]
-    if not len(steps) == 2: raise ValueError('steps should be (step_w, step_h)')
+    if not _is_list_or_tuple_(min_sizes):
+        min_sizes = [min_sizes]
+    if not _is_list_or_tuple_(aspect_ratios):
+        aspect_ratios = [aspect_ratios]
+    if not _is_list_or_tuple_(steps):
+        steps = [steps]
+    if not len(steps) == 2:
+        raise ValueError('steps should be (step_w, step_h)')
 
     min_sizes = list(map(float, min_sizes))
     aspect_ratios = list(map(float, aspect_ratios))
@@ -527,23 +598,53 @@ def prior_box(input,
 
     cur_max_sizes = None
     if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
-        if not _is_list_or_tuple_(max_sizes): max_sizes = [max_sizes]
+        if not _is_list_or_tuple_(max_sizes):
+            max_sizes = [max_sizes]
         cur_max_sizes = max_sizes
 
     if in_dygraph_mode():
         step_w, step_h = steps
-        if max_sizes == None: max_sizes = []
-        box, var = _C_ops.prior_box(input, image, min_sizes, aspect_ratios,
-                                    variance, max_sizes, flip, clip, step_w,
-                                    step_h, offset, min_max_aspect_ratios_order)
+        if max_sizes == None:
+            max_sizes = []
+        box, var = _C_ops.prior_box(
+            input,
+            image,
+            min_sizes,
+            aspect_ratios,
+            variance,
+            max_sizes,
+            flip,
+            clip,
+            step_w,
+            step_h,
+            offset,
+            min_max_aspect_ratios_order,
+        )
         return box, var
 
     if _in_legacy_dygraph():
-        attrs = ('min_sizes', min_sizes, 'aspect_ratios', aspect_ratios,
-                 'variances', variance, 'flip', flip, 'clip', clip, 'step_w',
-                 steps[0], 'step_h', steps[1], 'offset', offset,
-                 'min_max_aspect_ratios_order', min_max_aspect_ratios_order)
-        if cur_max_sizes is not None: attrs += ('max_sizes', cur_max_sizes)
+        attrs = (
+            'min_sizes',
+            min_sizes,
+            'aspect_ratios',
+            aspect_ratios,
+            'variances',
+            variance,
+            'flip',
+            flip,
+            'clip',
+            clip,
+            'step_w',
+            steps[0],
+            'step_h',
+            steps[1],
+            'offset',
+            offset,
+            'min_max_aspect_ratios_order',
+            min_max_aspect_ratios_order,
+        )
+        if cur_max_sizes is not None:
+            attrs += ('max_sizes', cur_max_sizes)
         box, var = _legacy_C_ops.prior_box(input, image, *attrs)
         return box, var
     else:
@@ -556,22 +657,17 @@ def prior_box(input,
             'step_w': steps[0],
             'step_h': steps[1],
             'offset': offset,
-            'min_max_aspect_ratios_order': min_max_aspect_ratios_order
+            'min_max_aspect_ratios_order': min_max_aspect_ratios_order,
         }
-        if cur_max_sizes is not None: attrs['max_sizes'] = cur_max_sizes
+        if cur_max_sizes is not None:
+            attrs['max_sizes'] = cur_max_sizes
 
         box = helper.create_variable_for_type_inference(dtype)
         var = helper.create_variable_for_type_inference(dtype)
         helper.append_op(
             type="prior_box",
-            inputs={
-                "Input": input,
-                "Image": image
-            },
-            outputs={
-                "Boxes": box,
-                "Variances": var
-            },
+            inputs={"Input": input, "Image": image},
+            outputs={"Boxes": box, "Variances": var},
             attrs=attrs,
         )
         box.stop_gradient = True
@@ -579,13 +675,15 @@ def prior_box(input,
         return box, var
 
 
-def box_coder(prior_box,
-              prior_box_var,
-              target_box,
-              code_type="encode_center_size",
-              box_normalized=True,
-              axis=0,
-              name=None):
+def box_coder(
+    prior_box,
+    prior_box_var,
+    target_box,
+    code_type="encode_center_size",
+    box_normalized=True,
+    axis=0,
+    name=None,
+):
     r"""
     Encode/Decode the target bounding box with the priorbox information.
 
@@ -688,35 +786,65 @@ def box_coder(prior_box,
                 box_normalized=False)
 
     """
-    check_variable_and_dtype(prior_box, 'prior_box', ['float32', 'float64'],
-                             'box_coder')
-    check_variable_and_dtype(target_box, 'target_box', ['float32', 'float64'],
-                             'box_coder')
+    check_variable_and_dtype(
+        prior_box, 'prior_box', ['float32', 'float64'], 'box_coder'
+    )
+    check_variable_and_dtype(
+        target_box, 'target_box', ['float32', 'float64'], 'box_coder'
+    )
 
     if in_dygraph_mode():
         if isinstance(prior_box_var, Variable):
-            output_box = _C_ops.box_coder(prior_box, prior_box_var, target_box,
-                                          code_type, box_normalized, axis, [])
+            output_box = _C_ops.box_coder(
+                prior_box,
+                prior_box_var,
+                target_box,
+                code_type,
+                box_normalized,
+                axis,
+                [],
+            )
         elif isinstance(prior_box_var, list):
-            output_box = _C_ops.box_coder(prior_box, None, target_box,
-                                          code_type, box_normalized, axis,
-                                          prior_box_var)
+            output_box = _C_ops.box_coder(
+                prior_box,
+                None,
+                target_box,
+                code_type,
+                box_normalized,
+                axis,
+                prior_box_var,
+            )
         else:
             raise TypeError("Input prior_box_var must be Variable or list")
         return output_box
 
     if _in_legacy_dygraph():
         if isinstance(prior_box_var, Variable):
-            output_box = _legacy_C_ops.box_coder(prior_box, prior_box_var,
-                                                 target_box, "code_type",
-                                                 code_type, "box_normalized",
-                                                 box_normalized, "axis", axis)
+            output_box = _legacy_C_ops.box_coder(
+                prior_box,
+                prior_box_var,
+                target_box,
+                "code_type",
+                code_type,
+                "box_normalized",
+                box_normalized,
+                "axis",
+                axis,
+            )
         elif isinstance(prior_box_var, list):
-            output_box = _legacy_C_ops.box_coder(prior_box, None, target_box,
-                                                 "code_type", code_type,
-                                                 "box_normalized",
-                                                 box_normalized, "axis", axis,
-                                                 "variance", prior_box_var)
+            output_box = _legacy_C_ops.box_coder(
+                prior_box,
+                None,
+                target_box,
+                "code_type",
+                code_type,
+                "box_normalized",
+                box_normalized,
+                "axis",
+                axis,
+                "variance",
+                prior_box_var,
+            )
         else:
             raise TypeError("Input prior_box_var must be Variable or list")
         return output_box
@@ -724,13 +852,14 @@ def box_coder(prior_box,
         helper = LayerHelper("box_coder", **locals())
 
         output_box = helper.create_variable_for_type_inference(
-            dtype=prior_box.dtype)
+            dtype=prior_box.dtype
+        )
 
         inputs = {"PriorBox": prior_box, "TargetBox": target_box}
         attrs = {
             "code_type": code_type,
             "box_normalized": box_normalized,
-            "axis": axis
+            "axis": axis,
         }
         if isinstance(prior_box_var, Variable):
             inputs['PriorBoxVar'] = prior_box_var
@@ -738,24 +867,28 @@ def box_coder(prior_box,
             attrs['variance'] = prior_box_var
         else:
             raise TypeError("Input prior_box_var must be Variable or list")
-        helper.append_op(type="box_coder",
-                         inputs=inputs,
-                         attrs=attrs,
-                         outputs={"OutputBox": output_box})
+        helper.append_op(
+            type="box_coder",
+            inputs=inputs,
+            attrs=attrs,
+            outputs={"OutputBox": output_box},
+        )
         return output_box
 
 
-def deform_conv2d(x,
-                  offset,
-                  weight,
-                  bias=None,
-                  stride=1,
-                  padding=0,
-                  dilation=1,
-                  deformable_groups=1,
-                  groups=1,
-                  mask=None,
-                  name=None):
+def deform_conv2d(
+    x,
+    offset,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    deformable_groups=1,
+    groups=1,
+    mask=None,
+    name=None,
+):
     r"""
     Compute 2-D deformable convolution on 4-D input.
     Given input image x, output feature map y, the deformable convolution operation can be expressed as follow:
@@ -877,34 +1010,58 @@ def deform_conv2d(x,
     use_deform_conv2d_v1 = True if mask is None else False
 
     if in_dygraph_mode():
-        pre_bias = _C_ops.deformable_conv(x, offset, weight, mask, stride,
-                                          padding, dilation, deformable_groups,
-                                          groups, 1)
+        pre_bias = _C_ops.deformable_conv(
+            x,
+            offset,
+            weight,
+            mask,
+            stride,
+            padding,
+            dilation,
+            deformable_groups,
+            groups,
+            1,
+        )
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=1)
         else:
             out = pre_bias
     elif _in_legacy_dygraph():
-        attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
-                 'deformable_groups', deformable_groups, 'groups', groups,
-                 'im2col_step', 1)
+        attrs = (
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            'dilations',
+            dilation,
+            'deformable_groups',
+            deformable_groups,
+            'groups',
+            groups,
+            'im2col_step',
+            1,
+        )
         if use_deform_conv2d_v1:
             op_type = 'deformable_conv_v1'
-            pre_bias = getattr(_legacy_C_ops, op_type)(x, offset, weight,
-                                                       *attrs)
+            pre_bias = getattr(_legacy_C_ops, op_type)(
+                x, offset, weight, *attrs
+            )
         else:
             op_type = 'deformable_conv'
-            pre_bias = getattr(_legacy_C_ops, op_type)(x, offset, mask, weight,
-                                                       *attrs)
+            pre_bias = getattr(_legacy_C_ops, op_type)(
+                x, offset, mask, weight, *attrs
+            )
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=1)
         else:
             out = pre_bias
     else:
-        check_variable_and_dtype(x, "x", ['float32', 'float64'],
-                                 'deform_conv2d')
-        check_variable_and_dtype(offset, "offset", ['float32', 'float64'],
-                                 'deform_conv2d')
+        check_variable_and_dtype(
+            x, "x", ['float32', 'float64'], 'deform_conv2d'
+        )
+        check_variable_and_dtype(
+            offset, "offset", ['float32', 'float64'], 'deform_conv2d'
+        )
 
         num_channels = x.shape[1]
 
@@ -942,20 +1099,18 @@ def deform_conv2d(x,
             'deformable_groups': deformable_groups,
             'im2col_step': 1,
         }
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
 
         if bias is not None:
             out = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='elementwise_add',
-                             inputs={
-                                 'X': [pre_bias],
-                                 'Y': [bias]
-                             },
-                             outputs={'Out': [out]},
-                             attrs={'axis': 1})
+            helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [pre_bias], 'Y': [bias]},
+                outputs={'Out': [out]},
+                attrs={'axis': 1},
+            )
         else:
             out = pre_bias
     return out
@@ -1095,19 +1250,23 @@ class DeformConv2D(Layer):
           [8, 16, 26, 26]
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 deformable_groups=1,
-                 groups=1,
-                 weight_attr=None,
-                 bias_attr=None):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        deformable_groups=1,
+        groups=1,
+        weight_attr=None,
+        bias_attr=None,
+    ):
         super(DeformConv2D, self).__init__()
-        assert weight_attr is not False, "weight_attr should not be False in Conv."
+        assert (
+            weight_attr is not False
+        ), "weight_attr should not be False in Conv."
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
         self._deformable_groups = deformable_groups
@@ -1129,46 +1288,51 @@ class DeformConv2D(Layer):
 
         def _get_default_param_initializer():
             filter_elem_num = np.prod(self._kernel_size) * self._in_channels
-            std = (2.0 / filter_elem_num)**0.5
+            std = (2.0 / filter_elem_num) ** 0.5
             return Normal(0.0, std, 0)
 
         self.weight = self.create_parameter(
             shape=filter_shape,
             attr=self._weight_attr,
-            default_initializer=_get_default_param_initializer())
-        self.bias = self.create_parameter(attr=self._bias_attr,
-                                          shape=[self._out_channels],
-                                          is_bias=True)
+            default_initializer=_get_default_param_initializer(),
+        )
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=[self._out_channels], is_bias=True
+        )
 
     def forward(self, x, offset, mask=None):
-        out = deform_conv2d(x=x,
-                            offset=offset,
-                            weight=self.weight,
-                            bias=self.bias,
-                            stride=self._stride,
-                            padding=self._padding,
-                            dilation=self._dilation,
-                            deformable_groups=self._deformable_groups,
-                            groups=self._groups,
-                            mask=mask)
+        out = deform_conv2d(
+            x=x,
+            offset=offset,
+            weight=self.weight,
+            bias=self.bias,
+            stride=self._stride,
+            padding=self._padding,
+            dilation=self._dilation,
+            deformable_groups=self._deformable_groups,
+            groups=self._groups,
+            mask=mask,
+        )
         return out
 
 
-def distribute_fpn_proposals(fpn_rois,
-                             min_level,
-                             max_level,
-                             refer_level,
-                             refer_scale,
-                             pixel_offset=False,
-                             rois_num=None,
-                             name=None):
+def distribute_fpn_proposals(
+    fpn_rois,
+    min_level,
+    max_level,
+    refer_level,
+    refer_scale,
+    pixel_offset=False,
+    rois_num=None,
+    name=None,
+):
     r"""
-        In Feature Pyramid Networks (FPN) models, it is needed to distribute 
-    all proposals into different FPN level, with respect to scale of the proposals, 
-    the referring scale and the referring level. Besides, to restore the order of 
-    proposals, we return an array which indicates the original index of rois 
+        In Feature Pyramid Networks (FPN) models, it is needed to distribute
+    all proposals into different FPN level, with respect to scale of the proposals,
+    the referring scale and the referring level. Besides, to restore the order of
+    proposals, we return an array which indicates the original index of rois
     in current proposals. To compute FPN level for each roi, the formula is given as follows:
-    
+
     .. math::
         roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
         level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)
@@ -1177,30 +1341,30 @@ def distribute_fpn_proposals(fpn_rois,
     Args:
         fpn_rois (Tensor): The input fpn_rois. 2-D Tensor with shape [N, 4] and data type can be
             float32 or float64.
-        min_level (int): The lowest level of FPN layer where the proposals come 
+        min_level (int): The lowest level of FPN layer where the proposals come
             from.
         max_level (int): The highest level of FPN layer where the proposals
             come from.
         refer_level (int): The referring level of FPN layer with specified scale.
         refer_scale (int): The referring scale of FPN layer with specified level.
-        pixel_offset (bool, optional): Whether there is pixel offset. If True, the offset of 
+        pixel_offset (bool, optional): Whether there is pixel offset. If True, the offset of
             image shape will be 1. 'False' by default.
-        rois_num (Tensor, optional): 1-D Tensor contains the number of RoIs in each image. 
+        rois_num (Tensor, optional): 1-D Tensor contains the number of RoIs in each image.
             The shape is [B] and data type is int32. B is the number of images.
-            If rois_num not None, it will return a list of 1-D Tensor. Each element 
+            If rois_num not None, it will return a list of 1-D Tensor. Each element
             is the output RoIs' number of each image on the corresponding level
             and the shape is [B]. None by default.
-        name (str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
-            None by default. 
+        name (str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
 
     Returns:
         multi_rois (List) : The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
-            and data type is same as `fpn_rois` . The length is max_level-min_level+1.         
+            and data type is same as `fpn_rois` . The length is max_level-min_level+1.
         restore_ind (Tensor): The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
-            , where N is the number of total rois. The data type is int32. 
-        rois_num_per_level (List): A list of 1-D Tensor and each Tensor is 
-            the RoIs' number in each image on the corresponding level. The shape 
+            , where N is the number of total rois. The data type is int32.
+        rois_num_per_level (List): A list of 1-D Tensor and each Tensor is
+            the RoIs' number in each image on the corresponding level. The shape
             is [B] and data type of int32, where B is the number of images.
 
     Examples:
@@ -1222,24 +1386,56 @@ def distribute_fpn_proposals(fpn_rois,
     num_lvl = max_level - min_level + 1
 
     if in_dygraph_mode():
-        assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        multi_rois, rois_num_per_level, restore_ind = _C_ops.distribute_fpn_proposals(
-            fpn_rois, rois_num, min_level, max_level, refer_level, refer_scale,
-            pixel_offset)
+        assert (
+            rois_num is not None
+        ), "rois_num should not be None in dygraph mode."
+        (
+            multi_rois,
+            rois_num_per_level,
+            restore_ind,
+        ) = _C_ops.distribute_fpn_proposals(
+            fpn_rois,
+            rois_num,
+            min_level,
+            max_level,
+            refer_level,
+            refer_scale,
+            pixel_offset,
+        )
         return multi_rois, restore_ind, rois_num_per_level
 
     if _non_static_mode():
-        assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level',
-                 refer_level, 'refer_scale', refer_scale, 'pixel_offset',
-                 pixel_offset)
-        multi_rois, restore_ind, rois_num_per_level = _legacy_C_ops.distribute_fpn_proposals(
-            fpn_rois, rois_num, num_lvl, num_lvl, *attrs)
+        assert (
+            rois_num is not None
+        ), "rois_num should not be None in dygraph mode."
+        attrs = (
+            'min_level',
+            min_level,
+            'max_level',
+            max_level,
+            'refer_level',
+            refer_level,
+            'refer_scale',
+            refer_scale,
+            'pixel_offset',
+            pixel_offset,
+        )
+        (
+            multi_rois,
+            restore_ind,
+            rois_num_per_level,
+        ) = _legacy_C_ops.distribute_fpn_proposals(
+            fpn_rois, rois_num, num_lvl, num_lvl, *attrs
+        )
         return multi_rois, restore_ind, rois_num_per_level
 
     else:
-        check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'],
-                                 'distribute_fpn_proposals')
+        check_variable_and_dtype(
+            fpn_rois,
+            'fpn_rois',
+            ['float32', 'float64'],
+            'distribute_fpn_proposals',
+        )
         helper = LayerHelper('distribute_fpn_proposals', **locals())
         dtype = helper.input_dtype('fpn_rois')
         multi_rois = [
@@ -1265,16 +1461,18 @@ def distribute_fpn_proposals(fpn_rois,
         else:
             rois_num_per_level = None
 
-        helper.append_op(type='distribute_fpn_proposals',
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs={
-                             'min_level': min_level,
-                             'max_level': max_level,
-                             'refer_level': refer_level,
-                             'refer_scale': refer_scale,
-                             'pixel_offset': pixel_offset
-                         })
+        helper.append_op(
+            type='distribute_fpn_proposals',
+            inputs=inputs,
+            outputs=outputs,
+            attrs={
+                'min_level': min_level,
+                'max_level': max_level,
+                'refer_level': refer_level,
+                'refer_scale': refer_scale,
+                'pixel_offset': pixel_offset,
+            },
+        )
         return multi_rois, restore_ind, rois_num_per_level
 
 
@@ -1298,12 +1496,12 @@ def read_file(filename, name=None):
             import cv2
             import paddle
 
-            fake_img = (paddle.rand((400, 300, 3)).numpy() * 255).astype('uint8')            
+            fake_img = (paddle.rand((400, 300, 3)).numpy() * 255).astype('uint8')
 
             cv2.imwrite('fake.jpg', fake_img)
 
             img_bytes = paddle.vision.ops.read_file('fake.jpg')
-            
+
             print(img_bytes.shape)
             # [142915]
     """
@@ -1316,24 +1514,23 @@ def read_file(filename, name=None):
 
     helper = LayerHelper("read_file", **locals())
     out = helper.create_variable_for_type_inference('uint8')
-    helper.append_op(type="read_file",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={"Out": out})
+    helper.append_op(
+        type="read_file", inputs=inputs, attrs=attrs, outputs={"Out": out}
+    )
 
     return out
 
 
 def decode_jpeg(x, mode='unchanged', name=None):
     """
-    Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. 
-    Optionally converts the image to the desired format. 
+    Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor.
+    Optionally converts the image to the desired format.
     The values of the output tensor are uint8 between 0 and 255.
 
     Args:
-        x (Tensor): A one dimensional uint8 tensor containing the raw bytes 
+        x (Tensor): A one dimensional uint8 tensor containing the raw bytes
             of the JPEG image.
-        mode (str): The read mode used for optionally converting the image. 
+        mode (str): The read mode used for optionally converting the image.
             Default: 'unchanged'.
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
@@ -1367,10 +1564,9 @@ def decode_jpeg(x, mode='unchanged', name=None):
 
     helper = LayerHelper("decode_jpeg", **locals())
     out = helper.create_variable_for_type_inference('uint8')
-    helper.append_op(type="decode_jpeg",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={"Out": out})
+    helper.append_op(
+        type="decode_jpeg", inputs=inputs, attrs=attrs, outputs={"Out": out}
+    )
 
     return out
 
@@ -1378,7 +1574,7 @@ def decode_jpeg(x, mode='unchanged', name=None):
 def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     """
     Position sensitive region of interest pooling (also known as PSROIPooling) is to perform
-    position-sensitive average pooling on regions of interest specified by input. It performs 
+    position-sensitive average pooling on regions of interest specified by input. It performs
     on inputs of nonuniform sizes to obtain fixed-size feature maps.
 
     PSROIPooling is proposed by R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details.
@@ -1386,13 +1582,13 @@ def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     Args:
         x (Tensor): Input features with shape (N, C, H, W). The data type can be float32 or float64.
         boxes (Tensor): Box coordinates of ROIs (Regions of Interest) to pool over. It should be
-                         a 2-D Tensor with shape (num_rois, 4). Given as [[x1, y1, x2, y2], ...], 
+                         a 2-D Tensor with shape (num_rois, 4). Given as [[x1, y1, x2, y2], ...],
                          (x1, y1) is the top left coordinates, and (x2, y2) is the bottom
                          right coordinates.
         boxes_num (Tensor): The number of boxes contained in each picture in the batch.
-        output_size (int|Tuple(int, int))  The pooled output size(H, W), data type 
+        output_size (int|Tuple(int, int))  The pooled output size(H, W), data type
                                is int32. If int, H and W are both equal to output_size.
-        spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their 
+        spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their
                                input scale to the scale used when pooling. Default: 1.0
         name(str, optional): The default value is None.
                              Normally there is no need for user to set this property.
@@ -1418,34 +1614,47 @@ def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     if isinstance(output_size, int):
         output_size = (output_size, output_size)
     pooled_height, pooled_width = output_size
-    assert len(x.shape) == 4, \
-            "Input features with shape should be (N, C, H, W)"
+    assert len(x.shape) == 4, "Input features with shape should be (N, C, H, W)"
     output_channels = int(x.shape[1] / (pooled_height * pooled_width))
     if in_dygraph_mode():
-        return _C_ops.psroi_pool(x, boxes, boxes_num, pooled_height,
-                                 pooled_width, output_channels, spatial_scale)
+        return _C_ops.psroi_pool(
+            x,
+            boxes,
+            boxes_num,
+            pooled_height,
+            pooled_width,
+            output_channels,
+            spatial_scale,
+        )
     if _in_legacy_dygraph():
-        return _legacy_C_ops.psroi_pool(x, boxes, boxes_num, "output_channels",
-                                        output_channels, "spatial_scale",
-                                        spatial_scale, "pooled_height",
-                                        pooled_height, "pooled_width",
-                                        pooled_width)
+        return _legacy_C_ops.psroi_pool(
+            x,
+            boxes,
+            boxes_num,
+            "output_channels",
+            output_channels,
+            "spatial_scale",
+            spatial_scale,
+            "pooled_height",
+            pooled_height,
+            "pooled_width",
+            pooled_width,
+        )
 
     helper = LayerHelper('psroi_pool', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='psroi_pool',
-                     inputs={
-                         'X': x,
-                         'ROIs': boxes
-                     },
-                     outputs={'Out': out},
-                     attrs={
-                         'output_channels': output_channels,
-                         'spatial_scale': spatial_scale,
-                         'pooled_height': pooled_height,
-                         'pooled_width': pooled_width
-                     })
+    helper.append_op(
+        type='psroi_pool',
+        inputs={'X': x, 'ROIs': boxes},
+        outputs={'Out': out},
+        attrs={
+            'output_channels': output_channels,
+            'spatial_scale': spatial_scale,
+            'pooled_height': pooled_height,
+            'pooled_width': pooled_width,
+        },
+    )
     return out
 
 
@@ -1455,9 +1664,9 @@ class PSRoIPool(Layer):
     refer to :ref:`api_paddle_vision_ops_psroi_pool`.
 
     Args:
-        output_size (int|Tuple(int, int))  The pooled output size(H, W), data type 
+        output_size (int|Tuple(int, int))  The pooled output size(H, W), data type
                                is int32. If int, H and W are both equal to output_size.
-        spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their 
+        spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their
                                input scale to the scale used when pooling. Default: 1.0.
 
     Shape:
@@ -1474,7 +1683,7 @@ class PSRoIPool(Layer):
         .. code-block:: python
 
             import paddle
-            
+
             psroi_module = paddle.vision.ops.PSRoIPool(7, 1.0)
             x = paddle.uniform([2, 490, 28, 28], dtype='float32')
             boxes = paddle.to_tensor([[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], dtype='float32')
@@ -1489,24 +1698,25 @@ class PSRoIPool(Layer):
         self.spatial_scale = spatial_scale
 
     def forward(self, x, boxes, boxes_num):
-        return psroi_pool(x, boxes, boxes_num, self.output_size,
-                          self.spatial_scale)
+        return psroi_pool(
+            x, boxes, boxes_num, self.output_size, self.spatial_scale
+        )
 
 
 def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     """
     This operator implements the roi_pooling layer.
     Region of interest pooling (also known as RoI pooling) is to perform max pooling on inputs of nonuniform sizes to obtain fixed-size feature maps (e.g. 7*7).
-    The operator has three steps: 1. Dividing each region proposal into equal-sized sections with output_size(h, w) 2. Finding the largest value in each section 3. Copying these max values to the output buffer  
+    The operator has three steps: 1. Dividing each region proposal into equal-sized sections with output_size(h, w) 2. Finding the largest value in each section 3. Copying these max values to the output buffer
     For more information, please refer to https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn.
 
     Args:
-        x (Tensor): input feature, 4D-Tensor with the shape of [N,C,H,W], 
-            where N is the batch size, C is the input channel, H is Height, W is weight. 
+        x (Tensor): input feature, 4D-Tensor with the shape of [N,C,H,W],
+            where N is the batch size, C is the input channel, H is Height, W is weight.
             The data type is float32 or float64.
-        boxes (Tensor): boxes (Regions of Interest) to pool over. 
-            2D-Tensor with the shape of [num_boxes,4]. 
-            Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates, 
+        boxes (Tensor): boxes (Regions of Interest) to pool over.
+            2D-Tensor with the shape of [num_boxes,4].
+            Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates,
             and (x2, y2) is the bottom right coordinates.
         boxes_num (Tensor): the number of RoIs in each image, data type is int32. Default: None
         output_size (int or tuple[int, int]): the pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size.
@@ -1514,7 +1724,7 @@ def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
         name(str, optional): for detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
 
     Returns:
-        pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]].  
+        pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]].
 
     Examples:
         .. code-block:: python
@@ -1537,14 +1747,27 @@ def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
 
     pooled_height, pooled_width = output_size
     if in_dygraph_mode():
-        assert boxes_num is not None, "boxes_num should not be None in dygraph mode."
-        return _C_ops.roi_pool(x, boxes, boxes_num, pooled_height, pooled_width,
-                               spatial_scale)
+        assert (
+            boxes_num is not None
+        ), "boxes_num should not be None in dygraph mode."
+        return _C_ops.roi_pool(
+            x, boxes, boxes_num, pooled_height, pooled_width, spatial_scale
+        )
     if _in_legacy_dygraph():
-        assert boxes_num is not None, "boxes_num should not be None in dygraph mode."
+        assert (
+            boxes_num is not None
+        ), "boxes_num should not be None in dygraph mode."
         pool_out, argmaxes = _legacy_C_ops.roi_pool(
-            x, boxes, boxes_num, "pooled_height", pooled_height, "pooled_width",
-            pooled_width, "spatial_scale", spatial_scale)
+            x,
+            boxes,
+            boxes_num,
+            "pooled_height",
+            pooled_height,
+            "pooled_width",
+            pooled_width,
+            "spatial_scale",
+            spatial_scale,
+        )
         return pool_out
 
     else:
@@ -1561,38 +1784,37 @@ def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
         }
         if boxes_num is not None:
             inputs['RoisNum'] = boxes_num
-        helper.append_op(type="roi_pool",
-                         inputs=inputs,
-                         outputs={
-                             "Out": pool_out,
-                             "Argmax": argmaxes
-                         },
-                         attrs={
-                             "pooled_height": pooled_height,
-                             "pooled_width": pooled_width,
-                             "spatial_scale": spatial_scale
-                         })
+        helper.append_op(
+            type="roi_pool",
+            inputs=inputs,
+            outputs={"Out": pool_out, "Argmax": argmaxes},
+            attrs={
+                "pooled_height": pooled_height,
+                "pooled_width": pooled_width,
+                "spatial_scale": spatial_scale,
+            },
+        )
         return pool_out
 
 
 class RoIPool(Layer):
     """
     This interface is used to construct a callable object of the `RoIPool` class. Please
-    refer to :ref:`api_paddle_vision_ops_roi_pool`.  
+    refer to :ref:`api_paddle_vision_ops_roi_pool`.
 
     Args:
         output_size (int or tuple[int, int]): the pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size.
         spatial_scale (float, optional): multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0.
 
     Returns:
-        pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]].  
+        pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]].
 
     Examples:
         .. code-block:: python
 
             import paddle
             from paddle.vision.ops import RoIPool
-            
+
             data = paddle.rand([1, 256, 32, 32])
             boxes = paddle.rand([3, 4])
             boxes[:, 2] += boxes[:, 0] + 3
@@ -1609,25 +1831,29 @@ class RoIPool(Layer):
         self._spatial_scale = spatial_scale
 
     def forward(self, x, boxes, boxes_num):
-        return roi_pool(x=x,
-                        boxes=boxes,
-                        boxes_num=boxes_num,
-                        output_size=self._output_size,
-                        spatial_scale=self._spatial_scale)
+        return roi_pool(
+            x=x,
+            boxes=boxes,
+            boxes_num=boxes_num,
+            output_size=self._output_size,
+            spatial_scale=self._spatial_scale,
+        )
 
     def extra_repr(self):
         main_str = 'output_size={_output_size}, spatial_scale={_spatial_scale}'
         return main_str.format(**self.__dict__)
 
 
-def roi_align(x,
-              boxes,
-              boxes_num,
-              output_size,
-              spatial_scale=1.0,
-              sampling_ratio=-1,
-              aligned=True,
-              name=None):
+def roi_align(
+    x,
+    boxes,
+    boxes_num,
+    output_size,
+    spatial_scale=1.0,
+    sampling_ratio=-1,
+    aligned=True,
+    name=None,
+):
     """
     Implementing the roi_align layer.
     Region of Interest (RoI) Align operator (also known as RoI Align) is to
@@ -1639,13 +1865,13 @@ def roi_align(x,
 
     In each ROI bin, the value of the four regularly sampled locations are
     computed directly through bilinear interpolation. The output is the mean of
-    four locations. Thus avoid the misaligned problem. 
+    four locations. Thus avoid the misaligned problem.
 
     Args:
-        x (Tensor): Input feature, 4D-Tensor with the shape of [N,C,H,W], 
+        x (Tensor): Input feature, 4D-Tensor with the shape of [N,C,H,W],
             where N is the batch size, C is the input channel, H is Height,
             W is weight. The data type is float32 or float64.
-        boxes (Tensor): Boxes (RoIs, Regions of Interest) to pool over. It 
+        boxes (Tensor): Boxes (RoIs, Regions of Interest) to pool over. It
             should be a 2-D Tensor of shape (num_boxes, 4). The data type is
             float32 or float64. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
             the top left coordinates, and (x2, y2) is the bottom right coordinates.
@@ -1696,24 +1922,45 @@ def roi_align(x,
 
     pooled_height, pooled_width = output_size
     if in_dygraph_mode():
-        assert boxes_num is not None, "boxes_num should not be None in dygraph mode."
-        return _C_ops.roi_align(x, boxes, boxes_num, pooled_height,
-                                pooled_width, spatial_scale, sampling_ratio,
-                                aligned)
+        assert (
+            boxes_num is not None
+        ), "boxes_num should not be None in dygraph mode."
+        return _C_ops.roi_align(
+            x,
+            boxes,
+            boxes_num,
+            pooled_height,
+            pooled_width,
+            spatial_scale,
+            sampling_ratio,
+            aligned,
+        )
     if _in_legacy_dygraph():
-        assert boxes_num is not None, "boxes_num should not be None in dygraph mode."
-        align_out = _legacy_C_ops.roi_align(x, boxes, boxes_num,
-                                            "pooled_height", pooled_height,
-                                            "pooled_width", pooled_width,
-                                            "spatial_scale", spatial_scale,
-                                            "sampling_ratio", sampling_ratio,
-                                            "aligned", aligned)
+        assert (
+            boxes_num is not None
+        ), "boxes_num should not be None in dygraph mode."
+        align_out = _legacy_C_ops.roi_align(
+            x,
+            boxes,
+            boxes_num,
+            "pooled_height",
+            pooled_height,
+            "pooled_width",
+            pooled_width,
+            "spatial_scale",
+            spatial_scale,
+            "sampling_ratio",
+            sampling_ratio,
+            "aligned",
+            aligned,
+        )
         return align_out
 
     else:
         check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'roi_align')
-        check_variable_and_dtype(boxes, 'boxes', ['float32', 'float64'],
-                                 'roi_align')
+        check_variable_and_dtype(
+            boxes, 'boxes', ['float32', 'float64'], 'roi_align'
+        )
         helper = LayerHelper('roi_align', **locals())
         dtype = helper.input_dtype()
         align_out = helper.create_variable_for_type_inference(dtype)
@@ -1723,16 +1970,18 @@ def roi_align(x,
         }
         if boxes_num is not None:
             inputs['RoisNum'] = boxes_num
-        helper.append_op(type="roi_align",
-                         inputs=inputs,
-                         outputs={"Out": align_out},
-                         attrs={
-                             "pooled_height": pooled_height,
-                             "pooled_width": pooled_width,
-                             "spatial_scale": spatial_scale,
-                             "sampling_ratio": sampling_ratio,
-                             "aligned": aligned,
-                         })
+        helper.append_op(
+            type="roi_align",
+            inputs=inputs,
+            outputs={"Out": align_out},
+            attrs={
+                "pooled_height": pooled_height,
+                "pooled_width": pooled_width,
+                "spatial_scale": spatial_scale,
+                "sampling_ratio": sampling_ratio,
+                "aligned": aligned,
+            },
+        )
         return align_out
 
 
@@ -1774,12 +2023,14 @@ class RoIAlign(Layer):
         self._spatial_scale = spatial_scale
 
     def forward(self, x, boxes, boxes_num, aligned=True):
-        return roi_align(x=x,
-                         boxes=boxes,
-                         boxes_num=boxes_num,
-                         output_size=self._output_size,
-                         spatial_scale=self._spatial_scale,
-                         aligned=aligned)
+        return roi_align(
+            x=x,
+            boxes=boxes,
+            boxes_num=boxes_num,
+            output_size=self._output_size,
+            spatial_scale=self._spatial_scale,
+            aligned=aligned,
+        )
 
 
 class ConvNormActivation(Sequential):
@@ -1803,30 +2054,34 @@ class ConvNormActivation(Sequential):
         bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 stride=1,
-                 padding=None,
-                 groups=1,
-                 norm_layer=BatchNorm2D,
-                 activation_layer=ReLU,
-                 dilation=1,
-                 bias=None):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=1,
+        padding=None,
+        groups=1,
+        norm_layer=BatchNorm2D,
+        activation_layer=ReLU,
+        dilation=1,
+        bias=None,
+    ):
         if padding is None:
             padding = (kernel_size - 1) // 2 * dilation
         if bias is None:
             bias = norm_layer is None
         layers = [
-            Conv2D(in_channels,
-                   out_channels,
-                   kernel_size,
-                   stride,
-                   padding,
-                   dilation=dilation,
-                   groups=groups,
-                   bias_attr=bias)
+            Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                dilation=dilation,
+                groups=groups,
+                bias_attr=bias,
+            )
         ]
         if norm_layer is not None:
             layers.append(norm_layer(out_channels))
@@ -1835,17 +2090,19 @@ class ConvNormActivation(Sequential):
         super().__init__(*layers)
 
 
-def nms(boxes,
-        iou_threshold=0.3,
-        scores=None,
-        category_idxs=None,
-        categories=None,
-        top_k=None):
+def nms(
+    boxes,
+    iou_threshold=0.3,
+    scores=None,
+    category_idxs=None,
+    categories=None,
+    top_k=None,
+):
     r"""
     This operator implements non-maximum suppression. Non-maximum suppression (NMS)
-    is used to select one bounding box out of many overlapping bounding boxes in object detection. 
-    Boxes with IoU > iou_threshold will be considered as overlapping boxes, 
-    just one with highest score can be kept. Here IoU is Intersection Over Union, 
+    is used to select one bounding box out of many overlapping bounding boxes in object detection.
+    Boxes with IoU > iou_threshold will be considered as overlapping boxes,
+    just one with highest score can be kept. Here IoU is Intersection Over Union,
     which can be computed by:
 
     ..  math::
@@ -1854,25 +2111,25 @@ def nms(boxes,
 
     If scores are provided, input boxes will be sorted by their scores firstly.
 
-    If category_idxs and categories are provided, NMS will be performed with a batched style, 
+    If category_idxs and categories are provided, NMS will be performed with a batched style,
     which means NMS will be applied to each category respectively and results of each category
     will be concated and sorted by scores.
-    
+
     If K is provided, only the first k elements will be returned. Otherwise, all box indices sorted by scores will be returned.
 
     Args:
-        boxes(Tensor): The input boxes data to be computed, it's a 2D-Tensor with 
-            the shape of [num_boxes, 4]. The data type is float32 or float64. 
-            Given as [[x1, y1, x2, y2], …],  (x1, y1) is the top left coordinates, 
-            and (x2, y2) is the bottom right coordinates. 
+        boxes(Tensor): The input boxes data to be computed, it's a 2D-Tensor with
+            the shape of [num_boxes, 4]. The data type is float32 or float64.
+            Given as [[x1, y1, x2, y2], …],  (x1, y1) is the top left coordinates,
+            and (x2, y2) is the bottom right coordinates.
             Their relation should be ``0 <= x1 < x2 && 0 <= y1 < y2``.
         iou_threshold(float32, optional): IoU threshold for determine overlapping boxes. Default value: 0.3.
-        scores(Tensor, optional): Scores corresponding to boxes, it's a 1D-Tensor with 
+        scores(Tensor, optional): Scores corresponding to boxes, it's a 1D-Tensor with
             shape of [num_boxes]. The data type is float32 or float64. Default: None.
-        category_idxs(Tensor, optional): Category indices corresponding to boxes. 
+        category_idxs(Tensor, optional): Category indices corresponding to boxes.
             it's a 1D-Tensor with shape of [num_boxes]. The data type is int64. Default: None.
         categories(List, optional): A list of unique id of all categories. The data type is int64. Default: None.
-        top_k(int64, optional): The top K boxes who has higher score and kept by NMS preds to 
+        top_k(int64, optional): The top K boxes who has higher score and kept by NMS preds to
             consider. top_k should be smaller equal than num_boxes. Default: None.
 
     Returns:
@@ -1880,7 +2137,7 @@ def nms(boxes,
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
             import numpy as np
 
@@ -1899,14 +2156,14 @@ def nms(boxes,
             # [0.98015213 0.3156527  0.8199343  0.874901 ]
 
             categories = [0, 1, 2, 3]
-            category_idxs = np.random.choice(categories, 4)                        
+            category_idxs = np.random.choice(categories, 4)
             # [2 0 0 3]
 
-            out =  paddle.vision.ops.nms(paddle.to_tensor(boxes), 
-                                                    0.1, 
-                                                    paddle.to_tensor(scores), 
-                                                    paddle.to_tensor(category_idxs), 
-                                                    categories, 
+            out =  paddle.vision.ops.nms(paddle.to_tensor(boxes),
+                                                    0.1,
+                                                    paddle.to_tensor(scores),
+                                                    paddle.to_tensor(category_idxs),
+                                                    categories,
                                                     4)
             # [0, 3, 2]
     """
@@ -1920,34 +2177,42 @@ def nms(boxes,
 
         helper = LayerHelper('nms', **locals())
         out = helper.create_variable_for_type_inference('int64')
-        helper.append_op(type='nms',
-                         inputs={'Boxes': boxes},
-                         outputs={'KeepBoxesIdxs': out},
-                         attrs={'iou_threshold': iou_threshold})
+        helper.append_op(
+            type='nms',
+            inputs={'Boxes': boxes},
+            outputs={'KeepBoxesIdxs': out},
+            attrs={'iou_threshold': iou_threshold},
+        )
         return out
 
     if scores is None:
         return _nms(boxes, iou_threshold)
 
     import paddle
+
     if category_idxs is None:
         sorted_global_indices = paddle.argsort(scores, descending=True)
-        sorted_keep_boxes_indices = _nms(boxes[sorted_global_indices],
-                                         iou_threshold)
+        sorted_keep_boxes_indices = _nms(
+            boxes[sorted_global_indices], iou_threshold
+        )
         return sorted_global_indices[sorted_keep_boxes_indices]
 
     if top_k is not None:
-        assert top_k <= scores.shape[
-            0], "top_k should be smaller equal than the number of boxes"
-    assert categories is not None, "if category_idxs is given, categories which is a list of unique id of all categories is necessary"
+        assert (
+            top_k <= scores.shape[0]
+        ), "top_k should be smaller equal than the number of boxes"
+    assert (
+        categories is not None
+    ), "if category_idxs is given, categories which is a list of unique id of all categories is necessary"
 
     mask = paddle.zeros_like(scores, dtype=paddle.int32)
 
     for category_id in categories:
         cur_category_boxes_idxs = paddle.where(category_idxs == category_id)[0]
         shape = cur_category_boxes_idxs.shape[0]
-        cur_category_boxes_idxs = paddle.reshape(cur_category_boxes_idxs,
-                                                 [shape])
+        cur_category_boxes_idxs = paddle.reshape(
+            cur_category_boxes_idxs, [shape]
+        )
         if shape == 0:
             continue
         elif shape == 1:
@@ -1955,27 +2220,33 @@ def nms(boxes,
             continue
         cur_category_boxes = boxes[cur_category_boxes_idxs]
         cur_category_scores = scores[cur_category_boxes_idxs]
-        cur_category_sorted_indices = paddle.argsort(cur_category_scores,
-                                                     descending=True)
+        cur_category_sorted_indices = paddle.argsort(
+            cur_category_scores, descending=True
+        )
         cur_category_sorted_boxes = cur_category_boxes[
-            cur_category_sorted_indices]
+            cur_category_sorted_indices
+        ]
 
-        cur_category_keep_boxes_sub_idxs = cur_category_sorted_indices[_nms(
-            cur_category_sorted_boxes, iou_threshold)]
+        cur_category_keep_boxes_sub_idxs = cur_category_sorted_indices[
+            _nms(cur_category_sorted_boxes, iou_threshold)
+        ]
 
         updates = paddle.ones_like(
             cur_category_boxes_idxs[cur_category_keep_boxes_sub_idxs],
-            dtype=paddle.int32)
+            dtype=paddle.int32,
+        )
         mask = paddle.scatter(
             mask,
             cur_category_boxes_idxs[cur_category_keep_boxes_sub_idxs],
             updates,
-            overwrite=True)
+            overwrite=True,
+        )
     keep_boxes_idxs = paddle.where(mask)[0]
     shape = keep_boxes_idxs.shape[0]
     keep_boxes_idxs = paddle.reshape(keep_boxes_idxs, [shape])
-    sorted_sub_indices = paddle.argsort(scores[keep_boxes_idxs],
-                                        descending=True)
+    sorted_sub_indices = paddle.argsort(
+        scores[keep_boxes_idxs], descending=True
+    )
 
     if top_k is None:
         return keep_boxes_idxs[sorted_sub_indices]
@@ -1988,32 +2259,34 @@ def nms(boxes,
     return keep_boxes_idxs[sorted_sub_indices][:top_k]
 
 
-def generate_proposals(scores,
-                       bbox_deltas,
-                       img_size,
-                       anchors,
-                       variances,
-                       pre_nms_top_n=6000,
-                       post_nms_top_n=1000,
-                       nms_thresh=0.5,
-                       min_size=0.1,
-                       eta=1.0,
-                       pixel_offset=False,
-                       return_rois_num=False,
-                       name=None):
+def generate_proposals(
+    scores,
+    bbox_deltas,
+    img_size,
+    anchors,
+    variances,
+    pre_nms_top_n=6000,
+    post_nms_top_n=1000,
+    nms_thresh=0.5,
+    min_size=0.1,
+    eta=1.0,
+    pixel_offset=False,
+    return_rois_num=False,
+    name=None,
+):
     """
     This operation proposes RoIs according to each box with their
-    probability to be a foreground object. And 
-    the proposals of RPN output are  calculated by anchors, bbox_deltas and scores. Final proposals 
+    probability to be a foreground object. And
+    the proposals of RPN output are  calculated by anchors, bbox_deltas and scores. Final proposals
     could be used to train detection net.
 
     For generating proposals, this operation performs following steps:
 
     1. Transpose and resize scores and bbox_deltas in size of
        (H * W * A, 1) and (H * W * A, 4)
-    2. Calculate box locations as proposals candidates. 
+    2. Calculate box locations as proposals candidates.
     3. Clip boxes to image
-    4. Remove predicted boxes with small area. 
+    4. Remove predicted boxes with small area.
     5. Apply non-maximum suppression (NMS) to get final proposals as output.
 
     Args:
@@ -2071,40 +2344,74 @@ def generate_proposals(scores,
     """
 
     if in_dygraph_mode():
-        assert return_rois_num, "return_rois_num should be True in dygraph mode."
-        attrs = (pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta,
-                 pixel_offset)
+        assert (
+            return_rois_num
+        ), "return_rois_num should be True in dygraph mode."
+        attrs = (
+            pre_nms_top_n,
+            post_nms_top_n,
+            nms_thresh,
+            min_size,
+            eta,
+            pixel_offset,
+        )
         rpn_rois, rpn_roi_probs, rpn_rois_num = _C_ops.generate_proposals_v2(
-            scores, bbox_deltas, img_size, anchors, variances, *attrs)
+            scores, bbox_deltas, img_size, anchors, variances, *attrs
+        )
 
         return rpn_rois, rpn_roi_probs, rpn_rois_num
     elif _non_static_mode():
-        assert return_rois_num, "return_rois_num should be True in dygraph mode."
-        attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n,
-                 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta,
-                 'pixel_offset', pixel_offset)
-        rpn_rois, rpn_roi_probs, rpn_rois_num = _legacy_C_ops.generate_proposals_v2(
-            scores, bbox_deltas, img_size, anchors, variances, *attrs)
+        assert (
+            return_rois_num
+        ), "return_rois_num should be True in dygraph mode."
+        attrs = (
+            'pre_nms_topN',
+            pre_nms_top_n,
+            'post_nms_topN',
+            post_nms_top_n,
+            'nms_thresh',
+            nms_thresh,
+            'min_size',
+            min_size,
+            'eta',
+            eta,
+            'pixel_offset',
+            pixel_offset,
+        )
+        (
+            rpn_rois,
+            rpn_roi_probs,
+            rpn_rois_num,
+        ) = _legacy_C_ops.generate_proposals_v2(
+            scores, bbox_deltas, img_size, anchors, variances, *attrs
+        )
 
         return rpn_rois, rpn_roi_probs, rpn_rois_num
 
     helper = LayerHelper('generate_proposals_v2', **locals())
 
-    check_variable_and_dtype(scores, 'scores', ['float32'],
-                             'generate_proposals_v2')
-    check_variable_and_dtype(bbox_deltas, 'bbox_deltas', ['float32'],
-                             'generate_proposals_v2')
-    check_variable_and_dtype(img_size, 'img_size', ['float32', 'float64'],
-                             'generate_proposals_v2')
-    check_variable_and_dtype(anchors, 'anchors', ['float32'],
-                             'generate_proposals_v2')
-    check_variable_and_dtype(variances, 'variances', ['float32'],
-                             'generate_proposals_v2')
+    check_variable_and_dtype(
+        scores, 'scores', ['float32'], 'generate_proposals_v2'
+    )
+    check_variable_and_dtype(
+        bbox_deltas, 'bbox_deltas', ['float32'], 'generate_proposals_v2'
+    )
+    check_variable_and_dtype(
+        img_size, 'img_size', ['float32', 'float64'], 'generate_proposals_v2'
+    )
+    check_variable_and_dtype(
+        anchors, 'anchors', ['float32'], 'generate_proposals_v2'
+    )
+    check_variable_and_dtype(
+        variances, 'variances', ['float32'], 'generate_proposals_v2'
+    )
 
     rpn_rois = helper.create_variable_for_type_inference(
-        dtype=bbox_deltas.dtype)
+        dtype=bbox_deltas.dtype
+    )
     rpn_roi_probs = helper.create_variable_for_type_inference(
-        dtype=scores.dtype)
+        dtype=scores.dtype
+    )
     outputs = {
         'RpnRois': rpn_rois,
         'RpnRoiProbs': rpn_roi_probs,
@@ -2114,23 +2421,25 @@ def generate_proposals(scores,
         rpn_rois_num.stop_gradient = True
         outputs['RpnRoisNum'] = rpn_rois_num
 
-    helper.append_op(type="generate_proposals_v2",
-                     inputs={
-                         'Scores': scores,
-                         'BboxDeltas': bbox_deltas,
-                         'ImShape': img_size,
-                         'Anchors': anchors,
-                         'Variances': variances
-                     },
-                     attrs={
-                         'pre_nms_topN': pre_nms_top_n,
-                         'post_nms_topN': post_nms_top_n,
-                         'nms_thresh': nms_thresh,
-                         'min_size': min_size,
-                         'eta': eta,
-                         'pixel_offset': pixel_offset
-                     },
-                     outputs=outputs)
+    helper.append_op(
+        type="generate_proposals_v2",
+        inputs={
+            'Scores': scores,
+            'BboxDeltas': bbox_deltas,
+            'ImShape': img_size,
+            'Anchors': anchors,
+            'Variances': variances,
+        },
+        attrs={
+            'pre_nms_topN': pre_nms_top_n,
+            'post_nms_topN': post_nms_top_n,
+            'nms_thresh': nms_thresh,
+            'min_size': min_size,
+            'eta': eta,
+            'pixel_offset': pixel_offset,
+        },
+        outputs=outputs,
+    )
     rpn_rois.stop_gradient = True
     rpn_roi_probs.stop_gradient = True
     if not return_rois_num:
@@ -2139,19 +2448,21 @@ def generate_proposals(scores,
     return rpn_rois, rpn_roi_probs, rpn_rois_num
 
 
-def matrix_nms(bboxes,
-               scores,
-               score_threshold,
-               post_threshold,
-               nms_top_k,
-               keep_top_k,
-               use_gaussian=False,
-               gaussian_sigma=2.,
-               background_label=0,
-               normalized=True,
-               return_index=False,
-               return_rois_num=True,
-               name=None):
+def matrix_nms(
+    bboxes,
+    scores,
+    score_threshold,
+    post_threshold,
+    nms_top_k,
+    keep_top_k,
+    use_gaussian=False,
+    gaussian_sigma=2.0,
+    background_label=0,
+    normalized=True,
+    return_index=False,
+    return_rois_num=True,
+    name=None,
+):
     """
     This operator does matrix non maximum suppression (NMS).
     First selects a subset of candidate bounding boxes that have higher scores
@@ -2214,10 +2525,12 @@ def matrix_nms(bboxes,
                                  score_threshold=0.5, post_threshold=0.1,
                                  nms_top_k=400, keep_top_k=200, normalized=False)
     """
-    check_variable_and_dtype(bboxes, 'BBoxes', ['float32', 'float64'],
-                             'matrix_nms')
-    check_variable_and_dtype(scores, 'Scores', ['float32', 'float64'],
-                             'matrix_nms')
+    check_variable_and_dtype(
+        bboxes, 'BBoxes', ['float32', 'float64'], 'matrix_nms'
+    )
+    check_variable_and_dtype(
+        scores, 'Scores', ['float32', 'float64'], 'matrix_nms'
+    )
     check_type(score_threshold, 'score_threshold', float, 'matrix_nms')
     check_type(post_threshold, 'post_threshold', float, 'matrix_nms')
     check_type(nms_top_k, 'nums_top_k', int, 'matrix_nms')
@@ -2228,22 +2541,42 @@ def matrix_nms(bboxes,
     check_type(background_label, 'background_label', int, 'matrix_nms')
 
     if in_dygraph_mode():
-        out, index, rois_num = _C_ops.matrix_nms(bboxes, scores,
-                                                 score_threshold, nms_top_k,
-                                                 keep_top_k, post_threshold,
-                                                 use_gaussian, gaussian_sigma,
-                                                 background_label, normalized)
+        out, index, rois_num = _C_ops.matrix_nms(
+            bboxes,
+            scores,
+            score_threshold,
+            nms_top_k,
+            keep_top_k,
+            post_threshold,
+            use_gaussian,
+            gaussian_sigma,
+            background_label,
+            normalized,
+        )
         if not return_index:
             index = None
         if not return_rois_num:
             rois_num = None
         return out, rois_num, index
     elif _in_legacy_dygraph():
-        attrs = ('background_label', background_label, 'score_threshold',
-                 score_threshold, 'post_threshold', post_threshold, 'nms_top_k',
-                 nms_top_k, 'gaussian_sigma', gaussian_sigma, 'use_gaussian',
-                 use_gaussian, 'keep_top_k', keep_top_k, 'normalized',
-                 normalized)
+        attrs = (
+            'background_label',
+            background_label,
+            'score_threshold',
+            score_threshold,
+            'post_threshold',
+            post_threshold,
+            'nms_top_k',
+            nms_top_k,
+            'gaussian_sigma',
+            gaussian_sigma,
+            'use_gaussian',
+            use_gaussian,
+            'keep_top_k',
+            keep_top_k,
+            'normalized',
+            normalized,
+        )
         out, index, rois_num = _legacy_C_ops.matrix_nms(bboxes, scores, *attrs)
         if not return_index:
             index = None
@@ -2259,22 +2592,21 @@ def matrix_nms(bboxes,
             rois_num = helper.create_variable_for_type_inference(dtype='int32')
             outputs['RoisNum'] = rois_num
 
-        helper.append_op(type="matrix_nms",
-                         inputs={
-                             'BBoxes': bboxes,
-                             'Scores': scores
-                         },
-                         attrs={
-                             'background_label': background_label,
-                             'score_threshold': score_threshold,
-                             'post_threshold': post_threshold,
-                             'nms_top_k': nms_top_k,
-                             'gaussian_sigma': gaussian_sigma,
-                             'use_gaussian': use_gaussian,
-                             'keep_top_k': keep_top_k,
-                             'normalized': normalized
-                         },
-                         outputs=outputs)
+        helper.append_op(
+            type="matrix_nms",
+            inputs={'BBoxes': bboxes, 'Scores': scores},
+            attrs={
+                'background_label': background_label,
+                'score_threshold': score_threshold,
+                'post_threshold': post_threshold,
+                'nms_top_k': nms_top_k,
+                'gaussian_sigma': gaussian_sigma,
+                'use_gaussian': use_gaussian,
+                'keep_top_k': keep_top_k,
+                'normalized': normalized,
+            },
+            outputs=outputs,
+        )
         output.stop_gradient = True
 
         if not return_index:
-- 
GitLab