[cherry-pick2.4]for CodeStyle (#47608)

* only run pre-commit * only run pre-commit

[cherry-pick2.4]for CodeStyle (#47608)
* only run pre-commit * only run pre-commit
cfee9c13 · Ligoml · GitHub · 99c872fa · cfee9c13 · cfee9c13
57 changed file
--- a/python/paddle/autograd/py_layer.py
+++ b/python/paddle/autograd/py_layer.py
@@ -54,16 +54,16 @@ class LegacyPyLayerContext(object):
    def save_for_backward(self, *tensors):
        """
        Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
        .. note::
-            This API should be called at most once, and only inside `forward`. 
+            This API should be called at most once, and only inside `forward`.
        Args:
            tensors(list of Tensors): Tensors to be stored.
        Returns:
            None
        Examples:
            .. code-block:: python
@@ -94,7 +94,7 @@ class LegacyPyLayerContext(object):
        Get the tensors stored by ``save_for_backward``.
        Returns:
-            list of Tensors or None: If context contains tensors stored by `save_for_backward`, 
+            list of Tensors or None: If context contains tensors stored by `save_for_backward`,
            then return these tensors, otherwise return None.
        Examples:
@@ -124,9 +124,7 @@ class LegacyPyLayerContext(object):
 def with_mateclass(meta, *bases):
    class impl(meta):
        def __new__(cls, name, temp_bases, attrs):
            return meta(name, bases, attrs)
@@ -134,7 +132,6 @@ def with_mateclass(meta, *bases):
 class CPyLayer(object):
    @classmethod
    @dygraph_only
    def apply(cls, *args, **kwargs):
@@ -147,7 +144,7 @@ class CPyLayer(object):
        Returns:
            tensors or other types : output of PyLayer.
        Examples:
            .. code-block:: python
@@ -182,12 +179,14 @@ class CPyLayer(object):
 class PyLayerBackward(LegacyPyLayerContext):
    def backward(self, *args, **kwargs):
        with paddle.fluid.dygraph.guard():
            with paddle.fluid.dygraph.no_grad():
-                if self._amp_state and 'enable' in self._amp_state and self._amp_state[
+                if (
-                        'enable']:
+                    self._amp_state
+                    and 'enable' in self._amp_state
+                    and self._amp_state['enable']
+                ):
                    with auto_cast(**args[0]._amp_state):
                        return self._forward_cls.backward(*args, **kwargs)
                else:
@@ -197,10 +196,10 @@ class PyLayerBackward(LegacyPyLayerContext):
 class LayerMeta(type):
    def __init__(cls, name, bases, attrs):
-        cls._backward_function = type(name + '_backward', (PyLayerBackward, ),
+        cls._backward_function = type(
-                                      {"_forward_cls": cls})
+            name + '_backward', (PyLayerBackward,), {"_forward_cls": cls}
+        )
        return super(LayerMeta, cls).__init__(name, bases, attrs)
@@ -210,15 +209,15 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
    Build a custom `Layer` by creating subclasses. Subclasses need to follow the following rules:
    1. Subclasses contain `forward` and `backward` function. Both forward and backward are @staticmethod.
    Their first argument should be a context and `None` can not be included in the returned result.
-    2. Input of backward contains a context as the first argument, and the rest arguments are the 
+    2. Input of backward contains a context as the first argument, and the rest arguments are the
-    gradient of forward's output tensors. so the number of backward's input tensors equal to 
+    gradient of forward's output tensors. so the number of backward's input tensors equal to
-    the number of forward output tensors. If you need the forward's inputs or outputs in `backward`, 
+    the number of forward output tensors. If you need the forward's inputs or outputs in `backward`,
    you can use `save_for_backward` to store the required tensors, and then use them in the backward.
    3. Output of backward function can only be `Tensor` or tuple/list of `Tensor`.
-    Output tensors of backward are the gradient of forward's input tensors, 
+    Output tensors of backward are the gradient of forward's input tensors,
    so the number of backward's output tensors equal to the number of forward input tensors.
    After building the custom Layer, run it through the `apply` method.
    Examples:
        .. code-block:: python
@@ -259,8 +258,8 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
    @staticmethod
    def forward(ctx, *args, **kwargs):
        """
-        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as 
+        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as
-        the first argument, followed by any number of arguments (tensors or other types). 
+        the first argument, followed by any number of arguments (tensors or other types).
        `None` can not be included in the returned result.
        Args:
@@ -269,7 +268,7 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
        Returns:
            tensors or other types : output of PyLayer.
        Examples:
            .. code-block:: python
@@ -292,14 +291,15 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
                        return grad
        """
        raise NotImplementedError(
-            "You must implement the forward function for PyLayer.")
+            "You must implement the forward function for PyLayer."
+        )
    @staticmethod
    def backward(ctx, *args, **kwargs):
        """
-        This is a function to calculate the gradient. It is to be overloaded by subclasses. 
+        This is a function to calculate the gradient. It is to be overloaded by subclasses.
-        It must accept a object of `PyLayerContext` as the first argument, and the rest 
+        It must accept a object of `PyLayerContext` as the first argument, and the rest
-        arguments are the gradient of forward's output tensors. Output tensors of backward 
+        arguments are the gradient of forward's output tensors. Output tensors of backward
        are the gradient of forward's input tensors.
        Args:
@@ -308,7 +308,7 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
        Returns:
            Tensor or list of Tensors: The gradient of forward's input tensor(s).
        Examples:
            .. code-block:: python
@@ -332,24 +332,24 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
        """
        raise NotImplementedError(
-            "You must implement the backward function for PyLayer.")
+            "You must implement the backward function for PyLayer."
+        )
 class EagerPyLayerContext(object):
    def save_for_backward(self, *tensors):
        """
        Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
        .. note::
-            This API should be called at most once, and only inside `forward`. 
+            This API should be called at most once, and only inside `forward`.
        Args:
            tensors(list of Tensors): Tensors to be stored.
        Returns:
            None
        Examples:
            .. code-block:: python
@@ -380,7 +380,7 @@ class EagerPyLayerContext(object):
        Get the tensors stored by ``save_for_backward``.
        Returns:
-            list of Tensors or None: If context contains tensors stored by `save_for_backward`, 
+            list of Tensors or None: If context contains tensors stored by `save_for_backward`,
            then return these tensors, otherwise return None.
        Examples:
@@ -410,11 +410,11 @@ class EagerPyLayerContext(object):
    def mark_not_inplace(self, *args):
        """
        Marks inputs as not inplace.
-        This should be called at most once, only from inside the `forward` method, 
+        This should be called at most once, only from inside the `forward` method,
        and all arguments should be Tensor inputs.
-        If the Tensor returned by `forward` method is the same as the Tensor input of forward, 
+        If the Tensor returned by `forward` method is the same as the Tensor input of forward,
-        and this Tensor is marked as not_inplace, then Paddle will help the user create a new Tensor as output. 
+        and this Tensor is marked as not_inplace, then Paddle will help the user create a new Tensor as output.
        Thereby preventing the auto grad information of the input Tensor from being overwritten.
        Examples:
@@ -427,7 +427,7 @@ class EagerPyLayerContext(object):
                    def forward(ctx, x):
                        ctx.mark_not_inplace(x)
                        return x
                    @staticmethod
                    def backward(ctx, grad_output):
                        out = grad_output.exp()
@@ -438,7 +438,7 @@ class EagerPyLayerContext(object):
                attn_layers = []
                for idx in range(0, 2):
                    attn_layers.append(Exp())
                for step in range(0, 2):
                    a = x
                    for j in range(0,2):
@@ -450,7 +450,7 @@ class EagerPyLayerContext(object):
    def mark_non_differentiable(self, *args):
        """
        Marks outputs as non-differentiable.
-        This should be called at most once, only from inside the `forward` method, 
+        This should be called at most once, only from inside the `forward` method,
        and all arguments should be tensor outputs.
        This will mark outputs as not requiring gradients, increasing the
@@ -542,30 +542,27 @@ class EagerPyLayerContext(object):
 class EagerPyLayerBackward(core.eager.PyLayer, EagerPyLayerContext):
    def backward(self, *args):
        return self._forward_cls.backward(self, *args)
 class EagerPyLayerMeta(type):
    def __init__(cls, name, bases, attrs):
-        cls._backward_function = type(name + '_backward',
+        cls._backward_function = type(
-                                      (EagerPyLayerBackward, ),
+            name + '_backward', (EagerPyLayerBackward,), {"_forward_cls": cls}
-                                      {"_forward_cls": cls})
+        )
        return super(EagerPyLayerMeta, cls).__init__(name, bases, attrs)
 class EagerPyLayer(
-        with_mateclass(EagerPyLayerMeta, core.eager.PyLayer,
+    with_mateclass(EagerPyLayerMeta, core.eager.PyLayer, EagerPyLayerContext)
-                       EagerPyLayerContext)):
+):
    @staticmethod
    def forward(ctx, *args, **kwargs):
        """
-        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as 
+        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as
-        the first argument, followed by any number of arguments (tensors or other types). 
+        the first argument, followed by any number of arguments (tensors or other types).
        `None` can not be included in the returned result.
        Args:
@@ -574,7 +571,7 @@ class EagerPyLayer(
        Returns:
            tensors or other types : output of PyLayer.
        Examples:
            .. code-block:: python
@@ -597,14 +594,15 @@ class EagerPyLayer(
                        return grad
        """
        raise NotImplementedError(
-            "You must implement the forward function for PyLayer.")
+            "You must implement the forward function for PyLayer."
+        )
    @staticmethod
    def backward(ctx, *args):
        """
-        This is a function to calculate the gradient. It is to be overloaded by subclasses. 
+        This is a function to calculate the gradient. It is to be overloaded by subclasses.
-        It must accept a object of `PyLayerContext` as the first argument, and the rest 
+        It must accept a object of `PyLayerContext` as the first argument, and the rest
-        arguments are the gradient of forward's output tensors. Output tensors of backward 
+        arguments are the gradient of forward's output tensors. Output tensors of backward
        are the gradient of forward's input tensors.
        Args:
@@ -613,7 +611,7 @@ class EagerPyLayer(
        Returns:
            Tensor or list of Tensors: The gradient of forward's input tensor(s).
        Examples:
            .. code-block:: python
@@ -637,11 +635,11 @@ class EagerPyLayer(
        """
        raise NotImplementedError(
-            "You must implement the backward function for PyLayer.")
+            "You must implement the backward function for PyLayer."
+        )
 def once_differentiable(backward):
    def wrapper(ctx, *args):
        with paddle.fluid.dygraph.no_grad():
            outputs = backward(ctx, *args)

--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -42,12 +42,12 @@ def current_stream(device=None):
    Return the current CUDA stream by the device.
    Parameters:
-        device(paddle.CUDAPlace()|int, optional): The device or the ID of the device which want to get stream from. 
+        device(paddle.CUDAPlace()|int, optional): The device or the ID of the device which want to get stream from.
        If device is None, the device is the current device. Default: None.
    Returns:
        CUDAStream: the stream to the device.
    Examples:
        .. code-block:: python
@@ -82,7 +82,7 @@ def synchronize(device=None):
    Parameters:
        device(paddle.CUDAPlace()|int, optional): The device or the ID of the device.
        If device is None, the device is the current device. Default: None.
    Examples:
        .. code-block:: python
@@ -111,7 +111,7 @@ def synchronize(device=None):
 def device_count():
    '''
    Return the number of GPUs available.
    Returns:
        int: the number of GPUs available.
@@ -124,8 +124,11 @@ def device_count():
    '''
-    num_gpus = core.get_cuda_device_count() if hasattr(
+    num_gpus = (
-        core, 'get_cuda_device_count') else 0
+        core.get_cuda_device_count()
+        if hasattr(core, 'get_cuda_device_count')
+        else 0
+    )
    return num_gpus
@@ -158,14 +161,14 @@ def extract_cuda_device_id(device, op_name):
    Return the id of the given cuda device. It is just a utility that will not be exposed to users.
    Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
            the string name of device like 'gpu:x'.
            Default: None.
    Return:
        int: The id of the given device. If device is None, return the id of current device.
    '''
-    if (device is None):
+    if device is None:
        return core.get_cuda_current_device_id()
    if isinstance(device, int):
@@ -178,15 +181,19 @@ def extract_cuda_device_id(device, op_name):
        else:
            raise ValueError(
                "The current string {} is not expected. Because {} only support string which is like 'gpu:x'. "
-                "Please input appropriate string again!".format(
+                "Please input appropriate string again!".format(device, op_name)
-                    device, op_name))
+            )
    else:
        raise ValueError(
            "The device type {} is not expected. Because {} only support int, str or paddle.CUDAPlace. "
-            "Please input appropriate device again!".format(device, op_name))
+            "Please input appropriate device again!".format(device, op_name)
+        )
-    assert device_id >= 0, f"The device id must be not less than 0, but got id = {device_id}."
+    assert (
-    assert device_id < device_count(
+        device_id >= 0
+    ), f"The device id must be not less than 0, but got id = {device_id}."
+    assert (
+        device_id < device_count()
    ), f"The device id {device_id} exceeds gpu card number {device_count()}"
    return device_id
@@ -197,12 +204,12 @@ def max_memory_allocated(device=None):
    Return the peak size of gpu memory that is allocated to tensor of the given device.
    .. note::
-        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need. 
+        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need.
        For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
    Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
            Default: None.
    Return:
@@ -232,8 +239,8 @@ def max_memory_reserved(device=None):
    Return the peak size of GPU memory that is held by the allocator of the given device.
    Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
            Default: None.
    Return:
@@ -263,12 +270,12 @@ def memory_allocated(device=None):
    Return the current size of gpu memory that is allocated to tensor of the given device.
    .. note::
-        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need. 
+        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need.
-        For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes. 
+        For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
    Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
            Default: None.
    Return:
@@ -298,14 +305,14 @@ def memory_reserved(device=None):
    Return the current size of GPU memory that is held by the allocator of the given device.
    Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
            Default: None.
    Return:
        int: The current size of GPU memory that is held by the allocator of the given device, in bytes.
-    Examples:    
+    Examples:
        .. code-block:: python
            # required: gpu
@@ -389,18 +396,18 @@ def get_device_properties(device=None):
    Return the properties of given device.
    Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
-            the string name of device like 'gpu:x' which to get the properties of the 
+            the string name of device like 'gpu:x' which to get the properties of the
-            device from. If device is None, the device is the current device. 
+            device from. If device is None, the device is the current device.
            Default: None.
    Returns:
-        _gpuDeviceProperties: The properties of the device which include ASCII string 
+        _gpuDeviceProperties: The properties of the device which include ASCII string
-        identifying device, major compute capability, minor compute capability, global 
+        identifying device, major compute capability, minor compute capability, global
        memory available and the number of multiprocessors on the device.
    Examples:
        .. code-block:: python
            # required: gpu
@@ -424,7 +431,8 @@ def get_device_properties(device=None):
        raise ValueError(
            "The API paddle.device.cuda.get_device_properties is not supported in "
            "CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support "
-            "to call this API.")
+            "to call this API."
+        )
    if device is not None:
        if isinstance(device, int):
@@ -438,12 +446,14 @@ def get_device_properties(device=None):
                raise ValueError(
                    "The current string {} is not expected. Because paddle.device."
                    "cuda.get_device_properties only support string which is like 'gpu:x'. "
-                    "Please input appropriate string again!".format(device))
+                    "Please input appropriate string again!".format(device)
+                )
        else:
            raise ValueError(
                "The device type {} is not expected. Because paddle.device.cuda."
                "get_device_properties only support int, str or paddle.CUDAPlace. "
-                "Please input appropriate device again!".format(device))
+                "Please input appropriate device again!".format(device)
+            )
    else:
        device_id = -1
@@ -484,7 +494,7 @@ def get_device_capability(device=None):
    Return the major and minor revision numbers defining the device's compute capability which are got from CUDA function `cudaDeviceProp <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g1bf9d625a931d657e08db2b4391170f0>`_.
    Parameters:
-        device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device. 
+        device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device.
    Returns:
        tuple(int,int): the major and minor revision numbers defining the device's compute capability.

--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
--- a/python/paddle/distributed/fleet/base/private_helper_function.py
+++ b/python/paddle/distributed/fleet/base/private_helper_function.py
@@ -24,11 +24,11 @@ def wait_server_ready(endpoints):
    """
    Wait until parameter servers are ready, use connext_ex to detect
    port readiness.
    Args:
    endpoints (list|tuple): endpoints string list, like:
    ["127.0.0.1:8080", "127.0.0.1:8081"]
    Examples:
    .. code-block:: python
@@ -40,8 +40,9 @@ def wait_server_ready(endpoints):
        not_ready_endpoints = []
        for ep in endpoints:
            ip_port = ep.split(":")
-            with closing(socket.socket(socket.AF_INET,
+            with closing(
-                                       socket.SOCK_STREAM)) as sock:
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as sock:
                sock.settimeout(2)
                sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                if hasattr(socket, 'SO_REUSEPORT'):
@@ -53,8 +54,9 @@ def wait_server_ready(endpoints):
                    not_ready_endpoints.append(ep)
        if not all_ok:
            sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-            sys.stderr.write("not ready endpoints:" + str(not_ready_endpoints) +
+            sys.stderr.write(
-                             "\n")
+                "not ready endpoints:" + str(not_ready_endpoints) + "\n"
+            )
            sys.stderr.flush()
            time.sleep(3)
        else:

--- a/python/paddle/distributed/fleet/fleet.py
+++ b/python/paddle/distributed/fleet/fleet.py
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -30,7 +30,9 @@ from paddle.fluid.framework import _set_expected_place
 from paddle.fluid.dygraph import parallel_helper
 from paddle.distributed.fleet.launch_utils import check_backend
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.distributed.fleet.base.private_helper_function import wait_server_ready  # noqa: F401
+from paddle.distributed.fleet.base.private_helper_function import (
+    wait_server_ready,
+)  # noqa: F401
 from paddle.distributed import collective
 from paddle.distributed.collective import _set_group_map
 from paddle.distributed.collective import _set_group_map_by_name
@@ -63,6 +65,7 @@ def _get_global_parallel_env():
 def _start_kv_server(port, http_server_d, size):
    from paddle.distributed.fleet.utils.http_server import KVServer
    http_server = KVServer(int(port), size=size)
    http_server.start()
    wait_seconds = 3
@@ -73,10 +76,15 @@ def _start_kv_server(port, http_server_d, size):
 def _is_cpuonly(backend):
    check_backend(backend)
-    if (backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl'] and
+    if (
-        (core.is_compiled_with_cuda() or core.is_compiled_with_xpu()
+        backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl']
-         or core.is_compiled_with_npu()
+        and (
-         or core.is_compiled_with_mlu())) or backend is 'xccl':
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_xpu()
+            or core.is_compiled_with_npu()
+            or core.is_compiled_with_mlu()
+        )
+    ) or backend is 'xccl':
        # passes 'auto' and can use cuda or xpu, use the default logics. so return False
        return False
@@ -87,9 +95,10 @@ def _is_cpuonly(backend):
 def _check_var_exists(var_name):
    var = os.environ.get(var_name, None)
    if var is None:
-        raise ValueError("paddle.distributed initialize error, "
+        raise ValueError(
-                         "environment variable %s is needed, but not set." %
+            "paddle.distributed initialize error, "
-                         var_name)
+            "environment variable %s is needed, but not set." % var_name
+        )
 def init_parallel_env():
@@ -106,7 +115,7 @@ def init_parallel_env():
    Returns:
        None
    Examples:
        .. code-block:: python
            # required: gpu
@@ -120,7 +129,7 @@ def init_parallel_env():
                    super(LinearNet, self).__init__()
                    self._linear1 = nn.Linear(10, 10)
                    self._linear2 = nn.Linear(10, 1)
                def forward(self, x):
                    return self._linear2(self._linear1(x))
@@ -141,7 +150,7 @@ def init_parallel_env():
                outputs = dp_layer(inputs)
                labels = paddle.randn([10, 1], 'float32')
                loss = loss_fn(outputs, labels)
                loss.backward()
                adam.step()
@@ -167,15 +176,21 @@ def init_parallel_env():
    backend = os.environ.get('PADDLE_DISTRI_BACKEND', 'auto')
    is_cpu_only = _is_cpuonly(backend)
    # 1. gpu xpu check, must be gpu or xpu,
-    if not (is_cpu_only or core.is_compiled_with_cuda()
+    if not (
-            or core.is_compiled_with_xpu() or core.is_compiled_with_npu()
+        is_cpu_only
-            or core.is_compiled_with_mlu()):
+        or core.is_compiled_with_cuda()
+        or core.is_compiled_with_xpu()
+        or core.is_compiled_with_npu()
+        or core.is_compiled_with_mlu()
+    ):
        raise NotImplementedError(
-            "If you want to use CPU-only version, please use 'gloo' as backend")
+            "If you want to use CPU-only version, please use 'gloo' as backend"
+        )
    if backend == "xccl":
        FLAGS_selected_custom_devices = 'FLAGS_selected_{}s'.format(
-            parallel_env.device_type)
+            parallel_env.device_type
+        )
        _check_var_exists(FLAGS_selected_custom_devices)
    else:
        if not is_cpu_only and core.is_compiled_with_cuda():
@@ -203,8 +218,9 @@ def init_parallel_env():
    # they need to call a function to change default place,
    # here just set correctly place to users
    if backend == "xccl":
-        place = core.CustomPlace(parallel_env.device_type,
+        place = core.CustomPlace(
-                                 parallel_env.device_id)
+            parallel_env.device_type, parallel_env.device_id
+        )
    elif is_cpu_only:
        place = core.CPUPlace()
    elif core.is_compiled_with_cuda():
@@ -228,11 +244,15 @@ def init_parallel_env():
        assert rank >= 0 and world_size > rank and world_size > 1, (
            "rank must be non-negative and world_size must be the "
            "maximum rank plus one. Moreover, at least two processes are "
-            "required to create a process group.")
+            "required to create a process group."
+        )
        master_addr = os.getenv("MASTER_ADDR", None)
        master_port = os.getenv("MASTER_PORT", None)
-        endpoints = ":".join([master_addr, master_port
+        endpoints = (
-                              ]) if master_addr and master_port else None
+            ":".join([master_addr, master_port])
+            if master_addr and master_port
+            else None
+        )
        if endpoints is None:
            endpoints = os.getenv("PADDLE_MASTER", None)
        if endpoints is None:
@@ -241,23 +261,28 @@ def init_parallel_env():
            "The environment variable 'MASTER_ADDR' and 'MASTER_PORT' "
            "must be specified, for example 'export MASTER_ADDR=127.0.0.1' "
            "and 'export MASTER_ADDR=54612'. Or you can start your training"
-            "with paddle.distributed.run module.")
+            "with paddle.distributed.run module."
+        )
        master_addr, master_port = endpoints.split(":")
        master_port = int(master_port)
        is_master = rank == 0
        stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900"))
-        default_store = core.TCPStore(master_addr,
+        default_store = core.TCPStore(
-                                      master_port,
+            master_addr,
-                                      is_master,
+            master_port,
-                                      world_size,
+            is_master,
-                                      timeout=stop_check_timeout)
+            world_size,
+            timeout=stop_check_timeout,
+        )
        _set_default_store(default_store)
-        pg = _new_process_group_impl(backend,
+        pg = _new_process_group_impl(
-                                     default_store,
+            backend,
-                                     rank,
+            default_store,
-                                     world_size,
+            rank,
-                                     _default_group_name,
+            world_size,
-                                     pg_options=None)
+            _default_group_name,
+            pg_options=None,
+        )
        ranks = list(range(world_size))
        group = Group(rank, 0, ranks, pg=pg, name=_default_group_name)
        _set_group_map_by_name(_default_group_name, group)
@@ -283,8 +308,10 @@ def init_parallel_env():
            size = {'_worker': parallel_env.world_size}
            if backend == "heter":
                size = {'_worker': len(node_num)}
-            http_server = Process(target=_start_kv_server,
+            http_server = Process(
-                                  args=(int(ep_rank_0[1]), http_server_d, size))
+                target=_start_kv_server,
+                args=(int(ep_rank_0[1]), http_server_d, size),
+            )
            http_server.daemon = True
            http_server_d["running"] = True
            http_server.start()
@@ -302,22 +329,28 @@ def init_parallel_env():
    # init nccl or hccl or bkcl or heter context
    if is_cpu_only:
        parallel_helper._set_parallel_ctx(
-            core.GLOOParallelContext(strategy, place))
+            core.GLOOParallelContext(strategy, place)
-    elif (backend == "heter"):
+        )
+    elif backend == "heter":
        parallel_helper._set_parallel_ctx(
-            core.HeterParallelContext(strategy, parallel_env.device_id))
+            core.HeterParallelContext(strategy, parallel_env.device_id)
+        )
    elif core.is_compiled_with_cuda():
        parallel_helper._set_parallel_ctx(
-            core.NCCLParallelContext(strategy, place))
+            core.NCCLParallelContext(strategy, place)
+        )
    elif core.is_compiled_with_xpu():
        parallel_helper._set_parallel_ctx(
-            core.BKCLParallelContext(strategy, place))
+            core.BKCLParallelContext(strategy, place)
+        )
    elif core.is_compiled_with_npu():
        parallel_helper._set_parallel_ctx(
-            core.HCCLParallelContext(strategy, place))
+            core.HCCLParallelContext(strategy, place)
+        )
    elif core.is_compiled_with_mlu():
        parallel_helper._set_parallel_ctx(
-            core.CNCLParallelContext(strategy, place))
+            core.CNCLParallelContext(strategy, place)
+        )
    if backend != "heter":
        other_endpoints = strategy.trainer_endpoints[:]

--- a/python/paddle/distributed/sharding/group_sharded.py
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -23,30 +23,48 @@ from paddle.distributed.utils.log_utils import get_logger
 from paddle.fluid.framework import in_dygraph_mode
 # Old version
-from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import (
-from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
+    ShardingOptimizerStage2,
-from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3
+)
-from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import (
+    ShardingStage2,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import (
+    ShardingStage3,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import (
+    ShardingScaler,
+)
 # New version
-from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import (
-from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2
+    GroupShardedOptimizerStage2,
-from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import GroupShardedStage3
+)
-from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import GroupShardedScaler
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import (
+    GroupShardedStage2,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import (
+    GroupShardedStage3,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import (
+    GroupShardedScaler,
+)
 logger_ = get_logger(logging.WARNING)
-def group_sharded_parallel(model,
+def group_sharded_parallel(
-                           optimizer,
+    model,
-                           level,
+    optimizer,
-                           scaler=None,
+    level,
-                           group=None,
+    scaler=None,
-                           offload=False,
+    group=None,
-                           sync_buffers=False,
+    offload=False,
-                           buffer_max_size=2**23,
+    sync_buffers=False,
-                           segment_size=2**20,
+    buffer_max_size=2**23,
-                           sync_comm=False):
+    segment_size=2**20,
+    sync_comm=False,
+):
    """
    Use group_sharded_parallel can perform group shared configuration on the model, optimizer and GradScaler. Level has three string options, 'os', 'os_g' and 'p_g_os' corresponds to three different usage scenarios: optimizer state segmentation, optimizer state + gradient segmentation, and parameter + gradient + optimizer state segmentation.
    Usually, optimizer state + gradient segmentation is actually a re optimization of optimizer state segmentation, so optimizer state + gradient segmentation can be used to realize optimizer state segmentation.
@@ -62,12 +80,12 @@ def group_sharded_parallel(model,
        buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. The larger the size, the more GPU memory will be used. Defaults to 2**23, which means that the dimension of the buffer is 2**23.
        segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20, indicating that the dimension of the minimum segmented parameter is 2**20.
        sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False, indicating that asynchronous communication is used.
    Returns:
        model: A wrapper for group sharded given model.
        optimizer: A wrapper for group sharded given optimizer.
        scaler: A wrapper for group sharded given scaler.
    Examples:
        .. code-block:: python
@@ -100,13 +118,16 @@ def group_sharded_parallel(model,
    """
    # check optition type
    assert isinstance(
-        model,
+        model, paddle.nn.Layer
-        paddle.nn.Layer), "The model must be the instance of paddle.nn.Layer."
+    ), "The model must be the instance of paddle.nn.Layer."
    assert isinstance(
        optimizer, Optimizer
    ), "The optimizer must be the instance of paddle.optimizer.Optimizer."
-    assert level in ['os', 'os_g',
+    assert level in [
-                     'p_g_os'], "The level must be os, os_g or p_g_os."
+        'os',
+        'os_g',
+        'p_g_os',
+    ], "The level must be os, os_g or p_g_os."
    def check_dtype(param):
        return param.dtype == paddle.float16
@@ -124,39 +145,50 @@ def group_sharded_parallel(model,
                params=optimizer._parameter_list,
                optim=optimizer,
                group=group,
-                offload=offload)
+                offload=offload,
-            model = GroupShardedStage2(model,
+            )
-                                       optimizer,
+            model = GroupShardedStage2(
-                                       group=group,
+                model,
-                                       sync_buffers=sync_buffers,
+                optimizer,
-                                       buffer_max_size=buffer_max_size)
+                group=group,
+                sync_buffers=sync_buffers,
+                buffer_max_size=buffer_max_size,
+            )
        else:
-            optimizer = ShardingOptimizerStage2(params=model.parameters(),
+            optimizer = ShardingOptimizerStage2(
-                                                optim=optimizer,
+                params=model.parameters(),
-                                                group=group,
+                optim=optimizer,
-                                                offload=offload)
+                group=group,
-            model = ShardingStage2(model,
+                offload=offload,
-                                   optimizer,
+            )
-                                   group=group,
+            model = ShardingStage2(
-                                   sync_buffers=sync_buffers,
+                model,
-                                   buffer_max_size=buffer_max_size)
+                optimizer,
+                group=group,
+                sync_buffers=sync_buffers,
+                buffer_max_size=buffer_max_size,
+            )
    elif level == 'p_g_os':
        if in_dygraph_mode():
-            model = GroupShardedStage3(model,
+            model = GroupShardedStage3(
-                                       optimizer=optimizer,
+                model,
-                                       group=group,
+                optimizer=optimizer,
-                                       sync_buffers=sync_buffers,
+                group=group,
-                                       segment_size=segment_size,
+                sync_buffers=sync_buffers,
-                                       offload=offload,
+                segment_size=segment_size,
-                                       sync_comm=sync_comm)
+                offload=offload,
+                sync_comm=sync_comm,
+            )
        else:
-            model = ShardingStage3(model,
+            model = ShardingStage3(
-                                   optimizer=optimizer,
+                model,
-                                   group=group,
+                optimizer=optimizer,
-                                   sync_buffers=sync_buffers,
+                group=group,
-                                   segment_size=segment_size,
+                sync_buffers=sync_buffers,
-                                   offload=offload,
+                segment_size=segment_size,
-                                   sync_comm=sync_comm)
+                offload=offload,
+                sync_comm=sync_comm,
+            )
    else:
        raise ValueError("Please enter the correct level.")
    if isinstance(scaler, paddle.amp.GradScaler):
@@ -184,7 +216,7 @@ def save_group_sharded_model(model, output, optimizer=None):
        model (Layer): A wrapper for group sharded given model.
        output (str): Save directory.
        optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None, indicating that the optimizer state is not saved.
    Examples:
        .. code-block:: python
@@ -219,7 +251,8 @@ def save_group_sharded_model(model, output, optimizer=None):
            save_group_sharded_model(model, optimizer, output=output_dir)
    """
    logger_.info(
-        "==========Begin to save group sharded model and optimizer==========")
+        "==========Begin to save group sharded model and optimizer=========="
+    )
    assert not os.path.isfile(
        output
    ), "Saving directory ({}) should be a directory, not a file".format(output)
@@ -243,4 +276,5 @@ def save_group_sharded_model(model, output, optimizer=None):
        output_opt = os.path.join(output, "model.pdopt")
        paddle.save(optimizer._optim.state_dict(), output_opt)
    logger_.info(
-        "==========End to save group sharded model and optimizer==========")
+        "==========End to save group sharded model and optimizer=========="
+    )
--- a/python/paddle/distribution/distribution.py
+++ b/python/paddle/distribution/distribution.py
@@ -28,35 +28,56 @@ import numpy as np
 import paddle
 from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid import core
-from paddle.fluid.data_feeder import (check_dtype, check_type,
+from paddle.fluid.data_feeder import (
-                                      check_variable_and_dtype, convert_dtype)
+    check_dtype,
-from paddle.fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
+    check_type,
-from paddle.fluid.layers import (control_flow, elementwise_add, elementwise_div,
+    check_variable_and_dtype,
-                                 elementwise_mul, elementwise_sub, nn, ops,
+    convert_dtype,
-                                 tensor)
+)
+from paddle.fluid.framework import (
+    _non_static_mode,
+    in_dygraph_mode,
+    _in_legacy_dygraph,
+)
+from paddle.fluid.layers import (
+    control_flow,
+    elementwise_add,
+    elementwise_div,
+    elementwise_mul,
+    elementwise_sub,
+    nn,
+    ops,
+    tensor,
+)
 from paddle.tensor import arange, concat, gather_nd, multinomial
 class Distribution(object):
    """
-    The abstract base class for probability distributions. Functions are 
+    The abstract base class for probability distributions. Functions are
    implemented in specific distributions.
    Args:
-        batch_shape(Sequence[int], optional):  independent, not identically 
+        batch_shape(Sequence[int], optional):  independent, not identically
            distributed draws, aka a "collection" or "bunch" of distributions.
-        event_shape(Sequence[int], optional): the shape of a single 
+        event_shape(Sequence[int], optional): the shape of a single
-            draw from the distribution; it may be dependent across dimensions. 
+            draw from the distribution; it may be dependent across dimensions.
-            For scalar distributions, the event shape is []. For n-dimension 
+            For scalar distributions, the event shape is []. For n-dimension
            multivariate distribution, the event shape is [n].
    """
    def __init__(self, batch_shape=(), event_shape=()):
-        self._batch_shape = batch_shape if isinstance(
+        self._batch_shape = (
-            batch_shape, tuple) else tuple(batch_shape)
+            batch_shape
-        self._event_shape = event_shape if isinstance(
+            if isinstance(batch_shape, tuple)
-            event_shape, tuple) else tuple(event_shape)
+            else tuple(batch_shape)
+        )
+        self._event_shape = (
+            event_shape
+            if isinstance(event_shape, tuple)
+            else tuple(event_shape)
+        )
        super(Distribution, self).__init__()
@@ -118,16 +139,16 @@ class Distribution(object):
    def probs(self, value):
        """Probability density/mass function.
-        .. note:: 
+        .. note::
-            This method will be deprecated in the future, please use `prob` 
+            This method will be deprecated in the future, please use `prob`
            instead.
        """
        raise NotImplementedError
    def _extend_shape(self, sample_shape):
-        """compute shape of the sample 
+        """compute shape of the sample
        Args:
            sample_shape (Tensor): sample shape
@@ -155,7 +176,8 @@ class Distribution(object):
        if is_variable and is_number:
            raise ValueError(
-                'if one argument is Tensor, all arguments should be Tensor')
+                'if one argument is Tensor, all arguments should be Tensor'
+            )
        return is_variable
@@ -170,15 +192,17 @@ class Distribution(object):
        """
        numpy_args = []
        variable_args = []
-        tmp = 0.
+        tmp = 0.0
        for arg in args:
            if isinstance(arg, float):
                arg = [arg]
            if not isinstance(arg, (list, tuple, np.ndarray, tensor.Variable)):
                raise TypeError(
-                    "Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}"
+                    "Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}".format(
-                    .format(type(arg)))
+                        type(arg)
+                    )
+                )
            arg_np = np.array(arg)
            arg_dtype = arg_np.dtype
@@ -216,20 +240,24 @@ class Distribution(object):
            value (Tensor): Change value's dtype if value's dtype is different from param.
        """
        if _non_static_mode():
-            if value.dtype != param.dtype and convert_dtype(
+            if value.dtype != param.dtype and convert_dtype(value.dtype) in [
-                    value.dtype) in ['float32', 'float64']:
+                'float32',
+                'float64',
+            ]:
                warnings.warn(
                    "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
                )
                if in_dygraph_mode():
                    return _C_ops.cast(value, param.dtype)
                if _in_legacy_dygraph():
-                    return _legacy_C_ops.cast(value, 'in_dtype', value.dtype,
+                    return _legacy_C_ops.cast(
-                                              'out_dtype', param.dtype)
+                        value, 'in_dtype', value.dtype, 'out_dtype', param.dtype
+                    )
            return value
-        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+        check_variable_and_dtype(
-                                 'log_prob')
+            value, 'value', ['float32', 'float64'], 'log_prob'
+        )
        if value.dtype != param.dtype:
            warnings.warn(
                "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
@@ -239,19 +267,25 @@ class Distribution(object):
    def _probs_to_logits(self, probs, is_binary=False):
        r"""
-        Converts probabilities into logits. For the binary, probs denotes the 
+        Converts probabilities into logits. For the binary, probs denotes the
-        probability of occurrence of the event indexed by `1`. For the 
+        probability of occurrence of the event indexed by `1`. For the
-        multi-dimensional, values of last axis denote the probabilities of 
+        multi-dimensional, values of last axis denote the probabilities of
        occurrence of each of the events.
        """
-        return (paddle.log(probs) - paddle.log1p(-probs)) \
+        return (
-            if is_binary else paddle.log(probs)
+            (paddle.log(probs) - paddle.log1p(-probs))
+            if is_binary
+            else paddle.log(probs)
+        )
    def _logits_to_probs(self, logits, is_binary=False):
        r"""
-        Converts logits into probabilities. For the binary, each value denotes 
+        Converts logits into probabilities. For the binary, each value denotes
-        log odds, whereas for the multi-dimensional case, the values along the 
+        log odds, whereas for the multi-dimensional case, the values along the
        last dimension denote the log probabilities of the events.
        """
-        return paddle.nn.functional.sigmoid(logits) \
+        return (
-            if is_binary else paddle.nn.functional.softmax(logits, axis=-1)
+            paddle.nn.functional.sigmoid(logits)
+            if is_binary
+            else paddle.nn.functional.softmax(logits, axis=-1)
+        )
--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -35,7 +35,7 @@ def kl_divergence(p, q):
    .. math::
-        KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x 
+        KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x
    Args:
        p (Distribution): ``Distribution`` object.
@@ -64,11 +64,11 @@ def kl_divergence(p, q):
 def register_kl(cls_p, cls_q):
    """Decorator for register a KL divergence implemention function.
-    The ``kl_divergence(p, q)`` function will search concrete implemention 
+    The ``kl_divergence(p, q)`` function will search concrete implemention
-    functions registered by ``register_kl``, according to multi-dispatch pattern. 
+    functions registered by ``register_kl``, according to multi-dispatch pattern.
-    If an implemention function is found, it will return the result, otherwise, 
+    If an implemention function is found, it will return the result, otherwise,
-    it will raise ``NotImplementError`` exception. Users can register 
+    it will raise ``NotImplementError`` exception. Users can register
-    implemention funciton by the decorator. 
+    implemention funciton by the decorator.
    Args:
        cls_p(Distribution): Subclass derived from ``Distribution``.
@@ -83,8 +83,9 @@ def register_kl(cls_p, cls_q):
            def kl_beta_beta():
                pass # insert implementation here
    """
-    if (not issubclass(cls_p, Distribution)
+    if not issubclass(cls_p, Distribution) or not issubclass(
-            or not issubclass(cls_q, Distribution)):
+        cls_q, Distribution
+    ):
        raise TypeError('cls_p and cls_q must be subclass of Distribution')
    def decorator(f):
@@ -98,8 +99,11 @@ def _dispatch(cls_p, cls_q):
    """Multiple dispatch into concrete implement function"""
    # find all matched super class pair of p and q
-    matchs = [(super_p, super_q) for super_p, super_q in _REGISTER_TABLE
+    matchs = [
-              if issubclass(cls_p, super_p) and issubclass(cls_q, super_q)]
+        (super_p, super_q)
+        for super_p, super_q in _REGISTER_TABLE
+        if issubclass(cls_p, super_p) and issubclass(cls_q, super_q)
+    ]
    if not matchs:
        raise NotImplementedError
@@ -108,16 +112,20 @@ def _dispatch(cls_p, cls_q):
    if _REGISTER_TABLE[left_p, left_q] is not _REGISTER_TABLE[right_p, right_q]:
        warnings.warn(
-            'Ambiguous kl_divergence({}, {}). Please register_kl({}, {})'.
+            'Ambiguous kl_divergence({}, {}). Please register_kl({}, {})'.format(
-            format(cls_p.__name__, cls_q.__name__, left_p.__name__,
+                cls_p.__name__,
-                   right_q.__name__), RuntimeWarning)
+                cls_q.__name__,
+                left_p.__name__,
+                right_q.__name__,
+            ),
+            RuntimeWarning,
+        )
    return _REGISTER_TABLE[left_p, left_q]
 @functools.total_ordering
 class _Compare(object):
    def __init__(self, *classes):
        self.classes = classes
@@ -135,22 +143,33 @@ class _Compare(object):
 @register_kl(Beta, Beta)
 def _kl_beta_beta(p, q):
-    return ((q.alpha.lgamma() + q.beta.lgamma() + (p.alpha + p.beta).lgamma()) -
+    return (
-            (p.alpha.lgamma() + p.beta.lgamma() + (q.alpha + q.beta).lgamma()) +
+        (q.alpha.lgamma() + q.beta.lgamma() + (p.alpha + p.beta).lgamma())
-            ((p.alpha - q.alpha) * p.alpha.digamma()) +
+        - (p.alpha.lgamma() + p.beta.lgamma() + (q.alpha + q.beta).lgamma())
-            ((p.beta - q.beta) * p.beta.digamma()) +
+        + ((p.alpha - q.alpha) * p.alpha.digamma())
-            (((q.alpha + q.beta) - (p.alpha + p.beta)) *
+        + ((p.beta - q.beta) * p.beta.digamma())
-             (p.alpha + p.beta).digamma()))
+        + (
+            ((q.alpha + q.beta) - (p.alpha + p.beta))
+            * (p.alpha + p.beta).digamma()
+        )
+    )
 @register_kl(Dirichlet, Dirichlet)
 def _kl_dirichlet_dirichlet(p, q):
    return (
-        (p.concentration.sum(-1).lgamma() - q.concentration.sum(-1).lgamma()) -
+        (p.concentration.sum(-1).lgamma() - q.concentration.sum(-1).lgamma())
-        ((p.concentration.lgamma() - q.concentration.lgamma()).sum(-1)) +
+        - ((p.concentration.lgamma() - q.concentration.lgamma()).sum(-1))
-        (((p.concentration - q.concentration) *
+        + (
-          (p.concentration.digamma() -
+            (
-           p.concentration.sum(-1).digamma().unsqueeze(-1))).sum(-1)))
+                (p.concentration - q.concentration)
+                * (
+                    p.concentration.digamma()
+                    - p.concentration.sum(-1).digamma().unsqueeze(-1)
+                )
+            ).sum(-1)
+        )
+    )
 @register_kl(Categorical, Categorical)
@@ -170,8 +189,7 @@ def _kl_uniform_uniform(p, q):
 @register_kl(ExponentialFamily, ExponentialFamily)
 def _kl_expfamily_expfamily(p, q):
-    """Compute kl-divergence using `Bregman divergences <https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf>`_
+    """Compute kl-divergence using `Bregman divergences <https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf>`_"""
-    """
    if not type(p) == type(q):
        raise NotImplementedError
@@ -187,19 +205,22 @@ def _kl_expfamily_expfamily(p, q):
    try:
        if _non_static_mode():
-            p_grads = paddle.grad(p_log_norm,
+            p_grads = paddle.grad(
-                                  p_natural_params,
+                p_log_norm, p_natural_params, create_graph=True
-                                  create_graph=True)
+            )
        else:
            p_grads = paddle.static.gradients(p_log_norm, p_natural_params)
    except RuntimeError as e:
        raise TypeError(
-            "Cann't compute kl_divergence({cls_p}, {cls_q}) use bregman divergence. Please register_kl({cls_p}, {cls_q})."
+            "Cann't compute kl_divergence({cls_p}, {cls_q}) use bregman divergence. Please register_kl({cls_p}, {cls_q}).".format(
-            .format(cls_p=type(p).__name__, cls_q=type(q).__name__)) from e
+                cls_p=type(p).__name__, cls_q=type(q).__name__
+            )
+        ) from e
    kl = q._log_normalizer(*q_natural_params) - p_log_norm
-    for p_param, q_param, p_grad in zip(p_natural_params, q_natural_params,
+    for p_param, q_param, p_grad in zip(
-                                        p_grads):
+        p_natural_params, q_natural_params, p_grads
+    ):
        term = (q_param - p_param) * p_grad
        kl -= _sum_rightmost(term, len(q.event_shape))

--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -19,12 +19,23 @@ import numpy as np
 from paddle import _C_ops, _legacy_C_ops
 from paddle.distribution import distribution
 from paddle.fluid import core
-from paddle.fluid.data_feeder import (check_dtype, check_type,
+from paddle.fluid.data_feeder import (
-                                      check_variable_and_dtype, convert_dtype)
+    check_dtype,
+    check_type,
+    check_variable_and_dtype,
+    convert_dtype,
+)
 from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
-from paddle.fluid.layers import (control_flow, elementwise_add, elementwise_div,
+from paddle.fluid.layers import (
-                                 elementwise_mul, elementwise_sub, nn, ops,
+    control_flow,
-                                 tensor)
+    elementwise_add,
+    elementwise_div,
+    elementwise_mul,
+    elementwise_sub,
+    nn,
+    ops,
+    tensor,
+)
 class Normal(distribution.Distribution):
@@ -55,7 +66,7 @@ class Normal(distribution.Distribution):
    Examples:
        .. code-block:: python
          import paddle
          from paddle.distribution import Normal
@@ -90,12 +101,18 @@ class Normal(distribution.Distribution):
    def __init__(self, loc, scale, name=None):
        if not _non_static_mode():
-            check_type(loc, 'loc',
+            check_type(
-                       (int, float, np.ndarray, tensor.Variable, list, tuple),
+                loc,
-                       'Normal')
+                'loc',
-            check_type(scale, 'scale',
+                (int, float, np.ndarray, tensor.Variable, list, tuple),
-                       (int, float, np.ndarray, tensor.Variable, list, tuple),
+                'Normal',
-                       'Normal')
+            )
+            check_type(
+                scale,
+                'scale',
+                (int, float, np.ndarray, tensor.Variable, list, tuple),
+                'Normal',
+            )
        self.batch_size_unknown = False
        self.all_arg_is_float = False
@@ -115,11 +132,15 @@ class Normal(distribution.Distribution):
        else:
            if isinstance(loc, float) and isinstance(scale, float):
                self.all_arg_is_float = True
-            if isinstance(loc, np.ndarray) and str(
+            if isinstance(loc, np.ndarray) and str(loc.dtype) in [
-                    loc.dtype) in ['float32', 'float64']:
+                'float32',
+                'float64',
+            ]:
                self.dtype = loc.dtype
-            elif isinstance(scale, np.ndarray) and str(
+            elif isinstance(scale, np.ndarray) and str(scale.dtype) in [
-                    scale.dtype) in ['float32', 'float64']:
+                'float32',
+                'float64',
+            ]:
                self.dtype = scale.dtype
            # pylint: disable=unbalanced-tuple-unpacking
            self.loc, self.scale = self._to_tensor(loc, scale)
@@ -149,21 +170,21 @@ class Normal(distribution.Distribution):
        if self.batch_size_unknown:
            output_shape = shape + batch_shape
            zero_tmp = tensor.fill_constant_batch_size_like(
-                self.loc + self.scale, batch_shape + shape, self.dtype, 0.)
+                self.loc + self.scale, batch_shape + shape, self.dtype, 0.0
+            )
            zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
            zero_tmp_shape = nn.shape(zero_tmp_reshape)
-            normal_random_tmp = nn.gaussian_random(zero_tmp_shape,
+            normal_random_tmp = nn.gaussian_random(
-                                                   mean=0.,
+                zero_tmp_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
-                                                   std=1.,
+            )
-                                                   seed=seed,
-                                                   dtype=self.dtype)
            output = normal_random_tmp * (zero_tmp_reshape + self.scale)
            output = elementwise_add(output, self.loc, name=name)
            return output
        else:
            output_shape = shape + batch_shape
-            output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed, dtype=self.dtype) * \
+            output = nn.gaussian_random(
-                     (tensor.zeros(output_shape, dtype=self.dtype) + self.scale)
+                output_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
+            ) * (tensor.zeros(output_shape, dtype=self.dtype) + self.scale)
            output = elementwise_add(output, self.loc, name=name)
            if self.all_arg_is_float:
                return nn.reshape(output, shape, name=name)
@@ -189,13 +210,14 @@ class Normal(distribution.Distribution):
        """
        name = self.name + '_entropy'
        batch_shape = list((self.loc + self.scale).shape)
-        zero_tmp = tensor.fill_constant_batch_size_like(self.loc + self.scale,
+        zero_tmp = tensor.fill_constant_batch_size_like(
-                                                        batch_shape, self.dtype,
+            self.loc + self.scale, batch_shape, self.dtype, 0.0
-                                                        0.)
+        )
-        return elementwise_add(0.5 + zero_tmp,
+        return elementwise_add(
-                               0.5 * math.log(2 * math.pi) + nn.log(
+            0.5 + zero_tmp,
-                                   (self.scale + zero_tmp)),
+            0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)),
-                               name=name)
+            name=name,
+        )
    def log_prob(self, value):
        """Log probability density/mass function.
@@ -212,10 +234,11 @@ class Normal(distribution.Distribution):
        var = self.scale * self.scale
        log_scale = nn.log(self.scale)
-        return elementwise_sub(-1. * ((value - self.loc) * (value - self.loc)) /
+        return elementwise_sub(
-                               (2. * var),
+            -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var),
-                               log_scale + math.log(math.sqrt(2. * math.pi)),
+            log_scale + math.log(math.sqrt(2.0 * math.pi)),
-                               name=name)
+            name=name,
+        )
    def probs(self, value):
        """Probability density/mass function.
@@ -231,10 +254,13 @@ class Normal(distribution.Distribution):
        value = self._check_values_dtype_in_probs(self.loc, value)
        var = self.scale * self.scale
-        return elementwise_div(ops.exp(-1. * ((value - self.loc) *
+        return elementwise_div(
-                                              (value - self.loc)) / (2. * var)),
+            ops.exp(
-                               (math.sqrt(2 * math.pi) * self.scale),
+                -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var)
-                               name=name)
+            ),
+            (math.sqrt(2 * math.pi) * self.scale),
+            name=name,
+        )
    def kl_divergence(self, other):
        r"""The KL-divergence between two normal distributions.
@@ -248,7 +274,7 @@ class Normal(distribution.Distribution):
        .. math::
            ratio = \\frac{\sigma_0}{\sigma_1}
        .. math::
            diff = \mu_1 - \mu_0
@@ -274,9 +300,9 @@ class Normal(distribution.Distribution):
        name = self.name + '_kl_divergence'
        var_ratio = self.scale / other.scale
-        var_ratio = (var_ratio * var_ratio)
+        var_ratio = var_ratio * var_ratio
        t1 = (self.loc - other.loc) / other.scale
-        t1 = (t1 * t1)
+        t1 = t1 * t1
-        return elementwise_add(0.5 * var_ratio,
+        return elementwise_add(
-                               0.5 * (t1 - 1. - nn.log(var_ratio)),
+            0.5 * var_ratio, 0.5 * (t1 - 1.0 - nn.log(var_ratio)), name=name
-                               name=name)
+        )
--- a/python/paddle/distribution/transform.py
+++ b/python/paddle/distribution/transform.py
--- a/python/paddle/distribution/uniform.py
+++ b/python/paddle/distribution/uniform.py
@@ -19,12 +19,27 @@ import numpy as np
 from paddle import _C_ops, _legacy_C_ops
 from paddle.distribution import distribution
 from paddle.fluid import core
-from paddle.fluid.data_feeder import (check_dtype, check_type,
+from paddle.fluid.data_feeder import (
-                                      check_variable_and_dtype, convert_dtype)
+    check_dtype,
-from paddle.fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
+    check_type,
-from paddle.fluid.layers import (control_flow, elementwise_add, elementwise_div,
+    check_variable_and_dtype,
-                                 elementwise_mul, elementwise_sub, nn, ops,
+    convert_dtype,
-                                 tensor)
+)
+from paddle.fluid.framework import (
+    _non_static_mode,
+    in_dygraph_mode,
+    _in_legacy_dygraph,
+)
+from paddle.fluid.layers import (
+    control_flow,
+    elementwise_add,
+    elementwise_div,
+    elementwise_mul,
+    elementwise_sub,
+    nn,
+    ops,
+    tensor,
+)
 from paddle.tensor import arange, concat, gather_nd, multinomial
@@ -91,12 +106,18 @@ class Uniform(distribution.Distribution):
    def __init__(self, low, high, name=None):
        if not _non_static_mode():
-            check_type(low, 'low',
+            check_type(
-                       (int, float, np.ndarray, tensor.Variable, list, tuple),
+                low,
-                       'Uniform')
+                'low',
-            check_type(high, 'high',
+                (int, float, np.ndarray, tensor.Variable, list, tuple),
-                       (int, float, np.ndarray, tensor.Variable, list, tuple),
+                'Uniform',
-                       'Uniform')
+            )
+            check_type(
+                high,
+                'high',
+                (int, float, np.ndarray, tensor.Variable, list, tuple),
+                'Uniform',
+            )
        self.all_arg_is_float = False
        self.batch_size_unknown = False
@@ -116,11 +137,15 @@ class Uniform(distribution.Distribution):
        else:
            if isinstance(low, float) and isinstance(high, float):
                self.all_arg_is_float = True
-            if isinstance(low, np.ndarray) and str(
+            if isinstance(low, np.ndarray) and str(low.dtype) in [
-                    low.dtype) in ['float32', 'float64']:
+                'float32',
+                'float64',
+            ]:
                self.dtype = low.dtype
-            elif isinstance(high, np.ndarray) and str(
+            elif isinstance(high, np.ndarray) and str(high.dtype) in [
-                    high.dtype) in ['float32', 'float64']:
+                'float32',
+                'float64',
+            ]:
                self.dtype = high.dtype
            # pylint: disable=unbalanced-tuple-unpacking
            self.low, self.high = self._to_tensor(low, high)
@@ -148,27 +173,33 @@ class Uniform(distribution.Distribution):
        if self.batch_size_unknown:
            output_shape = shape + batch_shape
            zero_tmp = tensor.fill_constant_batch_size_like(
-                self.low + self.high, batch_shape + shape, self.dtype, 0.)
+                self.low + self.high, batch_shape + shape, self.dtype, 0.0
+            )
            uniform_random_tmp = nn.uniform_random_batch_size_like(
                zero_tmp,
                zero_tmp.shape,
                dtype=self.dtype,
-                min=0.,
+                min=0.0,
-                max=1.,
+                max=1.0,
-                seed=seed)
+                seed=seed,
+            )
            zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
-            uniform_random_tmp_reshape = nn.reshape(uniform_random_tmp,
+            uniform_random_tmp_reshape = nn.reshape(
-                                                    output_shape)
+                uniform_random_tmp, output_shape
-            output = uniform_random_tmp_reshape * (zero_tmp_reshape +
+            )
-                                                   self.high - self.low)
+            output = uniform_random_tmp_reshape * (
+                zero_tmp_reshape + self.high - self.low
+            )
            output = elementwise_add(output, self.low, name=name)
            return output
        else:
            output_shape = shape + batch_shape
            output = nn.uniform_random(
-                output_shape, dtype=self.dtype, min=0., max=1.,
+                output_shape, dtype=self.dtype, min=0.0, max=1.0, seed=seed
-                seed=seed) * (tensor.zeros(output_shape, dtype=self.dtype) +
+            ) * (
-                              (self.high - self.low))
+                tensor.zeros(output_shape, dtype=self.dtype)
+                + (self.high - self.low)
+            )
            output = elementwise_add(output, self.low, name=name)
            if self.all_arg_is_float:
                return nn.reshape(output, shape, name=name)
@@ -197,10 +228,12 @@ class Uniform(distribution.Distribution):
                return nn.log(lb * ub) - nn.log(self.high - self.low)
            if _in_legacy_dygraph():
-                lb = _legacy_C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype,
+                lb = _legacy_C_ops.cast(
-                                        'out_dtype', value.dtype)
+                    lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype', value.dtype
-                ub = _legacy_C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype,
+                )
-                                        'out_dtype', value.dtype)
+                ub = _legacy_C_ops.cast(
+                    ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype', value.dtype
+                )
                return nn.log(lb * ub) - nn.log(self.high - self.low)
        name = self.name + '_log_prob'
@@ -208,9 +241,9 @@ class Uniform(distribution.Distribution):
        ub_bool = value < self.high
        lb = tensor.cast(lb_bool, dtype=value.dtype)
        ub = tensor.cast(ub_bool, dtype=value.dtype)
-        return elementwise_sub(nn.log(lb * ub),
+        return elementwise_sub(
-                               nn.log(self.high - self.low),
+            nn.log(lb * ub), nn.log(self.high - self.low), name=name
-                               name=name)
+        )
    def probs(self, value):
        """Probability density/mass function.
@@ -233,10 +266,12 @@ class Uniform(distribution.Distribution):
                return (lb * ub) / (self.high - self.low)
            if _in_legacy_dygraph():
-                lb = _legacy_C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype,
+                lb = _legacy_C_ops.cast(
-                                        'out_dtype', value.dtype)
+                    lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype', value.dtype
-                ub = _legacy_C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype,
+                )
-                                        'out_dtype', value.dtype)
+                ub = _legacy_C_ops.cast(
+                    ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype', value.dtype
+                )
                return (lb * ub) / (self.high - self.low)
        name = self.name + '_probs'

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -46,11 +46,16 @@ def set_default_dtype(d):
        else:
            raise TypeError(
                "set_default_dtype only supports [float16, float32, float64] "
-                ", but received %s" % d.__name__)
+                ", but received %s" % d.__name__
+            )
    else:
        if d in [
-                'float16', 'float32', 'float64', u'float16', u'float32',
+            'float16',
-                u'float64'
+            'float32',
+            'float64',
+            u'float16',
+            u'float32',
+            u'float64',
        ]:
            # this code is a little bit dangerous, since error could happen
            # when casting no-ascii code to str in python2.
@@ -61,7 +66,8 @@ def set_default_dtype(d):
        else:
            raise TypeError(
                "set_default_dtype only supports [float16, float32, float64] "
-                ", but received %s" % str(d))
+                ", but received %s" % str(d)
+            )
    LayerHelperBase.set_default_dtype(d)
@@ -94,7 +100,7 @@ def set_grad_enabled(mode):
    Examples:
        .. code-block:: python
            import paddle
            x = paddle.ones([3, 2])
            x.stop_gradient = False
@@ -127,9 +133,9 @@ def is_grad_enabled():
    Examples:
        .. code-block:: python
            import paddle
            # Dygraph gradient calculation mode is enabled by default.
            paddle.is_grad_enabled() # True

--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
--- a/python/paddle/incubate/autograd/primapi.py
+++ b/python/paddle/incubate/autograd/primapi.py
@@ -28,8 +28,8 @@ def forward_grad(outputs, inputs, grad_inputs=None):
    Args:
        outputs(Tensor|Sequence[Tensor]): The output tensor or tensors.
        inputs(Tensor|Sequence[Tensor]): The input tensor or tensors.
-        grad_inputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or 
+        grad_inputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or
-            Tensors of inputs which has the same shape with inputs, Defaults to 
+            Tensors of inputs which has the same shape with inputs, Defaults to
            None, in this case is equivalent to all ones.
    Returns:
@@ -50,7 +50,7 @@ def forward_grad(outputs, inputs, grad_inputs=None):
            with paddle.static.program_guard(main_program, startup_program):
                x = paddle.static.data('x', shape=[1], dtype='float32')
-                y = x * x 
+                y = x * x
                y_grad = paddle.incubate.autograd.forward_grad(y, x)
                paddle.incubate.autograd.prim2orig()
@@ -64,25 +64,35 @@ def forward_grad(outputs, inputs, grad_inputs=None):
            paddle.disable_static()
    """
    if not utils.prim_enabled():
-        raise RuntimeError('forward_grad must be running on primitive'
+        raise RuntimeError(
-                           'operators, use enable_prim to turn it on.')
+            'forward_grad must be running on primitive'
+            'operators, use enable_prim to turn it on.'
+        )
    if not isinstance(outputs, (framework.Variable, typing.Sequence)):
-        raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], '
+        raise TypeError(
-                        f'but got {type(outputs)}.')
+            f'Expected outputs is Tensor|Sequence[Tesnor], '
+            f'but got {type(outputs)}.'
+        )
    if not isinstance(inputs, (framework.Variable, typing.Sequence)):
-        raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], '
+        raise TypeError(
-                        f'but got {type(inputs)}.')
+            f'Expected inputs is Tensor|Sequence[Tesnor], '
+            f'but got {type(inputs)}.'
+        )
-    ys, xs, xs_dot = utils.as_tensors(outputs), utils.as_tensors(
+    ys, xs, xs_dot = (
-        inputs), utils.as_tensors(grad_inputs)
+        utils.as_tensors(outputs),
+        utils.as_tensors(inputs),
+        utils.as_tensors(grad_inputs),
+    )
    block = framework.default_main_program().current_block()
    if any(x.block != block for x in xs + ys):
        raise RuntimeError(
            'Variable in inputs and targets should exist in current block of '
-            'main program.')
+            'main program.'
+        )
    primx.orig2prim(block)
    ad = primx.Transform(ys[0].block)
@@ -101,12 +111,12 @@ def grad(outputs, inputs, grad_outputs=None):
    Args:
        outputs(Tensor|Sequence[Tensor]): The output Tensor or Tensors.
        inputs(Tensor|Sequence[Tensor]): The input Tensor or Tensors.
-        grad_outputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or 
+        grad_outputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or
-            Tensors of outputs which has the same shape with outputs, Defaults 
+            Tensors of outputs which has the same shape with outputs, Defaults
            to None, in this case is equivalent to all ones.
    Returns:
-        grad_inputs(Tensor|Tensors): The gradients for inputs. 
+        grad_inputs(Tensor|Tensors): The gradients for inputs.
    Examples:
@@ -123,7 +133,7 @@ def grad(outputs, inputs, grad_outputs=None):
            with paddle.static.program_guard(main_program, startup_program):
                x = paddle.static.data('x', shape=[1], dtype='float32')
                x.stop_gradients = False
-                y = x * x 
+                y = x * x
                x_grad = paddle.incubate.autograd.grad(y, x)
                paddle.incubate.autograd.prim2orig()
@@ -132,7 +142,7 @@ def grad(outputs, inputs, grad_outputs=None):
            x_grad = exe.run(main_program, feed={'x': np.array([2.]).astype('float32')}, fetch_list=[x_grad])
            print(x_grad)
            # [array([4.], dtype=float32)]
            paddle.incubate.autograd.disable_prim()
            paddle.disable_static()
    """
@@ -141,22 +151,32 @@ def grad(outputs, inputs, grad_outputs=None):
        # backward.gradients returns a list though the inputs is a signle Tensor.
        # The follow code snippet fixes the problem by return the first element
        # of grad_inputs when the inputs is a signle Tensor.
-        if isinstance(inputs, framework.Variable) and isinstance(
+        if (
-                grad_inputs, typing.Sequence) and len(grad_inputs) > 0:
+            isinstance(inputs, framework.Variable)
+            and isinstance(grad_inputs, typing.Sequence)
+            and len(grad_inputs) > 0
+        ):
            return grad_inputs[0]
        else:
            return grad_inputs
    if not isinstance(outputs, (framework.Variable, typing.Sequence)):
-        raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], '
+        raise TypeError(
-                        f'but got {type(outputs)}.')
+            f'Expected outputs is Tensor|Sequence[Tesnor], '
+            f'but got {type(outputs)}.'
+        )
    if not isinstance(inputs, (framework.Variable, typing.Sequence)):
-        raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], '
+        raise TypeError(
-                        f'but got {type(inputs)}.')
+            f'Expected inputs is Tensor|Sequence[Tesnor], '
+            f'but got {type(inputs)}.'
+        )
-    ys, xs, ys_bar = utils.as_tensors(outputs), utils.as_tensors(
+    ys, xs, ys_bar = (
-        inputs), utils.as_tensors(grad_outputs)
+        utils.as_tensors(outputs),
+        utils.as_tensors(inputs),
+        utils.as_tensors(grad_outputs),
+    )
    block = framework.default_main_program().current_block()
    if any((x is not None and x.block != block) for x in xs + ys):
        raise RuntimeError(

--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
--- a/python/paddle/incubate/autograd/utils.py
+++ b/python/paddle/incubate/autograd/utils.py
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
--- a/python/paddle/incubate/optimizer/lookahead.py
+++ b/python/paddle/incubate/optimizer/lookahead.py
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
--- a/python/paddle/nn/functional/sparse_attention.py
+++ b/python/paddle/nn/functional/sparse_attention.py
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
--- a/python/paddle/nn/utils/spectral_norm_hook.py
+++ b/python/paddle/nn/utils/spectral_norm_hook.py
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
--- a/python/paddle/onnx/export.py
+++ b/python/paddle/onnx/export.py
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
--- a/python/paddle/regularizer.py
+++ b/python/paddle/regularizer.py
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
--- a/python/paddle/vision/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py