未验证 提交 cfee9c13 编写于 作者: L Ligoml 提交者: GitHub

[cherry-pick2.4]for CodeStyle (#47608)

* only run pre-commit

* only run pre-commit
上级 99c872fa
...@@ -54,16 +54,16 @@ class LegacyPyLayerContext(object): ...@@ -54,16 +54,16 @@ class LegacyPyLayerContext(object):
def save_for_backward(self, *tensors): def save_for_backward(self, *tensors):
""" """
Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors. Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
.. note:: .. note::
This API should be called at most once, and only inside `forward`. This API should be called at most once, and only inside `forward`.
Args: Args:
tensors(list of Tensors): Tensors to be stored. tensors(list of Tensors): Tensors to be stored.
Returns: Returns:
None None
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -94,7 +94,7 @@ class LegacyPyLayerContext(object): ...@@ -94,7 +94,7 @@ class LegacyPyLayerContext(object):
Get the tensors stored by ``save_for_backward``. Get the tensors stored by ``save_for_backward``.
Returns: Returns:
list of Tensors or None: If context contains tensors stored by `save_for_backward`, list of Tensors or None: If context contains tensors stored by `save_for_backward`,
then return these tensors, otherwise return None. then return these tensors, otherwise return None.
Examples: Examples:
...@@ -124,9 +124,7 @@ class LegacyPyLayerContext(object): ...@@ -124,9 +124,7 @@ class LegacyPyLayerContext(object):
def with_mateclass(meta, *bases): def with_mateclass(meta, *bases):
class impl(meta): class impl(meta):
def __new__(cls, name, temp_bases, attrs): def __new__(cls, name, temp_bases, attrs):
return meta(name, bases, attrs) return meta(name, bases, attrs)
...@@ -134,7 +132,6 @@ def with_mateclass(meta, *bases): ...@@ -134,7 +132,6 @@ def with_mateclass(meta, *bases):
class CPyLayer(object): class CPyLayer(object):
@classmethod @classmethod
@dygraph_only @dygraph_only
def apply(cls, *args, **kwargs): def apply(cls, *args, **kwargs):
...@@ -147,7 +144,7 @@ class CPyLayer(object): ...@@ -147,7 +144,7 @@ class CPyLayer(object):
Returns: Returns:
tensors or other types : output of PyLayer. tensors or other types : output of PyLayer.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -182,12 +179,14 @@ class CPyLayer(object): ...@@ -182,12 +179,14 @@ class CPyLayer(object):
class PyLayerBackward(LegacyPyLayerContext): class PyLayerBackward(LegacyPyLayerContext):
def backward(self, *args, **kwargs): def backward(self, *args, **kwargs):
with paddle.fluid.dygraph.guard(): with paddle.fluid.dygraph.guard():
with paddle.fluid.dygraph.no_grad(): with paddle.fluid.dygraph.no_grad():
if self._amp_state and 'enable' in self._amp_state and self._amp_state[ if (
'enable']: self._amp_state
and 'enable' in self._amp_state
and self._amp_state['enable']
):
with auto_cast(**args[0]._amp_state): with auto_cast(**args[0]._amp_state):
return self._forward_cls.backward(*args, **kwargs) return self._forward_cls.backward(*args, **kwargs)
else: else:
...@@ -197,10 +196,10 @@ class PyLayerBackward(LegacyPyLayerContext): ...@@ -197,10 +196,10 @@ class PyLayerBackward(LegacyPyLayerContext):
class LayerMeta(type): class LayerMeta(type):
def __init__(cls, name, bases, attrs): def __init__(cls, name, bases, attrs):
cls._backward_function = type(name + '_backward', (PyLayerBackward, ), cls._backward_function = type(
{"_forward_cls": cls}) name + '_backward', (PyLayerBackward,), {"_forward_cls": cls}
)
return super(LayerMeta, cls).__init__(name, bases, attrs) return super(LayerMeta, cls).__init__(name, bases, attrs)
...@@ -210,15 +209,15 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)): ...@@ -210,15 +209,15 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
Build a custom `Layer` by creating subclasses. Subclasses need to follow the following rules: Build a custom `Layer` by creating subclasses. Subclasses need to follow the following rules:
1. Subclasses contain `forward` and `backward` function. Both forward and backward are @staticmethod. 1. Subclasses contain `forward` and `backward` function. Both forward and backward are @staticmethod.
Their first argument should be a context and `None` can not be included in the returned result. Their first argument should be a context and `None` can not be included in the returned result.
2. Input of backward contains a context as the first argument, and the rest arguments are the 2. Input of backward contains a context as the first argument, and the rest arguments are the
gradient of forward's output tensors. so the number of backward's input tensors equal to gradient of forward's output tensors. so the number of backward's input tensors equal to
the number of forward output tensors. If you need the forward's inputs or outputs in `backward`, the number of forward output tensors. If you need the forward's inputs or outputs in `backward`,
you can use `save_for_backward` to store the required tensors, and then use them in the backward. you can use `save_for_backward` to store the required tensors, and then use them in the backward.
3. Output of backward function can only be `Tensor` or tuple/list of `Tensor`. 3. Output of backward function can only be `Tensor` or tuple/list of `Tensor`.
Output tensors of backward are the gradient of forward's input tensors, Output tensors of backward are the gradient of forward's input tensors,
so the number of backward's output tensors equal to the number of forward input tensors. so the number of backward's output tensors equal to the number of forward input tensors.
After building the custom Layer, run it through the `apply` method. After building the custom Layer, run it through the `apply` method.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -259,8 +258,8 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)): ...@@ -259,8 +258,8 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
@staticmethod @staticmethod
def forward(ctx, *args, **kwargs): def forward(ctx, *args, **kwargs):
""" """
It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as
the first argument, followed by any number of arguments (tensors or other types). the first argument, followed by any number of arguments (tensors or other types).
`None` can not be included in the returned result. `None` can not be included in the returned result.
Args: Args:
...@@ -269,7 +268,7 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)): ...@@ -269,7 +268,7 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
Returns: Returns:
tensors or other types : output of PyLayer. tensors or other types : output of PyLayer.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -292,14 +291,15 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)): ...@@ -292,14 +291,15 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
return grad return grad
""" """
raise NotImplementedError( raise NotImplementedError(
"You must implement the forward function for PyLayer.") "You must implement the forward function for PyLayer."
)
@staticmethod @staticmethod
def backward(ctx, *args, **kwargs): def backward(ctx, *args, **kwargs):
""" """
This is a function to calculate the gradient. It is to be overloaded by subclasses. This is a function to calculate the gradient. It is to be overloaded by subclasses.
It must accept a object of `PyLayerContext` as the first argument, and the rest It must accept a object of `PyLayerContext` as the first argument, and the rest
arguments are the gradient of forward's output tensors. Output tensors of backward arguments are the gradient of forward's output tensors. Output tensors of backward
are the gradient of forward's input tensors. are the gradient of forward's input tensors.
Args: Args:
...@@ -308,7 +308,7 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)): ...@@ -308,7 +308,7 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
Returns: Returns:
Tensor or list of Tensors: The gradient of forward's input tensor(s). Tensor or list of Tensors: The gradient of forward's input tensor(s).
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -332,24 +332,24 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)): ...@@ -332,24 +332,24 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
""" """
raise NotImplementedError( raise NotImplementedError(
"You must implement the backward function for PyLayer.") "You must implement the backward function for PyLayer."
)
class EagerPyLayerContext(object): class EagerPyLayerContext(object):
def save_for_backward(self, *tensors): def save_for_backward(self, *tensors):
""" """
Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors. Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
.. note:: .. note::
This API should be called at most once, and only inside `forward`. This API should be called at most once, and only inside `forward`.
Args: Args:
tensors(list of Tensors): Tensors to be stored. tensors(list of Tensors): Tensors to be stored.
Returns: Returns:
None None
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -380,7 +380,7 @@ class EagerPyLayerContext(object): ...@@ -380,7 +380,7 @@ class EagerPyLayerContext(object):
Get the tensors stored by ``save_for_backward``. Get the tensors stored by ``save_for_backward``.
Returns: Returns:
list of Tensors or None: If context contains tensors stored by `save_for_backward`, list of Tensors or None: If context contains tensors stored by `save_for_backward`,
then return these tensors, otherwise return None. then return these tensors, otherwise return None.
Examples: Examples:
...@@ -410,11 +410,11 @@ class EagerPyLayerContext(object): ...@@ -410,11 +410,11 @@ class EagerPyLayerContext(object):
def mark_not_inplace(self, *args): def mark_not_inplace(self, *args):
""" """
Marks inputs as not inplace. Marks inputs as not inplace.
This should be called at most once, only from inside the `forward` method, This should be called at most once, only from inside the `forward` method,
and all arguments should be Tensor inputs. and all arguments should be Tensor inputs.
If the Tensor returned by `forward` method is the same as the Tensor input of forward, If the Tensor returned by `forward` method is the same as the Tensor input of forward,
and this Tensor is marked as not_inplace, then Paddle will help the user create a new Tensor as output. and this Tensor is marked as not_inplace, then Paddle will help the user create a new Tensor as output.
Thereby preventing the auto grad information of the input Tensor from being overwritten. Thereby preventing the auto grad information of the input Tensor from being overwritten.
Examples: Examples:
...@@ -427,7 +427,7 @@ class EagerPyLayerContext(object): ...@@ -427,7 +427,7 @@ class EagerPyLayerContext(object):
def forward(ctx, x): def forward(ctx, x):
ctx.mark_not_inplace(x) ctx.mark_not_inplace(x)
return x return x
@staticmethod @staticmethod
def backward(ctx, grad_output): def backward(ctx, grad_output):
out = grad_output.exp() out = grad_output.exp()
...@@ -438,7 +438,7 @@ class EagerPyLayerContext(object): ...@@ -438,7 +438,7 @@ class EagerPyLayerContext(object):
attn_layers = [] attn_layers = []
for idx in range(0, 2): for idx in range(0, 2):
attn_layers.append(Exp()) attn_layers.append(Exp())
for step in range(0, 2): for step in range(0, 2):
a = x a = x
for j in range(0,2): for j in range(0,2):
...@@ -450,7 +450,7 @@ class EagerPyLayerContext(object): ...@@ -450,7 +450,7 @@ class EagerPyLayerContext(object):
def mark_non_differentiable(self, *args): def mark_non_differentiable(self, *args):
""" """
Marks outputs as non-differentiable. Marks outputs as non-differentiable.
This should be called at most once, only from inside the `forward` method, This should be called at most once, only from inside the `forward` method,
and all arguments should be tensor outputs. and all arguments should be tensor outputs.
This will mark outputs as not requiring gradients, increasing the This will mark outputs as not requiring gradients, increasing the
...@@ -542,30 +542,27 @@ class EagerPyLayerContext(object): ...@@ -542,30 +542,27 @@ class EagerPyLayerContext(object):
class EagerPyLayerBackward(core.eager.PyLayer, EagerPyLayerContext): class EagerPyLayerBackward(core.eager.PyLayer, EagerPyLayerContext):
def backward(self, *args): def backward(self, *args):
return self._forward_cls.backward(self, *args) return self._forward_cls.backward(self, *args)
class EagerPyLayerMeta(type): class EagerPyLayerMeta(type):
def __init__(cls, name, bases, attrs): def __init__(cls, name, bases, attrs):
cls._backward_function = type(name + '_backward', cls._backward_function = type(
(EagerPyLayerBackward, ), name + '_backward', (EagerPyLayerBackward,), {"_forward_cls": cls}
{"_forward_cls": cls}) )
return super(EagerPyLayerMeta, cls).__init__(name, bases, attrs) return super(EagerPyLayerMeta, cls).__init__(name, bases, attrs)
class EagerPyLayer( class EagerPyLayer(
with_mateclass(EagerPyLayerMeta, core.eager.PyLayer, with_mateclass(EagerPyLayerMeta, core.eager.PyLayer, EagerPyLayerContext)
EagerPyLayerContext)): ):
@staticmethod @staticmethod
def forward(ctx, *args, **kwargs): def forward(ctx, *args, **kwargs):
""" """
It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as
the first argument, followed by any number of arguments (tensors or other types). the first argument, followed by any number of arguments (tensors or other types).
`None` can not be included in the returned result. `None` can not be included in the returned result.
Args: Args:
...@@ -574,7 +571,7 @@ class EagerPyLayer( ...@@ -574,7 +571,7 @@ class EagerPyLayer(
Returns: Returns:
tensors or other types : output of PyLayer. tensors or other types : output of PyLayer.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -597,14 +594,15 @@ class EagerPyLayer( ...@@ -597,14 +594,15 @@ class EagerPyLayer(
return grad return grad
""" """
raise NotImplementedError( raise NotImplementedError(
"You must implement the forward function for PyLayer.") "You must implement the forward function for PyLayer."
)
@staticmethod @staticmethod
def backward(ctx, *args): def backward(ctx, *args):
""" """
This is a function to calculate the gradient. It is to be overloaded by subclasses. This is a function to calculate the gradient. It is to be overloaded by subclasses.
It must accept a object of `PyLayerContext` as the first argument, and the rest It must accept a object of `PyLayerContext` as the first argument, and the rest
arguments are the gradient of forward's output tensors. Output tensors of backward arguments are the gradient of forward's output tensors. Output tensors of backward
are the gradient of forward's input tensors. are the gradient of forward's input tensors.
Args: Args:
...@@ -613,7 +611,7 @@ class EagerPyLayer( ...@@ -613,7 +611,7 @@ class EagerPyLayer(
Returns: Returns:
Tensor or list of Tensors: The gradient of forward's input tensor(s). Tensor or list of Tensors: The gradient of forward's input tensor(s).
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -637,11 +635,11 @@ class EagerPyLayer( ...@@ -637,11 +635,11 @@ class EagerPyLayer(
""" """
raise NotImplementedError( raise NotImplementedError(
"You must implement the backward function for PyLayer.") "You must implement the backward function for PyLayer."
)
def once_differentiable(backward): def once_differentiable(backward):
def wrapper(ctx, *args): def wrapper(ctx, *args):
with paddle.fluid.dygraph.no_grad(): with paddle.fluid.dygraph.no_grad():
outputs = backward(ctx, *args) outputs = backward(ctx, *args)
......
...@@ -42,12 +42,12 @@ def current_stream(device=None): ...@@ -42,12 +42,12 @@ def current_stream(device=None):
Return the current CUDA stream by the device. Return the current CUDA stream by the device.
Parameters: Parameters:
device(paddle.CUDAPlace()|int, optional): The device or the ID of the device which want to get stream from. device(paddle.CUDAPlace()|int, optional): The device or the ID of the device which want to get stream from.
If device is None, the device is the current device. Default: None. If device is None, the device is the current device. Default: None.
Returns: Returns:
CUDAStream: the stream to the device. CUDAStream: the stream to the device.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -82,7 +82,7 @@ def synchronize(device=None): ...@@ -82,7 +82,7 @@ def synchronize(device=None):
Parameters: Parameters:
device(paddle.CUDAPlace()|int, optional): The device or the ID of the device. device(paddle.CUDAPlace()|int, optional): The device or the ID of the device.
If device is None, the device is the current device. Default: None. If device is None, the device is the current device. Default: None.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -111,7 +111,7 @@ def synchronize(device=None): ...@@ -111,7 +111,7 @@ def synchronize(device=None):
def device_count(): def device_count():
''' '''
Return the number of GPUs available. Return the number of GPUs available.
Returns: Returns:
int: the number of GPUs available. int: the number of GPUs available.
...@@ -124,8 +124,11 @@ def device_count(): ...@@ -124,8 +124,11 @@ def device_count():
''' '''
num_gpus = core.get_cuda_device_count() if hasattr( num_gpus = (
core, 'get_cuda_device_count') else 0 core.get_cuda_device_count()
if hasattr(core, 'get_cuda_device_count')
else 0
)
return num_gpus return num_gpus
...@@ -158,14 +161,14 @@ def extract_cuda_device_id(device, op_name): ...@@ -158,14 +161,14 @@ def extract_cuda_device_id(device, op_name):
Return the id of the given cuda device. It is just a utility that will not be exposed to users. Return the id of the given cuda device. It is just a utility that will not be exposed to users.
Args: Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. the string name of device like 'gpu:x'.
Default: None. Default: None.
Return: Return:
int: The id of the given device. If device is None, return the id of current device. int: The id of the given device. If device is None, return the id of current device.
''' '''
if (device is None): if device is None:
return core.get_cuda_current_device_id() return core.get_cuda_current_device_id()
if isinstance(device, int): if isinstance(device, int):
...@@ -178,15 +181,19 @@ def extract_cuda_device_id(device, op_name): ...@@ -178,15 +181,19 @@ def extract_cuda_device_id(device, op_name):
else: else:
raise ValueError( raise ValueError(
"The current string {} is not expected. Because {} only support string which is like 'gpu:x'. " "The current string {} is not expected. Because {} only support string which is like 'gpu:x'. "
"Please input appropriate string again!".format( "Please input appropriate string again!".format(device, op_name)
device, op_name)) )
else: else:
raise ValueError( raise ValueError(
"The device type {} is not expected. Because {} only support int, str or paddle.CUDAPlace. " "The device type {} is not expected. Because {} only support int, str or paddle.CUDAPlace. "
"Please input appropriate device again!".format(device, op_name)) "Please input appropriate device again!".format(device, op_name)
)
assert device_id >= 0, f"The device id must be not less than 0, but got id = {device_id}." assert (
assert device_id < device_count( device_id >= 0
), f"The device id must be not less than 0, but got id = {device_id}."
assert (
device_id < device_count()
), f"The device id {device_id} exceeds gpu card number {device_count()}" ), f"The device id {device_id} exceeds gpu card number {device_count()}"
return device_id return device_id
...@@ -197,12 +204,12 @@ def max_memory_allocated(device=None): ...@@ -197,12 +204,12 @@ def max_memory_allocated(device=None):
Return the peak size of gpu memory that is allocated to tensor of the given device. Return the peak size of gpu memory that is allocated to tensor of the given device.
.. note:: .. note::
The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need. The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need.
For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes. For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
Args: Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device. the string name of device like 'gpu:x'. If device is None, the device is the current device.
Default: None. Default: None.
Return: Return:
...@@ -232,8 +239,8 @@ def max_memory_reserved(device=None): ...@@ -232,8 +239,8 @@ def max_memory_reserved(device=None):
Return the peak size of GPU memory that is held by the allocator of the given device. Return the peak size of GPU memory that is held by the allocator of the given device.
Args: Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device. the string name of device like 'gpu:x'. If device is None, the device is the current device.
Default: None. Default: None.
Return: Return:
...@@ -263,12 +270,12 @@ def memory_allocated(device=None): ...@@ -263,12 +270,12 @@ def memory_allocated(device=None):
Return the current size of gpu memory that is allocated to tensor of the given device. Return the current size of gpu memory that is allocated to tensor of the given device.
.. note:: .. note::
The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need. The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need.
For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes. For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
Args: Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device. the string name of device like 'gpu:x'. If device is None, the device is the current device.
Default: None. Default: None.
Return: Return:
...@@ -298,14 +305,14 @@ def memory_reserved(device=None): ...@@ -298,14 +305,14 @@ def memory_reserved(device=None):
Return the current size of GPU memory that is held by the allocator of the given device. Return the current size of GPU memory that is held by the allocator of the given device.
Args: Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x'. If device is None, the device is the current device. the string name of device like 'gpu:x'. If device is None, the device is the current device.
Default: None. Default: None.
Return: Return:
int: The current size of GPU memory that is held by the allocator of the given device, in bytes. int: The current size of GPU memory that is held by the allocator of the given device, in bytes.
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu # required: gpu
...@@ -389,18 +396,18 @@ def get_device_properties(device=None): ...@@ -389,18 +396,18 @@ def get_device_properties(device=None):
Return the properties of given device. Return the properties of given device.
Args: Args:
device(paddle.CUDAPlace or int or str): The device, the id of the device or device(paddle.CUDAPlace or int or str): The device, the id of the device or
the string name of device like 'gpu:x' which to get the properties of the the string name of device like 'gpu:x' which to get the properties of the
device from. If device is None, the device is the current device. device from. If device is None, the device is the current device.
Default: None. Default: None.
Returns: Returns:
_gpuDeviceProperties: The properties of the device which include ASCII string _gpuDeviceProperties: The properties of the device which include ASCII string
identifying device, major compute capability, minor compute capability, global identifying device, major compute capability, minor compute capability, global
memory available and the number of multiprocessors on the device. memory available and the number of multiprocessors on the device.
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu # required: gpu
...@@ -424,7 +431,8 @@ def get_device_properties(device=None): ...@@ -424,7 +431,8 @@ def get_device_properties(device=None):
raise ValueError( raise ValueError(
"The API paddle.device.cuda.get_device_properties is not supported in " "The API paddle.device.cuda.get_device_properties is not supported in "
"CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support " "CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support "
"to call this API.") "to call this API."
)
if device is not None: if device is not None:
if isinstance(device, int): if isinstance(device, int):
...@@ -438,12 +446,14 @@ def get_device_properties(device=None): ...@@ -438,12 +446,14 @@ def get_device_properties(device=None):
raise ValueError( raise ValueError(
"The current string {} is not expected. Because paddle.device." "The current string {} is not expected. Because paddle.device."
"cuda.get_device_properties only support string which is like 'gpu:x'. " "cuda.get_device_properties only support string which is like 'gpu:x'. "
"Please input appropriate string again!".format(device)) "Please input appropriate string again!".format(device)
)
else: else:
raise ValueError( raise ValueError(
"The device type {} is not expected. Because paddle.device.cuda." "The device type {} is not expected. Because paddle.device.cuda."
"get_device_properties only support int, str or paddle.CUDAPlace. " "get_device_properties only support int, str or paddle.CUDAPlace. "
"Please input appropriate device again!".format(device)) "Please input appropriate device again!".format(device)
)
else: else:
device_id = -1 device_id = -1
...@@ -484,7 +494,7 @@ def get_device_capability(device=None): ...@@ -484,7 +494,7 @@ def get_device_capability(device=None):
Return the major and minor revision numbers defining the device's compute capability which are got from CUDA function `cudaDeviceProp <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g1bf9d625a931d657e08db2b4391170f0>`_. Return the major and minor revision numbers defining the device's compute capability which are got from CUDA function `cudaDeviceProp <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g1bf9d625a931d657e08db2b4391170f0>`_.
Parameters: Parameters:
device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device. device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device.
Returns: Returns:
tuple(int,int): the major and minor revision numbers defining the device's compute capability. tuple(int,int): the major and minor revision numbers defining the device's compute capability.
......
...@@ -24,11 +24,11 @@ def wait_server_ready(endpoints): ...@@ -24,11 +24,11 @@ def wait_server_ready(endpoints):
""" """
Wait until parameter servers are ready, use connext_ex to detect Wait until parameter servers are ready, use connext_ex to detect
port readiness. port readiness.
Args: Args:
endpoints (list|tuple): endpoints string list, like: endpoints (list|tuple): endpoints string list, like:
["127.0.0.1:8080", "127.0.0.1:8081"] ["127.0.0.1:8080", "127.0.0.1:8081"]
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -40,8 +40,9 @@ def wait_server_ready(endpoints): ...@@ -40,8 +40,9 @@ def wait_server_ready(endpoints):
not_ready_endpoints = [] not_ready_endpoints = []
for ep in endpoints: for ep in endpoints:
ip_port = ep.split(":") ip_port = ep.split(":")
with closing(socket.socket(socket.AF_INET, with closing(
socket.SOCK_STREAM)) as sock: socket.socket(socket.AF_INET, socket.SOCK_STREAM)
) as sock:
sock.settimeout(2) sock.settimeout(2)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
if hasattr(socket, 'SO_REUSEPORT'): if hasattr(socket, 'SO_REUSEPORT'):
...@@ -53,8 +54,9 @@ def wait_server_ready(endpoints): ...@@ -53,8 +54,9 @@ def wait_server_ready(endpoints):
not_ready_endpoints.append(ep) not_ready_endpoints.append(ep)
if not all_ok: if not all_ok:
sys.stderr.write("server not ready, wait 3 sec to retry...\n") sys.stderr.write("server not ready, wait 3 sec to retry...\n")
sys.stderr.write("not ready endpoints:" + str(not_ready_endpoints) + sys.stderr.write(
"\n") "not ready endpoints:" + str(not_ready_endpoints) + "\n"
)
sys.stderr.flush() sys.stderr.flush()
time.sleep(3) time.sleep(3)
else: else:
......
...@@ -30,7 +30,9 @@ from paddle.fluid.framework import _set_expected_place ...@@ -30,7 +30,9 @@ from paddle.fluid.framework import _set_expected_place
from paddle.fluid.dygraph import parallel_helper from paddle.fluid.dygraph import parallel_helper
from paddle.distributed.fleet.launch_utils import check_backend from paddle.distributed.fleet.launch_utils import check_backend
from paddle.fluid.dygraph.parallel import ParallelEnv from paddle.fluid.dygraph.parallel import ParallelEnv
from paddle.distributed.fleet.base.private_helper_function import wait_server_ready # noqa: F401 from paddle.distributed.fleet.base.private_helper_function import (
wait_server_ready,
) # noqa: F401
from paddle.distributed import collective from paddle.distributed import collective
from paddle.distributed.collective import _set_group_map from paddle.distributed.collective import _set_group_map
from paddle.distributed.collective import _set_group_map_by_name from paddle.distributed.collective import _set_group_map_by_name
...@@ -63,6 +65,7 @@ def _get_global_parallel_env(): ...@@ -63,6 +65,7 @@ def _get_global_parallel_env():
def _start_kv_server(port, http_server_d, size): def _start_kv_server(port, http_server_d, size):
from paddle.distributed.fleet.utils.http_server import KVServer from paddle.distributed.fleet.utils.http_server import KVServer
http_server = KVServer(int(port), size=size) http_server = KVServer(int(port), size=size)
http_server.start() http_server.start()
wait_seconds = 3 wait_seconds = 3
...@@ -73,10 +76,15 @@ def _start_kv_server(port, http_server_d, size): ...@@ -73,10 +76,15 @@ def _start_kv_server(port, http_server_d, size):
def _is_cpuonly(backend): def _is_cpuonly(backend):
check_backend(backend) check_backend(backend)
if (backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl'] and if (
(core.is_compiled_with_cuda() or core.is_compiled_with_xpu() backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl']
or core.is_compiled_with_npu() and (
or core.is_compiled_with_mlu())) or backend is 'xccl': core.is_compiled_with_cuda()
or core.is_compiled_with_xpu()
or core.is_compiled_with_npu()
or core.is_compiled_with_mlu()
)
) or backend is 'xccl':
# passes 'auto' and can use cuda or xpu, use the default logics. so return False # passes 'auto' and can use cuda or xpu, use the default logics. so return False
return False return False
...@@ -87,9 +95,10 @@ def _is_cpuonly(backend): ...@@ -87,9 +95,10 @@ def _is_cpuonly(backend):
def _check_var_exists(var_name): def _check_var_exists(var_name):
var = os.environ.get(var_name, None) var = os.environ.get(var_name, None)
if var is None: if var is None:
raise ValueError("paddle.distributed initialize error, " raise ValueError(
"environment variable %s is needed, but not set." % "paddle.distributed initialize error, "
var_name) "environment variable %s is needed, but not set." % var_name
)
def init_parallel_env(): def init_parallel_env():
...@@ -106,7 +115,7 @@ def init_parallel_env(): ...@@ -106,7 +115,7 @@ def init_parallel_env():
Returns: Returns:
None None
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu # required: gpu
...@@ -120,7 +129,7 @@ def init_parallel_env(): ...@@ -120,7 +129,7 @@ def init_parallel_env():
super(LinearNet, self).__init__() super(LinearNet, self).__init__()
self._linear1 = nn.Linear(10, 10) self._linear1 = nn.Linear(10, 10)
self._linear2 = nn.Linear(10, 1) self._linear2 = nn.Linear(10, 1)
def forward(self, x): def forward(self, x):
return self._linear2(self._linear1(x)) return self._linear2(self._linear1(x))
...@@ -141,7 +150,7 @@ def init_parallel_env(): ...@@ -141,7 +150,7 @@ def init_parallel_env():
outputs = dp_layer(inputs) outputs = dp_layer(inputs)
labels = paddle.randn([10, 1], 'float32') labels = paddle.randn([10, 1], 'float32')
loss = loss_fn(outputs, labels) loss = loss_fn(outputs, labels)
loss.backward() loss.backward()
adam.step() adam.step()
...@@ -167,15 +176,21 @@ def init_parallel_env(): ...@@ -167,15 +176,21 @@ def init_parallel_env():
backend = os.environ.get('PADDLE_DISTRI_BACKEND', 'auto') backend = os.environ.get('PADDLE_DISTRI_BACKEND', 'auto')
is_cpu_only = _is_cpuonly(backend) is_cpu_only = _is_cpuonly(backend)
# 1. gpu xpu check, must be gpu or xpu, # 1. gpu xpu check, must be gpu or xpu,
if not (is_cpu_only or core.is_compiled_with_cuda() if not (
or core.is_compiled_with_xpu() or core.is_compiled_with_npu() is_cpu_only
or core.is_compiled_with_mlu()): or core.is_compiled_with_cuda()
or core.is_compiled_with_xpu()
or core.is_compiled_with_npu()
or core.is_compiled_with_mlu()
):
raise NotImplementedError( raise NotImplementedError(
"If you want to use CPU-only version, please use 'gloo' as backend") "If you want to use CPU-only version, please use 'gloo' as backend"
)
if backend == "xccl": if backend == "xccl":
FLAGS_selected_custom_devices = 'FLAGS_selected_{}s'.format( FLAGS_selected_custom_devices = 'FLAGS_selected_{}s'.format(
parallel_env.device_type) parallel_env.device_type
)
_check_var_exists(FLAGS_selected_custom_devices) _check_var_exists(FLAGS_selected_custom_devices)
else: else:
if not is_cpu_only and core.is_compiled_with_cuda(): if not is_cpu_only and core.is_compiled_with_cuda():
...@@ -203,8 +218,9 @@ def init_parallel_env(): ...@@ -203,8 +218,9 @@ def init_parallel_env():
# they need to call a function to change default place, # they need to call a function to change default place,
# here just set correctly place to users # here just set correctly place to users
if backend == "xccl": if backend == "xccl":
place = core.CustomPlace(parallel_env.device_type, place = core.CustomPlace(
parallel_env.device_id) parallel_env.device_type, parallel_env.device_id
)
elif is_cpu_only: elif is_cpu_only:
place = core.CPUPlace() place = core.CPUPlace()
elif core.is_compiled_with_cuda(): elif core.is_compiled_with_cuda():
...@@ -228,11 +244,15 @@ def init_parallel_env(): ...@@ -228,11 +244,15 @@ def init_parallel_env():
assert rank >= 0 and world_size > rank and world_size > 1, ( assert rank >= 0 and world_size > rank and world_size > 1, (
"rank must be non-negative and world_size must be the " "rank must be non-negative and world_size must be the "
"maximum rank plus one. Moreover, at least two processes are " "maximum rank plus one. Moreover, at least two processes are "
"required to create a process group.") "required to create a process group."
)
master_addr = os.getenv("MASTER_ADDR", None) master_addr = os.getenv("MASTER_ADDR", None)
master_port = os.getenv("MASTER_PORT", None) master_port = os.getenv("MASTER_PORT", None)
endpoints = ":".join([master_addr, master_port endpoints = (
]) if master_addr and master_port else None ":".join([master_addr, master_port])
if master_addr and master_port
else None
)
if endpoints is None: if endpoints is None:
endpoints = os.getenv("PADDLE_MASTER", None) endpoints = os.getenv("PADDLE_MASTER", None)
if endpoints is None: if endpoints is None:
...@@ -241,23 +261,28 @@ def init_parallel_env(): ...@@ -241,23 +261,28 @@ def init_parallel_env():
"The environment variable 'MASTER_ADDR' and 'MASTER_PORT' " "The environment variable 'MASTER_ADDR' and 'MASTER_PORT' "
"must be specified, for example 'export MASTER_ADDR=127.0.0.1' " "must be specified, for example 'export MASTER_ADDR=127.0.0.1' "
"and 'export MASTER_ADDR=54612'. Or you can start your training" "and 'export MASTER_ADDR=54612'. Or you can start your training"
"with paddle.distributed.run module.") "with paddle.distributed.run module."
)
master_addr, master_port = endpoints.split(":") master_addr, master_port = endpoints.split(":")
master_port = int(master_port) master_port = int(master_port)
is_master = rank == 0 is_master = rank == 0
stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900")) stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900"))
default_store = core.TCPStore(master_addr, default_store = core.TCPStore(
master_port, master_addr,
is_master, master_port,
world_size, is_master,
timeout=stop_check_timeout) world_size,
timeout=stop_check_timeout,
)
_set_default_store(default_store) _set_default_store(default_store)
pg = _new_process_group_impl(backend, pg = _new_process_group_impl(
default_store, backend,
rank, default_store,
world_size, rank,
_default_group_name, world_size,
pg_options=None) _default_group_name,
pg_options=None,
)
ranks = list(range(world_size)) ranks = list(range(world_size))
group = Group(rank, 0, ranks, pg=pg, name=_default_group_name) group = Group(rank, 0, ranks, pg=pg, name=_default_group_name)
_set_group_map_by_name(_default_group_name, group) _set_group_map_by_name(_default_group_name, group)
...@@ -283,8 +308,10 @@ def init_parallel_env(): ...@@ -283,8 +308,10 @@ def init_parallel_env():
size = {'_worker': parallel_env.world_size} size = {'_worker': parallel_env.world_size}
if backend == "heter": if backend == "heter":
size = {'_worker': len(node_num)} size = {'_worker': len(node_num)}
http_server = Process(target=_start_kv_server, http_server = Process(
args=(int(ep_rank_0[1]), http_server_d, size)) target=_start_kv_server,
args=(int(ep_rank_0[1]), http_server_d, size),
)
http_server.daemon = True http_server.daemon = True
http_server_d["running"] = True http_server_d["running"] = True
http_server.start() http_server.start()
...@@ -302,22 +329,28 @@ def init_parallel_env(): ...@@ -302,22 +329,28 @@ def init_parallel_env():
# init nccl or hccl or bkcl or heter context # init nccl or hccl or bkcl or heter context
if is_cpu_only: if is_cpu_only:
parallel_helper._set_parallel_ctx( parallel_helper._set_parallel_ctx(
core.GLOOParallelContext(strategy, place)) core.GLOOParallelContext(strategy, place)
elif (backend == "heter"): )
elif backend == "heter":
parallel_helper._set_parallel_ctx( parallel_helper._set_parallel_ctx(
core.HeterParallelContext(strategy, parallel_env.device_id)) core.HeterParallelContext(strategy, parallel_env.device_id)
)
elif core.is_compiled_with_cuda(): elif core.is_compiled_with_cuda():
parallel_helper._set_parallel_ctx( parallel_helper._set_parallel_ctx(
core.NCCLParallelContext(strategy, place)) core.NCCLParallelContext(strategy, place)
)
elif core.is_compiled_with_xpu(): elif core.is_compiled_with_xpu():
parallel_helper._set_parallel_ctx( parallel_helper._set_parallel_ctx(
core.BKCLParallelContext(strategy, place)) core.BKCLParallelContext(strategy, place)
)
elif core.is_compiled_with_npu(): elif core.is_compiled_with_npu():
parallel_helper._set_parallel_ctx( parallel_helper._set_parallel_ctx(
core.HCCLParallelContext(strategy, place)) core.HCCLParallelContext(strategy, place)
)
elif core.is_compiled_with_mlu(): elif core.is_compiled_with_mlu():
parallel_helper._set_parallel_ctx( parallel_helper._set_parallel_ctx(
core.CNCLParallelContext(strategy, place)) core.CNCLParallelContext(strategy, place)
)
if backend != "heter": if backend != "heter":
other_endpoints = strategy.trainer_endpoints[:] other_endpoints = strategy.trainer_endpoints[:]
......
...@@ -23,30 +23,48 @@ from paddle.distributed.utils.log_utils import get_logger ...@@ -23,30 +23,48 @@ from paddle.distributed.utils.log_utils import get_logger
from paddle.fluid.framework import in_dygraph_mode from paddle.fluid.framework import in_dygraph_mode
# Old version # Old version
from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import (
from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 ShardingOptimizerStage2,
from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3 )
from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import (
ShardingStage2,
)
from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import (
ShardingStage3,
)
from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import (
ShardingScaler,
)
# New version # New version
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import (
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2 GroupShardedOptimizerStage2,
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import GroupShardedStage3 )
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import GroupShardedScaler from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import (
GroupShardedStage2,
)
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import (
GroupShardedStage3,
)
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import (
GroupShardedScaler,
)
logger_ = get_logger(logging.WARNING) logger_ = get_logger(logging.WARNING)
def group_sharded_parallel(model, def group_sharded_parallel(
optimizer, model,
level, optimizer,
scaler=None, level,
group=None, scaler=None,
offload=False, group=None,
sync_buffers=False, offload=False,
buffer_max_size=2**23, sync_buffers=False,
segment_size=2**20, buffer_max_size=2**23,
sync_comm=False): segment_size=2**20,
sync_comm=False,
):
""" """
Use group_sharded_parallel can perform group shared configuration on the model, optimizer and GradScaler. Level has three string options, 'os', 'os_g' and 'p_g_os' corresponds to three different usage scenarios: optimizer state segmentation, optimizer state + gradient segmentation, and parameter + gradient + optimizer state segmentation. Use group_sharded_parallel can perform group shared configuration on the model, optimizer and GradScaler. Level has three string options, 'os', 'os_g' and 'p_g_os' corresponds to three different usage scenarios: optimizer state segmentation, optimizer state + gradient segmentation, and parameter + gradient + optimizer state segmentation.
Usually, optimizer state + gradient segmentation is actually a re optimization of optimizer state segmentation, so optimizer state + gradient segmentation can be used to realize optimizer state segmentation. Usually, optimizer state + gradient segmentation is actually a re optimization of optimizer state segmentation, so optimizer state + gradient segmentation can be used to realize optimizer state segmentation.
...@@ -62,12 +80,12 @@ def group_sharded_parallel(model, ...@@ -62,12 +80,12 @@ def group_sharded_parallel(model,
buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. The larger the size, the more GPU memory will be used. Defaults to 2**23, which means that the dimension of the buffer is 2**23. buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. The larger the size, the more GPU memory will be used. Defaults to 2**23, which means that the dimension of the buffer is 2**23.
segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20, indicating that the dimension of the minimum segmented parameter is 2**20. segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20, indicating that the dimension of the minimum segmented parameter is 2**20.
sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False, indicating that asynchronous communication is used. sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False, indicating that asynchronous communication is used.
Returns: Returns:
model: A wrapper for group sharded given model. model: A wrapper for group sharded given model.
optimizer: A wrapper for group sharded given optimizer. optimizer: A wrapper for group sharded given optimizer.
scaler: A wrapper for group sharded given scaler. scaler: A wrapper for group sharded given scaler.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -100,13 +118,16 @@ def group_sharded_parallel(model, ...@@ -100,13 +118,16 @@ def group_sharded_parallel(model,
""" """
# check optition type # check optition type
assert isinstance( assert isinstance(
model, model, paddle.nn.Layer
paddle.nn.Layer), "The model must be the instance of paddle.nn.Layer." ), "The model must be the instance of paddle.nn.Layer."
assert isinstance( assert isinstance(
optimizer, Optimizer optimizer, Optimizer
), "The optimizer must be the instance of paddle.optimizer.Optimizer." ), "The optimizer must be the instance of paddle.optimizer.Optimizer."
assert level in ['os', 'os_g', assert level in [
'p_g_os'], "The level must be os, os_g or p_g_os." 'os',
'os_g',
'p_g_os',
], "The level must be os, os_g or p_g_os."
def check_dtype(param): def check_dtype(param):
return param.dtype == paddle.float16 return param.dtype == paddle.float16
...@@ -124,39 +145,50 @@ def group_sharded_parallel(model, ...@@ -124,39 +145,50 @@ def group_sharded_parallel(model,
params=optimizer._parameter_list, params=optimizer._parameter_list,
optim=optimizer, optim=optimizer,
group=group, group=group,
offload=offload) offload=offload,
model = GroupShardedStage2(model, )
optimizer, model = GroupShardedStage2(
group=group, model,
sync_buffers=sync_buffers, optimizer,
buffer_max_size=buffer_max_size) group=group,
sync_buffers=sync_buffers,
buffer_max_size=buffer_max_size,
)
else: else:
optimizer = ShardingOptimizerStage2(params=model.parameters(), optimizer = ShardingOptimizerStage2(
optim=optimizer, params=model.parameters(),
group=group, optim=optimizer,
offload=offload) group=group,
model = ShardingStage2(model, offload=offload,
optimizer, )
group=group, model = ShardingStage2(
sync_buffers=sync_buffers, model,
buffer_max_size=buffer_max_size) optimizer,
group=group,
sync_buffers=sync_buffers,
buffer_max_size=buffer_max_size,
)
elif level == 'p_g_os': elif level == 'p_g_os':
if in_dygraph_mode(): if in_dygraph_mode():
model = GroupShardedStage3(model, model = GroupShardedStage3(
optimizer=optimizer, model,
group=group, optimizer=optimizer,
sync_buffers=sync_buffers, group=group,
segment_size=segment_size, sync_buffers=sync_buffers,
offload=offload, segment_size=segment_size,
sync_comm=sync_comm) offload=offload,
sync_comm=sync_comm,
)
else: else:
model = ShardingStage3(model, model = ShardingStage3(
optimizer=optimizer, model,
group=group, optimizer=optimizer,
sync_buffers=sync_buffers, group=group,
segment_size=segment_size, sync_buffers=sync_buffers,
offload=offload, segment_size=segment_size,
sync_comm=sync_comm) offload=offload,
sync_comm=sync_comm,
)
else: else:
raise ValueError("Please enter the correct level.") raise ValueError("Please enter the correct level.")
if isinstance(scaler, paddle.amp.GradScaler): if isinstance(scaler, paddle.amp.GradScaler):
...@@ -184,7 +216,7 @@ def save_group_sharded_model(model, output, optimizer=None): ...@@ -184,7 +216,7 @@ def save_group_sharded_model(model, output, optimizer=None):
model (Layer): A wrapper for group sharded given model. model (Layer): A wrapper for group sharded given model.
output (str): Save directory. output (str): Save directory.
optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None, indicating that the optimizer state is not saved. optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None, indicating that the optimizer state is not saved.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -219,7 +251,8 @@ def save_group_sharded_model(model, output, optimizer=None): ...@@ -219,7 +251,8 @@ def save_group_sharded_model(model, output, optimizer=None):
save_group_sharded_model(model, optimizer, output=output_dir) save_group_sharded_model(model, optimizer, output=output_dir)
""" """
logger_.info( logger_.info(
"==========Begin to save group sharded model and optimizer==========") "==========Begin to save group sharded model and optimizer=========="
)
assert not os.path.isfile( assert not os.path.isfile(
output output
), "Saving directory ({}) should be a directory, not a file".format(output) ), "Saving directory ({}) should be a directory, not a file".format(output)
...@@ -243,4 +276,5 @@ def save_group_sharded_model(model, output, optimizer=None): ...@@ -243,4 +276,5 @@ def save_group_sharded_model(model, output, optimizer=None):
output_opt = os.path.join(output, "model.pdopt") output_opt = os.path.join(output, "model.pdopt")
paddle.save(optimizer._optim.state_dict(), output_opt) paddle.save(optimizer._optim.state_dict(), output_opt)
logger_.info( logger_.info(
"==========End to save group sharded model and optimizer==========") "==========End to save group sharded model and optimizer=========="
)
...@@ -28,35 +28,56 @@ import numpy as np ...@@ -28,35 +28,56 @@ import numpy as np
import paddle import paddle
from paddle import _C_ops, _legacy_C_ops from paddle import _C_ops, _legacy_C_ops
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.data_feeder import (check_dtype, check_type, from paddle.fluid.data_feeder import (
check_variable_and_dtype, convert_dtype) check_dtype,
from paddle.fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph check_type,
from paddle.fluid.layers import (control_flow, elementwise_add, elementwise_div, check_variable_and_dtype,
elementwise_mul, elementwise_sub, nn, ops, convert_dtype,
tensor) )
from paddle.fluid.framework import (
_non_static_mode,
in_dygraph_mode,
_in_legacy_dygraph,
)
from paddle.fluid.layers import (
control_flow,
elementwise_add,
elementwise_div,
elementwise_mul,
elementwise_sub,
nn,
ops,
tensor,
)
from paddle.tensor import arange, concat, gather_nd, multinomial from paddle.tensor import arange, concat, gather_nd, multinomial
class Distribution(object): class Distribution(object):
""" """
The abstract base class for probability distributions. Functions are The abstract base class for probability distributions. Functions are
implemented in specific distributions. implemented in specific distributions.
Args: Args:
batch_shape(Sequence[int], optional): independent, not identically batch_shape(Sequence[int], optional): independent, not identically
distributed draws, aka a "collection" or "bunch" of distributions. distributed draws, aka a "collection" or "bunch" of distributions.
event_shape(Sequence[int], optional): the shape of a single event_shape(Sequence[int], optional): the shape of a single
draw from the distribution; it may be dependent across dimensions. draw from the distribution; it may be dependent across dimensions.
For scalar distributions, the event shape is []. For n-dimension For scalar distributions, the event shape is []. For n-dimension
multivariate distribution, the event shape is [n]. multivariate distribution, the event shape is [n].
""" """
def __init__(self, batch_shape=(), event_shape=()): def __init__(self, batch_shape=(), event_shape=()):
self._batch_shape = batch_shape if isinstance( self._batch_shape = (
batch_shape, tuple) else tuple(batch_shape) batch_shape
self._event_shape = event_shape if isinstance( if isinstance(batch_shape, tuple)
event_shape, tuple) else tuple(event_shape) else tuple(batch_shape)
)
self._event_shape = (
event_shape
if isinstance(event_shape, tuple)
else tuple(event_shape)
)
super(Distribution, self).__init__() super(Distribution, self).__init__()
...@@ -118,16 +139,16 @@ class Distribution(object): ...@@ -118,16 +139,16 @@ class Distribution(object):
def probs(self, value): def probs(self, value):
"""Probability density/mass function. """Probability density/mass function.
.. note:: .. note::
This method will be deprecated in the future, please use `prob` This method will be deprecated in the future, please use `prob`
instead. instead.
""" """
raise NotImplementedError raise NotImplementedError
def _extend_shape(self, sample_shape): def _extend_shape(self, sample_shape):
"""compute shape of the sample """compute shape of the sample
Args: Args:
sample_shape (Tensor): sample shape sample_shape (Tensor): sample shape
...@@ -155,7 +176,8 @@ class Distribution(object): ...@@ -155,7 +176,8 @@ class Distribution(object):
if is_variable and is_number: if is_variable and is_number:
raise ValueError( raise ValueError(
'if one argument is Tensor, all arguments should be Tensor') 'if one argument is Tensor, all arguments should be Tensor'
)
return is_variable return is_variable
...@@ -170,15 +192,17 @@ class Distribution(object): ...@@ -170,15 +192,17 @@ class Distribution(object):
""" """
numpy_args = [] numpy_args = []
variable_args = [] variable_args = []
tmp = 0. tmp = 0.0
for arg in args: for arg in args:
if isinstance(arg, float): if isinstance(arg, float):
arg = [arg] arg = [arg]
if not isinstance(arg, (list, tuple, np.ndarray, tensor.Variable)): if not isinstance(arg, (list, tuple, np.ndarray, tensor.Variable)):
raise TypeError( raise TypeError(
"Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}" "Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}".format(
.format(type(arg))) type(arg)
)
)
arg_np = np.array(arg) arg_np = np.array(arg)
arg_dtype = arg_np.dtype arg_dtype = arg_np.dtype
...@@ -216,20 +240,24 @@ class Distribution(object): ...@@ -216,20 +240,24 @@ class Distribution(object):
value (Tensor): Change value's dtype if value's dtype is different from param. value (Tensor): Change value's dtype if value's dtype is different from param.
""" """
if _non_static_mode(): if _non_static_mode():
if value.dtype != param.dtype and convert_dtype( if value.dtype != param.dtype and convert_dtype(value.dtype) in [
value.dtype) in ['float32', 'float64']: 'float32',
'float64',
]:
warnings.warn( warnings.warn(
"dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted." "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
) )
if in_dygraph_mode(): if in_dygraph_mode():
return _C_ops.cast(value, param.dtype) return _C_ops.cast(value, param.dtype)
if _in_legacy_dygraph(): if _in_legacy_dygraph():
return _legacy_C_ops.cast(value, 'in_dtype', value.dtype, return _legacy_C_ops.cast(
'out_dtype', param.dtype) value, 'in_dtype', value.dtype, 'out_dtype', param.dtype
)
return value return value
check_variable_and_dtype(value, 'value', ['float32', 'float64'], check_variable_and_dtype(
'log_prob') value, 'value', ['float32', 'float64'], 'log_prob'
)
if value.dtype != param.dtype: if value.dtype != param.dtype:
warnings.warn( warnings.warn(
"dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted." "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
...@@ -239,19 +267,25 @@ class Distribution(object): ...@@ -239,19 +267,25 @@ class Distribution(object):
def _probs_to_logits(self, probs, is_binary=False): def _probs_to_logits(self, probs, is_binary=False):
r""" r"""
Converts probabilities into logits. For the binary, probs denotes the Converts probabilities into logits. For the binary, probs denotes the
probability of occurrence of the event indexed by `1`. For the probability of occurrence of the event indexed by `1`. For the
multi-dimensional, values of last axis denote the probabilities of multi-dimensional, values of last axis denote the probabilities of
occurrence of each of the events. occurrence of each of the events.
""" """
return (paddle.log(probs) - paddle.log1p(-probs)) \ return (
if is_binary else paddle.log(probs) (paddle.log(probs) - paddle.log1p(-probs))
if is_binary
else paddle.log(probs)
)
def _logits_to_probs(self, logits, is_binary=False): def _logits_to_probs(self, logits, is_binary=False):
r""" r"""
Converts logits into probabilities. For the binary, each value denotes Converts logits into probabilities. For the binary, each value denotes
log odds, whereas for the multi-dimensional case, the values along the log odds, whereas for the multi-dimensional case, the values along the
last dimension denote the log probabilities of the events. last dimension denote the log probabilities of the events.
""" """
return paddle.nn.functional.sigmoid(logits) \ return (
if is_binary else paddle.nn.functional.softmax(logits, axis=-1) paddle.nn.functional.sigmoid(logits)
if is_binary
else paddle.nn.functional.softmax(logits, axis=-1)
)
...@@ -35,7 +35,7 @@ def kl_divergence(p, q): ...@@ -35,7 +35,7 @@ def kl_divergence(p, q):
.. math:: .. math::
KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x
Args: Args:
p (Distribution): ``Distribution`` object. p (Distribution): ``Distribution`` object.
...@@ -64,11 +64,11 @@ def kl_divergence(p, q): ...@@ -64,11 +64,11 @@ def kl_divergence(p, q):
def register_kl(cls_p, cls_q): def register_kl(cls_p, cls_q):
"""Decorator for register a KL divergence implemention function. """Decorator for register a KL divergence implemention function.
The ``kl_divergence(p, q)`` function will search concrete implemention The ``kl_divergence(p, q)`` function will search concrete implemention
functions registered by ``register_kl``, according to multi-dispatch pattern. functions registered by ``register_kl``, according to multi-dispatch pattern.
If an implemention function is found, it will return the result, otherwise, If an implemention function is found, it will return the result, otherwise,
it will raise ``NotImplementError`` exception. Users can register it will raise ``NotImplementError`` exception. Users can register
implemention funciton by the decorator. implemention funciton by the decorator.
Args: Args:
cls_p(Distribution): Subclass derived from ``Distribution``. cls_p(Distribution): Subclass derived from ``Distribution``.
...@@ -83,8 +83,9 @@ def register_kl(cls_p, cls_q): ...@@ -83,8 +83,9 @@ def register_kl(cls_p, cls_q):
def kl_beta_beta(): def kl_beta_beta():
pass # insert implementation here pass # insert implementation here
""" """
if (not issubclass(cls_p, Distribution) if not issubclass(cls_p, Distribution) or not issubclass(
or not issubclass(cls_q, Distribution)): cls_q, Distribution
):
raise TypeError('cls_p and cls_q must be subclass of Distribution') raise TypeError('cls_p and cls_q must be subclass of Distribution')
def decorator(f): def decorator(f):
...@@ -98,8 +99,11 @@ def _dispatch(cls_p, cls_q): ...@@ -98,8 +99,11 @@ def _dispatch(cls_p, cls_q):
"""Multiple dispatch into concrete implement function""" """Multiple dispatch into concrete implement function"""
# find all matched super class pair of p and q # find all matched super class pair of p and q
matchs = [(super_p, super_q) for super_p, super_q in _REGISTER_TABLE matchs = [
if issubclass(cls_p, super_p) and issubclass(cls_q, super_q)] (super_p, super_q)
for super_p, super_q in _REGISTER_TABLE
if issubclass(cls_p, super_p) and issubclass(cls_q, super_q)
]
if not matchs: if not matchs:
raise NotImplementedError raise NotImplementedError
...@@ -108,16 +112,20 @@ def _dispatch(cls_p, cls_q): ...@@ -108,16 +112,20 @@ def _dispatch(cls_p, cls_q):
if _REGISTER_TABLE[left_p, left_q] is not _REGISTER_TABLE[right_p, right_q]: if _REGISTER_TABLE[left_p, left_q] is not _REGISTER_TABLE[right_p, right_q]:
warnings.warn( warnings.warn(
'Ambiguous kl_divergence({}, {}). Please register_kl({}, {})'. 'Ambiguous kl_divergence({}, {}). Please register_kl({}, {})'.format(
format(cls_p.__name__, cls_q.__name__, left_p.__name__, cls_p.__name__,
right_q.__name__), RuntimeWarning) cls_q.__name__,
left_p.__name__,
right_q.__name__,
),
RuntimeWarning,
)
return _REGISTER_TABLE[left_p, left_q] return _REGISTER_TABLE[left_p, left_q]
@functools.total_ordering @functools.total_ordering
class _Compare(object): class _Compare(object):
def __init__(self, *classes): def __init__(self, *classes):
self.classes = classes self.classes = classes
...@@ -135,22 +143,33 @@ class _Compare(object): ...@@ -135,22 +143,33 @@ class _Compare(object):
@register_kl(Beta, Beta) @register_kl(Beta, Beta)
def _kl_beta_beta(p, q): def _kl_beta_beta(p, q):
return ((q.alpha.lgamma() + q.beta.lgamma() + (p.alpha + p.beta).lgamma()) - return (
(p.alpha.lgamma() + p.beta.lgamma() + (q.alpha + q.beta).lgamma()) + (q.alpha.lgamma() + q.beta.lgamma() + (p.alpha + p.beta).lgamma())
((p.alpha - q.alpha) * p.alpha.digamma()) + - (p.alpha.lgamma() + p.beta.lgamma() + (q.alpha + q.beta).lgamma())
((p.beta - q.beta) * p.beta.digamma()) + + ((p.alpha - q.alpha) * p.alpha.digamma())
(((q.alpha + q.beta) - (p.alpha + p.beta)) * + ((p.beta - q.beta) * p.beta.digamma())
(p.alpha + p.beta).digamma())) + (
((q.alpha + q.beta) - (p.alpha + p.beta))
* (p.alpha + p.beta).digamma()
)
)
@register_kl(Dirichlet, Dirichlet) @register_kl(Dirichlet, Dirichlet)
def _kl_dirichlet_dirichlet(p, q): def _kl_dirichlet_dirichlet(p, q):
return ( return (
(p.concentration.sum(-1).lgamma() - q.concentration.sum(-1).lgamma()) - (p.concentration.sum(-1).lgamma() - q.concentration.sum(-1).lgamma())
((p.concentration.lgamma() - q.concentration.lgamma()).sum(-1)) + - ((p.concentration.lgamma() - q.concentration.lgamma()).sum(-1))
(((p.concentration - q.concentration) * + (
(p.concentration.digamma() - (
p.concentration.sum(-1).digamma().unsqueeze(-1))).sum(-1))) (p.concentration - q.concentration)
* (
p.concentration.digamma()
- p.concentration.sum(-1).digamma().unsqueeze(-1)
)
).sum(-1)
)
)
@register_kl(Categorical, Categorical) @register_kl(Categorical, Categorical)
...@@ -170,8 +189,7 @@ def _kl_uniform_uniform(p, q): ...@@ -170,8 +189,7 @@ def _kl_uniform_uniform(p, q):
@register_kl(ExponentialFamily, ExponentialFamily) @register_kl(ExponentialFamily, ExponentialFamily)
def _kl_expfamily_expfamily(p, q): def _kl_expfamily_expfamily(p, q):
"""Compute kl-divergence using `Bregman divergences <https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf>`_ """Compute kl-divergence using `Bregman divergences <https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf>`_"""
"""
if not type(p) == type(q): if not type(p) == type(q):
raise NotImplementedError raise NotImplementedError
...@@ -187,19 +205,22 @@ def _kl_expfamily_expfamily(p, q): ...@@ -187,19 +205,22 @@ def _kl_expfamily_expfamily(p, q):
try: try:
if _non_static_mode(): if _non_static_mode():
p_grads = paddle.grad(p_log_norm, p_grads = paddle.grad(
p_natural_params, p_log_norm, p_natural_params, create_graph=True
create_graph=True) )
else: else:
p_grads = paddle.static.gradients(p_log_norm, p_natural_params) p_grads = paddle.static.gradients(p_log_norm, p_natural_params)
except RuntimeError as e: except RuntimeError as e:
raise TypeError( raise TypeError(
"Cann't compute kl_divergence({cls_p}, {cls_q}) use bregman divergence. Please register_kl({cls_p}, {cls_q})." "Cann't compute kl_divergence({cls_p}, {cls_q}) use bregman divergence. Please register_kl({cls_p}, {cls_q}).".format(
.format(cls_p=type(p).__name__, cls_q=type(q).__name__)) from e cls_p=type(p).__name__, cls_q=type(q).__name__
)
) from e
kl = q._log_normalizer(*q_natural_params) - p_log_norm kl = q._log_normalizer(*q_natural_params) - p_log_norm
for p_param, q_param, p_grad in zip(p_natural_params, q_natural_params, for p_param, q_param, p_grad in zip(
p_grads): p_natural_params, q_natural_params, p_grads
):
term = (q_param - p_param) * p_grad term = (q_param - p_param) * p_grad
kl -= _sum_rightmost(term, len(q.event_shape)) kl -= _sum_rightmost(term, len(q.event_shape))
......
...@@ -19,12 +19,23 @@ import numpy as np ...@@ -19,12 +19,23 @@ import numpy as np
from paddle import _C_ops, _legacy_C_ops from paddle import _C_ops, _legacy_C_ops
from paddle.distribution import distribution from paddle.distribution import distribution
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.data_feeder import (check_dtype, check_type, from paddle.fluid.data_feeder import (
check_variable_and_dtype, convert_dtype) check_dtype,
check_type,
check_variable_and_dtype,
convert_dtype,
)
from paddle.fluid.framework import _non_static_mode, in_dygraph_mode from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
from paddle.fluid.layers import (control_flow, elementwise_add, elementwise_div, from paddle.fluid.layers import (
elementwise_mul, elementwise_sub, nn, ops, control_flow,
tensor) elementwise_add,
elementwise_div,
elementwise_mul,
elementwise_sub,
nn,
ops,
tensor,
)
class Normal(distribution.Distribution): class Normal(distribution.Distribution):
...@@ -55,7 +66,7 @@ class Normal(distribution.Distribution): ...@@ -55,7 +66,7 @@ class Normal(distribution.Distribution):
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
from paddle.distribution import Normal from paddle.distribution import Normal
...@@ -90,12 +101,18 @@ class Normal(distribution.Distribution): ...@@ -90,12 +101,18 @@ class Normal(distribution.Distribution):
def __init__(self, loc, scale, name=None): def __init__(self, loc, scale, name=None):
if not _non_static_mode(): if not _non_static_mode():
check_type(loc, 'loc', check_type(
(int, float, np.ndarray, tensor.Variable, list, tuple), loc,
'Normal') 'loc',
check_type(scale, 'scale', (int, float, np.ndarray, tensor.Variable, list, tuple),
(int, float, np.ndarray, tensor.Variable, list, tuple), 'Normal',
'Normal') )
check_type(
scale,
'scale',
(int, float, np.ndarray, tensor.Variable, list, tuple),
'Normal',
)
self.batch_size_unknown = False self.batch_size_unknown = False
self.all_arg_is_float = False self.all_arg_is_float = False
...@@ -115,11 +132,15 @@ class Normal(distribution.Distribution): ...@@ -115,11 +132,15 @@ class Normal(distribution.Distribution):
else: else:
if isinstance(loc, float) and isinstance(scale, float): if isinstance(loc, float) and isinstance(scale, float):
self.all_arg_is_float = True self.all_arg_is_float = True
if isinstance(loc, np.ndarray) and str( if isinstance(loc, np.ndarray) and str(loc.dtype) in [
loc.dtype) in ['float32', 'float64']: 'float32',
'float64',
]:
self.dtype = loc.dtype self.dtype = loc.dtype
elif isinstance(scale, np.ndarray) and str( elif isinstance(scale, np.ndarray) and str(scale.dtype) in [
scale.dtype) in ['float32', 'float64']: 'float32',
'float64',
]:
self.dtype = scale.dtype self.dtype = scale.dtype
# pylint: disable=unbalanced-tuple-unpacking # pylint: disable=unbalanced-tuple-unpacking
self.loc, self.scale = self._to_tensor(loc, scale) self.loc, self.scale = self._to_tensor(loc, scale)
...@@ -149,21 +170,21 @@ class Normal(distribution.Distribution): ...@@ -149,21 +170,21 @@ class Normal(distribution.Distribution):
if self.batch_size_unknown: if self.batch_size_unknown:
output_shape = shape + batch_shape output_shape = shape + batch_shape
zero_tmp = tensor.fill_constant_batch_size_like( zero_tmp = tensor.fill_constant_batch_size_like(
self.loc + self.scale, batch_shape + shape, self.dtype, 0.) self.loc + self.scale, batch_shape + shape, self.dtype, 0.0
)
zero_tmp_reshape = nn.reshape(zero_tmp, output_shape) zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
zero_tmp_shape = nn.shape(zero_tmp_reshape) zero_tmp_shape = nn.shape(zero_tmp_reshape)
normal_random_tmp = nn.gaussian_random(zero_tmp_shape, normal_random_tmp = nn.gaussian_random(
mean=0., zero_tmp_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
std=1., )
seed=seed,
dtype=self.dtype)
output = normal_random_tmp * (zero_tmp_reshape + self.scale) output = normal_random_tmp * (zero_tmp_reshape + self.scale)
output = elementwise_add(output, self.loc, name=name) output = elementwise_add(output, self.loc, name=name)
return output return output
else: else:
output_shape = shape + batch_shape output_shape = shape + batch_shape
output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed, dtype=self.dtype) * \ output = nn.gaussian_random(
(tensor.zeros(output_shape, dtype=self.dtype) + self.scale) output_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
) * (tensor.zeros(output_shape, dtype=self.dtype) + self.scale)
output = elementwise_add(output, self.loc, name=name) output = elementwise_add(output, self.loc, name=name)
if self.all_arg_is_float: if self.all_arg_is_float:
return nn.reshape(output, shape, name=name) return nn.reshape(output, shape, name=name)
...@@ -189,13 +210,14 @@ class Normal(distribution.Distribution): ...@@ -189,13 +210,14 @@ class Normal(distribution.Distribution):
""" """
name = self.name + '_entropy' name = self.name + '_entropy'
batch_shape = list((self.loc + self.scale).shape) batch_shape = list((self.loc + self.scale).shape)
zero_tmp = tensor.fill_constant_batch_size_like(self.loc + self.scale, zero_tmp = tensor.fill_constant_batch_size_like(
batch_shape, self.dtype, self.loc + self.scale, batch_shape, self.dtype, 0.0
0.) )
return elementwise_add(0.5 + zero_tmp, return elementwise_add(
0.5 * math.log(2 * math.pi) + nn.log( 0.5 + zero_tmp,
(self.scale + zero_tmp)), 0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)),
name=name) name=name,
)
def log_prob(self, value): def log_prob(self, value):
"""Log probability density/mass function. """Log probability density/mass function.
...@@ -212,10 +234,11 @@ class Normal(distribution.Distribution): ...@@ -212,10 +234,11 @@ class Normal(distribution.Distribution):
var = self.scale * self.scale var = self.scale * self.scale
log_scale = nn.log(self.scale) log_scale = nn.log(self.scale)
return elementwise_sub(-1. * ((value - self.loc) * (value - self.loc)) / return elementwise_sub(
(2. * var), -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var),
log_scale + math.log(math.sqrt(2. * math.pi)), log_scale + math.log(math.sqrt(2.0 * math.pi)),
name=name) name=name,
)
def probs(self, value): def probs(self, value):
"""Probability density/mass function. """Probability density/mass function.
...@@ -231,10 +254,13 @@ class Normal(distribution.Distribution): ...@@ -231,10 +254,13 @@ class Normal(distribution.Distribution):
value = self._check_values_dtype_in_probs(self.loc, value) value = self._check_values_dtype_in_probs(self.loc, value)
var = self.scale * self.scale var = self.scale * self.scale
return elementwise_div(ops.exp(-1. * ((value - self.loc) * return elementwise_div(
(value - self.loc)) / (2. * var)), ops.exp(
(math.sqrt(2 * math.pi) * self.scale), -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var)
name=name) ),
(math.sqrt(2 * math.pi) * self.scale),
name=name,
)
def kl_divergence(self, other): def kl_divergence(self, other):
r"""The KL-divergence between two normal distributions. r"""The KL-divergence between two normal distributions.
...@@ -248,7 +274,7 @@ class Normal(distribution.Distribution): ...@@ -248,7 +274,7 @@ class Normal(distribution.Distribution):
.. math:: .. math::
ratio = \\frac{\sigma_0}{\sigma_1} ratio = \\frac{\sigma_0}{\sigma_1}
.. math:: .. math::
diff = \mu_1 - \mu_0 diff = \mu_1 - \mu_0
...@@ -274,9 +300,9 @@ class Normal(distribution.Distribution): ...@@ -274,9 +300,9 @@ class Normal(distribution.Distribution):
name = self.name + '_kl_divergence' name = self.name + '_kl_divergence'
var_ratio = self.scale / other.scale var_ratio = self.scale / other.scale
var_ratio = (var_ratio * var_ratio) var_ratio = var_ratio * var_ratio
t1 = (self.loc - other.loc) / other.scale t1 = (self.loc - other.loc) / other.scale
t1 = (t1 * t1) t1 = t1 * t1
return elementwise_add(0.5 * var_ratio, return elementwise_add(
0.5 * (t1 - 1. - nn.log(var_ratio)), 0.5 * var_ratio, 0.5 * (t1 - 1.0 - nn.log(var_ratio)), name=name
name=name) )
...@@ -19,12 +19,27 @@ import numpy as np ...@@ -19,12 +19,27 @@ import numpy as np
from paddle import _C_ops, _legacy_C_ops from paddle import _C_ops, _legacy_C_ops
from paddle.distribution import distribution from paddle.distribution import distribution
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.data_feeder import (check_dtype, check_type, from paddle.fluid.data_feeder import (
check_variable_and_dtype, convert_dtype) check_dtype,
from paddle.fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph check_type,
from paddle.fluid.layers import (control_flow, elementwise_add, elementwise_div, check_variable_and_dtype,
elementwise_mul, elementwise_sub, nn, ops, convert_dtype,
tensor) )
from paddle.fluid.framework import (
_non_static_mode,
in_dygraph_mode,
_in_legacy_dygraph,
)
from paddle.fluid.layers import (
control_flow,
elementwise_add,
elementwise_div,
elementwise_mul,
elementwise_sub,
nn,
ops,
tensor,
)
from paddle.tensor import arange, concat, gather_nd, multinomial from paddle.tensor import arange, concat, gather_nd, multinomial
...@@ -91,12 +106,18 @@ class Uniform(distribution.Distribution): ...@@ -91,12 +106,18 @@ class Uniform(distribution.Distribution):
def __init__(self, low, high, name=None): def __init__(self, low, high, name=None):
if not _non_static_mode(): if not _non_static_mode():
check_type(low, 'low', check_type(
(int, float, np.ndarray, tensor.Variable, list, tuple), low,
'Uniform') 'low',
check_type(high, 'high', (int, float, np.ndarray, tensor.Variable, list, tuple),
(int, float, np.ndarray, tensor.Variable, list, tuple), 'Uniform',
'Uniform') )
check_type(
high,
'high',
(int, float, np.ndarray, tensor.Variable, list, tuple),
'Uniform',
)
self.all_arg_is_float = False self.all_arg_is_float = False
self.batch_size_unknown = False self.batch_size_unknown = False
...@@ -116,11 +137,15 @@ class Uniform(distribution.Distribution): ...@@ -116,11 +137,15 @@ class Uniform(distribution.Distribution):
else: else:
if isinstance(low, float) and isinstance(high, float): if isinstance(low, float) and isinstance(high, float):
self.all_arg_is_float = True self.all_arg_is_float = True
if isinstance(low, np.ndarray) and str( if isinstance(low, np.ndarray) and str(low.dtype) in [
low.dtype) in ['float32', 'float64']: 'float32',
'float64',
]:
self.dtype = low.dtype self.dtype = low.dtype
elif isinstance(high, np.ndarray) and str( elif isinstance(high, np.ndarray) and str(high.dtype) in [
high.dtype) in ['float32', 'float64']: 'float32',
'float64',
]:
self.dtype = high.dtype self.dtype = high.dtype
# pylint: disable=unbalanced-tuple-unpacking # pylint: disable=unbalanced-tuple-unpacking
self.low, self.high = self._to_tensor(low, high) self.low, self.high = self._to_tensor(low, high)
...@@ -148,27 +173,33 @@ class Uniform(distribution.Distribution): ...@@ -148,27 +173,33 @@ class Uniform(distribution.Distribution):
if self.batch_size_unknown: if self.batch_size_unknown:
output_shape = shape + batch_shape output_shape = shape + batch_shape
zero_tmp = tensor.fill_constant_batch_size_like( zero_tmp = tensor.fill_constant_batch_size_like(
self.low + self.high, batch_shape + shape, self.dtype, 0.) self.low + self.high, batch_shape + shape, self.dtype, 0.0
)
uniform_random_tmp = nn.uniform_random_batch_size_like( uniform_random_tmp = nn.uniform_random_batch_size_like(
zero_tmp, zero_tmp,
zero_tmp.shape, zero_tmp.shape,
dtype=self.dtype, dtype=self.dtype,
min=0., min=0.0,
max=1., max=1.0,
seed=seed) seed=seed,
)
zero_tmp_reshape = nn.reshape(zero_tmp, output_shape) zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
uniform_random_tmp_reshape = nn.reshape(uniform_random_tmp, uniform_random_tmp_reshape = nn.reshape(
output_shape) uniform_random_tmp, output_shape
output = uniform_random_tmp_reshape * (zero_tmp_reshape + )
self.high - self.low) output = uniform_random_tmp_reshape * (
zero_tmp_reshape + self.high - self.low
)
output = elementwise_add(output, self.low, name=name) output = elementwise_add(output, self.low, name=name)
return output return output
else: else:
output_shape = shape + batch_shape output_shape = shape + batch_shape
output = nn.uniform_random( output = nn.uniform_random(
output_shape, dtype=self.dtype, min=0., max=1., output_shape, dtype=self.dtype, min=0.0, max=1.0, seed=seed
seed=seed) * (tensor.zeros(output_shape, dtype=self.dtype) + ) * (
(self.high - self.low)) tensor.zeros(output_shape, dtype=self.dtype)
+ (self.high - self.low)
)
output = elementwise_add(output, self.low, name=name) output = elementwise_add(output, self.low, name=name)
if self.all_arg_is_float: if self.all_arg_is_float:
return nn.reshape(output, shape, name=name) return nn.reshape(output, shape, name=name)
...@@ -197,10 +228,12 @@ class Uniform(distribution.Distribution): ...@@ -197,10 +228,12 @@ class Uniform(distribution.Distribution):
return nn.log(lb * ub) - nn.log(self.high - self.low) return nn.log(lb * ub) - nn.log(self.high - self.low)
if _in_legacy_dygraph(): if _in_legacy_dygraph():
lb = _legacy_C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, lb = _legacy_C_ops.cast(
'out_dtype', value.dtype) lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype', value.dtype
ub = _legacy_C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, )
'out_dtype', value.dtype) ub = _legacy_C_ops.cast(
ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype', value.dtype
)
return nn.log(lb * ub) - nn.log(self.high - self.low) return nn.log(lb * ub) - nn.log(self.high - self.low)
name = self.name + '_log_prob' name = self.name + '_log_prob'
...@@ -208,9 +241,9 @@ class Uniform(distribution.Distribution): ...@@ -208,9 +241,9 @@ class Uniform(distribution.Distribution):
ub_bool = value < self.high ub_bool = value < self.high
lb = tensor.cast(lb_bool, dtype=value.dtype) lb = tensor.cast(lb_bool, dtype=value.dtype)
ub = tensor.cast(ub_bool, dtype=value.dtype) ub = tensor.cast(ub_bool, dtype=value.dtype)
return elementwise_sub(nn.log(lb * ub), return elementwise_sub(
nn.log(self.high - self.low), nn.log(lb * ub), nn.log(self.high - self.low), name=name
name=name) )
def probs(self, value): def probs(self, value):
"""Probability density/mass function. """Probability density/mass function.
...@@ -233,10 +266,12 @@ class Uniform(distribution.Distribution): ...@@ -233,10 +266,12 @@ class Uniform(distribution.Distribution):
return (lb * ub) / (self.high - self.low) return (lb * ub) / (self.high - self.low)
if _in_legacy_dygraph(): if _in_legacy_dygraph():
lb = _legacy_C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, lb = _legacy_C_ops.cast(
'out_dtype', value.dtype) lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype', value.dtype
ub = _legacy_C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, )
'out_dtype', value.dtype) ub = _legacy_C_ops.cast(
ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype', value.dtype
)
return (lb * ub) / (self.high - self.low) return (lb * ub) / (self.high - self.low)
name = self.name + '_probs' name = self.name + '_probs'
......
此差异已折叠。
...@@ -46,11 +46,16 @@ def set_default_dtype(d): ...@@ -46,11 +46,16 @@ def set_default_dtype(d):
else: else:
raise TypeError( raise TypeError(
"set_default_dtype only supports [float16, float32, float64] " "set_default_dtype only supports [float16, float32, float64] "
", but received %s" % d.__name__) ", but received %s" % d.__name__
)
else: else:
if d in [ if d in [
'float16', 'float32', 'float64', u'float16', u'float32', 'float16',
u'float64' 'float32',
'float64',
u'float16',
u'float32',
u'float64',
]: ]:
# this code is a little bit dangerous, since error could happen # this code is a little bit dangerous, since error could happen
# when casting no-ascii code to str in python2. # when casting no-ascii code to str in python2.
...@@ -61,7 +66,8 @@ def set_default_dtype(d): ...@@ -61,7 +66,8 @@ def set_default_dtype(d):
else: else:
raise TypeError( raise TypeError(
"set_default_dtype only supports [float16, float32, float64] " "set_default_dtype only supports [float16, float32, float64] "
", but received %s" % str(d)) ", but received %s" % str(d)
)
LayerHelperBase.set_default_dtype(d) LayerHelperBase.set_default_dtype(d)
...@@ -94,7 +100,7 @@ def set_grad_enabled(mode): ...@@ -94,7 +100,7 @@ def set_grad_enabled(mode):
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
x = paddle.ones([3, 2]) x = paddle.ones([3, 2])
x.stop_gradient = False x.stop_gradient = False
...@@ -127,9 +133,9 @@ def is_grad_enabled(): ...@@ -127,9 +133,9 @@ def is_grad_enabled():
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
# Dygraph gradient calculation mode is enabled by default. # Dygraph gradient calculation mode is enabled by default.
paddle.is_grad_enabled() # True paddle.is_grad_enabled() # True
......
此差异已折叠。
...@@ -28,8 +28,8 @@ def forward_grad(outputs, inputs, grad_inputs=None): ...@@ -28,8 +28,8 @@ def forward_grad(outputs, inputs, grad_inputs=None):
Args: Args:
outputs(Tensor|Sequence[Tensor]): The output tensor or tensors. outputs(Tensor|Sequence[Tensor]): The output tensor or tensors.
inputs(Tensor|Sequence[Tensor]): The input tensor or tensors. inputs(Tensor|Sequence[Tensor]): The input tensor or tensors.
grad_inputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or grad_inputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or
Tensors of inputs which has the same shape with inputs, Defaults to Tensors of inputs which has the same shape with inputs, Defaults to
None, in this case is equivalent to all ones. None, in this case is equivalent to all ones.
Returns: Returns:
...@@ -50,7 +50,7 @@ def forward_grad(outputs, inputs, grad_inputs=None): ...@@ -50,7 +50,7 @@ def forward_grad(outputs, inputs, grad_inputs=None):
with paddle.static.program_guard(main_program, startup_program): with paddle.static.program_guard(main_program, startup_program):
x = paddle.static.data('x', shape=[1], dtype='float32') x = paddle.static.data('x', shape=[1], dtype='float32')
y = x * x y = x * x
y_grad = paddle.incubate.autograd.forward_grad(y, x) y_grad = paddle.incubate.autograd.forward_grad(y, x)
paddle.incubate.autograd.prim2orig() paddle.incubate.autograd.prim2orig()
...@@ -64,25 +64,35 @@ def forward_grad(outputs, inputs, grad_inputs=None): ...@@ -64,25 +64,35 @@ def forward_grad(outputs, inputs, grad_inputs=None):
paddle.disable_static() paddle.disable_static()
""" """
if not utils.prim_enabled(): if not utils.prim_enabled():
raise RuntimeError('forward_grad must be running on primitive' raise RuntimeError(
'operators, use enable_prim to turn it on.') 'forward_grad must be running on primitive'
'operators, use enable_prim to turn it on.'
)
if not isinstance(outputs, (framework.Variable, typing.Sequence)): if not isinstance(outputs, (framework.Variable, typing.Sequence)):
raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], ' raise TypeError(
f'but got {type(outputs)}.') f'Expected outputs is Tensor|Sequence[Tesnor], '
f'but got {type(outputs)}.'
)
if not isinstance(inputs, (framework.Variable, typing.Sequence)): if not isinstance(inputs, (framework.Variable, typing.Sequence)):
raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], ' raise TypeError(
f'but got {type(inputs)}.') f'Expected inputs is Tensor|Sequence[Tesnor], '
f'but got {type(inputs)}.'
)
ys, xs, xs_dot = utils.as_tensors(outputs), utils.as_tensors( ys, xs, xs_dot = (
inputs), utils.as_tensors(grad_inputs) utils.as_tensors(outputs),
utils.as_tensors(inputs),
utils.as_tensors(grad_inputs),
)
block = framework.default_main_program().current_block() block = framework.default_main_program().current_block()
if any(x.block != block for x in xs + ys): if any(x.block != block for x in xs + ys):
raise RuntimeError( raise RuntimeError(
'Variable in inputs and targets should exist in current block of ' 'Variable in inputs and targets should exist in current block of '
'main program.') 'main program.'
)
primx.orig2prim(block) primx.orig2prim(block)
ad = primx.Transform(ys[0].block) ad = primx.Transform(ys[0].block)
...@@ -101,12 +111,12 @@ def grad(outputs, inputs, grad_outputs=None): ...@@ -101,12 +111,12 @@ def grad(outputs, inputs, grad_outputs=None):
Args: Args:
outputs(Tensor|Sequence[Tensor]): The output Tensor or Tensors. outputs(Tensor|Sequence[Tensor]): The output Tensor or Tensors.
inputs(Tensor|Sequence[Tensor]): The input Tensor or Tensors. inputs(Tensor|Sequence[Tensor]): The input Tensor or Tensors.
grad_outputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or grad_outputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or
Tensors of outputs which has the same shape with outputs, Defaults Tensors of outputs which has the same shape with outputs, Defaults
to None, in this case is equivalent to all ones. to None, in this case is equivalent to all ones.
Returns: Returns:
grad_inputs(Tensor|Tensors): The gradients for inputs. grad_inputs(Tensor|Tensors): The gradients for inputs.
Examples: Examples:
...@@ -123,7 +133,7 @@ def grad(outputs, inputs, grad_outputs=None): ...@@ -123,7 +133,7 @@ def grad(outputs, inputs, grad_outputs=None):
with paddle.static.program_guard(main_program, startup_program): with paddle.static.program_guard(main_program, startup_program):
x = paddle.static.data('x', shape=[1], dtype='float32') x = paddle.static.data('x', shape=[1], dtype='float32')
x.stop_gradients = False x.stop_gradients = False
y = x * x y = x * x
x_grad = paddle.incubate.autograd.grad(y, x) x_grad = paddle.incubate.autograd.grad(y, x)
paddle.incubate.autograd.prim2orig() paddle.incubate.autograd.prim2orig()
...@@ -132,7 +142,7 @@ def grad(outputs, inputs, grad_outputs=None): ...@@ -132,7 +142,7 @@ def grad(outputs, inputs, grad_outputs=None):
x_grad = exe.run(main_program, feed={'x': np.array([2.]).astype('float32')}, fetch_list=[x_grad]) x_grad = exe.run(main_program, feed={'x': np.array([2.]).astype('float32')}, fetch_list=[x_grad])
print(x_grad) print(x_grad)
# [array([4.], dtype=float32)] # [array([4.], dtype=float32)]
paddle.incubate.autograd.disable_prim() paddle.incubate.autograd.disable_prim()
paddle.disable_static() paddle.disable_static()
""" """
...@@ -141,22 +151,32 @@ def grad(outputs, inputs, grad_outputs=None): ...@@ -141,22 +151,32 @@ def grad(outputs, inputs, grad_outputs=None):
# backward.gradients returns a list though the inputs is a signle Tensor. # backward.gradients returns a list though the inputs is a signle Tensor.
# The follow code snippet fixes the problem by return the first element # The follow code snippet fixes the problem by return the first element
# of grad_inputs when the inputs is a signle Tensor. # of grad_inputs when the inputs is a signle Tensor.
if isinstance(inputs, framework.Variable) and isinstance( if (
grad_inputs, typing.Sequence) and len(grad_inputs) > 0: isinstance(inputs, framework.Variable)
and isinstance(grad_inputs, typing.Sequence)
and len(grad_inputs) > 0
):
return grad_inputs[0] return grad_inputs[0]
else: else:
return grad_inputs return grad_inputs
if not isinstance(outputs, (framework.Variable, typing.Sequence)): if not isinstance(outputs, (framework.Variable, typing.Sequence)):
raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], ' raise TypeError(
f'but got {type(outputs)}.') f'Expected outputs is Tensor|Sequence[Tesnor], '
f'but got {type(outputs)}.'
)
if not isinstance(inputs, (framework.Variable, typing.Sequence)): if not isinstance(inputs, (framework.Variable, typing.Sequence)):
raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], ' raise TypeError(
f'but got {type(inputs)}.') f'Expected inputs is Tensor|Sequence[Tesnor], '
f'but got {type(inputs)}.'
)
ys, xs, ys_bar = utils.as_tensors(outputs), utils.as_tensors( ys, xs, ys_bar = (
inputs), utils.as_tensors(grad_outputs) utils.as_tensors(outputs),
utils.as_tensors(inputs),
utils.as_tensors(grad_outputs),
)
block = framework.default_main_program().current_block() block = framework.default_main_program().current_block()
if any((x is not None and x.block != block) for x in xs + ys): if any((x is not None and x.block != block) for x in xs + ys):
raise RuntimeError( raise RuntimeError(
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册