未验证 提交 94c6ec86 编写于 作者: U ustiniankw 提交者: GitHub

[Docs]fix math api en docs issue (#47448)

* fix_docx_stanh

* fix einsum api en docs issue

* fix model api en docs issue

* for codestyle

* fix_einsum.py_einsum, test=document_fix

* fix_model.py_Model, test=ducument_fix

* fix_creation.py_meshgrid, test=document_fix

* fix_linalg.py_slogdet, test=document_fix

* fix_loss.py_SoftMarginLoss_CrossEntropyLoss_NLLLoss_BCELoss, test=document_fix

* norm.py_SyncBatchNorm, test=document-fix

* norm.py_SyncBatchNorm, test=document_fix

* norm.py_SyncBatchNorm, test=document_fix

* list18-30, test=document_fix

* refix_list1-15, test=document_fix

* deletefiles, test=document_fix

* fixedapi_pre-commit, test=document_fix

* fix_list31-45, test=document_fix

* list111, test=document_fix

* some_fix, test=document_fix

* some_fix, test=document_fix

* somefix, test=document_fix

* somefix, test=document_fix

* refix, test=document_fix

* refix, test=document_fix

* refix, test=document_fix

* refix, test=document_fix

* rerfix, test=document_fix
Co-authored-by: 梦柳's avatarLigoml <limengliu@tiaozhan.com>
上级 51b08123
...@@ -112,6 +112,7 @@ class DistributedStrategy: ...@@ -112,6 +112,7 @@ class DistributedStrategy:
def __init__(self): def __init__(self):
""" """
DistributedStrategy is the main configuration entry for distributed training of Paddle. DistributedStrategy is the main configuration entry for distributed training of Paddle.
All of the distributed training configurations can be configured in DistributedStrategy, All of the distributed training configurations can be configured in DistributedStrategy,
such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS), such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS),
...@@ -153,33 +154,35 @@ class DistributedStrategy: ...@@ -153,33 +154,35 @@ class DistributedStrategy:
def save_to_prototxt(self, output): def save_to_prototxt(self, output):
""" """
Serialize current DistributedStrategy to string and save to output file Serialize current DistributedStrategy to string and save to output file
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.dgc = True
strategy.recompute = True
strategy.recompute_configs = {"checkpoints": ["x"]}
strategy.save_to_prototxt("dist_strategy.prototxt")
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.dgc = True
strategy.recompute = True
strategy.recompute_configs = {"checkpoints": ["x"]}
strategy.save_to_prototxt("dist_strategy.prototxt")
""" """
with open(output, "w") as fout: with open(output, "w") as fout:
fout.write(str(self.strategy)) fout.write(str(self.strategy))
def load_from_prototxt(self, pb_file): def load_from_prototxt(self, pb_file):
""" """
Load from prototxt file for DistributedStrategy initialization Load from prototxt file for DistributedStrategy initialization
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.load_from_prototxt("dist_strategy.prototxt")
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.load_from_prototxt("dist_strategy.prototxt")
""" """
with open(pb_file, 'r') as f: with open(pb_file, 'r') as f:
self.strategy = google.protobuf.text_format.Merge( self.strategy = google.protobuf.text_format.Merge(
...@@ -192,17 +195,17 @@ class DistributedStrategy: ...@@ -192,17 +195,17 @@ class DistributedStrategy:
Configure ExecutionStrategy for DistributedStrategy Configure ExecutionStrategy for DistributedStrategy
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle
exe_strategy = paddle.static.ExecutionStrategy()
exe_strategy.num_threads = 10
exe_strategy.num_iteration_per_drop_scope = 10
exe_strategy.num_iteration_per_run = 10
import paddle strategy = paddle.distributed.fleet.DistributedStrategy()
exe_strategy = paddle.static.ExecutionStrategy() strategy.execution_strategy = exe_strategy
exe_strategy.num_threads = 10
exe_strategy.num_iteration_per_drop_scope = 10
exe_strategy.num_iteration_per_run = 10
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.execution_strategy = exe_strategy
""" """
execution_strategy = paddle.fluid.ExecutionStrategy() execution_strategy = paddle.fluid.ExecutionStrategy()
fields = self.strategy.execution_strategy.DESCRIPTOR.fields fields = self.strategy.execution_strategy.DESCRIPTOR.fields
...@@ -228,27 +231,28 @@ class DistributedStrategy: ...@@ -228,27 +231,28 @@ class DistributedStrategy:
@property @property
def build_strategy(self): def build_strategy(self):
""" """
Configure BuildStrategy for DistributedStrategy Configure BuildStrategy for DistributedStrategy
Note that the properties of BuildStrategy are valid in DistributedStrategy Note that the properties of BuildStrategy are valid in DistributedStrategy
only if the property is non-distributed strategy. only if the property is non-distributed strategy.
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle
build_strategy = paddle.static.BuildStrategy()
build_strategy.enable_sequential_execution = True
build_strategy.fuse_elewise_add_act_ops = True
build_strategy.fuse_bn_act_ops = True
build_strategy.enable_auto_fusion = True
build_strategy.fuse_relu_depthwise_conv = True
build_strategy.fuse_broadcast_ops = True
build_strategy.fuse_all_optimizer_ops = True
build_strategy.enable_inplace = True
import paddle strategy = paddle.distributed.fleet.DistributedStrategy()
build_strategy = paddle.static.BuildStrategy() strategy.build_strategy = build_strategy
build_strategy.enable_sequential_execution = True
build_strategy.fuse_elewise_add_act_ops = True
build_strategy.fuse_bn_act_ops = True
build_strategy.enable_auto_fusion = True
build_strategy.fuse_relu_depthwise_conv = True
build_strategy.fuse_broadcast_ops = True
build_strategy.fuse_all_optimizer_ops = True
build_strategy.enable_inplace = True
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.build_strategy = build_strategy
""" """
build_strategy = paddle.fluid.BuildStrategy() build_strategy = paddle.fluid.BuildStrategy()
...@@ -278,15 +282,18 @@ class DistributedStrategy: ...@@ -278,15 +282,18 @@ class DistributedStrategy:
@property @property
def gradient_scale_configs(self): def gradient_scale_configs(self):
""" """
Set the strategy of gradient scale Set the strategy of gradient scale
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy()
strategy = fleet.DistributedStrategy() strategy.gradient_scale_configs = {'scale_strategy': 'avg'}
strategy.gradient_scale_configs = {'scale_strategy': 'avg'}
Note that, strategy must be in 'avg', 'sum' or 'customized' Note that, strategy must be in 'avg', 'sum' or 'customized'
""" """
return get_msg_dict(self.strategy.gradient_scale_configs) return get_msg_dict(self.strategy.gradient_scale_configs)
...@@ -303,24 +310,25 @@ class DistributedStrategy: ...@@ -303,24 +310,25 @@ class DistributedStrategy:
@property @property
def a_sync(self): def a_sync(self):
""" """
Indicating whether we are using asynchronous stocastic gradient descent updates Indicating whether we are using asynchronous stocastic gradient descent updates
for training. This property is valid when we are using parameter server training, for training. This property is valid when we are using parameter server training,
which is implied by setting approperate RoleMaker which is implied by setting approperate RoleMaker
Default value: True Default value: True
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
role_maker = fleet.PaddleCloudRoleMaker()
fleet.init(role_maker)
import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy()
role_maker = fleet.PaddleCloudRoleMaker() strategy.a_sync = True # by default this is True
fleet.init(role_maker)
strategy = fleet.DistributedStrategy() # code block for defining loss and local optimizer
strategy.a_sync = True # by default this is True # sgd = fleet.distributed_optimizer(optimizer, strategy)
# code block for defining loss and local optimizer
# sgd = fleet.distributed_optimizer(optimizer, strategy)
""" """
return self.strategy.a_sync return self.strategy.a_sync
...@@ -340,6 +348,7 @@ class DistributedStrategy: ...@@ -340,6 +348,7 @@ class DistributedStrategy:
@property @property
def a_sync_configs(self): def a_sync_configs(self):
""" """
Set a_sync update configurations. In general, asynchronous parameter server Set a_sync update configurations. In general, asynchronous parameter server
training has serveral configurable settings that can be configured through training has serveral configurable settings that can be configured through
a dict. a dict.
...@@ -360,20 +369,19 @@ class DistributedStrategy: ...@@ -360,20 +369,19 @@ class DistributedStrategy:
runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
role_maker = fleet.PaddleCloudRoleMaker()
import paddle.distributed.fleet as fleet fleet.init(role_maker)
role_maker = fleet.PaddleCloudRoleMaker()
fleet.init(role_maker)
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.a_sync = True # by default this is True strategy.a_sync = True # by default this is True
configs = {"k_steps": 1024, "send_queue_size": 32} configs = {"k_steps": 1024, "send_queue_size": 32}
strategy.a_sync_configs = configs strategy.a_sync_configs = configs
# code block for defining loss and local optimizer # code block for defining loss and local optimizer
# sgd = fleet.distributed_optimizer(optimizer, strategy) # sgd = fleet.distributed_optimizer(optimizer, strategy)
""" """
return get_msg_dict(self.strategy.a_sync_configs) return get_msg_dict(self.strategy.a_sync_configs)
...@@ -389,6 +397,7 @@ class DistributedStrategy: ...@@ -389,6 +397,7 @@ class DistributedStrategy:
@property @property
def trainer_desc_configs(self): def trainer_desc_configs(self):
""" """
Set trainer desc configurations. Set trainer desc configurations.
**Notes**: **Notes**:
...@@ -401,19 +410,18 @@ class DistributedStrategy: ...@@ -401,19 +410,18 @@ class DistributedStrategy:
stat_var_names(list(str)): stat_var_names(list(str)):
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
role_maker = fleet.PaddleCloudRoleMaker()
fleet.init(role_maker)
import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy()
role_maker = fleet.PaddleCloudRoleMaker() configs = {"dump_fields_path": "./dump_data", "dump_fields": ["xxx", "yyy"]}
fleet.init(role_maker) strategy.trainer_desc_configs = configs
strategy = fleet.DistributedStrategy()
configs = {"dump_fields_path": "./dump_data", "dump_fields": ["xxx", "yyy"]}
strategy.trainer_desc_configs = configs
# code block for defining loss and local optimizer # code block for defining loss and local optimizer
# sgd = fleet.distributed_optimizer(optimizer, strategy) # sgd = fleet.distributed_optimizer(optimizer, strategy)
""" """
return get_msg_dict(self.strategy.trainer_desc_configs) return get_msg_dict(self.strategy.trainer_desc_configs)
...@@ -421,22 +429,23 @@ class DistributedStrategy: ...@@ -421,22 +429,23 @@ class DistributedStrategy:
@property @property
def adam_d2sum(self): def adam_d2sum(self):
""" """
set adam_d2sum set adam_d2sum
Default value: False Default value: False
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
role_maker = fleet.PaddleCloudRoleMaker()
fleet.init(role_maker)
import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy()
role_maker = fleet.PaddleCloudRoleMaker() strategy.adam_d2sum = True # by default this is False
fleet.init(role_maker)
strategy = fleet.DistributedStrategy() # code block for defining loss and local optimizer
strategy.adam_d2sum = True # by default this is False # sgd = fleet.distributed_optimizer(optimizer, strategy)
# code block for defining loss and local optimizer
# sgd = fleet.distributed_optimizer(optimizer, strategy)
""" """
return self.strategy.adam_d2sum return self.strategy.adam_d2sum
...@@ -463,22 +472,30 @@ class DistributedStrategy: ...@@ -463,22 +472,30 @@ class DistributedStrategy:
@property @property
def fs_client_param(self): def fs_client_param(self):
""" """
Set fs client configurations. Set fs client configurations.
**Notes**:
Note:
uri(str): the uri of fs client uri(str): the uri of fs client
user(str): the user_name of fs client user(str): the user_name of fs client
passwd(str): the passwd of fs client passwd(str): the passwd of fs client
hadoop_bin(str): hadoop_bin(str):
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet
role_maker = fleet.PaddleCloudRoleMaker() import paddle.distributed.fleet as fleet
fleet.init(role_maker) role_maker = fleet.PaddleCloudRoleMaker()
strategy = fleet.DistributedStrategy() fleet.init(role_maker)
configs = {"uri": "xxx", "user": "xxx", passwd: "xxx"} strategy = fleet.DistributedStrategy()
strategy.fs_client_param = configs configs = {"uri": "xxx", "user": "xxx", passwd: "xxx"}
# code block for defining loss and local optimizer strategy.fs_client_param = configs
# sgd = fleet.distributed_optimizer(optimizer, strategy) # code block for defining loss and local optimizer
# sgd = fleet.distributed_optimizer(optimizer, strategy)
""" """
return self.strategy.fs_client_param return self.strategy.fs_client_param
...@@ -858,6 +875,7 @@ class DistributedStrategy: ...@@ -858,6 +875,7 @@ class DistributedStrategy:
@property @property
def amp_configs(self): def amp_configs(self):
""" """
Set automatic mixed precision training configurations. In general, amp has serveral configurable Set automatic mixed precision training configurations. In general, amp has serveral configurable
settings that can be configured through a dict. settings that can be configured through a dict.
...@@ -886,28 +904,27 @@ class DistributedStrategy: ...@@ -886,28 +904,27 @@ class DistributedStrategy:
Default True. Only takes effect when `use_pure_fp16` is turned on. Default True. Only takes effect when `use_pure_fp16` is turned on.
Examples 1: Examples 1:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
import paddle.distributed.fleet as fleet strategy.amp = True
strategy = fleet.DistributedStrategy() strategy.amp_configs = {
strategy.amp = True "init_loss_scaling": 32768,
strategy.amp_configs = { "custom_white_list": ['conv2d']}
"init_loss_scaling": 32768,
"custom_white_list": ['conv2d']}
Examples 2: Examples 2:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.amp = True
# pure fp16
strategy.amp_configs = {
"init_loss_scaling": 32768,
"use_pure_fp16": True
}
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.amp = True
# pure fp16
strategy.amp_configs = {
"init_loss_scaling": 32768,
"use_pure_fp16": True
}
""" """
return get_msg_dict(self.strategy.amp_configs) return get_msg_dict(self.strategy.amp_configs)
...@@ -920,16 +937,16 @@ class DistributedStrategy: ...@@ -920,16 +937,16 @@ class DistributedStrategy:
@property @property
def asp(self): def asp(self):
""" """
Indicating whether we are using automatic sparsity training Indicating whether we are using automatic sparsity training
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
import paddle.distributed.fleet as fleet strategy.asp = True # by default this is false
strategy = fleet.DistributedStrategy()
strategy.asp = True # by default this is false
""" """
return self.strategy.asp return self.strategy.asp
...@@ -949,30 +966,31 @@ class DistributedStrategy: ...@@ -949,30 +966,31 @@ class DistributedStrategy:
Default value: False Default value: False
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.recompute = True
# suppose x and y are names of checkpoint tensors for recomputation
strategy.recompute_configs = {"checkpoints": ["x", "y"]}
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.recompute = True
# suppose x and y are names of checkpoint tensors for recomputation
strategy.recompute_configs = {"checkpoints": ["x", "y"]}
""" """
return self.strategy.recompute return self.strategy.recompute
@property @property
def sync_nccl_allreduce(self): def sync_nccl_allreduce(self):
""" """
Indicating whether we are using synchronized all reduce in each communication thread Indicating whether we are using synchronized all reduce in each communication thread
We note that system overhead is usually lower when sync_nccl_allreduce = True We note that system overhead is usually lower when sync_nccl_allreduce = True
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.sync_nccl_allreduce = True
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.sync_nccl_allreduce = True
""" """
return self.strategy.sync_nccl_allreduce return self.strategy.sync_nccl_allreduce
...@@ -987,17 +1005,18 @@ class DistributedStrategy: ...@@ -987,17 +1005,18 @@ class DistributedStrategy:
@property @property
def use_hierarchical_allreduce(self): def use_hierarchical_allreduce(self):
""" """
Indicating whether we are using hierarchical allreduce in collective communication Indicating whether we are using hierarchical allreduce in collective communication
Hierarchical allreduce often does allreduce within a certain node group and then do Hierarchical allreduce often does allreduce within a certain node group and then do
allreduce among the leaders of each group allreduce among the leaders of each group
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.use_hierarchical_allreduce = True
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.use_hierarchical_allreduce = True
""" """
return self.strategy.use_hierarchical_allreduce return self.strategy.use_hierarchical_allreduce
...@@ -1014,16 +1033,17 @@ class DistributedStrategy: ...@@ -1014,16 +1033,17 @@ class DistributedStrategy:
@property @property
def hierarchical_allreduce_inter_nranks(self): def hierarchical_allreduce_inter_nranks(self):
""" """
Number of ranks for low level node groups in hierarchical allreduce Number of ranks for low level node groups in hierarchical allreduce
Default value: number of GPU cards on each single GPU machine Default value: number of GPU cards on each single GPU machine
Example: Example:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.hierarchical_allreduce_inter_nranks = 8
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.hierarchical_allreduce_inter_nranks = 8
""" """
return self.strategy.hierarchical_allreduce_inter_nranks return self.strategy.hierarchical_allreduce_inter_nranks
...@@ -1040,17 +1060,18 @@ class DistributedStrategy: ...@@ -1040,17 +1060,18 @@ class DistributedStrategy:
@property @property
def sync_batch_norm(self): def sync_batch_norm(self):
""" """
Indicating whether we are using sync_batch_norm to do synchronous batch normalization among all training nodes. Indicating whether we are using sync_batch_norm to do synchronous batch normalization among all training nodes.
Default value: False Default value: False
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.sync_batch_norm = True
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.sync_batch_norm = True
""" """
return self.strategy.sync_batch_norm return self.strategy.sync_batch_norm
...@@ -1066,16 +1087,17 @@ class DistributedStrategy: ...@@ -1066,16 +1087,17 @@ class DistributedStrategy:
@property @property
def fuse_all_reduce_ops(self): def fuse_all_reduce_ops(self):
""" """
Indicating whether we are using fuse_all_reduce_ops for gradient fusion during backward phase of training Indicating whether we are using fuse_all_reduce_ops for gradient fusion during backward phase of training
Default value: True Default value: True
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.fuse_all_reduce_ops = False
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.fuse_all_reduce_ops = False
""" """
return self.strategy.fuse_all_reduce_ops return self.strategy.fuse_all_reduce_ops
...@@ -1090,17 +1112,18 @@ class DistributedStrategy: ...@@ -1090,17 +1112,18 @@ class DistributedStrategy:
@property @property
def fuse_grad_size_in_MB(self): def fuse_grad_size_in_MB(self):
""" """
Specifying the size of gradient to fuse in Mega-Bytes Specifying the size of gradient to fuse in Mega-Bytes
Default value: 32 Default value: 32
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.fuse_grad_size_in_MB = 50
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.fuse_grad_size_in_MB = 50
""" """
return self.strategy.fuse_grad_size_in_MB return self.strategy.fuse_grad_size_in_MB
...@@ -1115,6 +1138,7 @@ class DistributedStrategy: ...@@ -1115,6 +1138,7 @@ class DistributedStrategy:
@property @property
def last_comm_group_size_MB(self): def last_comm_group_size_MB(self):
""" """
Specifying the size of gradient to fuse in Mega-Bytes when Specifying the size of gradient to fuse in Mega-Bytes when
the last group of each batch communicates. Making the last group the last group of each batch communicates. Making the last group
small is useful to improve performance. small is useful to improve performance.
...@@ -1122,11 +1146,12 @@ class DistributedStrategy: ...@@ -1122,11 +1146,12 @@ class DistributedStrategy:
Default value: 1 Default value: 1
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.last_comm_group_size_MB = 2
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.last_comm_group_size_MB = 2
""" """
return self.strategy.last_comm_group_size_MB return self.strategy.last_comm_group_size_MB
...@@ -1141,18 +1166,19 @@ class DistributedStrategy: ...@@ -1141,18 +1166,19 @@ class DistributedStrategy:
@property @property
def find_unused_parameters(self): def find_unused_parameters(self):
""" """
Indicating whether we are using find_unused_parameters to Indicating whether we are using find_unused_parameters to
find unused parameters in DataParallel. find unused parameters in DataParallel.
Default value: False Default value: False
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.find_unused_parameters = True
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.find_unused_parameters = True
""" """
return self.strategy.find_unused_parameters return self.strategy.find_unused_parameters
...@@ -1184,17 +1210,18 @@ class DistributedStrategy: ...@@ -1184,17 +1210,18 @@ class DistributedStrategy:
@property @property
def nccl_comm_num(self): def nccl_comm_num(self):
""" """
Specifying the number of NCCL communicator Specifying the number of NCCL communicator
Default value: 1 Default value: 1
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.nccl_comm_num = 2
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.nccl_comm_num = 2
""" """
return self.strategy.nccl_comm_num return self.strategy.nccl_comm_num
...@@ -1218,6 +1245,7 @@ class DistributedStrategy: ...@@ -1218,6 +1245,7 @@ class DistributedStrategy:
@property @property
def recompute_configs(self): def recompute_configs(self):
""" """
Set recompute configurations. Set recompute configurations.
**Note**: **Note**:
...@@ -1234,16 +1262,15 @@ class DistributedStrategy: ...@@ -1234,16 +1262,15 @@ class DistributedStrategy:
specific here should be determined ("-1" is not allowed). specific here should be determined ("-1" is not allowed).
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
import paddle.distributed.fleet as fleet strategy.recompute = True
strategy = fleet.DistributedStrategy() strategy.recompute_configs = {
strategy.recompute = True "checkpoints": ["x", "y"],
strategy.recompute_configs = { "enable_offload": True,
"checkpoints": ["x", "y"], "checkpoint_shape": [100, 512, 1024] }
"enable_offload": True,
"checkpoint_shape": [100, 512, 1024] }
""" """
return get_msg_dict(self.strategy.recompute_configs) return get_msg_dict(self.strategy.recompute_configs)
...@@ -1259,6 +1286,7 @@ class DistributedStrategy: ...@@ -1259,6 +1286,7 @@ class DistributedStrategy:
@property @property
def sharding(self): def sharding(self):
""" """
Indicating whether we are using sharding Optimizer for memory Indicating whether we are using sharding Optimizer for memory
optimization. We implement the sharding optimizer following the ZeRO-DP optimization. We implement the sharding optimizer following the ZeRO-DP
idea from [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054). idea from [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054).
...@@ -1269,12 +1297,12 @@ class DistributedStrategy: ...@@ -1269,12 +1297,12 @@ class DistributedStrategy:
Default value: False Default value: False
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.sharding = True
import paddle.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.sharding = True
""" """
return self.strategy.sharding return self.strategy.sharding
...@@ -1289,6 +1317,7 @@ class DistributedStrategy: ...@@ -1289,6 +1317,7 @@ class DistributedStrategy:
@property @property
def sharding_configs(self): def sharding_configs(self):
""" """
Set sharding configurations. Set sharding configurations.
**Note**: **Note**:
...@@ -1326,20 +1355,20 @@ class DistributedStrategy: ...@@ -1326,20 +1355,20 @@ class DistributedStrategy:
Examples: Examples:
.. code-block:: python
# sharding-DP, 2 nodes with 8 gpus per node
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.sharding = True
strategy.sharding_configs = {
"sharding_segment_strategy": "segment_broadcast_MB",
"segment_broadcast_MB": 32,
"sharding_degree": 8,
"dp_degree": 2,
"gradient_merge_acc_step": 4,
}
.. code-block:: python
# sharding-DP, 2 nodes with 8 gpus per node
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.sharding = True
strategy.sharding_configs = {
"sharding_segment_strategy": "segment_broadcast_MB",
"segment_broadcast_MB": 32,
"sharding_degree": 8,
"dp_degree": 2,
"gradient_merge_acc_step": 4,
}
""" """
return get_msg_dict(self.strategy.sharding_configs) return get_msg_dict(self.strategy.sharding_configs)
...@@ -1354,15 +1383,15 @@ class DistributedStrategy: ...@@ -1354,15 +1383,15 @@ class DistributedStrategy:
@property @property
def without_graph_optimization(self): def without_graph_optimization(self):
""" """
Run program using Executor other than ParallelExecutor. Run program using Executor other than ParallelExecutor.
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
import paddle.distributed.fleet as fleet strategy.without_graph_optimization = True
strategy = fleet.DistributedStrategy()
strategy.without_graph_optimization = True
""" """
return self.strategy.without_graph_optimization return self.strategy.without_graph_optimization
...@@ -1380,14 +1409,18 @@ class DistributedStrategy: ...@@ -1380,14 +1409,18 @@ class DistributedStrategy:
@property @property
def _calc_comm_same_stream(self): def _calc_comm_same_stream(self):
""" """
This based on raw_program_optimizer program This based on raw_program_optimizer program
Set whether use same stream for calc and comm when fuse allreduce Set whether use same stream for calc and comm when fuse allreduce
The default value for the calc_comm_same_stream is False The default value for the calc_comm_same_stream is False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() import paddle.distributed.fleet as fleet
strategy.calc_comm_same_stream = True strategy = fleet.DistributedStrategy()
strategy.calc_comm_same_stream = True
""" """
return self.strategy.calc_comm_same_stream return self.strategy.calc_comm_same_stream
...@@ -1404,14 +1437,18 @@ class DistributedStrategy: ...@@ -1404,14 +1437,18 @@ class DistributedStrategy:
@property @property
def fuse_grad_merge(self): def fuse_grad_merge(self):
""" """
Set whether fuse the grad for gradient merge. Set whether fuse the grad for gradient merge.
Note: this flag will only effect the gradient merge under pipeline mode Note: this flag will only effect the gradient merge under pipeline mode
The default value for the fuse_grad_merge is False The default value for the fuse_grad_merge is False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() import paddle.distributed.fleet as fleet
strategy.fuse_param_grad = True strategy = fleet.DistributedStrategy()
strategy.fuse_param_grad = True
""" """
return self.strategy.fuse_grad_merge return self.strategy.fuse_grad_merge
...@@ -1426,12 +1463,17 @@ class DistributedStrategy: ...@@ -1426,12 +1463,17 @@ class DistributedStrategy:
@property @property
def fuse_grad_size_in_num(self): def fuse_grad_size_in_num(self):
""" """
This based on raw_program_optimizer program and allreduce the num of the fused op This based on raw_program_optimizer program and allreduce the num of the fused op
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() import paddle.distributed.fleet as fleet
strategy.fuse_grad_size_in_num = 2
strategy = fleet.DistributedStrategy()
strategy.fuse_grad_size_in_num = 2
""" """
return self.strategy.fuse_grad_size_in_num return self.strategy.fuse_grad_size_in_num
...@@ -1448,18 +1490,18 @@ class DistributedStrategy: ...@@ -1448,18 +1490,18 @@ class DistributedStrategy:
@property @property
def pipeline(self): def pipeline(self):
""" """
Indicating whether we are using pipeline parallelism for distributed training. Indicating whether we are using pipeline parallelism for distributed training.
Current implementation mainly focus on single GPU machine pipeline parallelism and Current implementation mainly focus on single GPU machine pipeline parallelism and
data parallelism across GPU machine. The pipeline information is indicated through data parallelism across GPU machine. The pipeline information is indicated through
device_guard information in user-defined program. device_guard information in user-defined program.
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
import paddle.distributed.fleet as fleet strategy.pipeline = True
strategy = fleet.DistributedStrategy()
strategy.pipeline = True
""" """
return self.strategy.pipeline return self.strategy.pipeline
...@@ -1499,6 +1541,7 @@ class DistributedStrategy: ...@@ -1499,6 +1541,7 @@ class DistributedStrategy:
@property @property
def pipeline_configs(self): def pipeline_configs(self):
""" """
Set pipeline parallelism configurations. In pipeline parallelism, Set pipeline parallelism configurations. In pipeline parallelism,
different parts of neural networks are running on different GPUS. different parts of neural networks are running on different GPUS.
There are Tensor queue buffer between each pair of neighborhood GPUS There are Tensor queue buffer between each pair of neighborhood GPUS
...@@ -1514,13 +1557,12 @@ class DistributedStrategy: ...@@ -1514,13 +1557,12 @@ class DistributedStrategy:
**micro_batch_size**: the number of small batches in each user defined batch **micro_batch_size**: the number of small batches in each user defined batch
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
import paddle.distributed.fleet as fleet strategy.pipeline = True
strategy = fleet.DistributedStrategy() strategy.pipeline_configs = {"micro_batch_size": 12}
strategy.pipeline = True
strategy.pipeline_configs = {"micro_batch_size": 12}
""" """
...@@ -1537,15 +1579,15 @@ class DistributedStrategy: ...@@ -1537,15 +1579,15 @@ class DistributedStrategy:
@property @property
def tensor_parallel(self): def tensor_parallel(self):
""" """
Indicating whether we are using tensor parallel for distributed training. Indicating whether we are using tensor parallel for distributed training.
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
import paddle.distributed.fleet as fleet strategy.tensor_parallel = True
strategy = fleet.DistributedStrategy()
strategy.tensor_parallel = True
""" """
return self.strategy.tensor_parallel return self.strategy.tensor_parallel
...@@ -1561,23 +1603,25 @@ class DistributedStrategy: ...@@ -1561,23 +1603,25 @@ class DistributedStrategy:
@property @property
def tensor_parallel_configs(self): def tensor_parallel_configs(self):
""" """
Set tensor_parallel configurations. Set tensor_parallel configurations.
**Notes**: **Notes**:
**Detailed arguments for tensor_parallel_configs** **Detailed arguments for tensor_parallel_configs**
**tensor_parallel_degree**: degree of tensor parallel **tensor_parallel_degree**: degree of tensor parallel
**tensor_init_seed**: parameter initialization random seed **tensor_init_seed**: parameter initialization random seed
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
import paddle.distributed.fleet as fleet strategy.tensor_parallel = True
strategy = fleet.DistributedStrategy() strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4,
strategy.tensor_parallel = True "tensor_init_seed": 123}
strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4,
"tensor_init_seed": 123}
""" """
return get_msg_dict(self.strategy.tensor_parallel_configs) return get_msg_dict(self.strategy.tensor_parallel_configs)
...@@ -1595,28 +1639,32 @@ class DistributedStrategy: ...@@ -1595,28 +1639,32 @@ class DistributedStrategy:
@property @property
def hybrid_configs(self): def hybrid_configs(self):
""" """
Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism
needs to meet the following relationships needs to meet the following relationships
total_number_GPUs = dp_degree * mp_degree * pp_degree total_number_GPUs = dp_degree * mp_degree * pp_degree
**Note**: **Note**:
dp_degree(int): set number of GPUs in a data parallel group. Default -1. **dp_degree(int)**: set number of GPUs in a data parallel group. Default -1.
This value should be an integer greater than 0. This value should be an integer greater than 0.
If it is not set, or set to -1, its value will be inferred If it is not set, or set to -1, its value will be inferred
based on the total number of cards. based on the total number of cards.
mp_degree(int): set number of GPUs in a model parallel group. Default 1
pp_degree(int): set number of GPUs in a pipeline parallel group. Default 1
**mp_degree(int)**: set number of GPUs in a model parallel group. Default 1
**pp_degree(int)**: set number of GPUs in a pipeline parallel group. Default 1
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() import paddle.distributed.fleet as fleet
strategy.hybrid_configs = { strategy = fleet.DistributedStrategy()
"dp_degree": 1, strategy.hybrid_configs = {
"mp_degree": 2, "dp_degree": 1,
"pp_degree": 1} "mp_degree": 2,
"pp_degree": 1}
""" """
return get_msg_dict(self.strategy.hybrid_configs) return get_msg_dict(self.strategy.hybrid_configs)
...@@ -1630,18 +1678,18 @@ class DistributedStrategy: ...@@ -1630,18 +1678,18 @@ class DistributedStrategy:
@property @property
def localsgd(self): def localsgd(self):
""" """
Indicating whether we are using Local SGD training. Default Value: False Indicating whether we are using Local SGD training. Default Value: False
For more details, please refer to For more details, please refer to
`Don't Use Large Mini-Batches, Use Local SGD <https://arxiv.org/pdf/1808.07217.pdf>`_. `Don't Use Large Mini-Batches, Use Local SGD <https://arxiv.org/pdf/1808.07217.pdf>`_.
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
import paddle.distributed.fleet as fleet strategy.localsgd = True # by default this is false
strategy = fleet.DistributedStrategy()
strategy.localsgd = True # by default this is false
""" """
return self.strategy.localsgd return self.strategy.localsgd
...@@ -1657,6 +1705,7 @@ class DistributedStrategy: ...@@ -1657,6 +1705,7 @@ class DistributedStrategy:
@property @property
def localsgd_configs(self): def localsgd_configs(self):
""" """
Set LocalSGD training configurations. LocalSGD has a configurable Set LocalSGD training configurations. LocalSGD has a configurable
setting that can be configured through a dict. setting that can be configured through a dict.
...@@ -1665,14 +1714,14 @@ class DistributedStrategy: ...@@ -1665,14 +1714,14 @@ class DistributedStrategy:
begin_step(int) The step of beginning training by localsgd. Default 1. begin_step(int) The step of beginning training by localsgd. Default 1.
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.localsgd = True
strategy.localsgd_configs = {"k_steps": 4,
"begin_step": 30}
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.localsgd = True
strategy.localsgd_configs = {"k_steps": 4,
"begin_step": 30}
""" """
return get_msg_dict(self.strategy.localsgd_configs) return get_msg_dict(self.strategy.localsgd_configs)
...@@ -1688,18 +1737,17 @@ class DistributedStrategy: ...@@ -1688,18 +1737,17 @@ class DistributedStrategy:
@property @property
def adaptive_localsgd(self): def adaptive_localsgd(self):
""" """
Indicating whether we are using Adaptive Local SGD training. Default Value: False Indicating whether we are using Adaptive Local SGD training. Default Value: False
For more details, please refer to `Adaptive Communication Strategies to Achieve For more details, please refer to `Adaptive Communication Strategies to Achieve
the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_. the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
import paddle.distributed.fleet as fleet strategy.adaptive_localsgd = True # by default this is false
strategy = fleet.DistributedStrategy()
strategy.adaptive_localsgd = True # by default this is false
""" """
return self.strategy.adaptive_localsgd return self.strategy.adaptive_localsgd
...@@ -1715,6 +1763,7 @@ class DistributedStrategy: ...@@ -1715,6 +1763,7 @@ class DistributedStrategy:
@property @property
def adaptive_localsgd_configs(self): def adaptive_localsgd_configs(self):
""" """
Set AdaptiveLocalSGD training configurations. AdaptiveLocalSGD has a configurable Set AdaptiveLocalSGD training configurations. AdaptiveLocalSGD has a configurable
setting that can be configured through a dict. setting that can be configured through a dict.
...@@ -1722,17 +1771,18 @@ class DistributedStrategy: ...@@ -1722,17 +1771,18 @@ class DistributedStrategy:
init_k_steps(int) The initial steps for training before adaptive localsgd. init_k_steps(int) The initial steps for training before adaptive localsgd.
Then, the adaptive localsgd method will modify init_k_steps automatically. Then, the adaptive localsgd method will modify init_k_steps automatically.
Default 1. Default 1.
begin_step(int) The step of beginning training by adaptive localsgd. Default 1. begin_step(int) The step of beginning training by adaptive localsgd. Default 1.
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.adaptive_localsgd = True
strategy.adaptive_localsgd_configs = {"init_k_steps": 1,
"begin_step": 30}
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.adaptive_localsgd = True
strategy.adaptive_localsgd_configs = {"init_k_steps": 1,
"begin_step": 30}
""" """
return get_msg_dict(self.strategy.adaptive_localsgd_configs) return get_msg_dict(self.strategy.adaptive_localsgd_configs)
...@@ -1750,18 +1800,18 @@ class DistributedStrategy: ...@@ -1750,18 +1800,18 @@ class DistributedStrategy:
@property @property
def dgc(self): def dgc(self):
""" """
Indicating whether we are using Deep Gradient Compression training. For more details, please refer to Indicating whether we are using Deep Gradient Compression training. For more details, please refer to
[Deep Gradient Compression](https://arxiv.org/abs/1712.01887). [Deep Gradient Compression](https://arxiv.org/abs/1712.01887).
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
import paddle.distributed.fleet as fleet strategy.dgc = True # by default this is false
strategy = fleet.DistributedStrategy()
strategy.dgc = True # by default this is false
""" """
return self.strategy.dgc return self.strategy.dgc
...@@ -1777,6 +1827,7 @@ class DistributedStrategy: ...@@ -1777,6 +1827,7 @@ class DistributedStrategy:
@property @property
def dgc_configs(self): def dgc_configs(self):
r""" r"""
Set Deep Gradient Compression training configurations. In general, dgc has serveral configurable Set Deep Gradient Compression training configurations. In general, dgc has serveral configurable
settings that can be configured through a dict. settings that can be configured through a dict.
...@@ -1793,13 +1844,13 @@ class DistributedStrategy: ...@@ -1793,13 +1844,13 @@ class DistributedStrategy:
element will be transmitted. element will be transmitted.
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.dgc = True
strategy.dgc_configs = {"rampup_begin_step": 1252}
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.dgc = True
strategy.dgc_configs = {"rampup_begin_step": 1252}
""" """
return get_msg_dict(self.strategy.dgc_configs) return get_msg_dict(self.strategy.dgc_configs)
...@@ -1812,16 +1863,17 @@ class DistributedStrategy: ...@@ -1812,16 +1863,17 @@ class DistributedStrategy:
@property @property
def fp16_allreduce(self): def fp16_allreduce(self):
""" """
Indicating whether we are using fp16 gradient allreduce training Indicating whether we are using fp16 gradient allreduce training
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy()
strategy = fleet.DistributedStrategy() strategy.fp16_allreduce = True # by default this is false
strategy.fp16_allreduce = True # by default this is false
""" """
return self.strategy.fp16_allreduce return self.strategy.fp16_allreduce
...@@ -1836,6 +1888,7 @@ class DistributedStrategy: ...@@ -1836,6 +1888,7 @@ class DistributedStrategy:
@property @property
def gradient_merge(self): def gradient_merge(self):
""" """
Gradient Merge, also called as Gradient Accumulation, Gradient Merge, also called as Gradient Accumulation,
is a strategy for large batch training. With this strategy, is a strategy for large batch training. With this strategy,
model parameter will not be updated until user-defined steps. model parameter will not be updated until user-defined steps.
...@@ -1846,13 +1899,13 @@ class DistributedStrategy: ...@@ -1846,13 +1899,13 @@ class DistributedStrategy:
to model parameters. to model parameters.
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.gradient_merge = True
strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.gradient_merge = True
strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
""" """
return self.strategy.gradient_merge return self.strategy.gradient_merge
...@@ -1867,6 +1920,7 @@ class DistributedStrategy: ...@@ -1867,6 +1920,7 @@ class DistributedStrategy:
@property @property
def gradient_merge_configs(self): def gradient_merge_configs(self):
""" """
the key-value configs of distribute_strategy the key-value configs of distribute_strategy
**Note**: **Note**:
...@@ -1875,13 +1929,13 @@ class DistributedStrategy: ...@@ -1875,13 +1929,13 @@ class DistributedStrategy:
avg(bool): whether to average the gradients of each mini-batch, the default value is `True` avg(bool): whether to average the gradients of each mini-batch, the default value is `True`
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.gradient_merge = True
strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.gradient_merge = True
strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
""" """
return get_msg_dict(self.strategy.gradient_merge_configs) return get_msg_dict(self.strategy.gradient_merge_configs)
...@@ -1896,6 +1950,7 @@ class DistributedStrategy: ...@@ -1896,6 +1950,7 @@ class DistributedStrategy:
@property @property
def lars(self): def lars(self):
""" """
Set lars configurations. lars is used to deal with the convergence problems when the global Set lars configurations. lars is used to deal with the convergence problems when the global
batch size is larger than 8k. For more details, please refer to batch size is larger than 8k. For more details, please refer to
[Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888). [Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888).
...@@ -1903,12 +1958,12 @@ class DistributedStrategy: ...@@ -1903,12 +1958,12 @@ class DistributedStrategy:
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.lars = True # by default this is false
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.lars = True # by default this is false
""" """
return self.strategy.lars return self.strategy.lars
...@@ -1923,6 +1978,7 @@ class DistributedStrategy: ...@@ -1923,6 +1978,7 @@ class DistributedStrategy:
@property @property
def lars_configs(self): def lars_configs(self):
""" """
Set Lars training configurations. Set Lars training configurations.
**Notes**: **Notes**:
...@@ -1934,18 +1990,18 @@ class DistributedStrategy: ...@@ -1934,18 +1990,18 @@ class DistributedStrategy:
will be exclude from weight decay in lars formula. will be exclude from weight decay in lars formula.
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.lars = True
strategy.lars_configs = {
"lars_coeff": 0.01,
"lars_weight_decay": 0.0005,
"epsilon": 0,
"exclude_from_weight_decay": ['batch_norm', '.b_0']
}
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.lars = True
strategy.lars_configs = {
"lars_coeff": 0.01,
"lars_weight_decay": 0.0005,
"epsilon": 0,
"exclude_from_weight_decay": ['batch_norm', '.b_0']
}
""" """
return get_msg_dict(self.strategy.lars_configs) return get_msg_dict(self.strategy.lars_configs)
...@@ -1958,6 +2014,7 @@ class DistributedStrategy: ...@@ -1958,6 +2014,7 @@ class DistributedStrategy:
@property @property
def lamb(self): def lamb(self):
""" """
Set lamb configurations. lamb is used to deal with the convergence problems for large Set lamb configurations. lamb is used to deal with the convergence problems for large
batch size training, specially for attention-related model like BERT. For more details, batch size training, specially for attention-related model like BERT. For more details,
please refer to please refer to
...@@ -1966,12 +2023,12 @@ class DistributedStrategy: ...@@ -1966,12 +2023,12 @@ class DistributedStrategy:
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.lamb = True # by default this is false
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.lamb = True # by default this is false
""" """
return self.strategy.lamb return self.strategy.lamb
...@@ -1987,6 +2044,7 @@ class DistributedStrategy: ...@@ -1987,6 +2044,7 @@ class DistributedStrategy:
@property @property
def lamb_configs(self): def lamb_configs(self):
""" """
Set Lars training configurations. Set Lars training configurations.
**Notes**: **Notes**:
...@@ -1995,16 +2053,16 @@ class DistributedStrategy: ...@@ -1995,16 +2053,16 @@ class DistributedStrategy:
will be exclude from weight decay in lamb formula. will be exclude from weight decay in lamb formula.
Examples: Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.lamb = True
strategy.lamb_configs = {
'lamb_weight_decay': 0.01,
'exclude_from_weight_decay': [],
}
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.lamb = True
strategy.lamb_configs = {
'lamb_weight_decay': 0.01,
'exclude_from_weight_decay': [],
}
""" """
return get_msg_dict(self.strategy.lamb_configs) return get_msg_dict(self.strategy.lamb_configs)
...@@ -2017,8 +2075,10 @@ class DistributedStrategy: ...@@ -2017,8 +2075,10 @@ class DistributedStrategy:
@property @property
def elastic(self): def elastic(self):
""" """
Indicating whether we want to do current distributed training on clusters with elastic resources. Indicating whether we want to do current distributed training on clusters with elastic resources.
Currently, this is configuration is not valid. Currently, this is configuration is not valid.
""" """
return self.strategy.elastic return self.strategy.elastic
...@@ -2033,6 +2093,7 @@ class DistributedStrategy: ...@@ -2033,6 +2093,7 @@ class DistributedStrategy:
@property @property
def auto(self): def auto(self):
""" """
Indicating whether we are using auto-parallel configuration Indicating whether we are using auto-parallel configuration
This feature is currently an experimental feature. Currently, This feature is currently an experimental feature. Currently,
auto-parallelism can be used only when a user does not set any other auto-parallelism can be used only when a user does not set any other
...@@ -2041,20 +2102,20 @@ class DistributedStrategy: ...@@ -2041,20 +2102,20 @@ class DistributedStrategy:
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle
paddle.enable_static()
import paddle.distributed.fleet as fleet
import paddle strategy = fleet.DistributedStrategy()
paddle.enable_static() strategy.auto = True
import paddle.distributed.fleet as fleet # if set other strategy at the same time, auto will not apply
# strategy.amp = True
strategy = fleet.DistributedStrategy() optimizer = paddle.optimizer.SGD(learning_rate=0.01)
strategy.auto = True optimizer = fleet.distributed_optimizer(optimizer, strategy)
# if set other strategy at the same time, auto will not apply
# strategy.amp = True
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy)
""" """
return self.strategy.auto return self.strategy.auto
...@@ -2068,6 +2129,7 @@ class DistributedStrategy: ...@@ -2068,6 +2129,7 @@ class DistributedStrategy:
@property @property
def semi_auto(self): def semi_auto(self):
""" """
Indicating whether we are using semi-auto parallel function Indicating whether we are using semi-auto parallel function
This feature is currently an experimental feature. Currently, This feature is currently an experimental feature. Currently,
auto-parallelism can be used only when a user does not set any other auto-parallelism can be used only when a user does not set any other
...@@ -2076,20 +2138,20 @@ class DistributedStrategy: ...@@ -2076,20 +2138,20 @@ class DistributedStrategy:
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle
paddle.enable_static()
import paddle.distributed.fleet as fleet
import paddle strategy = fleet.DistributedStrategy()
paddle.enable_static() strategy.semi_auto = True
import paddle.distributed.fleet as fleet # if set other strategy at the same time, auto will not apply
# strategy.amp = True
strategy = fleet.DistributedStrategy() optimizer = paddle.optimizer.SGD(learning_rate=0.01)
strategy.semi_auto = True optimizer = fleet.distributed_optimizer(optimizer, strategy)
# if set other strategy at the same time, auto will not apply
# strategy.amp = True
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy)
""" """
return self.strategy.semi_auto return self.strategy.semi_auto
...@@ -2103,16 +2165,21 @@ class DistributedStrategy: ...@@ -2103,16 +2165,21 @@ class DistributedStrategy:
@property @property
def auto_search(self): def auto_search(self):
""" """
Indicating whether we are using auto-search parallel function Indicating whether we are using auto-search parallel function
For details, please reference the following code example For details, please reference the following code example
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle
paddle.enable_static() import paddle
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() paddle.enable_static()
strategy.auto_search = True import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.auto_search = True
""" """
return self.strategy.auto_search return self.strategy.auto_search
...@@ -2126,15 +2193,20 @@ class DistributedStrategy: ...@@ -2126,15 +2193,20 @@ class DistributedStrategy:
@property @property
def split_data(self): def split_data(self):
""" """
Indicating whether we split the data. If True, we split the data. Indicating whether we split the data. If True, we split the data.
Default Value: True Default Value: True
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle
paddle.enable_static() import paddle
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() paddle.enable_static()
strategy.split_data = True import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.split_data = True
""" """
return self.strategy.split_data return self.strategy.split_data
...@@ -2148,8 +2220,10 @@ class DistributedStrategy: ...@@ -2148,8 +2220,10 @@ class DistributedStrategy:
@property @property
def qat(self): def qat(self):
""" """
Indicating whether we are using quantization training Indicating whether we are using quantization training
Default Value: False Default Value: False
""" """
return self.strategy.qat return self.strategy.qat
...@@ -2163,6 +2237,7 @@ class DistributedStrategy: ...@@ -2163,6 +2237,7 @@ class DistributedStrategy:
@property @property
def qat_configs(self): def qat_configs(self):
""" """
Set quantization training configurations. In general, qat has serveral configurable Set quantization training configurations. In general, qat has serveral configurable
settings that can be configured through a dict. settings that can be configured through a dict.
...@@ -2179,17 +2254,17 @@ class DistributedStrategy: ...@@ -2179,17 +2254,17 @@ class DistributedStrategy:
algo(str): Other quantization training algorithm. algo(str): Other quantization training algorithm.
Exampless: Exampless:
.. code-block:: python
.. code-block:: python import paddle.distributed.fleet as fleet
import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy()
strategy = fleet.DistributedStrategy() strategy.qat = True
strategy.qat = True strategy.qat_configs = {
strategy.qat_configs = { "channel_wise_abs_max": True,
"channel_wise_abs_max": True, "weight_bits": 8,
"weight_bits": 8, "activation_bits: 8,
"activation_bits: 8, "not_quant_pattern": ['skip_quant']}
"not_quant_pattern": ['skip_quant']}
""" """
return get_msg_dict(self.strategy.qat_configs) return get_msg_dict(self.strategy.qat_configs)
...@@ -2202,24 +2277,25 @@ class DistributedStrategy: ...@@ -2202,24 +2277,25 @@ class DistributedStrategy:
@property @property
def heter_ccl_mode(self): def heter_ccl_mode(self):
""" """
Indicating whether we are using heter_ccl_mode for model training. Indicating whether we are using heter_ccl_mode for model training.
This feature is currently an experimental feature. Currently, This feature is currently an experimental feature. Currently,
heter_ccl_mode can be used only for dataparallel with dygraph mode. heter_ccl_mode can be used only for dataparallel with dygraph mode.
Default Value: False Default Value: False
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle
import paddle.distributed.fleet as fleet
import paddle strategy = fleet.DistributedStrategy()
import paddle.distributed.fleet as fleet strategy.heter_ccl_mode = True
strategy = fleet.DistributedStrategy() # for initialize parallel env, only need to call
strategy.heter_ccl_mode = True paddle.distributed.init_parallel_env()
# then the heterogenous context will be created.
# for initialize parallel env, only need to call
paddle.distributed.init_parallel_env()
# then the heterogenous context will be created.
""" """
return self.strategy.heter_ccl_mode return self.strategy.heter_ccl_mode
...@@ -2233,6 +2309,7 @@ class DistributedStrategy: ...@@ -2233,6 +2309,7 @@ class DistributedStrategy:
@property @property
def cudnn_exhaustive_search(self): def cudnn_exhaustive_search(self):
""" """
Indicating whether to use exhaustive search method to choose convolution algorithms. Indicating whether to use exhaustive search method to choose convolution algorithms.
Exhaustive search attempts all cuDNN algorithms to choose the fastest algorithm. Exhaustive search attempts all cuDNN algorithms to choose the fastest algorithm.
This method is time-consuming, the choosed algorithm will be cached for the given layer specifications. This method is time-consuming, the choosed algorithm will be cached for the given layer specifications.
...@@ -2240,17 +2317,18 @@ class DistributedStrategy: ...@@ -2240,17 +2317,18 @@ class DistributedStrategy:
Default Value: True Default Value: True
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle
paddle.enable_static()
import paddle.distributed.fleet as fleet
import paddle strategy = fleet.DistributedStrategy()
paddle.enable_static() strategy.cudnn_exhaustive_search = False
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy() optimizer = paddle.optimizer.SGD(learning_rate=0.01)
strategy.cudnn_exhaustive_search = False optimizer = fleet.distributed_optimizer(optimizer, strategy)
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy)
""" """
return self.strategy.cudnn_exhaustive_search return self.strategy.cudnn_exhaustive_search
...@@ -2267,6 +2345,7 @@ class DistributedStrategy: ...@@ -2267,6 +2345,7 @@ class DistributedStrategy:
@property @property
def conv_workspace_size_limit(self): def conv_workspace_size_limit(self):
""" """
The workspace limit size in MB unit for choosing cuDNN convolution algorithms. The workspace limit size in MB unit for choosing cuDNN convolution algorithms.
The inner funciton of cuDNN obtain the fastest suited algorithm that fits within this memory limit. The inner funciton of cuDNN obtain the fastest suited algorithm that fits within this memory limit.
Usually, large workspace size may lead to choose faster algorithms, Usually, large workspace size may lead to choose faster algorithms,
...@@ -2274,17 +2353,17 @@ class DistributedStrategy: ...@@ -2274,17 +2353,17 @@ class DistributedStrategy:
Default Value: 4000 Default Value: 4000
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle
paddle.enable_static()
import paddle.distributed.fleet as fleet
import paddle strategy = fleet.DistributedStrategy()
paddle.enable_static() strategy.conv_workspace_size_limit = 1024
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.conv_workspace_size_limit = 1024
optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy)
""" """
return self.strategy.conv_workspace_size_limit return self.strategy.conv_workspace_size_limit
...@@ -2302,22 +2381,23 @@ class DistributedStrategy: ...@@ -2302,22 +2381,23 @@ class DistributedStrategy:
@property @property
def cudnn_batchnorm_spatial_persistent(self): def cudnn_batchnorm_spatial_persistent(self):
""" """
Indicates whether to use the mode CUDNN_BATCHNORM_SPATIAL_PERSISTENT function in batchnorm. Indicates whether to use the mode CUDNN_BATCHNORM_SPATIAL_PERSISTENT function in batchnorm.
This is only useful in cudnn. This is only useful in cudnn.
Default Value: True Default Value: True
Examples: Examples:
.. code-block:: python
.. code-block:: python import paddle
paddle.enable_static()
import paddle.distributed.fleet as fleet
import paddle strategy = fleet.DistributedStrategy()
paddle.enable_static() strategy.cudnn_batchnorm_spatial_persistent = True
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.cudnn_batchnorm_spatial_persistent = True
optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer = fleet.distributed_optimizer(optimizer, strategy)
""" """
return self.strategy.cudnn_batchnorm_spatial_persistent return self.strategy.cudnn_batchnorm_spatial_persistent
......
...@@ -25,12 +25,13 @@ _HYBRID_PARALLEL_GROUP = None ...@@ -25,12 +25,13 @@ _HYBRID_PARALLEL_GROUP = None
class ParallelMode: class ParallelMode:
""" """
There are all the parallel modes currently supported: There are all the parallel modes currently supported:
- DATA_PARALLEL: Distribute input data to different devices.
- TENSOR_PARALLEL: Shards tensors in the network to different devices. - DATA_PARALLEL: Distribute input data to different devices.
- PIPELINE_PARALLEL: Place different layers of the network on different devices. - TENSOR_PARALLEL: Shards tensors in the network to different devices.
- SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states - PIPELINE_PARALLEL: Place different layers of the network on different devices.
corresponding to the parameters to each device. - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states corresponding to the parameters to each device.
Examples: Examples:
.. code-block:: python .. code-block:: python
......
...@@ -97,6 +97,7 @@ def _check_var_exists(var_name): ...@@ -97,6 +97,7 @@ def _check_var_exists(var_name):
def init_parallel_env(): def init_parallel_env():
""" """
Initialize parallel training environment in dynamic graph mode. Initialize parallel training environment in dynamic graph mode.
Note: Note:
...@@ -112,6 +113,7 @@ def init_parallel_env(): ...@@ -112,6 +113,7 @@ def init_parallel_env():
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu # required: gpu
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
...@@ -152,6 +154,7 @@ def init_parallel_env(): ...@@ -152,6 +154,7 @@ def init_parallel_env():
if __name__ == '__main__': if __name__ == '__main__':
dist.spawn(train) dist.spawn(train)
""" """
# 0. get env & check world size # 0. get env & check world size
......
...@@ -236,13 +236,13 @@ def send_ue_recv( ...@@ -236,13 +236,13 @@ def send_ue_recv(
src_index (Tensor): An 1-D tensor, and the available data type is int32, int64. src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
The available data type is int32, int64. The available data type is int32, int64.
message_op (str): Different message ops for x and e, including `add`, `sub`, `mul`, `div`. message_op (str, optional): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`. reduce_op (str, optional): Different reduce ops, including `sum`, `mean`, `max`, `min`.
Default value is `sum`. Default value is `sum`.
out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or out_size (int|Tensor, optional): We can set `out_size` to get necessary output shape. If not set or
out_size is smaller or equal to 0, then this input will not be used. out_size is smaller or equal to 0, then this input will not be used.
Otherwise, `out_size` should be equal with or larger than Otherwise, `out_size` should be equal with or larger than
max(dst_index) + 1. max(dst_index) + 1. Default value is `None`.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
......
...@@ -25,6 +25,7 @@ def reindex_graph( ...@@ -25,6 +25,7 @@ def reindex_graph(
x, neighbors, count, value_buffer=None, index_buffer=None, name=None x, neighbors, count, value_buffer=None, index_buffer=None, name=None
): ):
""" """
Reindex Graph API. Reindex Graph API.
This API is mainly used in Graph Learning domain, which should be used This API is mainly used in Graph Learning domain, which should be used
...@@ -48,12 +49,12 @@ def reindex_graph( ...@@ -48,12 +49,12 @@ def reindex_graph(
should be the same with `x`. should be the same with `x`.
count (Tensor): The neighbor count of the input nodes `x`. And the count (Tensor): The neighbor count of the input nodes `x`. And the
data type should be int32. data type should be int32.
value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32, value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version. and should be filled with -1. Only useful for gpu version. Default is None.
index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32, index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version. and should be filled with -1. Only useful for gpu version.
`value_buffer` and `index_buffer` should be both not None `value_buffer` and `index_buffer` should be both not None
if you want to speed up by using hashtable buffer. if you want to speed up by using hashtable buffer. Default is None.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
...@@ -68,6 +69,7 @@ def reindex_graph( ...@@ -68,6 +69,7 @@ def reindex_graph(
.. code-block:: python .. code-block:: python
import paddle import paddle
x = [0, 1, 2] x = [0, 1, 2]
neighbors = [8, 9, 0, 4, 7, 6, 7] neighbors = [8, 9, 0, 4, 7, 6, 7]
count = [2, 3, 2] count = [2, 3, 2]
...@@ -137,6 +139,7 @@ def reindex_heter_graph( ...@@ -137,6 +139,7 @@ def reindex_heter_graph(
x, neighbors, count, value_buffer=None, index_buffer=None, name=None x, neighbors, count, value_buffer=None, index_buffer=None, name=None
): ):
""" """
Reindex HeterGraph API. Reindex HeterGraph API.
This API is mainly used in Graph Learning domain, which should be used This API is mainly used in Graph Learning domain, which should be used
...@@ -160,12 +163,12 @@ def reindex_heter_graph( ...@@ -160,12 +163,12 @@ def reindex_heter_graph(
The data type should be the same with `x`. The data type should be the same with `x`.
count (list|tuple): The neighbor counts of the input nodes `x` from different graphs. count (list|tuple): The neighbor counts of the input nodes `x` from different graphs.
And the data type should be int32. And the data type should be int32.
value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32, value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version. and should be filled with -1. Only useful for gpu version. Default is None.
index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32, index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version. and should be filled with -1. Only useful for gpu version.
`value_buffer` and `index_buffer` should be both not None `value_buffer` and `index_buffer` should be both not None
if you want to speed up by using hashtable buffer. if you want to speed up by using hashtable buffer. Default is None.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
...@@ -182,6 +185,7 @@ def reindex_heter_graph( ...@@ -182,6 +185,7 @@ def reindex_heter_graph(
.. code-block:: python .. code-block:: python
import paddle import paddle
x = [0, 1, 2] x = [0, 1, 2]
neighbors_a = [8, 9, 0, 4, 7, 6, 7] neighbors_a = [8, 9, 0, 4, 7, 6, 7]
count_a = [2, 3, 2] count_a = [2, 3, 2]
......
...@@ -31,6 +31,7 @@ def sample_neighbors( ...@@ -31,6 +31,7 @@ def sample_neighbors(
name=None, name=None,
): ):
""" """
Graph Sample Neighbors API. Graph Sample Neighbors API.
This API is mainly used in Graph Learning domain, and the main purpose is to This API is mainly used in Graph Learning domain, and the main purpose is to
...@@ -51,16 +52,16 @@ def sample_neighbors( ...@@ -51,16 +52,16 @@ def sample_neighbors(
The data type should be the same with `row`. The data type should be the same with `row`.
input_nodes (Tensor): The input nodes we need to sample neighbors for, and the input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
data type should be the same with `row`. data type should be the same with `row`.
sample_size (int): The number of neighbors we need to sample. Default value is -1, sample_size (int, optional): The number of neighbors we need to sample. Default value is -1,
which means returning all the neighbors of the input nodes. which means returning all the neighbors of the input nodes.
eids (Tensor): The eid information of the input graph. If return_eids is True, eids (Tensor, optional): The eid information of the input graph. If return_eids is True,
then `eids` should not be None. The data type should be the then `eids` should not be None. The data type should be the
same with `row`. Default is None. same with `row`. Default is None.
return_eids (bool): Whether to return eid information of sample edges. Default is False. return_eids (bool, optional): Whether to return eid information of sample edges. Default is False.
perm_buffer (Tensor): Permutation buffer for fisher-yates sampling. If `use_perm_buffer` perm_buffer (Tensor, optional): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
is True, then `perm_buffer` should not be None. The data type should is True, then `perm_buffer` should not be None. The data type should
be the same with `row`. If not None, we will use fiser-yates sampling be the same with `row`. If not None, we will use fiser-yates sampling
to speed up. Only useful for gpu version. to speed up. Only useful for gpu version. Default is None.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
...@@ -68,15 +69,16 @@ def sample_neighbors( ...@@ -68,15 +69,16 @@ def sample_neighbors(
- out_neighbors (Tensor), the sample neighbors of the input nodes. - out_neighbors (Tensor), the sample neighbors of the input nodes.
- out_count (Tensor), the number of sampling neighbors of each input node, and the shape - out_count (Tensor), the number of sampling neighbors of each input node, and the shape
should be the same with `input_nodes`. should be the same with `input_nodes`.
- out_eids (Tensor), if `return_eids` is True, we will return the eid information of the - out_eids (Tensor), if `return_eids` is True, we will return the eid information of the
sample edges. sample edges.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
# edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4), # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
# (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8) # (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7] row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
......
...@@ -258,7 +258,9 @@ def _update_input_info(inputs): ...@@ -258,7 +258,9 @@ def _update_input_info(inputs):
class StaticGraphAdapter: class StaticGraphAdapter:
""" """
Model traning/inference with a static graph. Model traning/inference with a static graph.
""" """
def __init__(self, model): def __init__(self, model):
...@@ -1005,6 +1007,7 @@ class DynamicGraphAdapter: ...@@ -1005,6 +1007,7 @@ class DynamicGraphAdapter:
class Model: class Model:
""" """
An Model object is network with training and inference features. An Model object is network with training and inference features.
Dynamic graph and static graph are supported at the same time, Dynamic graph and static graph are supported at the same time,
switched by `paddle.enable_static()`. The usage is as follows. switched by `paddle.enable_static()`. The usage is as follows.
...@@ -1145,6 +1148,7 @@ class Model: ...@@ -1145,6 +1148,7 @@ class Model:
def train_batch(self, inputs, labels=None, update=True): def train_batch(self, inputs, labels=None, update=True):
""" """
Run one training step on one batch of data. And using `update` indicates Run one training step on one batch of data. And using `update` indicates
whether optimizer update gradients computing by this batch. whether optimizer update gradients computing by this batch.
...@@ -1190,6 +1194,7 @@ class Model: ...@@ -1190,6 +1194,7 @@ class Model:
loss = model.train_batch([data], [label]) loss = model.train_batch([data], [label])
print(loss) print(loss)
# [array([2.192784], dtype=float32)] # [array([2.192784], dtype=float32)]
""" """
loss = self._adapter.train_batch(inputs, labels, update) loss = self._adapter.train_batch(inputs, labels, update)
if fluid._non_static_mode() and self._input_info is None: if fluid._non_static_mode() and self._input_info is None:
...@@ -1199,6 +1204,7 @@ class Model: ...@@ -1199,6 +1204,7 @@ class Model:
@no_grad() @no_grad()
def eval_batch(self, inputs, labels=None): def eval_batch(self, inputs, labels=None):
""" """
Run one evaluating step on a batch of data. Run one evaluating step on a batch of data.
Args: Args:
...@@ -1242,6 +1248,7 @@ class Model: ...@@ -1242,6 +1248,7 @@ class Model:
loss, acc = model.eval_batch([data], [label]) loss, acc = model.eval_batch([data], [label])
print(loss, acc) print(loss, acc)
# [array([2.8825705], dtype=float32)] [0.0] # [array([2.8825705], dtype=float32)] [0.0]
""" """
loss = self._adapter.eval_batch(inputs, labels) loss = self._adapter.eval_batch(inputs, labels)
if fluid._non_static_mode() and self._input_info is None: if fluid._non_static_mode() and self._input_info is None:
...@@ -1251,6 +1258,7 @@ class Model: ...@@ -1251,6 +1258,7 @@ class Model:
@no_grad() @no_grad()
def predict_batch(self, inputs): def predict_batch(self, inputs):
""" """
Run one predicting step on a batch of data. Run one predicting step on a batch of data.
Args: Args:
...@@ -1289,6 +1297,7 @@ class Model: ...@@ -1289,6 +1297,7 @@ class Model:
# [array([[0.08189095, 0.16740078, 0.06889386, 0.05085445, 0.10729759, # [array([[0.08189095, 0.16740078, 0.06889386, 0.05085445, 0.10729759,
# 0.02217775, 0.14518553, 0.1591538 , 0.01808308, 0.17906217]], # 0.02217775, 0.14518553, 0.1591538 , 0.01808308, 0.17906217]],
# dtype=float32)] # dtype=float32)]
""" """
loss = self._adapter.predict_batch(inputs) loss = self._adapter.predict_batch(inputs)
if fluid._non_static_mode() and self._input_info is None: if fluid._non_static_mode() and self._input_info is None:
...@@ -1297,6 +1306,7 @@ class Model: ...@@ -1297,6 +1306,7 @@ class Model:
def save(self, path, training=True): def save(self, path, training=True):
""" """
This function saves parameters, optimizer information or model and This function saves parameters, optimizer information or model and
paramters only for inference to path. It depends on the parameter paramters only for inference to path. It depends on the parameter
`training`. `training`.
...@@ -1364,6 +1374,7 @@ class Model: ...@@ -1364,6 +1374,7 @@ class Model:
model.fit(data, epochs=1, batch_size=32, verbose=0) model.fit(data, epochs=1, batch_size=32, verbose=0)
model.save('checkpoint/test') # save for training model.save('checkpoint/test') # save for training
model.save('inference_model', False) # save for inference model.save('inference_model', False) # save for inference
""" """
if ParallelEnv().local_rank == 0: if ParallelEnv().local_rank == 0:
...@@ -1374,6 +1385,7 @@ class Model: ...@@ -1374,6 +1385,7 @@ class Model:
def load(self, path, skip_mismatch=False, reset_optimizer=False): def load(self, path, skip_mismatch=False, reset_optimizer=False):
""" """
Load from files storing the model states and optimizer states. The file Load from files storing the model states and optimizer states. The file
for optimizer states is not necessary if no need to restore the optimizer. for optimizer states is not necessary if no need to restore the optimizer.
...@@ -1421,6 +1433,7 @@ class Model: ...@@ -1421,6 +1433,7 @@ class Model:
model.save('checkpoint/test') model.save('checkpoint/test')
model.load('checkpoint/test') model.load('checkpoint/test')
""" """
def _load_state_from_path(path): def _load_state_from_path(path):
...@@ -1491,6 +1504,7 @@ class Model: ...@@ -1491,6 +1504,7 @@ class Model:
def parameters(self, *args, **kwargs): def parameters(self, *args, **kwargs):
""" """
Returns a list of parameters of the model. Returns a list of parameters of the model.
Returns: Returns:
...@@ -1513,6 +1527,7 @@ class Model: ...@@ -1513,6 +1527,7 @@ class Model:
nn.Linear(200, 10)), input) nn.Linear(200, 10)), input)
params = model.parameters() params = model.parameters()
""" """
return self._adapter.parameters() return self._adapter.parameters()
...@@ -1609,6 +1624,7 @@ class Model: ...@@ -1609,6 +1624,7 @@ class Model:
self, optimizer=None, loss=None, metrics=None, amp_configs=None self, optimizer=None, loss=None, metrics=None, amp_configs=None
): ):
""" """
Configures the model before runing. Configures the model before runing.
Args: Args:
...@@ -1640,6 +1656,7 @@ class Model: ...@@ -1640,6 +1656,7 @@ class Model:
Returns: Returns:
None None
""" """
self._place = _get_device() self._place = _get_device()
if isinstance(self._place, fluid.CUDAPlace): if isinstance(self._place, fluid.CUDAPlace):
...@@ -1699,6 +1716,7 @@ class Model: ...@@ -1699,6 +1716,7 @@ class Model:
num_iters=None, num_iters=None,
): ):
""" """
Trains the model for a fixed number of epochs. If `eval_data` is set, Trains the model for a fixed number of epochs. If `eval_data` is set,
evaluation will be done at the end of each epoch. evaluation will be done at the end of each epoch.
...@@ -1753,7 +1771,7 @@ class Model: ...@@ -1753,7 +1771,7 @@ class Model:
How to make a batch is done internally. How to make a batch is done internally.
.. code-block:: python .. code-block:: python
:name: code-example1 :name: code-example3
import paddle import paddle
import paddle.vision.transforms as T import paddle.vision.transforms as T
...@@ -1793,7 +1811,7 @@ class Model: ...@@ -1793,7 +1811,7 @@ class Model:
DataLoader. DataLoader.
.. code-block:: python .. code-block:: python
:name: code-example2 :name: code-example4
import paddle import paddle
import paddle.vision.transforms as T import paddle.vision.transforms as T
...@@ -1830,6 +1848,7 @@ class Model: ...@@ -1830,6 +1848,7 @@ class Model:
val_loader, val_loader,
epochs=2, epochs=2,
save_dir='mnist_checkpoint') save_dir='mnist_checkpoint')
""" """
assert train_data is not None, "train_data must be given!" assert train_data is not None, "train_data must be given!"
......
...@@ -37,6 +37,7 @@ def graph_sample_neighbors( ...@@ -37,6 +37,7 @@ def graph_sample_neighbors(
name=None, name=None,
): ):
""" """
Graph Sample Neighbors API. Graph Sample Neighbors API.
This API is mainly used in Graph Learning domain, and the main purpose is to This API is mainly used in Graph Learning domain, and the main purpose is to
...@@ -72,27 +73,26 @@ def graph_sample_neighbors( ...@@ -72,27 +73,26 @@ def graph_sample_neighbors(
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
Returns: Returns:
out_neighbors (Tensor): The sample neighbors of the input nodes. - out_neighbors (Tensor): The sample neighbors of the input nodes.
out_count (Tensor): The number of sampling neighbors of each input node, and the shape - out_count (Tensor): The number of sampling neighbors of each input node, and the shape should be the same with `input_nodes`.
should be the same with `input_nodes`. - out_eids (Tensor): If `return_eids` is True, we will return the eid information of the sample edges.
out_eids (Tensor): If `return_eids` is True, we will return the eid information of the
sample edges.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle
# edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4), import paddle
# (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8) # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7] # (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13] row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
nodes = [0, 8, 1, 2] colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
sample_size = 2 nodes = [0, 8, 1, 2]
row = paddle.to_tensor(row, dtype="int64") sample_size = 2
colptr = paddle.to_tensor(colptr, dtype="int64") row = paddle.to_tensor(row, dtype="int64")
nodes = paddle.to_tensor(nodes, dtype="int64") colptr = paddle.to_tensor(colptr, dtype="int64")
out_neighbors, out_count = \ nodes = paddle.to_tensor(nodes, dtype="int64")
paddle.incubate.graph_sample_neighbors(row, colptr, nodes, out_neighbors, out_count = \
sample_size=sample_size) paddle.incubate.graph_sample_neighbors(row, colptr, nodes,
sample_size=sample_size)
""" """
......
...@@ -710,6 +710,7 @@ def upsample( ...@@ -710,6 +710,7 @@ def upsample(
name=None, name=None,
): ):
""" """
This API resizes a batch of images. This API resizes a batch of images.
The input must be a 3-D Tensor of the shape (num_batches, channels, in_w) The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
...@@ -720,11 +721,12 @@ def upsample( ...@@ -720,11 +721,12 @@ def upsample(
and the resizing only applies on the three dimensions(depth, height and width). and the resizing only applies on the three dimensions(depth, height and width).
Supporting resample methods: Supporting resample methods:
'linear' : Linear interpolation - 'linear' : Linear interpolation
'bilinear' : Bilinear interpolation - 'bilinear' : Bilinear interpolation
'trilinear' : Trilinear interpolation - 'trilinear' : Trilinear interpolation
'nearest' : Nearest neighbor interpolation - 'nearest' : Nearest neighbor interpolation
'bicubic' : Bicubic interpolation - 'bicubic' : Bicubic interpolation
Linear interpolation is the method of using a line connecting two known quantities Linear interpolation is the method of using a line connecting two known quantities
to determine the value of an unknown quantity between the two known quantities. to determine the value of an unknown quantity between the two known quantities.
...@@ -757,77 +759,78 @@ def upsample( ...@@ -757,77 +759,78 @@ def upsample(
`paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`. `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
Example: Example:
.. code-block:: text .. code-block:: text
For scale_factor: For scale_factor:
if align_corners = True && out_size > 1 : if align_corners = True && out_size > 1 :
scale_factor = (in_size-1.0)/(out_size-1.0) scale_factor = (in_size-1.0)/(out_size-1.0)
else:
scale_factor = float(in_size/out_size)
Linear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,W_in)
output: (N,C,W_out) where:
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,W_in)
output: (N,C,W_out) where:
W_out = W_{in} * scale_{factor}
Nearest neighbor interpolation:
if:
align_corners = False
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = floor (H_{in} * scale_{factor})
W_out = floor (W_{in} * scale_{factor})
else: else:
scale_factor = float(in_size/out_size) align_corners = True
Linear interpolation: input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = round(H_{in} * scale_{factor})
W_out = round(W_{in} * scale_{factor})
Bilinear interpolation:
if: if:
align_corners = False , align_mode = 0 align_corners = False , align_mode = 0
input : (N,C,W_in) input : (N,C,H_in,W_in)
output: (N,C,W_out) where: output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5 W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else: else:
input : (N,C,W_in) input : (N,C,H_in,W_in)
output: (N,C,W_out) where: output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
Bicubic interpolation:
if:
align_corners = False
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
Trilinear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,D_in,H_in,W_in)
output: (N,C,D_out,H_out,W_out) where:
D_out = (D_{in}+0.5) * scale_{factor} - 0.5
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,D_in,H_in,W_in)
output: (N,C,D_out,H_out,W_out) where:
D_out = D_{in} * scale_{factor}
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor} W_out = W_{in} * scale_{factor}
Nearest neighbor interpolation:
if:
align_corners = False
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = floor (H_{in} * scale_{factor})
W_out = floor (W_{in} * scale_{factor})
else:
align_corners = True
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = round(H_{in} * scale_{factor})
W_out = round(W_{in} * scale_{factor})
Bilinear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
Bicubic interpolation:
if:
align_corners = False
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
Trilinear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,D_in,H_in,W_in)
output: (N,C,D_out,H_out,W_out) where:
D_out = (D_{in}+0.5) * scale_{factor} - 0.5
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,D_in,H_in,W_in)
output: (N,C,D_out,H_out,W_out) where:
D_out = D_{in} * scale_{factor}
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
https://en.wikipedia.org/wiki/Linear_interpolation.
For details of linear interpolation, please refer to Wikipedia: For details of linear interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Linear_interpolation.
For details of nearest neighbor interpolation, please refer to Wikipedia: For details of nearest neighbor interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation. https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
...@@ -871,23 +874,24 @@ def upsample( ...@@ -871,23 +874,24 @@ def upsample(
name(str, optional): The default value is None. name(str, optional): The default value is None.
Normally there is no need for user to set this property. Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name` For more information, please refer to :ref:`api_guide_Name`
Returns: Returns:
A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels), A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels), A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels). or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32) input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
upsample_out = paddle.nn.Upsample(size=[12,12]) upsample_out = paddle.nn.Upsample(size=[12,12])
output = upsample_out(x=input_data) output = upsample_out(x=input_data)
print(output.shape) print(output.shape)
# [2L, 3L, 12L, 12L] # [2L, 3L, 12L, 12L]
""" """
return interpolate( return interpolate(
......
...@@ -23,6 +23,7 @@ __all__ = [] ...@@ -23,6 +23,7 @@ __all__ = []
def pairwise_distance(x, y, p=2.0, epsilon=1e-6, keepdim=False, name=None): def pairwise_distance(x, y, p=2.0, epsilon=1e-6, keepdim=False, name=None):
r""" r"""
It computes the pairwise distance between two vectors. The It computes the pairwise distance between two vectors. The
distance is calculated by p-oreder norm: distance is calculated by p-oreder norm:
...@@ -48,10 +49,11 @@ def pairwise_distance(x, y, p=2.0, epsilon=1e-6, keepdim=False, name=None): ...@@ -48,10 +49,11 @@ def pairwise_distance(x, y, p=2.0, epsilon=1e-6, keepdim=False, name=None):
Returns: Returns:
Tensor, the dtype is same as input tensor. Tensor, the dtype is same as input tensor.
- If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`, - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
depending on whether the input has data shaped as :math:`[N, D]`. depending on whether the input has data shaped as :math:`[N, D]`.
- If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`, - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,
depending on whether the input has data shaped as :math:`[N, D]`. depending on whether the input has data shaped as :math:`[N, D]`.
Examples: Examples:
.. code-block:: python .. code-block:: python
......
...@@ -1305,6 +1305,7 @@ def margin_ranking_loss( ...@@ -1305,6 +1305,7 @@ def margin_ranking_loss(
def l1_loss(input, label, reduction='mean', name=None): def l1_loss(input, label, reduction='mean', name=None):
r""" r"""
Computes the L1 Loss of Tensor ``input`` and ``label`` as follows. Computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
If `reduction` set to ``'none'``, the loss is: If `reduction` set to ``'none'``, the loss is:
...@@ -1336,7 +1337,7 @@ def l1_loss(input, label, reduction='mean', name=None): ...@@ -1336,7 +1337,7 @@ def l1_loss(input, label, reduction='mean', name=None):
Returns: Returns:
Tensor, the L1 Loss of Tensor ``input`` and ``label``. Tensor, the L1 Loss of Tensor ``input`` and ``label``.
If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` . If `reduction` is ``'none'``, the shape of output loss is :math:`[N, *]`, the same as ``input`` .
If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1]. If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
Examples: Examples:
...@@ -1359,6 +1360,7 @@ def l1_loss(input, label, reduction='mean', name=None): ...@@ -1359,6 +1360,7 @@ def l1_loss(input, label, reduction='mean', name=None):
l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum') l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
print(l1_loss.numpy()) print(l1_loss.numpy())
# [1.4] # [1.4]
""" """
if reduction not in ['sum', 'mean', 'none']: if reduction not in ['sum', 'mean', 'none']:
raise ValueError( raise ValueError(
...@@ -2281,6 +2283,7 @@ def cross_entropy( ...@@ -2281,6 +2283,7 @@ def cross_entropy(
name=None, name=None,
): ):
r""" r"""
By default, this operator implements the cross entropy loss function with softmax. This function By default, this operator implements the cross entropy loss function with softmax. This function
combines the calculation of the softmax operation and the cross entropy loss function combines the calculation of the softmax operation and the cross entropy loss function
to provide a more numerically stable computing. to provide a more numerically stable computing.
...@@ -2394,21 +2397,13 @@ def cross_entropy( ...@@ -2394,21 +2397,13 @@ def cross_entropy(
Parameters: Parameters:
input (Tensor): the data type is float32, float64. Shape is :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes, ``k >= 1`` .
- **input** (Tensor)
Input tensor, the data type is float32, float64. Shape is
:math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` .
Note: Note:
1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the output of softmax operator, which will produce incorrect results.
1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
output of softmax operator, which will produce incorrect results.
2. when use_softmax=False, it expects the output of softmax operator. 2. when use_softmax=False, it expects the output of softmax operator.
- **label** (Tensor) label (Tensor):
1. If soft_label=False, the shape is 1. If soft_label=False, the shape is
:math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1. :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
the data type is int32, int64, float32, float64, where each value is [0, C-1]. the data type is int32, int64, float32, float64, where each value is [0, C-1].
...@@ -2416,48 +2411,27 @@ def cross_entropy( ...@@ -2416,48 +2411,27 @@ def cross_entropy(
2. If soft_label=True, the shape and data type should be same with ``input`` , 2. If soft_label=True, the shape and data type should be same with ``input`` ,
and the sum of the labels for each sample should be 1. and the sum of the labels for each sample should be 1.
- **weight** (Tensor, optional) weight (Tensor, optional): a manual rescaling weight given to each class.
a manual rescaling weight given to each class.
If given, has to be a Tensor of size C and the data type is float32, float64. If given, has to be a Tensor of size C and the data type is float32, float64.
Default is ``'None'`` . Default is ``'None'`` .
ignore_index (int64, optional): Specifies a target value that is ignored
- **ignore_index** (int64, optional)
Specifies a target value that is ignored
and does not contribute to the loss. A negative value means that no label and does not contribute to the loss. A negative value means that no label
value needs to be ignored. Only valid when soft_label = False. value needs to be ignored. Only valid when soft_label = False.
Default is ``-100`` . Default is ``-100`` .
reduction (str, optional): Indicate how to average the loss by batch_size,
- **reduction** (str, optional)
Indicate how to average the loss by batch_size,
the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned. If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
If :attr:`reduction` is ``'none'``, the unreduced loss is returned. If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
Default is ``'mean'``. Default is ``'mean'``.
soft_label (bool, optional): Indicate whether label is soft. Default is ``False``.
- **soft_label** (bool, optional) axis (int, optional):The index of dimension to perform softmax calculations.
Indicate whether label is soft.
Default is ``False``.
- **axis** (int, optional)
The index of dimension to perform softmax calculations.
It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the
number of dimensions of input :attr:`input`. number of dimensions of input :attr:`input`.
Default is ``-1`` . Default is ``-1`` .
use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
- **use_softmax** (bool, optional)
Indicate whether compute softmax before cross_entropy.
Default is ``True``. Default is ``True``.
name (str, optional): The name of the operator. Default is ``None`` .
- **name** (str, optional)
The name of the operator. Default is ``None`` .
For more information, please refer to :ref:`api_guide_Name` . For more information, please refer to :ref:`api_guide_Name` .
Returns: Returns:
...@@ -2473,9 +2447,7 @@ def cross_entropy( ...@@ -2473,9 +2447,7 @@ def cross_entropy(
2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . 2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .
Examples: Examples:
.. code-block:: python .. code-block:: python
# hard labels # hard labels
...@@ -3958,6 +3930,7 @@ def multi_margin_loss( ...@@ -3958,6 +3930,7 @@ def multi_margin_loss(
def soft_margin_loss(input, label, reduction='mean', name=None): def soft_margin_loss(input, label, reduction='mean', name=None):
""" """
The API measures the soft margin loss between input predictions ``input`` The API measures the soft margin loss between input predictions ``input``
and target labels ``label`` . It can be described as: and target labels ``label`` . It can be described as:
...@@ -3966,9 +3939,9 @@ def soft_margin_loss(input, label, reduction='mean', name=None): ...@@ -3966,9 +3939,9 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
Parameters: Parameters:
input (Tensor): The input predications tensor with shape: [N, *], input (Tensor): The input predications tensor with shape: ``[N, *]``,
N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf. N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf.
Available dtype is float32, float64. Available dtype is float32, float64.
label (Tensor): The target labels tensor with the same shape as label (Tensor): The target labels tensor with the same shape as
``input``. The target labels which values should be numbers -1 or 1. ``input``. The target labels which values should be numbers -1 or 1.
...@@ -3986,8 +3959,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None): ...@@ -3986,8 +3959,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
Returns: Returns:
Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is [1].
same as ``input`` , else the shape of output is [1].
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -4013,6 +3985,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None): ...@@ -4013,6 +3985,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
# [0.84367639, 0.74795729, 0.44629076, 0.55123353, 0.77659678], # [0.84367639, 0.74795729, 0.44629076, 0.55123353, 0.77659678],
# [0.39465919, 0.76651484, 0.54485321, 0.76609844, 0.77166790], # [0.39465919, 0.76651484, 0.54485321, 0.76609844, 0.77166790],
# [0.51283568, 0.84757161, 0.78913331, 1.05268764, 0.45318675]]) # [0.51283568, 0.84757161, 0.78913331, 1.05268764, 0.45318675]])
""" """
if reduction not in ['sum', 'mean', 'none']: if reduction not in ['sum', 'mean', 'none']:
raise ValueError( raise ValueError(
......
...@@ -1735,15 +1735,17 @@ def adaptive_avg_pool1d(x, output_size, name=None): ...@@ -1735,15 +1735,17 @@ def adaptive_avg_pool1d(x, output_size, name=None):
def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
r""" r"""
Applies 2D adaptive avg pooling on input tensor. The h and w dimensions Applies 2D adaptive avg pooling on input tensor. The h and w dimensions
of the output tensor are determined by the parameter output_size. of the output tensor are determined by the parameter output_size.
For avg adaptive pool2d: For avg adaptive pool2d:
.. math:: .. math::
hstart &= floor(i * H_{in} / H_{out}) hstart &= floor(i * H_{in} / H_{out}) \\
hend &= ceil((i + 1) * H_{in} / H_{out}) hend &= ceil((i + 1) * H_{in} / H_{out}) \\
wstart &= floor(j * W_{in} / W_{out}) wstart &= floor(j * W_{in} / W_{out}) \\
wend &= ceil((j + 1) * W_{in} / W_{out}) wend &= ceil((j + 1) * W_{in} / W_{out}) \\
Output(i ,j) &= \frac{\sum Input[hstart:hend, wstart:wend]}{(hend - hstart) * (wend - wstart)} Output(i ,j) &= \frac{\sum Input[hstart:hend, wstart:wend]}{(hend - hstart) * (wend - wstart)}
Args: Args:
...@@ -1752,14 +1754,15 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): ...@@ -1752,14 +1754,15 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two element, (H, W). H and W can be either a int, or None which means it must contain two element, (H, W). H and W can be either a int, or None which means
the size will be the same as that of the input. the size will be the same as that of the input.
data_format (str): The data format of the input and output data. An optional string data_format (str, optional): The data format of the input and output data. An optional string
from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
the order of: [batch_size, input_channels, input_height, input_width]. the order of: [batch_size, input_channels, input_height, input_width].
name(str, optional): For detailed information, please refer name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and to :ref:`api_guide_Name`. Usually name is no need to set and
None by default. None by default.
Returns: Returns:
Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor. Tensor, The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1787,6 +1790,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): ...@@ -1787,6 +1790,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
x = x, x = x,
output_size=[3, 3]) output_size=[3, 3])
# out.shape is [2, 3, 3, 3] # out.shape is [2, 3, 3, 3]
""" """
if not in_dynamic_mode(): if not in_dynamic_mode():
check_variable_and_dtype( check_variable_and_dtype(
...@@ -1879,34 +1883,36 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): ...@@ -1879,34 +1883,36 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None): def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
r""" r"""
This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
of the output tensor are determined by the parameter output_size. of the output tensor are determined by the parameter output_size.
For avg adaptive pool3d: For avg adaptive pool3d:
.. math:: .. math::
dstart &= floor(i * D_{in} / D_{out}) dstart &= floor(i * D_{in} / D_{out}) \\
dend &= ceil((i + 1) * D_{in} / D_{out}) dend &= ceil((i + 1) * D_{in} / D_{out}) \\
hstart &= floor(j * H_{in} / H_{out}) hstart &= floor(j * H_{in} / H_{out}) \\
hend &= ceil((j + 1) * H_{in} / H_{out}) hend &= ceil((j + 1) * H_{in} / H_{out}) \\
wstart &= floor(k * W_{in} / W_{out}) wstart &= floor(k * W_{in} / W_{out}) \\
wend &= ceil((k + 1) * W_{in} / W_{out}) wend &= ceil((k + 1) * W_{in} / W_{out}) \\
Output(i ,j, k) &= \frac{\sum Input[dstart:dend, hstart:hend, wstart:wend]} Output(i ,j, k) &= \frac{\sum Input[dstart:dend, hstart:hend, wstart:wend]}
{(dend - dstart) * (hend - hstart) * (wend - wstart)} {(dend - dstart) * (hend - hstart) * (wend - wstart)}
Args: Args:
x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
The data type can be float32, float64. The data type can be float32, float64.
output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or
it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means list, it must contain three elements, (D, H, W). D, H and W can be either a int,
the size will be the same as that of the input. or None which means the size will be the same as that of the input.
data_format (str): The data format of the input and output data. An optional string data_format (str, optional): The data format of the input and output data. An optional string
from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
the order of: [batch_size, input_channels, input_depth, input_height, input_width]. the order of: [batch_size, input_channels, input_depth, input_height, input_width].
name(str, optional): For detailed information, please refer name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
to :ref:`api_guide_Name`. Usually name is no need to set and Usually name is no need to set and None by default.
None by default.
Returns: Returns:
Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor. Tensor, The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1936,6 +1942,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None): ...@@ -1936,6 +1942,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
x = input_data, x = input_data,
output_size=[3, 3, 3]) output_size=[3, 3, 3])
# out.shape is [2, 3, 3, 3, 3] # out.shape is [2, 3, 3, 3, 3]
""" """
if not in_dynamic_mode(): if not in_dynamic_mode():
check_variable_and_dtype( check_variable_and_dtype(
......
...@@ -1449,15 +1449,16 @@ class Maxout(Layer): ...@@ -1449,15 +1449,16 @@ class Maxout(Layer):
class Softmax2D(Layer): class Softmax2D(Layer):
r""" r"""
Softmax2D Activation. Softmax2D Activation.
Given a Tensor with shape (B, C, H, W) or (C, H, W), it will apply Softmax to each location (C, h_i, w_j). Given a Tensor with shape (B, C, H, W) or (C, H, W), it will apply Softmax to each location (C, h_i, w_j).
The sum of result in each location (C, H_i, W_j) will be one. The sum of result in each location (C, H_i, W_j) will be one.
Shape: Shape:
- Input: :math:`(B, C, H, W)` or :math:`(C, H, W)` - Input: :math:`(B, C, H, W)` or :math:`(C, H, W)`
- Output: :math:`(B, C, H, W)` or :math:`(C, H, W)`(same as input) - Output: :math:`(B, C, H, W)` or :math:`(C, H, W)` (same as input)
Return: Returns:
A Tensor of the same shape and dtype as input with value in range [0, 1]. A Tensor of the same shape and dtype as input with value in range [0, 1].
Examples: Examples:
...@@ -1482,6 +1483,7 @@ class Softmax2D(Layer): ...@@ -1482,6 +1483,7 @@ class Softmax2D(Layer):
# [[0.42368975 0.51082766 0.47752273 0.5258871 ] # [[0.42368975 0.51082766 0.47752273 0.5258871 ]
# [0.66754097 0.47182566 0.5187628 0.5402329 ] # [0.66754097 0.47182566 0.5187628 0.5402329 ]
# [0.49014282 0.46369177 0.50340754 0.5289428 ]]]] # [0.49014282 0.46369177 0.50340754 0.5289428 ]]]]
""" """
def __init__(self, name=None): def __init__(self, name=None):
......
...@@ -20,6 +20,7 @@ __all__ = [] ...@@ -20,6 +20,7 @@ __all__ = []
class PairwiseDistance(Layer): class PairwiseDistance(Layer):
r""" r"""
It computes the pairwise distance between two vectors. The It computes the pairwise distance between two vectors. The
distance is calculated by p-oreder norm: distance is calculated by p-oreder norm:
...@@ -38,14 +39,14 @@ class PairwiseDistance(Layer): ...@@ -38,14 +39,14 @@ class PairwiseDistance(Layer):
Generally, no setting is required. Default: None. Generally, no setting is required. Default: None.
Shape: Shape:
x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D` - x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
is the dimension of the data. Available data type is float32, float64. is the dimension of the data. Available data type is float32, float64.
y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x. - y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
output: The same dtype as input tensor. - output: The same dtype as input tensor.
- If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`, - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
depending on whether the input has data shaped as :math:`[N, D]`. depending on whether the input has data shaped as :math:`[N, D]`.
- If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`, - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,
depending on whether the input has data shaped as :math:`[N, D]`. depending on whether the input has data shaped as :math:`[N, D]`.
Examples: Examples:
.. code-block:: python .. code-block:: python
......
...@@ -26,7 +26,8 @@ __all__ = [] ...@@ -26,7 +26,8 @@ __all__ = []
class BCEWithLogitsLoss(Layer): class BCEWithLogitsLoss(Layer):
r""" r"""
This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
This operator combines the sigmoid layer and the :ref:`api_paddle_nn_BCELoss` layer.
Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits`` Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
layer and some reduce operations. layer and some reduce operations.
...@@ -49,7 +50,7 @@ class BCEWithLogitsLoss(Layer): ...@@ -49,7 +50,7 @@ class BCEWithLogitsLoss(Layer):
For stability and to prevent overflow of :math:`e^{-Logit}` when Logit < 0, For stability and to prevent overflow of :math:`e^{-Logit}` when Logit < 0,
we reformulate the loss as follows: we reformulate the loss as follows:
.. math:: .. math::
Out = \max(Logit, 0) - Logit * Labels + \log(1 + e^{-\|Logit\|}) Out = \max(Logit, 0) - Logit * Labels + \log(1 + e^{-\|Logit\|})
Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
...@@ -81,21 +82,21 @@ class BCEWithLogitsLoss(Layer): ...@@ -81,21 +82,21 @@ class BCEWithLogitsLoss(Layer):
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
Shapes: Shapes:
logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *], - logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, `*`],
N is batch_size, `*` means number of additional dimensions. The ``logit`` N is batch_size, `*` means number of additional dimensions. The ``logit``
is usually the output of Linear layer. Available dtype is float32, float64. is usually the output of Linear layer. Available dtype is float32, float64.
label (Tensor): The target labels tensor. 2-D tensor with the same shape as - label (Tensor): The target labels tensor. 2-D tensor with the same shape as
``logit``. The target labels which values should be numbers between 0 and 1. ``logit``. The target labels which values should be numbers between 0 and 1.
Available dtype is float32, float64. Available dtype is float32, float64.
output (Tensor): If ``reduction`` is ``'none'``, the shape of output is - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
same as ``logit`` , else the shape of output is scalar. same as ``logit`` , else the shape of output is scalar.
Returns: Returns:
A callable object of BCEWithLogitsLoss. A callable object of BCEWithLogitsLoss.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32") logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32") label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
...@@ -134,6 +135,7 @@ class BCEWithLogitsLoss(Layer): ...@@ -134,6 +135,7 @@ class BCEWithLogitsLoss(Layer):
class CrossEntropyLoss(Layer): class CrossEntropyLoss(Layer):
r""" r"""
By default, this operator implements the cross entropy loss function with softmax. This function By default, this operator implements the cross entropy loss function with softmax. This function
combines the calculation of the softmax operation and the cross entropy loss function combines the calculation of the softmax operation and the cross entropy loss function
to provide a more numerically stable computing. to provide a more numerically stable computing.
...@@ -246,60 +248,35 @@ class CrossEntropyLoss(Layer): ...@@ -246,60 +248,35 @@ class CrossEntropyLoss(Layer):
Parameters: Parameters:
weight (Tensor, optional): a manual rescaling weight given to each class.
- **weight** (Tensor, optional)
a manual rescaling weight given to each class.
If given, has to be a Tensor of size C and the data type is float32, float64. If given, has to be a Tensor of size C and the data type is float32, float64.
Default is ``'None'`` . Default is ``'None'`` .
ignore_index (int64, optional): Specifies a target value that is ignored
- **ignore_index** (int64, optional)
Specifies a target value that is ignored
and does not contribute to the loss. A negative value means that no label and does not contribute to the loss. A negative value means that no label
value needs to be ignored. Only valid when soft_label = False. value needs to be ignored. Only valid when soft_label = False.
Default is ``-100`` . Default is ``-100`` .
reduction (str, optional): Indicate how to average the loss by batch_size,
- **reduction** (str, optional)
Indicate how to average the loss by batch_size,
the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned. If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
If :attr:`reduction` is ``'none'``, the unreduced loss is returned. If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
Default is ``'mean'``. Default is ``'mean'``.
soft_label (bool, optional): Indicate whether label is soft.
- **soft_label** (bool, optional)
Indicate whether label is soft.
If soft_label=False, the label is hard. If soft_label=True, the label is soft. If soft_label=False, the label is hard. If soft_label=True, the label is soft.
Default is ``False``. Default is ``False``.
axis (int, optional): The index of dimension to perform softmax calculations.
- **axis** (int, optional)
The index of dimension to perform softmax calculations.
It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number
of dimensions of input :attr:`input`. of dimensions of input :attr:`input`.
Default is ``-1`` . Default is ``-1`` .
use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
- **use_softmax** (bool, optional)
Indicate whether compute softmax before cross_entropy.
Default is ``True``. Default is ``True``.
name (str, optional): The name of the operator. Default is ``None`` .
- **name** (str, optional)
The name of the operator. Default is ``None`` .
For more information, please refer to :ref:`api_guide_Name` . For more information, please refer to :ref:`api_guide_Name` .
Shape: Shape:
- **input** (Tensor), the data type is float32, float64. Shape is
- **input** (Tensor) :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` .
Input tensor, the data type is float32, float64. Shape is
:math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` .
Note: Note:
1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the 1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
...@@ -307,7 +284,6 @@ class CrossEntropyLoss(Layer): ...@@ -307,7 +284,6 @@ class CrossEntropyLoss(Layer):
2. when use_softmax=False, it expects the output of softmax operator. 2. when use_softmax=False, it expects the output of softmax operator.
- **label** (Tensor) - **label** (Tensor)
1. If soft_label=False, the shape is 1. If soft_label=False, the shape is
...@@ -317,15 +293,10 @@ class CrossEntropyLoss(Layer): ...@@ -317,15 +293,10 @@ class CrossEntropyLoss(Layer):
2. If soft_label=True, the shape and data type should be same with ``input`` , 2. If soft_label=True, the shape and data type should be same with ``input`` ,
and the sum of the labels for each sample should be 1. and the sum of the labels for each sample should be 1.
- **output** (Tensor) - **output** (Tensor), Return the softmax cross_entropy loss of ``input`` and ``label``.
The data type is the same as input.
Return the softmax cross_entropy loss of ``input`` and ``label``. If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
If :attr:`reduction` is ``'none'``:
The data type is the same as input.
If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
If :attr:`reduction` is ``'none'``:
1. If soft_label = False, the dimension of return value is the same with ``label`` . 1. If soft_label = False, the dimension of return value is the same with ``label`` .
...@@ -629,6 +600,7 @@ class MSELoss(Layer): ...@@ -629,6 +600,7 @@ class MSELoss(Layer):
class L1Loss(Layer): class L1Loss(Layer):
r""" r"""
Construct a callable object of the ``L1Loss`` class. Construct a callable object of the ``L1Loss`` class.
The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows. The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
...@@ -658,11 +630,11 @@ class L1Loss(Layer): ...@@ -658,11 +630,11 @@ class L1Loss(Layer):
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Shape: Shape:
input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64. - input (Tensor): The input tensor. The shapes is ``[N, *]``, where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64. - label (Tensor): label. The shapes is ``[N, *]``, same shape as ``input`` . It's data type should be float32, float64, int32, int64.
output (Tensor): The L1 Loss of ``input`` and ``label``. - output (Tensor): The L1 Loss of ``input`` and ``label``.
If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` . If `reduction` is ``'none'``, the shape of output loss is ``[N, *]``, the same as ``input`` .
If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1]. If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -687,6 +659,7 @@ class L1Loss(Layer): ...@@ -687,6 +659,7 @@ class L1Loss(Layer):
print(output) print(output)
# [[0.20000005 0.19999999] # [[0.20000005 0.19999999]
# [0.2 0.79999995]] # [0.2 0.79999995]]
""" """
def __init__(self, reduction='mean', name=None): def __init__(self, reduction='mean', name=None):
...@@ -707,6 +680,7 @@ class L1Loss(Layer): ...@@ -707,6 +680,7 @@ class L1Loss(Layer):
class BCELoss(Layer): class BCELoss(Layer):
""" """
This interface is used to construct a callable object of the ``BCELoss`` class. This interface is used to construct a callable object of the ``BCELoss`` class.
The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input`` The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input``
and target labels ``label`` . The binary_cross_entropy loss can be described as: and target labels ``label`` . The binary_cross_entropy loss can be described as:
...@@ -750,14 +724,14 @@ class BCELoss(Layer): ...@@ -750,14 +724,14 @@ class BCELoss(Layer):
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
Shape: Shape:
input (Tensor): 2-D tensor with shape: [N, *], N is batch_size, `*` means - input (Tensor): 2-D tensor with shape: ``[N, *]``, N is batch_size, `*` means
number of additional dimensions. The input ``input`` should always number of additional dimensions. The input ``input`` should always
be the output of sigmod. Available dtype is float32, float64. be the output of sigmod. Available dtype is float32, float64.
label (Tensor): 2-D tensor with the same shape as ``input``. The target - label (Tensor): 2-D tensor with the same shape as ``input``. The target
labels which values should be numbers between 0 and 1. Available labels which values should be numbers between 0 and 1. Available
dtype is float32, float64. dtype is float32, float64.
output (Tensor): If ``reduction`` is ``'none'``, the shape of output is - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
same as ``input`` , else the shape of output is scalar. same as ``input`` , else the shape of output is scalar.
Returns: Returns:
A callable object of BCELoss. A callable object of BCELoss.
...@@ -850,7 +824,7 @@ class NLLLoss(Layer): ...@@ -850,7 +824,7 @@ class NLLLoss(Layer):
if `reduction` is ``'sum'``, the reduced sum loss is returned; if `reduction` is ``'sum'``, the reduced sum loss is returned;
if `reduction` is ``'none'``, no reduction will be apllied. if `reduction` is ``'none'``, no reduction will be apllied.
Default is ``'mean'``. Default is ``'mean'``.
name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None. name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
Shape: Shape:
- input (Tensor): Input tensor, the shape is :math:`[N, C]`, `C` is the number of classes. - input (Tensor): Input tensor, the shape is :math:`[N, C]`, `C` is the number of classes.
...@@ -909,6 +883,7 @@ class NLLLoss(Layer): ...@@ -909,6 +883,7 @@ class NLLLoss(Layer):
class KLDivLoss(Layer): class KLDivLoss(Layer):
r""" r"""
Generate a callable object of 'KLDivLoss' to calculate the Generate a callable object of 'KLDivLoss' to calculate the
Kullback-Leibler divergence loss between Input(X) and Kullback-Leibler divergence loss between Input(X) and
Input(Target). Notes that Input(X) is the log-probability Input(Target). Notes that Input(X) is the log-probability
...@@ -928,14 +903,10 @@ class KLDivLoss(Layer): ...@@ -928,14 +903,10 @@ class KLDivLoss(Layer):
Default is ``'mean'``. Default is ``'mean'``.
Shape: Shape:
- input (Tensor): ``(N, *)``, where ``*`` means, any number of additional dimensions.
- input (Tensor): (N, *), where * means, any number of additional dimensions. - label (Tensor): ``(N, *)``, same shape as input.
- label (Tensor): (N, *), same shape as input.
- output (Tensor): tensor with shape: [1] by default. - output (Tensor): tensor with shape: [1] by default.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -965,6 +936,7 @@ class KLDivLoss(Layer): ...@@ -965,6 +936,7 @@ class KLDivLoss(Layer):
kldiv_criterion = nn.KLDivLoss(reduction='none') kldiv_criterion = nn.KLDivLoss(reduction='none')
pred_loss = kldiv_criterion(x, target) pred_loss = kldiv_criterion(x, target)
# shape=[5, 20] # shape=[5, 20]
""" """
def __init__(self, reduction='mean'): def __init__(self, reduction='mean'):
...@@ -1817,6 +1789,7 @@ class MultiMarginLoss(Layer): ...@@ -1817,6 +1789,7 @@ class MultiMarginLoss(Layer):
class SoftMarginLoss(Layer): class SoftMarginLoss(Layer):
r""" r"""
Creates a criterion that measures a two-class soft margin loss between input predictions ``input`` Creates a criterion that measures a two-class soft margin loss between input predictions ``input``
and target labels ``label`` . It can be described as: and target labels ``label`` . It can be described as:
...@@ -1835,17 +1808,14 @@ class SoftMarginLoss(Layer): ...@@ -1835,17 +1808,14 @@ class SoftMarginLoss(Layer):
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Shapes: Shapes:
- Input (Tensor): The input tensor with shape: ``[N, *]``,
Input (Tensor): The input tensor with shape: [N, *], N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf
N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf Available dtype is float32, float64.
Available dtype is float32, float64. - Label (Tensor): The target labels tensor with the same shape as
``input``. The target labels which values should be numbers -1 or 1.
Label (Tensor): The target labels tensor with the same shape as Available dtype is int32, int64, float32, float64.
``input``. The target labels which values should be numbers -1 or 1. - Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
Available dtype is int32, int64, float32, float64. same as ``input`` , else the shape of output is [1].
Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
same as ``input`` , else the shape of output is [1].
Returns: Returns:
A callable object of SoftMarginLoss. A callable object of SoftMarginLoss.
...@@ -1877,6 +1847,7 @@ class SoftMarginLoss(Layer): ...@@ -1877,6 +1847,7 @@ class SoftMarginLoss(Layer):
# [0.55476735, 1.10505384, 0.89923519, 0.45018155, 1.06587511], # [0.55476735, 1.10505384, 0.89923519, 0.45018155, 1.06587511],
# [0.37998142, 0.48067240, 0.47791212, 0.55664053, 0.98581399], # [0.37998142, 0.48067240, 0.47791212, 0.55664053, 0.98581399],
# [0.78571653, 0.59319711, 0.39701841, 0.76172109, 0.83781742]]) # [0.78571653, 0.59319711, 0.39701841, 0.76172109, 0.83781742]])
""" """
def __init__(self, reduction='mean', name=None): def __init__(self, reduction='mean', name=None):
......
...@@ -318,6 +318,7 @@ Where `H` means height of feature map, `W` means width of feature map. ...@@ -318,6 +318,7 @@ Where `H` means height of feature map, `W` means width of feature map.
class GroupNorm(Layer): class GroupNorm(Layer):
""" """
This interface is used to construct a callable object of the ``GroupNorm`` class. This interface is used to construct a callable object of the ``GroupNorm`` class.
For more details, refer to code examples. For more details, refer to code examples.
It implements the function of the Group Normalization Layer. It implements the function of the Group Normalization Layer.
...@@ -338,7 +339,7 @@ class GroupNorm(Layer): ...@@ -338,7 +339,7 @@ class GroupNorm(Layer):
name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`.. name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
Shape: Shape:
- x: Tensor with shape: (batch, num_features, *). - x: Tensor with shape: attr:`(batch, num_features, *)`.
- output: The same shape as input x. - output: The same shape as input x.
Returns: Returns:
...@@ -1041,6 +1042,7 @@ class BatchNorm3D(_BatchNormBase): ...@@ -1041,6 +1042,7 @@ class BatchNorm3D(_BatchNormBase):
class SyncBatchNorm(_BatchNormBase): class SyncBatchNorm(_BatchNormBase):
r""" r"""
This interface is used to construct a callable object of the ``SyncBatchNorm`` class. This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can
be used as a normalizer function for other operations, such as conv2d and fully connected be used as a normalizer function for other operations, such as conv2d and fully connected
...@@ -1086,9 +1088,9 @@ class SyncBatchNorm(_BatchNormBase): ...@@ -1086,9 +1088,9 @@ class SyncBatchNorm(_BatchNormBase):
- :math:`\beta` : trainable shift parameter vector - :math:`\beta` : trainable shift parameter vector
Note: Note:
If you want to use container to pack your model and has ``SyncBatchNorm`` in the If you want to use container to pack your model and has :ref:`api_paddle_nn_SyncBatchNorm` in the
evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of evaluation phase, please use :ref:`api_paddle_nn_LayerList` or :ref:`api_paddle_nn_Sequential` instead of
``list`` to pack the model. :ref:`api_paddle_hub_list` to pack the model.
Parameters: Parameters:
num_features(int): Indicate the number of channels of the input ``Tensor``. num_features(int): Indicate the number of channels of the input ``Tensor``.
...@@ -1106,29 +1108,30 @@ class SyncBatchNorm(_BatchNormBase): ...@@ -1106,29 +1108,30 @@ class SyncBatchNorm(_BatchNormBase):
have trainable bias parameter. Default: None. have trainable bias parameter. Default: None.
Shapes: Shapes:
input: Tensor that the dimension from 2 to 5. - input: Tensor that the dimension from 2 to 5.
output: Tensor with the same shape as input. - output: Tensor with the same shape as input.
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu # required: gpu
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32') x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
if paddle.is_compiled_with_cuda(): if paddle.is_compiled_with_cuda():
sync_batch_norm = nn.SyncBatchNorm(2) sync_batch_norm = nn.SyncBatchNorm(2)
hidden1 = sync_batch_norm(x) hidden1 = sync_batch_norm(x)
print(hidden1) print(hidden1)
# Tensor(shape=[1, 2, 2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, # Tensor(shape=[1, 2, 2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
# [[[[ 0.26824948, 1.09363246], # [[[[ 0.26824948, 1.09363246],
# [ 0.26824948, -1.63013160]], # [ 0.26824948, -1.63013160]],
# [[ 0.80956620, -0.66528702],
# [-1.27446556, 1.13018656]]]])
# [[ 0.80956620, -0.66528702],
# [-1.27446556, 1.13018656]]]])
""" """
def __init__( def __init__(
...@@ -1277,8 +1280,8 @@ class SyncBatchNorm(_BatchNormBase): ...@@ -1277,8 +1280,8 @@ class SyncBatchNorm(_BatchNormBase):
The original model with converted SyncBatchNorm layers. If BatchNorm*d layer in the model, use SyncBatchNorm layer instead. The original model with converted SyncBatchNorm layers. If BatchNorm*d layer in the model, use SyncBatchNorm layer instead.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
......
...@@ -223,6 +223,7 @@ class AvgPool2D(Layer): ...@@ -223,6 +223,7 @@ class AvgPool2D(Layer):
class AvgPool3D(Layer): class AvgPool3D(Layer):
""" """
This operation applies 3D max pooling over input features based on the input, This operation applies 3D max pooling over input features based on the input,
and kernel_size, stride, padding parameters. Input(X) and Output(Out) are and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
in NCDHW format, where N is batch size, C is the number of channels, in NCDHW format, where N is batch size, C is the number of channels,
...@@ -263,6 +264,7 @@ class AvgPool3D(Layer): ...@@ -263,6 +264,7 @@ class AvgPool3D(Layer):
The data type can be float32, float64. The data type can be float32, float64.
- output(Tensor): The output tensor of avg pool3d operator, which is a 5-D tensor. - output(Tensor): The output tensor of avg pool3d operator, which is a 5-D tensor.
The data type is same as input x. The data type is same as input x.
Examples: Examples:
.. code-block:: python .. code-block:: python
......
...@@ -613,14 +613,17 @@ class QuantizedConv2D(Layer): ...@@ -613,14 +613,17 @@ class QuantizedConv2D(Layer):
class QuantizedConv2DTranspose(Layer): class QuantizedConv2DTranspose(Layer):
""" """
The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose. The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose.
The only difference is that its inputs are all fake quantized. The only difference is that its inputs are all fake quantized.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.) x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
conv = nn.Conv2DTranspose(4, 6, (3, 3)) conv = nn.Conv2DTranspose(4, 6, (3, 3))
conv_quantized = QuantizedConv2DTranspose(conv) conv_quantized = QuantizedConv2DTranspose(conv)
...@@ -630,6 +633,7 @@ class QuantizedConv2DTranspose(Layer): ...@@ -630,6 +633,7 @@ class QuantizedConv2DTranspose(Layer):
y_np = y_var.numpy() y_np = y_var.numpy()
print(y_np.shape, y_quantized_np.shape) print(y_np.shape, y_quantized_np.shape)
# (2, 6, 10, 10), (2, 6, 10, 10) # (2, 6, 10, 10), (2, 6, 10, 10)
""" """
def __init__( def __init__(
......
...@@ -1647,6 +1647,7 @@ class MultiplicativeDecay(LRScheduler): ...@@ -1647,6 +1647,7 @@ class MultiplicativeDecay(LRScheduler):
class OneCycleLR(LRScheduler): class OneCycleLR(LRScheduler):
r""" r"""
Sets the learning rate according to the one cycle learning rate scheduler. Sets the learning rate according to the one cycle learning rate scheduler.
The scheduler adjusts the learning rate from an initial learning rate to the maximum learning rate and then The scheduler adjusts the learning rate from an initial learning rate to the maximum learning rate and then
from that maximum learning rate to the minimum learning rate, which is much less than the initial learning rate. from that maximum learning rate to the minimum learning rate, which is much less than the initial learning rate.
...@@ -1660,22 +1661,25 @@ class OneCycleLR(LRScheduler): ...@@ -1660,22 +1661,25 @@ class OneCycleLR(LRScheduler):
Also note that you should update learning rate each step. Also note that you should update learning rate each step.
Args: Args:
max_learning_rate (float): The maximum learning rate. It is a python float number. max_learning_rate (float): The maximum learning rate. It is a python float number. Functionally, it defines the initial learning rate by ``divide_factor`` .
Functionally, it defines the initial learning rate by ``divide_factor`` .
total_steps (int): Number of total training steps. total_steps (int): Number of total training steps.
divide_factor (float): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25. divide_factor (float, optional): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate. end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate.
phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3. phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3.
anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing, anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing, 'linear' for linear annealing. Default: 'cos'.
'linear' for linear annealing. Default: 'cos'.
three_phase (bool, optional): Whether to use three phase. three_phase (bool, optional): Whether to use three phase.
If ``True``: If ``True``:
1. The learning rate will first increase from initial learning rate to maximum learning rate. 1. The learning rate will first increase from initial learning rate to maximum learning rate.
2. Then it will decrease to initial learning rate. Number of step in this phase is the same as the one in first phase. 2. Then it will decrease to initial learning rate. Number of step in this phase is the same as the one in first phase.
3. Finally, it will decrease to minimum learning rate which is much less than initial learning rate. 3. Finally, it will decrease to minimum learning rate which is much less than initial learning rate.
If ``False``: If ``False``:
1. The learning rate will increase to maximum learning rate. 1. The learning rate will increase to maximum learning rate.
2. Then it will directly decrease to minimum learning rate. 2. Then it will directly decrease to minimum learning rate.
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
...@@ -1727,6 +1731,7 @@ class OneCycleLR(LRScheduler): ...@@ -1727,6 +1731,7 @@ class OneCycleLR(LRScheduler):
}, },
fetch_list=loss.name) fetch_list=loss.name)
scheduler.step() # You should update learning rate each step scheduler.step() # You should update learning rate each step
""" """
def __init__( def __init__(
......
...@@ -1194,6 +1194,7 @@ def triu(x, diagonal=0, name=None): ...@@ -1194,6 +1194,7 @@ def triu(x, diagonal=0, name=None):
def meshgrid(*args, **kwargs): def meshgrid(*args, **kwargs):
""" """
Takes a list of N tensors as input :attr:`*args`, each of which is 1-dimensional vector, and creates N-dimensional grids. Takes a list of N tensors as input :attr:`*args`, each of which is 1-dimensional vector, and creates N-dimensional grids.
Args: Args:
......
...@@ -732,6 +732,7 @@ def preprocess(equation, *operands): ...@@ -732,6 +732,7 @@ def preprocess(equation, *operands):
def parse_fake_shape(equation, operands, labels): def parse_fake_shape(equation, operands, labels):
""" """
this shape is just used for operands planning. may differ with the original shape. this shape is just used for operands planning. may differ with the original shape.
for example: for example:
... is replaced by 1 ... is replaced by 1
...@@ -739,6 +740,7 @@ def parse_fake_shape(equation, operands, labels): ...@@ -739,6 +740,7 @@ def parse_fake_shape(equation, operands, labels):
Results Results
------- -------
list of shape list of shape
""" """
shaped = collections.namedtuple('shaped', ['shape']) shaped = collections.namedtuple('shaped', ['shape'])
...@@ -862,6 +864,7 @@ def gen_einsum_op(equation, *operands): ...@@ -862,6 +864,7 @@ def gen_einsum_op(equation, *operands):
def einsum(equation, *operands): def einsum(equation, *operands):
r""" r"""
einsum(equation, *operands) einsum(equation, *operands)
The current version of this API should be used in dygraph only mode. The current version of this API should be used in dygraph only mode.
...@@ -890,35 +893,35 @@ def einsum(equation, *operands): ...@@ -890,35 +893,35 @@ def einsum(equation, *operands):
**The summation notation** **The summation notation**
- The tensor dimensions are labeled using uncased English letters. E.g., `ijk` - The tensor dimensions are labeled using uncased English letters. E.g., `ijk`
relates to a three dimensional tensor whose dimensions are labeled i, j, and k. relates to a three dimensional tensor whose dimensions are labeled i, j, and k.
- The equation is `,` separated into terms, each being a distinct input's - The equation is `,` separated into terms, each being a distinct input's
dimension label string. dimension label string.
- Ellipsis `...` enables broadcasting by automatically converting the unlabeled - Ellipsis `...` enables broadcasting by automatically converting the unlabeled
dimensions into broadcasting dimensions. dimensions into broadcasting dimensions.
- Singular labels are called free labels, duplicate are dummy labels. Dummy labeled - Singular labels are called free labels, duplicate are dummy labels. Dummy labeled
dimensions will be reduced and removed in the output. dimensions will be reduced and removed in the output.
- Output labels can be explicitly specified on the right hand side of `->` or omitted. - Output labels can be explicitly specified on the right hand side of `->` or omitted. In the latter case, the output labels will be inferred from the input labels.
In the latter case, the output labels will be inferred from the input labels.
- Inference of output labels - Inference of output labels
- Broadcasting label `...`, if present, is put on the leftmost position. - Broadcasting label `...`, if present, is put on the leftmost position.
- Free labels are reordered alphabetically and put after `...`. - Free labels are reordered alphabetically and put after `...`.
- On explicit output labels - On explicit output labels
- If broadcasting is enabled, then `...` must be present. - If broadcasting is enabled, then `...` must be present.
- The output labels can be an empty, an indication to output as a scalar - The output labels can be an empty, an indication to output as a scalar
the sum over the original output. the sum over the original output.
- Non-input labels are invalid. - Non-input labels are invalid.
- Duplicate labels are invalid. - Duplicate labels are invalid.
- For any dummy label which is present for the output, it's promoted to - For any dummy label which is present for the output, it's promoted to
a free label. a free label.
- For any free label which is not present for the output, it's lowered to - For any free label which is not present for the output, it's lowered to
a dummy label. a dummy label.
- Examples - Examples
- '...ij, ...jk', where i and k are free labels, j is dummy. The output label - '...ij, ...jk', where i and k are free labels, j is dummy. The output label
string is '...ik' string is '...ik'
- 'ij -> i', where i is a free label and j is a dummy label. - 'ij -> i', where i is a free label and j is a dummy label.
- '...ij, ...jk -> ...ijk', where i, j and k are all free labels. - '...ij, ...jk -> ...ijk', where i, j and k are all free labels.
- '...ij, ...jk -> ij', an invalid equation since `...` is not present for - '...ij, ...jk -> ij', an invalid equation since `...` is not present for
the output. the output.
**The summation rule** **The summation rule**
...@@ -926,8 +929,8 @@ def einsum(equation, *operands): ...@@ -926,8 +929,8 @@ def einsum(equation, *operands):
may vary significantly due to implementation specific optimization. may vary significantly due to implementation specific optimization.
- Step 1: preparation for broadcasting, that is, transposing and unsqueezing - Step 1: preparation for broadcasting, that is, transposing and unsqueezing
the input operands to have each resulting dimension identically labeled across the input operands to have each resulting dimension identically labeled across
all the input operands. all the input operands.
- Step 2: broadcasting multiply all the resulting operands from step 1. - Step 2: broadcasting multiply all the resulting operands from step 1.
- Step 3: reducing dummy labeled dimensions. - Step 3: reducing dummy labeled dimensions.
- Step 4: transposing the result tensor to match the output labels. - Step 4: transposing the result tensor to match the output labels.
...@@ -944,78 +947,79 @@ def einsum(equation, *operands): ...@@ -944,78 +947,79 @@ def einsum(equation, *operands):
operands should equal the number of input terms in the equation. operands should equal the number of input terms in the equation.
Returns: Returns:
result (`Tensor`): the result tensor. result (`Tensor`), the result tensor.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.seed(102) paddle.seed(102)
x = paddle.rand([4]) x = paddle.rand([4])
y = paddle.rand([5]) y = paddle.rand([5])
# sum # sum
print(paddle.einsum('i->', x)) print(paddle.einsum('i->', x))
# Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True, # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# 1.95791852) # 1.95791852)
# dot # dot
print(paddle.einsum('i,i->', x, x)) print(paddle.einsum('i,i->', x, x))
# Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True, # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [1.45936954]) # [1.45936954])
# outer # outer
print(paddle.einsum("i,j->ij", x, y)) print(paddle.einsum("i,j->ij", x, y))
# Tensor(shape=[4, 5], dtype=float32, place=CUDAPlace(0), stop_gradient=True, # Tensor(shape=[4, 5], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[0.00079869, 0.00120950, 0.00136844, 0.00187187, 0.00192194], # [[0.00079869, 0.00120950, 0.00136844, 0.00187187, 0.00192194],
# [0.23455200, 0.35519385, 0.40186870, 0.54970956, 0.56441545], # [0.23455200, 0.35519385, 0.40186870, 0.54970956, 0.56441545],
# [0.11773264, 0.17828843, 0.20171674, 0.27592498, 0.28330654], # [0.11773264, 0.17828843, 0.20171674, 0.27592498, 0.28330654],
# [0.32897076, 0.49817693, 0.56364071, 0.77099484, 0.79162055]]) # [0.32897076, 0.49817693, 0.56364071, 0.77099484, 0.79162055]])
A = paddle.rand([2, 3, 2]) A = paddle.rand([2, 3, 2])
B = paddle.rand([2, 2, 3]) B = paddle.rand([2, 2, 3])
# transpose # transpose
print(paddle.einsum('ijk->kji', A)) print(paddle.einsum('ijk->kji', A))
# Tensor(shape=[2, 3, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, # Tensor(shape=[2, 3, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[[0.95649719, 0.49684682], # [[[0.95649719, 0.49684682],
# [0.80071914, 0.46258664], # [0.80071914, 0.46258664],
# [0.49814570, 0.33383518]], # [0.49814570, 0.33383518]],
# #
# [[0.07637714, 0.29374704], # [[0.07637714, 0.29374704],
# [0.51470858, 0.51907635], # [0.51470858, 0.51907635],
# [0.99066722, 0.55802226]]]) # [0.99066722, 0.55802226]]])
# batch matrix multiplication # batch matrix multiplication
print(paddle.einsum('ijk, ikl->ijl', A,B)) print(paddle.einsum('ijk, ikl->ijl', A,B))
# Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True, # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[[0.32172769, 0.50617385, 0.41394392], # [[[0.32172769, 0.50617385, 0.41394392],
# [0.51736701, 0.49921003, 0.38730967], # [0.51736701, 0.49921003, 0.38730967],
# [0.69078457, 0.42282537, 0.30161136]], # [0.69078457, 0.42282537, 0.30161136]],
# #
# [[0.32043904, 0.18164253, 0.27810261], # [[0.32043904, 0.18164253, 0.27810261],
# [0.50226176, 0.24512935, 0.39881429], # [0.50226176, 0.24512935, 0.39881429],
# [0.51476848, 0.23367381, 0.39229113]]]) # [0.51476848, 0.23367381, 0.39229113]]])
# Ellipsis transpose # Ellipsis transpose
print(paddle.einsum('...jk->...kj', A)) print(paddle.einsum('...jk->...kj', A))
# Tensor(shape=[2, 2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True, # Tensor(shape=[2, 2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[[0.95649719, 0.80071914, 0.49814570], # [[[0.95649719, 0.80071914, 0.49814570],
# [0.07637714, 0.51470858, 0.99066722]], # [0.07637714, 0.51470858, 0.99066722]],
# #
# [[0.49684682, 0.46258664, 0.33383518], # [[0.49684682, 0.46258664, 0.33383518],
# [0.29374704, 0.51907635, 0.55802226]]]) # [0.29374704, 0.51907635, 0.55802226]]])
# Ellipsis batch matrix multiplication # Ellipsis batch matrix multiplication
print(paddle.einsum('...jk, ...kl->...jl', A,B)) print(paddle.einsum('...jk, ...kl->...jl', A,B))
# Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True, # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
# [[[0.32172769, 0.50617385, 0.41394392], # [[[0.32172769, 0.50617385, 0.41394392],
# [0.51736701, 0.49921003, 0.38730967], # [0.51736701, 0.49921003, 0.38730967],
# [0.69078457, 0.42282537, 0.30161136]], # [0.69078457, 0.42282537, 0.30161136]],
# #
# [[0.32043904, 0.18164253, 0.27810261], # [[0.32043904, 0.18164253, 0.27810261],
# [0.50226176, 0.24512935, 0.39881429], # [0.50226176, 0.24512935, 0.39881429],
# [0.51476848, 0.23367381, 0.39229113]]]) # [0.51476848, 0.23367381, 0.39229113]]])
""" """
import os import os
......
...@@ -1905,12 +1905,15 @@ def mv(x, vec, name=None): ...@@ -1905,12 +1905,15 @@ def mv(x, vec, name=None):
def det(x, name=None): def det(x, name=None):
""" """
Calculates determinant value of a square matrix or batches of square matrices. Calculates determinant value of a square matrix or batches of square matrices.
Args: Args:
x (Tensor): input (Tensor): the input matrix of size `(n, n)` or the x (Tensor): the input matrix of size `(n, n)` or the
batch of matrices of size `(*, n, n)` where `*` is one or more batch of matrices of size `(*, n, n)` where `*` is one or more
batch dimensions. batch dimensions.
name(str, optional): Name of the output. Default is None. It's used
to print debug info for developers. Details: :ref:`api_guide_Name`
Returns: Returns:
Tensor, the determinant value of a square matrix or batches of square matrices. Tensor, the determinant value of a square matrix or batches of square matrices.
...@@ -1961,18 +1964,20 @@ def det(x, name=None): ...@@ -1961,18 +1964,20 @@ def det(x, name=None):
def slogdet(x, name=None): def slogdet(x, name=None):
""" """
Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant. Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant.
The determinant can be computed with ``sign * exp(logabsdet) The determinant can be computed with ``sign * exp`` (logabsdet)
Supports input of float, double Supports input of float, double
Note that for matrices that have zero determinant, this returns ``(0, -inf)`` Note that for matrices that have zero determinant, this returns ``(0, -inf)``
Args: Args:
x (Tensor): the batch of matrices of size :math:`(*, n, n)` x (Tensor): the batch of matrices of size :math:`(*, n, n)`
where math:`*` is one or more batch dimensions. where math:`*` is one or more batch dimensions.
Returns: Returns:
y (Tensor): A tensor containing the sign of the determinant and the natural logarithm y (Tensor), A tensor containing the sign of the determinant and the natural logarithm
of the absolute value of determinant, respectively. of the absolute value of determinant, respectively.
Examples: Examples:
...@@ -2090,6 +2095,7 @@ def svd(x, full_matrices=False, name=None): ...@@ -2090,6 +2095,7 @@ def svd(x, full_matrices=False, name=None):
def matrix_power(x, n, name=None): def matrix_power(x, n, name=None):
r""" r"""
Computes the n-th power of a square matrix or a batch of square matrices. Computes the n-th power of a square matrix or a batch of square matrices.
Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`n` be Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`n` be
...@@ -2115,8 +2121,8 @@ def matrix_power(x, n, name=None): ...@@ -2115,8 +2121,8 @@ def matrix_power(x, n, name=None):
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
Returns: Returns:
Tensor: The n-th power of the matrix (or the batch of matrices) `x`. Its - Tensor, The n-th power of the matrix (or the batch of matrices) `x`. Its
data type should be the same as that of `x`. data type should be the same as that of `x`.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -3054,8 +3060,9 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None): ...@@ -3054,8 +3060,9 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
def solve(x, y, name=None): def solve(x, y, name=None):
r""" r"""
Computes the solution of a square system of linear equations with a unique solution for input 'X' and 'Y'. Computes the solution of a square system of linear equations with a unique solution for input 'X' and 'Y'.
Let :math: `X` be a sqaure matrix or a batch of square matrices, :math:`Y` be Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`Y` be
a vector/matrix or a batch of vectors/matrices, the equation should be: a vector/matrix or a batch of vectors/matrices, the equation should be:
.. math:: .. math::
...@@ -3064,9 +3071,9 @@ def solve(x, y, name=None): ...@@ -3064,9 +3071,9 @@ def solve(x, y, name=None):
Specifically, this system of linear equations has one solution if and only if input 'X' is invertible. Specifically, this system of linear equations has one solution if and only if input 'X' is invertible.
Args: Args:
x (Tensor): A square matrix or a batch of square matrices. Its shape should be `[*, M, M]`, where `*` is zero or x (Tensor): A square matrix or a batch of square matrices. Its shape should be ``[*, M, M]``, where ``*`` is zero or
more batch dimensions. Its data type should be float32 or float64. more batch dimensions. Its data type should be float32 or float64.
y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be `[*, M, K]`, where `*` is zero or y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be ``[*, M, K]``, where ``*`` is zero or
more batch dimensions. Its data type should be float32 or float64. more batch dimensions. Its data type should be float32 or float64.
name(str, optional): Name for the operation (optional, default is None). name(str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
......
...@@ -272,7 +272,8 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): ...@@ -272,7 +272,8 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
def stanh(x, scale_a=0.67, scale_b=1.7159, name=None): def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
""" r"""
stanh activation. stanh activation.
.. math:: .. math::
...@@ -283,8 +284,7 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None): ...@@ -283,8 +284,7 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
x (Tensor): The input Tensor with data type float32, float64. x (Tensor): The input Tensor with data type float32, float64.
scale_a (float, optional): The scale factor a of the input. Default is 0.67. scale_a (float, optional): The scale factor a of the input. Default is 0.67.
scale_b (float, optional): The scale factor b of the output. Default is 1.7159. scale_b (float, optional): The scale factor b of the output. Default is 1.7159.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
For more information, please refer to :ref:`api_guide_Name`.
Returns: Returns:
A Tensor with the same data type and shape as ``x`` . A Tensor with the same data type and shape as ``x`` .
......
...@@ -1296,15 +1296,17 @@ def distribute_fpn_proposals( ...@@ -1296,15 +1296,17 @@ def distribute_fpn_proposals(
name=None, name=None,
): ):
r""" r"""
In Feature Pyramid Networks (FPN) models, it is needed to distribute
In Feature Pyramid Networks (FPN) models, it is needed to distribute
all proposals into different FPN level, with respect to scale of the proposals, all proposals into different FPN level, with respect to scale of the proposals,
the referring scale and the referring level. Besides, to restore the order of the referring scale and the referring level. Besides, to restore the order of
proposals, we return an array which indicates the original index of rois proposals, we return an array which indicates the original index of rois
in current proposals. To compute FPN level for each roi, the formula is given as follows: in current proposals. To compute FPN level for each roi, the formula is given as follows:
.. math:: .. math::
roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} \\
level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level) level &= floor(\log(\frac{roi\_scale}{refer\_scale}) + refer\_level)
where BBoxArea is a function to compute the area of each roi. where BBoxArea is a function to compute the area of each roi.
Args: Args:
...@@ -1328,13 +1330,13 @@ def distribute_fpn_proposals( ...@@ -1328,13 +1330,13 @@ def distribute_fpn_proposals(
None by default. None by default.
Returns: Returns:
multi_rois (List) : The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is - multi_rois (List), The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
and data type is same as `fpn_rois` . The length is max_level-min_level+1. and data type is same as `fpn_rois` . The length is max_level-min_level+1.
restore_ind (Tensor): The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1] - restore_ind (Tensor), The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
, where N is the number of total rois. The data type is int32. , where N is the number of total rois. The data type is int32.
rois_num_per_level (List): A list of 1-D Tensor and each Tensor is - rois_num_per_level (List), A list of 1-D Tensor and each Tensor is
the RoIs' number in each image on the corresponding level. The shape the RoIs' number in each image on the corresponding level. The shape
is [B] and data type of int32, where B is the number of images. is [B] and data type of int32, where B is the number of images.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1351,6 +1353,7 @@ def distribute_fpn_proposals( ...@@ -1351,6 +1353,7 @@ def distribute_fpn_proposals(
refer_level=4, refer_level=4,
refer_scale=224, refer_scale=224,
rois_num=rois_num) rois_num=rois_num)
""" """
num_lvl = max_level - min_level + 1 num_lvl = max_level - min_level + 1
...@@ -2438,6 +2441,7 @@ def matrix_nms( ...@@ -2438,6 +2441,7 @@ def matrix_nms(
name=None, name=None,
): ):
""" """
This operator does matrix non maximum suppression (NMS). This operator does matrix non maximum suppression (NMS).
First selects a subset of candidate bounding boxes that have higher scores First selects a subset of candidate bounding boxes that have higher scores
than score_threshold (if provided), then the top k candidate is selected if than score_threshold (if provided), then the top k candidate is selected if
...@@ -2445,6 +2449,7 @@ def matrix_nms( ...@@ -2445,6 +2449,7 @@ def matrix_nms(
decayed according to the Matrix NMS scheme. decayed according to the Matrix NMS scheme.
Aftern NMS step, at most keep_top_k number of total bboxes are to be kept Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
per image if keep_top_k is larger than -1. per image if keep_top_k is larger than -1.
Args: Args:
bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the
predicted locations of M bounding bboxes, predicted locations of M bounding bboxes,
...@@ -2468,29 +2473,32 @@ def matrix_nms( ...@@ -2468,29 +2473,32 @@ def matrix_nms(
on score_threshold. on score_threshold.
keep_top_k (int): Number of total bboxes to be kept per image after NMS keep_top_k (int): Number of total bboxes to be kept per image after NMS
step. -1 means keeping all bboxes after NMS step. step. -1 means keeping all bboxes after NMS step.
use_gaussian (bool): Use Gaussian as the decay function. Default: False use_gaussian (bool, optional): Use Gaussian as the decay function. Default: False
gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0 gaussian_sigma (float, optional): Sigma for Gaussian decay function. Default: 2.0
background_label (int): The index of background label, the background background_label (int, optional): The index of background label, the background
label will be ignored. If set to -1, then all label will be ignored. If set to -1, then all
categories will be considered. Default: 0 categories will be considered. Default: 0
normalized (bool): Whether detections are normalized. Default: True normalized (bool, optional): Whether detections are normalized. Default: True
return_index(bool): Whether return selected index. Default: False return_index(bool, optional): Whether return selected index. Default: False
return_rois_num(bool): whether return rois_num. Default: True return_rois_num(bool, optional): whether return rois_num. Default: True
name(str): Name of the matrix nms op. Default: None. name(str, optional): Name of the matrix nms op. Default: None.
Returns: Returns:
A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True, - A tuple with three Tensor, (Out, Index, RoisNum) if return_index is True,
otherwise, a tuple with two Tensor (Out, RoisNum) is returned. otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
Out (Tensor): A 2-D Tensor with shape [No, 6] containing the - Out (Tensor), A 2-D Tensor with shape [No, 6] containing the
detection results. detection results.
Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax] Each row has 6 values, [label, confidence, xmin, ymin, xmax, ymax]
Index (Tensor): A 2-D Tensor with shape [No, 1] containing the - Index (Tensor), A 2-D Tensor with shape [No, 1] containing the
selected indices, which are absolute values cross batches. selected indices, which are absolute values cross batches.
rois_num (Tensor): A 1-D Tensor with shape [N] containing - rois_num (Tensor), A 1-D Tensor with shape [N] containing
the number of detected boxes in each image. the number of detected boxes in each image.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
from paddle.vision.ops import matrix_nms from paddle.vision.ops import matrix_nms
boxes = paddle.rand([4, 1, 4]) boxes = paddle.rand([4, 1, 4])
boxes[..., 2] = boxes[..., 0] + boxes[..., 2] boxes[..., 2] = boxes[..., 0] + boxes[..., 2]
boxes[..., 3] = boxes[..., 1] + boxes[..., 3] boxes[..., 3] = boxes[..., 1] + boxes[..., 3]
...@@ -2498,6 +2506,7 @@ def matrix_nms( ...@@ -2498,6 +2506,7 @@ def matrix_nms(
out = matrix_nms(bboxes=boxes, scores=scores, background_label=0, out = matrix_nms(bboxes=boxes, scores=scores, background_label=0,
score_threshold=0.5, post_threshold=0.1, score_threshold=0.5, post_threshold=0.1,
nms_top_k=400, keep_top_k=200, normalized=False) nms_top_k=400, keep_top_k=200, normalized=False)
""" """
check_variable_and_dtype( check_variable_and_dtype(
bboxes, 'BBoxes', ['float32', 'float64'], 'matrix_nms' bboxes, 'BBoxes', ['float32', 'float64'], 'matrix_nms'
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册