diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index a337fc41f292521c5e90daffd71b5bd4ff4e0553..bc6ce8c5e1c3f750318eb105729b88617af5d578 100755 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -750,6 +750,20 @@ class DistributedStrategy(object): @property def lars(self): + """ + Set lars configurations. lars is used to deal with the convergence problems when the global + batch size is larger than 8k. For more details, please refer to + [Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888). + + Default Value: False + + Examples: + .. code-block:: python + + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.lars = True # by default this is false + """ return self.strategy.lars @lars.setter @@ -761,6 +775,29 @@ class DistributedStrategy(object): @property def lars_configs(self): + """ + Set Lars training configurations. + + **Notes**: + **lars_coeff (float)**: trust ratio in lars formula. + **lars_weight_decay** (float): weight decay coefficient in lars formula. + **epsilon (float)**: argument is used to avoid potential devision-by-zero + when compute the local lr; + **exclude_from_weight_decay ([string])**: is a list of name strings of layers which + will be exclude from weight decay in lars formula. + + Examples: + .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.lars = True + strategy.lars_configs = { + "lars_coeff": 0.01, + "lars_weight_decay": 0.0005, + "epsilon": 0, + "exclude_from_weight_decay": ['batch_norm', '.b_0'] + } + """ return get_msg_dict(self.strategy.lars_configs) @lars_configs.setter @@ -770,6 +807,22 @@ class DistributedStrategy(object): @property def lamb(self): + """ + Set lamb configurations. lamb is used to deal with the convergence problems for large + batch size training, specially for attention-related model like BERT. For more details, + please refer to + [Large Batch Optimization for Deep Learning: Training BERT in 76 minutes](https://arxiv.org/abs/1904.00962). + + Default Value: False + + Examples: + .. code-block:: python + + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.lamb = True # by default this is false + """ + return self.strategy.lamb @lamb.setter @@ -781,6 +834,24 @@ class DistributedStrategy(object): @property def lamb_configs(self): + """ + Set Lars training configurations. + + **Notes**: + **lamb_weight_decay** (float): weight decay coefficient in lamb formula. + **exclude_from_weight_decay ([string])**: is a list of name strings of layers which + will be exclude from weight decay in lamb formula. + + Examples: + .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.lamb = True + strategy.lamb_configs = { + 'lamb_weight_decay': 0.01, + 'exclude_from_weight_decay': [], + } + """ return get_msg_dict(self.strategy.lamb_configs) @lamb_configs.setter