From 958d7212c70c4ea0f59fa848b22f55b697eda685 Mon Sep 17 00:00:00 2001 From: JZ-LIANG <38102074+JZ-LIANG@users.noreply.github.com> Date: Wed, 26 Aug 2020 19:47:45 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90paddle.fleet=E3=80=91Document=20refine?= =?UTF-8?q?=20lars=20&=20lamb=20(#26533)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../fleet/base/distributed_strategy.py | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index a337fc41f2..bc6ce8c5e1 100755 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -750,6 +750,20 @@ class DistributedStrategy(object): @property def lars(self): + """ + Set lars configurations. lars is used to deal with the convergence problems when the global + batch size is larger than 8k. For more details, please refer to + [Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888). + + Default Value: False + + Examples: + .. code-block:: python + + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.lars = True # by default this is false + """ return self.strategy.lars @lars.setter @@ -761,6 +775,29 @@ class DistributedStrategy(object): @property def lars_configs(self): + """ + Set Lars training configurations. + + **Notes**: + **lars_coeff (float)**: trust ratio in lars formula. + **lars_weight_decay** (float): weight decay coefficient in lars formula. + **epsilon (float)**: argument is used to avoid potential devision-by-zero + when compute the local lr; + **exclude_from_weight_decay ([string])**: is a list of name strings of layers which + will be exclude from weight decay in lars formula. + + Examples: + .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.lars = True + strategy.lars_configs = { + "lars_coeff": 0.01, + "lars_weight_decay": 0.0005, + "epsilon": 0, + "exclude_from_weight_decay": ['batch_norm', '.b_0'] + } + """ return get_msg_dict(self.strategy.lars_configs) @lars_configs.setter @@ -770,6 +807,22 @@ class DistributedStrategy(object): @property def lamb(self): + """ + Set lamb configurations. lamb is used to deal with the convergence problems for large + batch size training, specially for attention-related model like BERT. For more details, + please refer to + [Large Batch Optimization for Deep Learning: Training BERT in 76 minutes](https://arxiv.org/abs/1904.00962). + + Default Value: False + + Examples: + .. code-block:: python + + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.lamb = True # by default this is false + """ + return self.strategy.lamb @lamb.setter @@ -781,6 +834,24 @@ class DistributedStrategy(object): @property def lamb_configs(self): + """ + Set Lars training configurations. + + **Notes**: + **lamb_weight_decay** (float): weight decay coefficient in lamb formula. + **exclude_from_weight_decay ([string])**: is a list of name strings of layers which + will be exclude from weight decay in lamb formula. + + Examples: + .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.lamb = True + strategy.lamb_configs = { + 'lamb_weight_decay': 0.01, + 'exclude_from_weight_decay': [], + } + """ return get_msg_dict(self.strategy.lamb_configs) @lamb_configs.setter -- GitLab