【paddle.fleet】Document refine lars & lamb (#26533)

958d7212 · JZ-LIANG · GitHub · ada1e129 · 958d7212
隐藏空白更改
内联并排

Showing with 71 addition and 0 deletion

python/paddle/distributed/fleet/base/distributed_strategy.py python/paddle/distributed/fleet/base/distributed_strategy.py +71 -0

未找到文件。
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -750,6 +750,20 @@ class DistributedStrategy(object):
    @property
    def lars(self):
+        """
+        Set lars configurations. lars is used to deal with the convergence problems when the global 
+        batch size is larger than 8k.  For more details, please refer to 
+        [Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888).
+        Default Value: False
+        Examples:
+          .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.lars = True # by default this is false
+        """
        return self.strategy.lars
    @lars.setter
@@ -761,6 +775,29 @@ class DistributedStrategy(object):
    @property
    def lars_configs(self):
+        """
+        Set Lars training configurations.
+        **Notes**:
+        **lars_coeff (float)**: trust ratio in lars formula.
+        **lars_weight_decay** (float): weight decay coefficient in lars formula.
+        **epsilon (float)**: argument is used to avoid potential devision-by-zero 
+        when compute the local lr; 
+        **exclude_from_weight_decay ([string])**: is a list of name strings of layers which
+        will be exclude from weight decay in lars formula.
+        Examples:
+          .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.lars = True
+            strategy.lars_configs = {
+                        "lars_coeff": 0.01,
+                        "lars_weight_decay": 0.0005,
+                        "epsilon": 0,
+                        "exclude_from_weight_decay": ['batch_norm', '.b_0']
+                    }
+        """
        return get_msg_dict(self.strategy.lars_configs)
    @lars_configs.setter
@@ -770,6 +807,22 @@ class DistributedStrategy(object):
    @property
    def lamb(self):
+        """
+        Set lamb configurations. lamb is used to deal with the convergence problems for large 
+        batch size training, specially for attention-related model like BERT. For more details, 
+        please refer to 
+        [Large Batch Optimization for Deep Learning: Training BERT in 76 minutes](https://arxiv.org/abs/1904.00962).
+        Default Value: False
+        Examples:
+          .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.lamb = True # by default this is false
+        """
        return self.strategy.lamb
    @lamb.setter
@@ -781,6 +834,24 @@ class DistributedStrategy(object):
    @property
    def lamb_configs(self):
+        """
+        Set Lars training configurations.
+        **Notes**:
+        **lamb_weight_decay** (float): weight decay coefficient in lamb formula.
+        **exclude_from_weight_decay ([string])**: is a list of name strings of layers which
+        will be exclude from weight decay in lamb formula.
+        Examples:
+          .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.lamb = True
+            strategy.lamb_configs = {
+                    'lamb_weight_decay': 0.01,
+                    'exclude_from_weight_decay': [],
+                }
+        """
        return get_msg_dict(self.strategy.lamb_configs)
    @lamb_configs.setter