fix strategy example (#26856)

* fix doc * test=develop * test=develop * test=develop * test=develop * test=develop * test=develop * test=develop * test=develop * fix doc, test=develop * update localsgd doc test=develop * update localsgd doc test=develop * fix fleet dgc amp doc, test=develop * fix, test=develop * fix async configs Co-authored-by: N liuyi05 <gavin1332@gmail.com> Co-authored-by: N WangXi <wangxi16@baidu.com> Co-authored-by: N seiriosPlus <tangwei12@baidu.com>

fix strategy example (#26856)
* fix doc * test=develop * test=develop * test=develop * test=develop * test=develop * test=develop * test=develop * test=develop * fix doc, test=develop * update localsgd doc test=develop * update localsgd doc test=develop * fix fleet dgc amp doc, test=develop * fix, test=develop * fix async configs Co-authored-by: N liuyi05 <gavin1332@gmail.com> Co-authored-by: N WangXi <wangxi16@baidu.com> Co-authored-by: N seiriosPlus <tangwei12@baidu.com>
9e4fe923 · mapingshuo · GitHub · 6b4ca0d7 · 9e4fe923
显示空白变更内容
内联并排

Showing with 68 addition and 44 deletion

python/paddle/distributed/fleet/base/distributed_strategy.py python/paddle/distributed/fleet/base/distributed_strategy.py +68 -44

未找到文件。
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -118,7 +118,7 @@ class DistributedStrategy(object):
            strategy = fleet.DistributedStrategy()
            strategy.dgc = True
            strategy.recompute = True
-            strategy.recompute_configs = {"checkpoint": ["x"]}
+            strategy.recompute_configs = {"checkpoints": ["x"]}
            strategy.save_to_prototxt("dist_strategy.prototxt")
        """
        with open(output, "w") as fout:
@@ -133,7 +133,7 @@ class DistributedStrategy(object):

            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
-            strategy.load_from_prototxt("dist_strategy.protoxt")
+            strategy.load_from_prototxt("dist_strategy.prototxt")
        """
        with open(pb_file, 'r') as f:
            self.strategy = google.protobuf.text_format.Merge(
@@ -147,6 +147,7 @@ class DistributedStrategy(object):
        Examples:
          .. code-block:: python

+            import paddle
            exe_strategy = paddle.fluid.ExecutionStrategy()
            exe_strategy.num_threads = 10
            exe_strategy.num_iteration_per_drop_scope = 10
@@ -179,6 +180,7 @@ class DistributedStrategy(object):
        Examples:
          .. code-block:: python

+            import paddle
            build_strategy = paddle.fluid.BuildStrategy()
            build_strategy.enable_sequential_execution = True
            build_strategy.fuse_elewise_add_act_ops = True
@@ -252,14 +254,19 @@ class DistributedStrategy(object):
        a dict.

        **Notes**:
-            **Detailed arguments for a_sync_configs**
-            **k_step**: number of local optimization updates before communication
-            **max_merge_var_num**: maximum number of merged gradients before communication
-            **send_queue_size**: a buffer size of worker communication
-            **independent_recv_thread**: if we are using independent recv thread for communication
-            **thread_pool_size**: number of thread pool
-            **send_wait_times**: waiting time for sending gradients
-            **runtime_split_send_recv**: if we are using Tensor split for send and recv during runtime
+            k_step(int): number of local optimization updates before communication
+
+            max_merge_var_num(int): maximum number of merged gradients before communication
+
+            send_queue_size(int): a buffer size of worker communication
+
+            independent_recv_thread(bool): if we are using independent recv thread for communication
+
+            thread_pool_size(int): number of thread pool
+
+            send_wait_times(int): waiting time for sending gradients
+
+            runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime

        Examples:
          .. code-block:: python
@@ -270,11 +277,12 @@ class DistributedStrategy(object):

            strategy = fleet.DistributedStrategy()
            strategy.a_sync = True  # by default this is True
-            configs = {"k_step": 10000, "send_queue_size": 32}
+            configs = {"k_steps": 1024, "send_queue_size": 32}
            strategy.a_sync_configs = configs

            # code block for defining loss and local optimizer
            # sgd = fleet.distributed_optimizer(optimizer, strategy)
+
        """
        return get_msg_dict(self.strategy.a_sync_configs)

@@ -314,14 +322,21 @@ class DistributedStrategy(object):
        settings that can be configured through a dict.

        **Notes**:
-            **init_loss_scaling(float)**: The initial loss scaling factor. Default 32768.
-            **use_dynamic_loss_scaling(bool)**: Whether to use dynamic loss scaling. Default True.
-            **incr_every_n_steps(int)**: Increases loss scaling every n consecutive steps with finite gradients. Default 1000.
-            **decr_every_n_nan_or_inf(int)**: Decreases loss scaling every n accumulated steps with nan or inf gradients. Default 2.
-            **incr_ratio(float)**: The multiplier to use when increasing the loss scaling. Default 2.0.
-            **decr_ratio(float)**: The less-than-one-multiplier to use when decreasing the loss scaling. Default 0.5.
-            **custom_white_list(list[str])**: Users' custom white list which always execution fp16.
-            **custom_black_list(list[str])**: Users' custom black list which forbidden execution fp16.
+            init_loss_scaling(float): The initial loss scaling factor. Default 32768.
+
+            use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. Default True.
+
+            incr_every_n_steps(int): Increases loss scaling every n consecutive steps with finite gradients. Default 1000.
+
+            decr_every_n_nan_or_inf(int): Decreases loss scaling every n accumulated steps with nan or inf gradients. Default 2.
+
+            incr_ratio(float): The multiplier to use when increasing the loss scaling. Default 2.0.
+
+            decr_ratio(float): The less-than-one-multiplier to use when decreasing the loss scaling. Default 0.5.
+
+            custom_white_list(list[str]): Users' custom white list which always execution fp16.
+
+            custom_black_list(list[str]): Users' custom black list which forbidden execution fp16.

        Examples:
          .. code-block:: python
@@ -553,7 +568,7 @@ class DistributedStrategy(object):
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.recompute = True
-            strategy.recompute_configs = {"checkpionts": ["x", "y"]}
+            strategy.recompute_configs = {"checkpoints": ["x", "y"]}

        """
        return get_msg_dict(self.strategy.recompute_configs)
@@ -603,6 +618,7 @@ class DistributedStrategy(object):

        **Notes**:
            **Detailed arguments for pipeline_configs**
+
            **micro_batch**: the number of small batches in each user defined batch

        Examples:
@@ -626,10 +642,10 @@ class DistributedStrategy(object):
    @property
    def localsgd(self):
        """
-        Indicating whether we are using Local SGD training. For more details, please refer to
-        [Don't Use Large Mini-Batches, Use Local SGD](https://arxiv.org/pdf/1808.07217.pdf),
+        Indicating whether we are using Local SGD training. Default Value: False
+        For more details, please refer to
+        `Don't Use Large Mini-Batches, Use Local SGD <https://arxiv.org/pdf/1808.07217.pdf>`_.

-        Default Value: False

        Examples:
          .. code-block:: python
@@ -655,13 +671,12 @@ class DistributedStrategy(object):
        setting that can be configured through a dict.

        **Notes**:
-            **k_steps(int)**: The local steps for training before parameter
-                synchronization. Default 1. If strategy.auto is set True, the
-                local steps will be calculated automatically during training.
+            k_steps(int) The local steps for training before parameter synchronization. Default 1.
+
+            If strategy.auto is set True, the local steps will be calculated automatically during training.
            The algorithm is referenced in this paper: 
-                [Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD](https://arxiv.org/pdf/1810.08313.pdf).
-                In this case, k_steps indicates the first local steps which
-                is suggested setting to 1.
+            `Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
+            In this case, k_steps indicates the first local steps which is suggested setting to 1.

        Examples:
          .. code-block:: python
@@ -712,13 +727,15 @@ class DistributedStrategy(object):
        settings that can be configured through a dict.

        **Notes**:
-            **rampup_begin_step(int)**: The beginning step from which gradient compression is implemented. Default 0.
-            **rampup_step(int)**: Time steps used in sparsity warm-up periods. Default is 1.
-                For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100,
-                it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. And when reach sparsity array
+            rampup_begin_step(int): The beginning step from which gradient compression is implemented. Default 0.
+
+            rampup_step(int): Time steps used in sparsity warm-up periods. Default is 1. \
+                    For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100, \
+                    it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. And when reach sparsity array \
                    ends, it will use 0.999 then and after.
-            **sparsity(list[float])**: Get top important element from gradient tensor, the ratio is (1 - sparsity).
-                Default is [0.999]. For example, if the sparsity is [0.99, 0.999], the top [1%, 0.1%] important
+
+            sparsity(list[float]): Get top important element from gradient tensor, the ratio is (1 - sparsity). \
+                    Default is [0.999]. For example, if the sparsity is [0.99, 0.999], the top [1%, 0.1%] important \
                    element will be transmitted.

        Examples:
@@ -750,6 +767,7 @@ class DistributedStrategy(object):

        Examples:
          .. code-block:: python
+
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.gradient_merge = True
@@ -768,11 +786,15 @@ class DistributedStrategy(object):
    def gradient_merge_configs(self):
        """
        the key-value configs of distribute_strategy
-        Keys: 
-            k_steps (int): the update period of the parameters
-            avg (bool): whether to average the gradients of each mini-batch,
-                the default value is `True`
-        Example:
+
+        **Note**:
+            k_steps(int): the update period of the parameters.
+
+            avg(bool): whether to average the gradients of each mini-batch, the default value is `True`
+
+        Examples:
+          .. code-block:: python
+
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.gradient_merge = True
@@ -826,6 +848,7 @@ class DistributedStrategy(object):

        Examples:
          .. code-block:: python
+
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.lars = True
@@ -882,6 +905,7 @@ class DistributedStrategy(object):

        Examples:
          .. code-block:: python
+
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.lamb = True