test=develop, fix doc (#29200)

* fix fleet api doc

test=develop, fix doc (#29200)
* fix fleet api doc
cc9c6196 · 123malin · GitHub · c0a991c8 · cc9c6196 · cc9c6196
2 changed file
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -107,7 +107,7 @@ class DistributedStrategy(object):
        All of the distributed training configurations can be configured in DistributedStrategy,
        such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS), 
        asynchronous update parameter server(ASGD), etc.
        DistributedStrategy can be serialized into protobuf file or deserialized from protobuf file
        Users who run local training usually configure BuildStrategy and ExecutionStrategy, and 
@@ -128,8 +128,9 @@ class DistributedStrategy(object):
        Serialize current DistributedStrategy to string and save to output file
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.dgc = True
@@ -145,6 +146,7 @@ class DistributedStrategy(object):
        Load from prototxt file for DistributedStrategy initialization
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -161,10 +163,11 @@ class DistributedStrategy(object):
        Configure ExecutionStrategy for DistributedStrategy
        Examples:
          .. code-block:: python
            import paddle
-            exe_strategy = paddle.fluid.ExecutionStrategy()
+            exe_strategy = paddle.static.ExecutionStrategy()
            exe_strategy.num_threads = 10
            exe_strategy.num_iteration_per_drop_scope = 10
            exe_strategy.num_iteration_per_run = 10
@@ -195,10 +198,11 @@ class DistributedStrategy(object):
        only if the property is non-distributed strategy.
        Examples:
          .. code-block:: python
            import paddle
-            build_strategy = paddle.fluid.BuildStrategy()
+            build_strategy = paddle.static.BuildStrategy()
            build_strategy.enable_sequential_execution = True
            build_strategy.fuse_elewise_add_act_ops = True
            build_strategy.fuse_bn_act_ops = True
@@ -207,7 +211,7 @@ class DistributedStrategy(object):
            build_strategy.fuse_broadcast_ops = True
            build_strategy.fuse_all_optimizer_ops = True
            build_strategy.enable_inplace = True
            strategy = paddle.distributed.fleet.DistributedStrategy()
            strategy.build_strategy = build_strategy
        """
@@ -240,6 +244,7 @@ class DistributedStrategy(object):
        Default value: True
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -248,7 +253,7 @@ class DistributedStrategy(object):
            strategy = fleet.DistributedStrategy()
            strategy.a_sync = True  # by default this is True
            # code block for defining loss and local optimizer
            # sgd = fleet.distributed_optimizer(optimizer, strategy)
        """
@@ -288,6 +293,7 @@ class DistributedStrategy(object):
            runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -319,6 +325,7 @@ class DistributedStrategy(object):
        Default Value: False
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -360,6 +367,7 @@ class DistributedStrategy(object):
            custom_black_list(list[str]): Users' custom black list which forbidden execution fp16.
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -384,6 +392,7 @@ class DistributedStrategy(object):
        Default value: False
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -401,6 +410,7 @@ class DistributedStrategy(object):
        We note that system overhead is usually lower when sync_nccl_allreduce = True
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -425,6 +435,7 @@ class DistributedStrategy(object):
        allreduce among the leaders of each group
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -450,6 +461,7 @@ class DistributedStrategy(object):
        Default value: number of GPU cards on each single GPU machine
        Example:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -472,10 +484,11 @@ class DistributedStrategy(object):
    def sync_batch_norm(self):
        """
        Indicating whether we are using sync_batch_norm to do synchronous batch normalization among all training nodes.
        Default value: False
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -500,6 +513,7 @@ class DistributedStrategy(object):
        Default value: True
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -524,8 +538,9 @@ class DistributedStrategy(object):
        Default value: 32
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.fuse_grad_size_in_MB = 50
@@ -562,8 +577,9 @@ class DistributedStrategy(object):
        Default value: 1
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.nccl_comm_num = 2
@@ -594,8 +610,9 @@ class DistributedStrategy(object):
        implementation should have some manually assign checkpoints
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.recompute = True
@@ -622,8 +639,9 @@ class DistributedStrategy(object):
        Default value: False
        Examples:
          .. code-block:: python
            import paddle.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.sharding = True
@@ -649,8 +667,9 @@ class DistributedStrategy(object):
            and should be an empirical value decided by your model size and network topology.
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.sharding = True
@@ -674,8 +693,9 @@ class DistributedStrategy(object):
        device_guard information in user-defined program.
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.pipeline = True
@@ -709,8 +729,9 @@ class DistributedStrategy(object):
            **micro_batch**: the number of small batches in each user defined batch
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.pipeline = True
@@ -736,6 +757,7 @@ class DistributedStrategy(object):
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -764,6 +786,7 @@ class DistributedStrategy(object):
            begin_step(int) The step of begining training by localsgd. Default 1.
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -791,6 +814,7 @@ class DistributedStrategy(object):
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -821,6 +845,7 @@ class DistributedStrategy(object):
            begin_step(int) The step of begining training by adaptive localsgd. Default 1.
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -848,6 +873,7 @@ class DistributedStrategy(object):
        Default Value: False
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -884,6 +910,7 @@ class DistributedStrategy(object):
                    element will be transmitted.
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -906,6 +933,7 @@ class DistributedStrategy(object):
        Default Value: False
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -935,6 +963,7 @@ class DistributedStrategy(object):
        to model parameters.
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -963,6 +992,7 @@ class DistributedStrategy(object):
            avg(bool): whether to average the gradients of each mini-batch, the default value is `True`
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -989,6 +1019,7 @@ class DistributedStrategy(object):
        Default Value: False
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -1019,6 +1050,7 @@ class DistributedStrategy(object):
        will be exclude from weight decay in lars formula.
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -1048,8 +1080,9 @@ class DistributedStrategy(object):
        [Large Batch Optimization for Deep Learning: Training BERT in 76 minutes](https://arxiv.org/abs/1904.00962).
        Default Value: False
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -1078,6 +1111,7 @@ class DistributedStrategy(object):
        will be exclude from weight decay in lamb formula.
        Examples:
          .. code-block:: python
            import paddle.distributed.fleet as fleet
@@ -1123,11 +1157,12 @@ class DistributedStrategy(object):
        Default Value: False
        Examples:
          .. code-block:: python
            import paddle
-            import paddle.distributed.fleet as fleet
            paddle.enable_static()
+            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.auto = True
@@ -1156,8 +1191,11 @@ class DistributedStrategy(object):
        Default Value: True
        Examples:
          .. code-block:: python
+            import paddle
+            paddle.enable_static()
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.cudnn_exhaustive_search = False
@@ -1187,15 +1225,18 @@ class DistributedStrategy(object):
        Default Value: 4000
        Examples:
          .. code-block:: python
+            import paddle
+            paddle.enable_static()
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.conv_workspace_size_limit = 1024
            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
            optimizer = fleet.distributed_optimizer(optimizer, strategy)
        """
        return self.strategy.conv_workspace_size_limit
@@ -1217,8 +1258,11 @@ class DistributedStrategy(object):
        Default Value: True
        Examples:
          .. code-block:: python
+            import paddle
+            paddle.enable_static()
            import paddle.distributed.fleet as fleet
            strategy = fleet.DistributedStrategy()
            strategy.cudnn_batchnorm_spatial_persistent = True

--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -69,8 +69,11 @@ class Fleet(object):
        Fleet: A Fleet instance
    Example for collective training:
        .. code-block:: python
+            import paddle
+            paddle.enable_static()
            import paddle.distributed.fleet as fleet
            fleet.init(is_collective=True)
@@ -86,6 +89,8 @@ class Fleet(object):
        .. code-block:: python
+            import paddle
+            paddle.enable_static()
            import paddle.distributed.fleet as fleet
            fleet.init()
@@ -159,7 +164,7 @@ class Fleet(object):
            .. code-block:: python
                import paddle.distributed.fleet as fleet
-                role = fleet.PaddleCloudRoleMaker
+                role = fleet.PaddleCloudRoleMaker()
                fleet.init(role)
        """
@@ -233,6 +238,7 @@ class Fleet(object):
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                fleet.init()
                fleet.worker_index()
@@ -246,8 +252,9 @@ class Fleet(object):
        Returns:
            int: worker numbers
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -266,6 +273,7 @@ class Fleet(object):
                  False if not.
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -283,6 +291,7 @@ class Fleet(object):
            list/string: server endpoints
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -303,10 +312,12 @@ class Fleet(object):
            int: server number
        Examples:
            .. code-block:: python
-            import paddle.distributed.fleet as fleet
-            fleet.init()
+                import paddle.distributed.fleet as fleet
-            fleet.server_num()
+                fleet.init()
+                fleet.server_num()
        """
        return len(self._role_maker._get_pserver_endpoints())
@@ -318,6 +329,7 @@ class Fleet(object):
            int: node id
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -335,6 +347,7 @@ class Fleet(object):
            list/string: server endpoints
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
@@ -359,6 +372,7 @@ class Fleet(object):
        Examples:
            .. code-block:: python
                import paddle.distributed.fleet as fleet
                fleet.init()
                fleet.is_server()
@@ -510,21 +524,21 @@ class Fleet(object):
    def save_persistables(self, executor, dirname, main_program=None, mode=1):
        """
-        saves all persistable variables from :code:`main_program` to
+        saves all persistable tensors from :code:`main_program` to
        the folder :code:`dirname`. You can refer to
-        The :code:`dirname` is used to specify the folder where persistable variables
+        The :code:`dirname` is used to specify the folder where persistable tensors
-        are going to be saved. If you would like to save variables in separate
+        are going to be saved. If you would like to save tensors in separate
        files, set :code:`filename` None.
        Args:
-            executor(Executor): The executor to run for saving persistable variables.
+            executor(Executor): The executor to run for saving persistable tensors.
                                You can refer to :ref:`api_guide_executor_en` for
                                more details.
            dirname(str, optional): The saving directory path.
                                When you need to save the parameter to the memory, set it to None.
-            main_program(Program, optional): The program whose persistbale variables will
+            main_program(Program, optional): The program whose persistbale tensors will
                                             be saved. Default: None.
@@ -535,16 +549,17 @@ class Fleet(object):
            .. code-block:: text
+                import paddle
+                paddle.enable_static()
                import paddle.distributed.fleet as fleet
-                import paddle.fluid as fluid
                fleet.init()
                # build net
                # fleet.distributed_optimizer(...)
-                exe = fluid.Executor(fluid.CPUPlace())
+                exe = paddle.static.Executor(paddle.CPUPlace())
-                fleet.save_persistables(exe, "dirname", fluid.default_main_program())
+                fleet.save_persistables(exe, "dirname", paddle.static.default_main_program())
        """
@@ -569,9 +584,9 @@ class Fleet(object):
            .. code-block:: python
+                import paddle
                import paddle.distributed.fleet as fleet
-                role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
+                fleet.init(is_collective=True)
-                fleet.init(role)
                strategy = fleet.DistributedStrategy()
                optimizer = paddle.optimizer.SGD(learning_rate=0.001)
                optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
@@ -621,23 +636,20 @@ class Fleet(object):
                    def forward(self, x):
                        return self._linear2(self._linear1(x))
-                # 1. enable dynamic mode
+                # 1. initialize fleet environment
-                paddle.disable_static()
-                # 2. initialize fleet environment
                fleet.init(is_collective=True)
-                # 3. create layer & optimizer
+                # 2. create layer & optimizer
                layer = LinearNet()
                loss_fn = nn.MSELoss()
                adam = paddle.optimizer.Adam(
                    learning_rate=0.001, parameters=layer.parameters())
-                # 4. get data_parallel model using fleet
+                # 3. get data_parallel model using fleet
                adam = fleet.distributed_optimizer(adam)
                dp_layer = fleet.distributed_model(layer)
-                # 5. run layer
+                # 4. run layer
                inputs = paddle.randn([10, 10], 'float32')
                outputs = dp_layer(inputs)
                labels = paddle.randn([10, 1], 'float32')
@@ -675,11 +687,10 @@ class Fleet(object):
                import paddle
                from paddle.distributed import fleet
-                paddle.disable_static()
                fleet.init(is_collective=True)
                value = np.arange(26).reshape(2, 13).astype("float32")
-                a = paddle.fluid.dygraph.to_variable(value)
+                a = paddle.to_tensor(value)
                layer = paddle.nn.Linear(13, 5)
                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
@@ -710,11 +721,10 @@ class Fleet(object):
                import paddle
                from paddle.distributed import fleet
-                paddle.disable_static()
                fleet.init(is_collective=True)
                value = np.arange(26).reshape(2, 13).astype("float32")
-                a = paddle.fluid.dygraph.to_variable(value)
+                a = paddle.to_tensor(value)
                layer = paddle.nn.Linear(13, 5)
                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
@@ -722,9 +732,9 @@ class Fleet(object):
                adam = fleet.distributed_optimizer(adam)
                dp_layer = fleet.distributed_model(layer)
                state_dict = adam.state_dict()
-                paddle.framework.save(state_dict, "paddle_dy")
+                paddle.save(state_dict, "paddle_dy")
-                para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
+                para_state_dict = paddle.load("paddle_dy")
-                adam.set_state_dict(opti_state_dict)
+                adam.set_state_dict(para_state_dict)
        """
        # imitate target optimizer retrieval
        return self.user_defined_optimizer.set_state_dict(state_dict)
@@ -748,11 +758,10 @@ class Fleet(object):
                import paddle
                from paddle.distributed import fleet
-                paddle.disable_static()
                fleet.init(is_collective=True)
                value = np.arange(26).reshape(2, 13).astype("float32")
-                a = paddle.fluid.dygraph.to_variable(value)
+                a = paddle.to_tensor(value)
                layer = paddle.nn.Linear(13, 5)
                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
@@ -785,17 +794,17 @@ class Fleet(object):
            float: The learning rate of the current step.
        Examples:
            .. code-block:: python
                import numpy as np
                import paddle
                from paddle.distributed import fleet
-                paddle.disable_static()
                fleet.init(is_collective=True)
                value = np.arange(26).reshape(2, 13).astype("float32")
-                a = paddle.fluid.dygraph.to_variable(value)
+                a = paddle.to_tensor(value)
                layer = paddle.nn.Linear(13, 5)
                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
@@ -819,6 +828,7 @@ class Fleet(object):
            None
        Examples:
            .. code-block:: python
                import paddle
@@ -834,23 +844,20 @@ class Fleet(object):
                    def forward(self, x):
                        return self._linear2(self._linear1(x))
-                # 1. enable dynamic mode
+                # 1. initialize fleet environment
-                paddle.disable_static()
-                # 2. initialize fleet environment
                fleet.init(is_collective=True)
-                # 3. create layer & optimizer
+                # 2. create layer & optimizer
                layer = LinearNet()
                loss_fn = nn.MSELoss()
                adam = paddle.optimizer.Adam(
                    learning_rate=0.001, parameters=layer.parameters())
-                # 4. get data_parallel model using fleet
+                # 3. get data_parallel model using fleet
                adam = fleet.distributed_optimizer(adam)
                dp_layer = fleet.distributed_model(layer)
-                # 5. run layer
+                # 4. run layer
                inputs = paddle.randn([10, 10], 'float32')
                outputs = dp_layer(inputs)
                labels = paddle.randn([10, 1], 'float32')
@@ -878,6 +885,7 @@ class Fleet(object):
            None
        Examples:
            .. code-block:: python
                import paddle
@@ -893,23 +901,20 @@ class Fleet(object):
                    def forward(self, x):
                        return self._linear2(self._linear1(x))
-                # 1. enable dynamic mode
+                # 1. initialize fleet environment
-                paddle.disable_static()
-                # 2. initialize fleet environment
                fleet.init(is_collective=True)
-                # 3. create layer & optimizer
+                # 2. create layer & optimizer
                layer = LinearNet()
                loss_fn = nn.MSELoss()
                adam = paddle.optimizer.Adam(
                    learning_rate=0.001, parameters=layer.parameters())
-                # 4. get data_parallel model using fleet
+                # 3. get data_parallel model using fleet
                adam = fleet.distributed_optimizer(adam)
                dp_layer = fleet.distributed_model(layer)
-                # 5. run layer
+                # 4. run layer
                inputs = paddle.randn([10, 10], 'float32')
                outputs = dp_layer(inputs)
                labels = paddle.randn([10, 1], 'float32')
@@ -962,38 +967,44 @@ class Fleet(object):
        Add distributed operations to minimize ``loss`` by updating ``parameter_list``.
        Args:
-            loss (Variable): A ``Variable`` containing the value to minimize.
+            loss (Tensor): A ``Tensor`` containing the value to minimize.
            startup_program (Program, optional): :ref:`api_fluid_Program` for
                initializing parameters in ``parameter_list``. The default value
                is None, at this time :ref:`api_fluid_default_startup_program` will be used.
-            parameter_list (Iterable, optional): Iterable of ``Variable`` or ``Variable.name`` to update
+            parameter_list (Iterable, optional): Iterable of ``Tensor`` or ``Tensor.name`` to update
                to minimize ``loss``. The default value is None, at this time all parameters
                will be updated.
-            no_grad_set (set, optional): Set of ``Variable``  or ``Variable.name`` that don't need
+            no_grad_set (set, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
                to be updated. The default value is None.
        Returns:
            tuple: tuple (optimize_ops, params_grads), A list of operators appended
-            by minimize and a list of (param, grad) variable pairs, param is
+            by minimize and a list of (param, grad) tensor pairs, param is
            ``Parameter``, grad is the gradient value corresponding to the parameter.
            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
            indicate program pruning. If so, the program will be pruned by ``feed`` and
            ``fetch_list`` before run, see details in ``Executor``.
        Examples:
            .. code-block:: python
                import paddle
+                paddle.enable_static()
                import paddle.distributed.fleet as fleet
+                import paddle.nn.functional as F
+                hid_dim = 10
+                label_dim = 2
+                input_x = paddle.static.data(name='x', shape=[None, 13], dtype='float32')
+                input_y = paddle.static.data(name='y', shape=[None, 1], dtype='int64')
+                fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim, activation='tanh')
+                fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim, activation='tanh')
+                prediction = paddle.static.nn.fc(x=[fc_2], size=label_dim, activation='softmax')
+                cost = F.cross_entropy(input=prediction, label=input_y)
+                avg_cost = paddle.mean(x=cost)
-                fc_1 = paddle.fluid.layers.fc(input=input_x, size=hid_dim, act='tanh')
+                fleet.init(is_collective=True)
-                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=hid_dim, act='tanh')
-                prediction = paddle.fluid.layers.fc(input=[fc_2], size=label_dim, act='softmax')
-                cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y)
-                avg_cost = paddle.fluid.layers.mean(x=cost)
-                role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True)
-                fleet.init(role)
                strategy = fleet.DistributedStrategy()
                optimizer = paddle.optimizer.SGD(learning_rate=0.001)
                optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)