diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index cb1c28b39b69952f87488a0fe2a9c3ee3bc96387..c94b77dd8c6428909a4a2b62cc19659a8635dcff 100755 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -107,7 +107,7 @@ class DistributedStrategy(object): All of the distributed training configurations can be configured in DistributedStrategy, such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS), asynchronous update parameter server(ASGD), etc. - + DistributedStrategy can be serialized into protobuf file or deserialized from protobuf file Users who run local training usually configure BuildStrategy and ExecutionStrategy, and @@ -128,8 +128,9 @@ class DistributedStrategy(object): Serialize current DistributedStrategy to string and save to output file Examples: + .. code-block:: python - + import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() strategy.dgc = True @@ -145,6 +146,7 @@ class DistributedStrategy(object): Load from prototxt file for DistributedStrategy initialization Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -161,10 +163,11 @@ class DistributedStrategy(object): Configure ExecutionStrategy for DistributedStrategy Examples: + .. code-block:: python import paddle - exe_strategy = paddle.fluid.ExecutionStrategy() + exe_strategy = paddle.static.ExecutionStrategy() exe_strategy.num_threads = 10 exe_strategy.num_iteration_per_drop_scope = 10 exe_strategy.num_iteration_per_run = 10 @@ -195,10 +198,11 @@ class DistributedStrategy(object): only if the property is non-distributed strategy. Examples: + .. code-block:: python import paddle - build_strategy = paddle.fluid.BuildStrategy() + build_strategy = paddle.static.BuildStrategy() build_strategy.enable_sequential_execution = True build_strategy.fuse_elewise_add_act_ops = True build_strategy.fuse_bn_act_ops = True @@ -207,7 +211,7 @@ class DistributedStrategy(object): build_strategy.fuse_broadcast_ops = True build_strategy.fuse_all_optimizer_ops = True build_strategy.enable_inplace = True - + strategy = paddle.distributed.fleet.DistributedStrategy() strategy.build_strategy = build_strategy """ @@ -240,6 +244,7 @@ class DistributedStrategy(object): Default value: True Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -248,7 +253,7 @@ class DistributedStrategy(object): strategy = fleet.DistributedStrategy() strategy.a_sync = True # by default this is True - + # code block for defining loss and local optimizer # sgd = fleet.distributed_optimizer(optimizer, strategy) """ @@ -288,6 +293,7 @@ class DistributedStrategy(object): runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -319,6 +325,7 @@ class DistributedStrategy(object): Default Value: False Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -360,6 +367,7 @@ class DistributedStrategy(object): custom_black_list(list[str]): Users' custom black list which forbidden execution fp16. Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -384,6 +392,7 @@ class DistributedStrategy(object): Default value: False Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -401,6 +410,7 @@ class DistributedStrategy(object): We note that system overhead is usually lower when sync_nccl_allreduce = True Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -425,6 +435,7 @@ class DistributedStrategy(object): allreduce among the leaders of each group Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -450,6 +461,7 @@ class DistributedStrategy(object): Default value: number of GPU cards on each single GPU machine Example: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -472,10 +484,11 @@ class DistributedStrategy(object): def sync_batch_norm(self): """ Indicating whether we are using sync_batch_norm to do synchronous batch normalization among all training nodes. - + Default value: False Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -500,6 +513,7 @@ class DistributedStrategy(object): Default value: True Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -524,8 +538,9 @@ class DistributedStrategy(object): Default value: 32 Examples: + .. code-block:: python - + import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() strategy.fuse_grad_size_in_MB = 50 @@ -562,8 +577,9 @@ class DistributedStrategy(object): Default value: 1 Examples: + .. code-block:: python - + import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() strategy.nccl_comm_num = 2 @@ -594,8 +610,9 @@ class DistributedStrategy(object): implementation should have some manually assign checkpoints Examples: + .. code-block:: python - + import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() strategy.recompute = True @@ -622,8 +639,9 @@ class DistributedStrategy(object): Default value: False Examples: + .. code-block:: python - + import paddle.fleet as fleet strategy = fleet.DistributedStrategy() strategy.sharding = True @@ -649,8 +667,9 @@ class DistributedStrategy(object): and should be an empirical value decided by your model size and network topology. Examples: + .. code-block:: python - + import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() strategy.sharding = True @@ -674,8 +693,9 @@ class DistributedStrategy(object): device_guard information in user-defined program. Examples: + .. code-block:: python - + import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() strategy.pipeline = True @@ -709,8 +729,9 @@ class DistributedStrategy(object): **micro_batch**: the number of small batches in each user defined batch Examples: + .. code-block:: python - + import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() strategy.pipeline = True @@ -736,6 +757,7 @@ class DistributedStrategy(object): Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -764,6 +786,7 @@ class DistributedStrategy(object): begin_step(int) The step of begining training by localsgd. Default 1. Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -791,6 +814,7 @@ class DistributedStrategy(object): Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -821,6 +845,7 @@ class DistributedStrategy(object): begin_step(int) The step of begining training by adaptive localsgd. Default 1. Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -848,6 +873,7 @@ class DistributedStrategy(object): Default Value: False Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -884,6 +910,7 @@ class DistributedStrategy(object): element will be transmitted. Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -906,6 +933,7 @@ class DistributedStrategy(object): Default Value: False Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -935,6 +963,7 @@ class DistributedStrategy(object): to model parameters. Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -963,6 +992,7 @@ class DistributedStrategy(object): avg(bool): whether to average the gradients of each mini-batch, the default value is `True` Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -989,6 +1019,7 @@ class DistributedStrategy(object): Default Value: False Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -1019,6 +1050,7 @@ class DistributedStrategy(object): will be exclude from weight decay in lars formula. Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -1048,8 +1080,9 @@ class DistributedStrategy(object): [Large Batch Optimization for Deep Learning: Training BERT in 76 minutes](https://arxiv.org/abs/1904.00962). Default Value: False - + Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -1078,6 +1111,7 @@ class DistributedStrategy(object): will be exclude from weight decay in lamb formula. Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -1123,11 +1157,12 @@ class DistributedStrategy(object): Default Value: False Examples: + .. code-block:: python import paddle - import paddle.distributed.fleet as fleet paddle.enable_static() + import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() strategy.auto = True @@ -1156,8 +1191,11 @@ class DistributedStrategy(object): Default Value: True Examples: + .. code-block:: python + import paddle + paddle.enable_static() import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() strategy.cudnn_exhaustive_search = False @@ -1187,15 +1225,18 @@ class DistributedStrategy(object): Default Value: 4000 Examples: + .. code-block:: python + import paddle + paddle.enable_static() import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() strategy.conv_workspace_size_limit = 1024 optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy) - + """ return self.strategy.conv_workspace_size_limit @@ -1217,8 +1258,11 @@ class DistributedStrategy(object): Default Value: True Examples: + .. code-block:: python + import paddle + paddle.enable_static() import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() strategy.cudnn_batchnorm_spatial_persistent = True diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 4db7f70e3cf5c07557259b9286866925c8e631f2..c5be6a7a8bb142ef4fe25a906785560f6b22e93c 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -69,8 +69,11 @@ class Fleet(object): Fleet: A Fleet instance Example for collective training: + .. code-block:: python + import paddle + paddle.enable_static() import paddle.distributed.fleet as fleet fleet.init(is_collective=True) @@ -86,6 +89,8 @@ class Fleet(object): .. code-block:: python + import paddle + paddle.enable_static() import paddle.distributed.fleet as fleet fleet.init() @@ -159,7 +164,7 @@ class Fleet(object): .. code-block:: python import paddle.distributed.fleet as fleet - role = fleet.PaddleCloudRoleMaker + role = fleet.PaddleCloudRoleMaker() fleet.init(role) """ @@ -233,6 +238,7 @@ class Fleet(object): Examples: .. code-block:: python + import paddle.distributed.fleet as fleet fleet.init() fleet.worker_index() @@ -246,8 +252,9 @@ class Fleet(object): Returns: int: worker numbers - + Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -266,6 +273,7 @@ class Fleet(object): False if not. Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -283,6 +291,7 @@ class Fleet(object): list/string: server endpoints Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -303,10 +312,12 @@ class Fleet(object): int: server number Examples: + .. code-block:: python - import paddle.distributed.fleet as fleet - fleet.init() - fleet.server_num() + + import paddle.distributed.fleet as fleet + fleet.init() + fleet.server_num() """ return len(self._role_maker._get_pserver_endpoints()) @@ -318,6 +329,7 @@ class Fleet(object): int: node id Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -335,6 +347,7 @@ class Fleet(object): list/string: server endpoints Examples: + .. code-block:: python import paddle.distributed.fleet as fleet @@ -359,6 +372,7 @@ class Fleet(object): Examples: .. code-block:: python + import paddle.distributed.fleet as fleet fleet.init() fleet.is_server() @@ -510,21 +524,21 @@ class Fleet(object): def save_persistables(self, executor, dirname, main_program=None, mode=1): """ - saves all persistable variables from :code:`main_program` to + saves all persistable tensors from :code:`main_program` to the folder :code:`dirname`. You can refer to - The :code:`dirname` is used to specify the folder where persistable variables - are going to be saved. If you would like to save variables in separate + The :code:`dirname` is used to specify the folder where persistable tensors + are going to be saved. If you would like to save tensors in separate files, set :code:`filename` None. Args: - executor(Executor): The executor to run for saving persistable variables. + executor(Executor): The executor to run for saving persistable tensors. You can refer to :ref:`api_guide_executor_en` for more details. dirname(str, optional): The saving directory path. When you need to save the parameter to the memory, set it to None. - main_program(Program, optional): The program whose persistbale variables will + main_program(Program, optional): The program whose persistbale tensors will be saved. Default: None. @@ -535,16 +549,17 @@ class Fleet(object): .. code-block:: text + import paddle + paddle.enable_static() import paddle.distributed.fleet as fleet - import paddle.fluid as fluid fleet.init() # build net # fleet.distributed_optimizer(...) - exe = fluid.Executor(fluid.CPUPlace()) - fleet.save_persistables(exe, "dirname", fluid.default_main_program()) + exe = paddle.static.Executor(paddle.CPUPlace()) + fleet.save_persistables(exe, "dirname", paddle.static.default_main_program()) """ @@ -569,9 +584,9 @@ class Fleet(object): .. code-block:: python + import paddle import paddle.distributed.fleet as fleet - role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True) - fleet.init(role) + fleet.init(is_collective=True) strategy = fleet.DistributedStrategy() optimizer = paddle.optimizer.SGD(learning_rate=0.001) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) @@ -621,23 +636,20 @@ class Fleet(object): def forward(self, x): return self._linear2(self._linear1(x)) - # 1. enable dynamic mode - paddle.disable_static() - - # 2. initialize fleet environment + # 1. initialize fleet environment fleet.init(is_collective=True) - # 3. create layer & optimizer + # 2. create layer & optimizer layer = LinearNet() loss_fn = nn.MSELoss() adam = paddle.optimizer.Adam( learning_rate=0.001, parameters=layer.parameters()) - # 4. get data_parallel model using fleet + # 3. get data_parallel model using fleet adam = fleet.distributed_optimizer(adam) dp_layer = fleet.distributed_model(layer) - # 5. run layer + # 4. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') @@ -675,11 +687,10 @@ class Fleet(object): import paddle from paddle.distributed import fleet - paddle.disable_static() fleet.init(is_collective=True) value = np.arange(26).reshape(2, 13).astype("float32") - a = paddle.fluid.dygraph.to_variable(value) + a = paddle.to_tensor(value) layer = paddle.nn.Linear(13, 5) adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters()) @@ -710,11 +721,10 @@ class Fleet(object): import paddle from paddle.distributed import fleet - paddle.disable_static() fleet.init(is_collective=True) value = np.arange(26).reshape(2, 13).astype("float32") - a = paddle.fluid.dygraph.to_variable(value) + a = paddle.to_tensor(value) layer = paddle.nn.Linear(13, 5) adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters()) @@ -722,9 +732,9 @@ class Fleet(object): adam = fleet.distributed_optimizer(adam) dp_layer = fleet.distributed_model(layer) state_dict = adam.state_dict() - paddle.framework.save(state_dict, "paddle_dy") - para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy") - adam.set_state_dict(opti_state_dict) + paddle.save(state_dict, "paddle_dy") + para_state_dict = paddle.load("paddle_dy") + adam.set_state_dict(para_state_dict) """ # imitate target optimizer retrieval return self.user_defined_optimizer.set_state_dict(state_dict) @@ -748,11 +758,10 @@ class Fleet(object): import paddle from paddle.distributed import fleet - paddle.disable_static() fleet.init(is_collective=True) value = np.arange(26).reshape(2, 13).astype("float32") - a = paddle.fluid.dygraph.to_variable(value) + a = paddle.to_tensor(value) layer = paddle.nn.Linear(13, 5) adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters()) @@ -785,17 +794,17 @@ class Fleet(object): float: The learning rate of the current step. Examples: + .. code-block:: python import numpy as np import paddle from paddle.distributed import fleet - paddle.disable_static() fleet.init(is_collective=True) value = np.arange(26).reshape(2, 13).astype("float32") - a = paddle.fluid.dygraph.to_variable(value) + a = paddle.to_tensor(value) layer = paddle.nn.Linear(13, 5) adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters()) @@ -819,6 +828,7 @@ class Fleet(object): None Examples: + .. code-block:: python import paddle @@ -834,23 +844,20 @@ class Fleet(object): def forward(self, x): return self._linear2(self._linear1(x)) - # 1. enable dynamic mode - paddle.disable_static() - - # 2. initialize fleet environment + # 1. initialize fleet environment fleet.init(is_collective=True) - # 3. create layer & optimizer + # 2. create layer & optimizer layer = LinearNet() loss_fn = nn.MSELoss() adam = paddle.optimizer.Adam( learning_rate=0.001, parameters=layer.parameters()) - # 4. get data_parallel model using fleet + # 3. get data_parallel model using fleet adam = fleet.distributed_optimizer(adam) dp_layer = fleet.distributed_model(layer) - # 5. run layer + # 4. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') @@ -878,6 +885,7 @@ class Fleet(object): None Examples: + .. code-block:: python import paddle @@ -893,23 +901,20 @@ class Fleet(object): def forward(self, x): return self._linear2(self._linear1(x)) - # 1. enable dynamic mode - paddle.disable_static() - - # 2. initialize fleet environment + # 1. initialize fleet environment fleet.init(is_collective=True) - # 3. create layer & optimizer + # 2. create layer & optimizer layer = LinearNet() loss_fn = nn.MSELoss() adam = paddle.optimizer.Adam( learning_rate=0.001, parameters=layer.parameters()) - # 4. get data_parallel model using fleet + # 3. get data_parallel model using fleet adam = fleet.distributed_optimizer(adam) dp_layer = fleet.distributed_model(layer) - # 5. run layer + # 4. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') @@ -962,38 +967,44 @@ class Fleet(object): Add distributed operations to minimize ``loss`` by updating ``parameter_list``. Args: - loss (Variable): A ``Variable`` containing the value to minimize. + loss (Tensor): A ``Tensor`` containing the value to minimize. startup_program (Program, optional): :ref:`api_fluid_Program` for initializing parameters in ``parameter_list``. The default value is None, at this time :ref:`api_fluid_default_startup_program` will be used. - parameter_list (Iterable, optional): Iterable of ``Variable`` or ``Variable.name`` to update + parameter_list (Iterable, optional): Iterable of ``Tensor`` or ``Tensor.name`` to update to minimize ``loss``. The default value is None, at this time all parameters will be updated. - no_grad_set (set, optional): Set of ``Variable`` or ``Variable.name`` that don't need + no_grad_set (set, optional): Set of ``Tensor`` or ``Tensor.name`` that don't need to be updated. The default value is None. Returns: tuple: tuple (optimize_ops, params_grads), A list of operators appended - by minimize and a list of (param, grad) variable pairs, param is + by minimize and a list of (param, grad) tensor pairs, param is ``Parameter``, grad is the gradient value corresponding to the parameter. The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to indicate program pruning. If so, the program will be pruned by ``feed`` and ``fetch_list`` before run, see details in ``Executor``. Examples: + .. code-block:: python import paddle + paddle.enable_static() import paddle.distributed.fleet as fleet + import paddle.nn.functional as F + + hid_dim = 10 + label_dim = 2 + input_x = paddle.static.data(name='x', shape=[None, 13], dtype='float32') + input_y = paddle.static.data(name='y', shape=[None, 1], dtype='int64') + fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim, activation='tanh') + fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim, activation='tanh') + prediction = paddle.static.nn.fc(x=[fc_2], size=label_dim, activation='softmax') + cost = F.cross_entropy(input=prediction, label=input_y) + avg_cost = paddle.mean(x=cost) - fc_1 = paddle.fluid.layers.fc(input=input_x, size=hid_dim, act='tanh') - fc_2 = paddle.fluid.layers.fc(input=fc_1, size=hid_dim, act='tanh') - prediction = paddle.fluid.layers.fc(input=[fc_2], size=label_dim, act='softmax') - cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) - - role = fleet.role_maker.PaddleCloudRoleMaker(is_collective=True) - fleet.init(role) + fleet.init(is_collective=True) strategy = fleet.DistributedStrategy() optimizer = paddle.optimizer.SGD(learning_rate=0.001) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)