diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 5960be4800de83086b9229f8b19070c360104d7b..f656cb076cb2f66e1949abdd3aa6429a2e7d737d 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -110,6 +110,10 @@ class Group(): def process_group(self): return self.pg + @property + def world_size(self): + return self.nranks if self.rank >= 0 else -1 + def __repr__(self): debug_str = "rank: {}, nranks: {}, id: {}, ranks: ".format( self.rank, self.nranks, self.id) diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index cb8baa220c7650e90f0d4a4528cea1d8b66927fb..6cda451a266e8cae9886653bacc88498b3d1da69 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -359,47 +359,65 @@ def init_parallel_env(): return group -def get_rank(): +def get_rank(group=None): """ - Returns the rank of current trainer. + Returns the rank of current trainer in the given group, ranks are consecutive integers in [0, ``world_size``). + If none of the group is given, the global group will be used as default. - Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . - The default value is 0. + Args: + group (Group, optional): The communication group you want to get rank of current trainer, use global group as default if group is None. Returns: - (int) The rank of current trainer. + (int) The rank of current trainer in the given group. Return -1 if the process is not part of the given group. + + Warning: + Argument ``group`` only supports in dygraph mode. Examples: .. code-block:: python + # Execute this script using distributed launch with one card configs. import paddle import paddle.distributed as dist - # execute this command in terminal: export PADDLE_TRAINER_ID=0 + dist.init_parallel_env() print("The rank is %d" % dist.get_rank()) # The rank is 0 """ + if in_dygraph_mode() and group: + return group.rank + + assert group is None, "Only support group argument in eager mode." return _get_global_parallel_env().rank -def get_world_size(): +def get_world_size(group=None): """ - Returns the number of trainers (number of processes participating in current job). + Returns the number of trainers (number of processes participating in current job) in the given group. + If none of the group is given, the global group will be used as default. - Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . - The default value is 1. + Args: + group (Group, optional): The communication group you want to check world size, use global group as default if group is None. Returns: - (int) The number of trainers. + (int) The number of trainers in the given group. Return -1 if the process if not part of the given group. + + Warning: + Argument ``group`` only supports in dygraph mode. Examples: .. code-block:: python + # Execute this script using distributed launch with one card configs. import paddle import paddle.distributed as dist - # execute this command in terminal: export PADDLE_TRAINERS_NUM=4 + dist.init_parallel_env() print("The world_size is %d" % dist.get_world_size()) - # The world_size is 4 + # The world_size is 1 """ + if in_dygraph_mode() and group: + return group.world_size + + assert group is None, "Only support group argument in eager mode." return _get_global_parallel_env().world_size diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt index 4431f16d7b6e527e3b7fc61ea63e9b53f497a8bd..5a1a6df2dd7ec20e3b954b084ab94606c1077780 100644 --- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt @@ -304,5 +304,16 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX)) set_tests_properties(test_communication_stream_allreduce_api PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST") endif() +if((WITH_ROCM OR WITH_GPU) AND (LINUX)) + bash_test_modules( + test_world_size_and_rank + START_BASH + test_world_size_and_rank.sh + LABELS + "RUN_TYPE=DIST" + ENVS + "PADDLE_DIST_UT_PORT=21532;http_proxy=;https_proxy=") + set_tests_properties(test_world_size_and_rank PROPERTIES TIMEOUT "120") +endif() add_subdirectory(fleet) add_subdirectory(multinode) diff --git a/python/paddle/fluid/tests/unittests/collective/test_world_size_and_rank.sh b/python/paddle/fluid/tests/unittests/collective/test_world_size_and_rank.sh new file mode 100644 index 0000000000000000000000000000000000000000..c559c4bd26cffc33d52cd825af629fd9bcfe9a39 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/test_world_size_and_rank.sh @@ -0,0 +1,17 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch --gpus=0,1 world_size_and_rank.py diff --git a/python/paddle/fluid/tests/unittests/collective/testslist.csv b/python/paddle/fluid/tests/unittests/collective/testslist.csv index fc08f861e907749139a3a3675a41c5d8618b8e84..16eb200565f735a494a5367afacef39da1d2b5bf 100644 --- a/python/paddle/fluid/tests/unittests/collective/testslist.csv +++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv @@ -36,3 +36,4 @@ test_eager_dist_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_ test_new_group_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_gen_nccl_id_op,,gpu;rocm;ASCEND;ASCEND_CL,,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=.., test_communication_stream_allreduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=, +test_world_size_and_rank,linux,rocm;gpu,120,DIST,test_world_size_and_rank.sh,2,,http_proxy=;https_proxy=, diff --git a/python/paddle/fluid/tests/unittests/collective/world_size_and_rank.py b/python/paddle/fluid/tests/unittests/collective/world_size_and_rank.py new file mode 100644 index 0000000000000000000000000000000000000000..69620238a94377966e983b9d78f2031d3375d42a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective/world_size_and_rank.py @@ -0,0 +1,44 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import paddle.distributed as dist + + +class TestWorldSizeAndRankAPI(unittest.TestCase): + + def setUp(self): + self._num_of_ranks = 2 + self._subgroup_ranks = [0, 1] + dist.init_parallel_env() + self._subgroup = dist.new_group(self._subgroup_ranks) + self._global_rank = dist.get_rank() + + def test_default_env_world_size(self): + self.assertEqual(dist.get_world_size(), self._num_of_ranks) + + def test_given_group_world_size(self): + world_size = 2 if self._global_rank in self._subgroup_ranks else -1 + self.assertEqual(dist.get_world_size(self._subgroup), world_size) + + def test_given_group_rank(self): + rank = self._subgroup_ranks.index( + self._global_rank + ) if self._global_rank in self._subgroup_ranks else -1 + self.assertEqual(dist.get_rank(self._subgroup), rank) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_adam_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_adam_pass.py index 85c3bf321a3b1b100f593ba527698605a85570af..c1cdd0b7336e30f4c50a2b29ce4e6d5edd665739 100644 --- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_adam_pass.py +++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_adam_pass.py @@ -21,6 +21,8 @@ from paddle.distributed.passes import new_pass, PassManager import unittest from dist_pass_test_base import DistPassTestBase +paddle.enable_static() + class DemoNet(nn.Layer): diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py index a0090f6d8c310ba4ae07ebbe04b830fb60054653..12aeae57b09aad8c05b6d052516a41600704e9f2 100644 --- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py +++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py @@ -21,6 +21,8 @@ from paddle.distributed.passes import new_pass, PassManager import unittest from dist_pass_test_base import DistPassTestBase +paddle.enable_static() + class BatchNormActNet(nn.Layer): diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_add_act_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_add_act_pass.py index eb9a901a40a9ffda23ca0bd8f9b0ecc3a8a88bb8..72dfd60d4a0bd9766da17d8bd5cde9e584fd6ab8 100644 --- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_add_act_pass.py +++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_add_act_pass.py @@ -21,6 +21,8 @@ from paddle.distributed.passes import new_pass, PassManager import unittest from dist_pass_test_base import DistPassTestBase +paddle.enable_static() + class BatchNormAddActNet(nn.Layer): diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_momentum_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_momentum_pass.py index 11bd4f5d2b13a662209b65f1c06de1be9a66a8b1..c725a9b91569c2282f6d9849345945d458eb9ebb 100644 --- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_momentum_pass.py +++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_momentum_pass.py @@ -21,6 +21,8 @@ from paddle.distributed.passes import new_pass, PassManager import unittest from dist_pass_test_base import DistPassTestBase +paddle.enable_static() + class DemoNet(nn.Layer): diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py index 0a7442a18d77651bba30a4404e7649db7457bf02..61897b37ea7c58603cf85acceb91dd417d2dd731 100644 --- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py +++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py @@ -21,6 +21,8 @@ from paddle.distributed.passes import new_pass, PassManager import unittest from dist_pass_test_base import DistPassTestBase +paddle.enable_static() + class ReluDepthwiseConvNet(nn.Layer): diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_sgd_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_sgd_pass.py index 3e96e9d3440e0e2a14ace5707c848c6146f87de2..ab752c60ed34294cfaa52ea9c95e752f52d9c82a 100644 --- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_sgd_pass.py +++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_sgd_pass.py @@ -21,6 +21,8 @@ from paddle.distributed.passes import new_pass, PassManager import unittest from dist_pass_test_base import DistPassTestBase +paddle.enable_static() + class DemoNet(nn.Layer): diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_inplace_addto_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_inplace_addto_pass.py index 32bb1ca83a9b36c2a5c8f2562029f14da78b94e5..0431c53c11c27c64a79034e379ca4bc00dc4cea6 100644 --- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_inplace_addto_pass.py +++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_inplace_addto_pass.py @@ -21,6 +21,8 @@ from paddle.distributed.passes import new_pass, PassManager import unittest from dist_pass_test_base import DistPassTestBase +paddle.enable_static() + class DemoNet(nn.Layer): diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index cf3dcd00a5d4185293e9140599fd92982bd3931e..bf9f73e80d16f78b8a0e64ba9bea573f128c3499 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -627,7 +627,8 @@ class TestParallelDyGraphRunnerBase(object): np.random.seed(seed) random.seed(seed) # get trainer id - args.trainer_id = paddle.distributed.get_rank() + paddle.distributed.parallel._get_global_parallel_env() + args.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) # 3. init parallel env if args.update_method in ["nccl2", "gloo"]: @@ -666,7 +667,8 @@ class TestParallelDyGraphRunnerBase(object): np.random.seed(seed) random.seed(seed) # get trainer id - args.trainer_id = paddle.distributed.get_rank() + paddle.distributed.parallel._get_global_parallel_env() + args.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) # set strategy strategy = fleet.DistributedStrategy()