diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py index 970a932fcc0bce0088b1f0536cfbbad866fc9a28..51537b9306ffcd0db6c804efe03b577c7f3eb30a 100644 --- a/python/paddle/distributed/fleet/__init__.py +++ b/python/paddle/distributed/fleet/__init__.py @@ -44,7 +44,13 @@ node_num=fleet.node_num rank=fleet.worker_index nranks=fleet.worker_num world_size=fleet.worker_num -rank_in_node=fleet.rank_in_node +# device id in current trainer +local_device_ids=fleet.local_device_ids +# device ids in world +world_device_ids=fleet.world_device_ids +# rank in node +local_rank=fleet.local_rank +rank_in_node=local_rank is_worker = fleet.is_worker worker_endpoints = fleet.worker_endpoints server_num = fleet.server_num diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 547fe76063cdc46355028743bb47105dc52228c1..5ae4475ecce6ca2bdbd22b3d860fd3bc641a47dd 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -291,8 +291,14 @@ class Fleet(object): def node_num(self): return self._role_maker._get_node_num() - def rank_in_node(self): - return self._role_maker._get_rank_in_node() + def local_rank(self): + return self._role_maker._get_local_rank() + + def local_device_ids(self): + return self._role_maker._get_local_device_ids() + + def world_device_ids(self): + return self._role_maker._get_world_device_ids() def is_worker(self): """ diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index 3da3e546352162b11d3ef35d7fd000ec736369d1..d17741decfbcd81cc5a80fcc21711d85cd4793ad 100644 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -622,10 +622,20 @@ class PaddleCloudRoleMaker(RoleMakerBase): self._generate_role() return self._nodes_num - def _get_rank_in_node(self): + def _get_local_rank(self): if not self._role_is_generated: self._generate_role() - return self._rank_in_node + return self._local_rank + + def _get_local_device_ids(self): + if not self._role_is_generated: + self._generate_role() + return self._local_device_ids + + def _get_world_device_ids(self): + if not self._role_is_generated: + self._generate_role() + return self._world_device_ids def _get_trainer_endpoints(self): """ @@ -787,7 +797,9 @@ class PaddleCloudRoleMaker(RoleMakerBase): self._trainers_num = len(self._worker_endpoints) self._nodes_num = len( set([x.split(':')[0] for x in self._worker_endpoints])) - self._rank_in_node = os.getenv("PADDLE_RANK_IN_NODE") + self._local_rank = os.getenv("PADDLE_RANK_IN_NODE") + self._local_device_ids=os.getenv("PADDLE_LOCAL_DEVICE_IDS") + self._world_device_ids=os.getenv("PADDLE_WORLD_DEVICE_IDS") def _gloo_init(self): # PADDLE_WITH_GLOO 1: trainer barrier, 2: all barrier diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 93e3924905932322c4c05eb8ead9b905fe4c649d..f39e2284a5805f589e7a37cca90771ec875ce17c 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -98,6 +98,13 @@ class Cluster(object): r.append(t.endpoint) return r + def world_device_ids(self): + r = [] + for pod in self.pods: + for t in pod.trainers: + r.append(t.accelerators) + return r + def pods_endpoints(self): r = [] for pod in self.pods: @@ -452,6 +459,8 @@ def start_local_trainers(cluster, current_env.pop("http_proxy", None) current_env.pop("https_proxy", None) + ids=cluster.world_device_ids() + res = [':'.join(ele) for ele in ids] procs = [] for idx, t in enumerate(pod.trainers): proc_env = { @@ -459,7 +468,9 @@ def start_local_trainers(cluster, "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint, "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), - "PADDLE_RANK_IN_NODE": str(idx) + "PADDLE_RANK_IN_NODE": str(idx), + "PADDLE_LOCAL_DEVICE_IDS":",".join(t.accelerators), + "PADDLE_WORLD_DEVICE_IDS":",".join(res), } if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.GPU: diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py index d6ad1b2f2d0cd7da0cc714754b62bc57405b487b..8e8447ad7eab0a2bc887b0aad157d56500660bc8 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py @@ -150,7 +150,7 @@ class AscendOptimizer(Optimizer): # Config about Graph Engine can be found in https://support.huaweicloud.com/ config = { - "ge.exec.deviceId": str(fleet.rank_in_node()), + "ge.exec.deviceId": str(fleet.local_device_ids()), "ge.graphRunMode": "1", "ge.exec.precision_mode": "must_keep_origin_dtype", # if multi mode diff --git a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py index 435af572973329a524a76acab0fdddb710213356..3f12ba91b227e4b5d8efc96297d5ef05b6af1701 100644 --- a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py +++ b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py @@ -23,9 +23,11 @@ def train(prefix): current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env trainers_num = len(worker_endpoints.split(',')) + device_ids=os.getenv("PADDLE_WORLD_DEVICE_IDS") + current_device_id=os.getenv("PADDLE_LOCAL_DEVICE_IDS") - details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\ - .format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id) + details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\ + .format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id) print(details) with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), "w") as f: diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh index 233fe7f7f25b3be27e6a5d27f4a60858e18c48fe..0960083abf28ec7bd34445cf22bd62284c102452 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh @@ -30,8 +30,8 @@ export TRAINER_PORTS_NUM=2 distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1 --log_dir=testlog" python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend -str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0" -str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1" +str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0" +str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1" file_0="multi_process_fleetlaunchascend.check_0.log" file_1="multi_process_fleetlaunchascend.check_1.log"