diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index bbca8118839cedeb1bc8e052034048880d6bc0ad..b7661be25e66fbab43b967416259226adb14d72b 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -476,8 +476,7 @@ def start_local_trainers(cluster, if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.GPU: proc_env["FLAGS_selected_gpus"] = "%s" % ",".join( [str(g) for g in t.accelerators]) - - if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.ASCEND_NPU: + elif len(t.accelerators) > 0 and pod.device_mode==DeviceMode.ASCEND_NPU: proc_env["FLAGS_selected_npus"] = "%s" % ",".join( [str(g) for g in t.accelerators]) diff --git a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py index 3f12ba91b227e4b5d8efc96297d5ef05b6af1701..fce477f5c5e51e27a89c22b44cbebfec5aa7412f 100644 --- a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py +++ b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py @@ -18,6 +18,7 @@ import time def train(prefix): selected_accelerators = os.getenv("FLAGS_selected_accelerators") + selected_npus = os.getenv("FLAGS_selected_npus") trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") @@ -26,8 +27,8 @@ def train(prefix): device_ids=os.getenv("PADDLE_WORLD_DEVICE_IDS") current_device_id=os.getenv("PADDLE_LOCAL_DEVICE_IDS") - details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\ - .format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id) + details = "selected_accelerators:{} selected_npus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\ + .format(selected_accelerators, selected_npus, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id) print(details) with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), "w") as f: diff --git a/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py b/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2de3844acda56150ec4d5ccd002286bbf0b800a1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py @@ -0,0 +1,65 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import sys +import os +import time +import six +import copy +import json +import unittest +import paddle.fluid as fluid + +import paddle.distributed.fleet.ascend_utils as ascend_utils + +RANK_TABLE_JSON = { + "status": "completed", + "version": "1.0", + "server_count": "1", + "server_list": [ + { + "server_id": "127.0.0.1", + "device": [ + { + "device_id": "0", + "device_ip": "192.1.184.23", + "rank_id": "0" + }, + { + "device_id": "1", + "device_ip": "192.2.21.93", + "rank_id": "1" + } + ] + } + ] +} + +class TestAscendUtil(unittest.TestCase): + def test_get_cloud_cluster(self): + cluster, pod = ascend_utils.get_cloud_cluster() + self.assertTrue(cluster) + self.assertTrue(pod) + + with open('rank_table_file.json', 'w') as f: + json.dump(RANK_TABLE_JSON, f) + rank_table_file = "./rank_table_file.json" + cluster, pod = ascend_utils.get_cloud_cluster(rank_table_file=rank_table_file) + self.assertTrue(cluster) + self.assertTrue(pod) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh index 7310af7d64ce544e099cd4a63d282dbbbfaa63d9..a54334692214c8ac3ced731c450a51a54478104f 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh @@ -51,8 +51,8 @@ echo "begin test use ascend npu" distributed_args="--run_mode=collective --log_dir=testlog" python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend -str1="selected_accelerators:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:0,1 device_id:0" -str2="selected_accelerators:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:0,1 device_id:1" +str1="selected_accelerators:0 selected_npus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:0,1 device_id:0" +str2="selected_accelerators:1 selected_npus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:0,1 device_id:1" file_0="multi_process_fleetlaunchascend.check_0.log" file_1="multi_process_fleetlaunchascend.check_1.log"