add ascend unittest (#31249)

add ascend unittest

add ascend unittest (#31249)
add ascend unittest
821c2f4e · xiayanming · GitHub · d45f5d78 · 821c2f4e · 821c2f4e
4 changed file
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -476,8 +476,7 @@ def start_local_trainers(cluster,
        if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.GPU:
            proc_env["FLAGS_selected_gpus"] = "%s" % ",".join(
                [str(g) for g in t.accelerators])
-        
-        if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.ASCEND_NPU:
+        elif len(t.accelerators) > 0 and pod.device_mode==DeviceMode.ASCEND_NPU:
            proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
                [str(g) for g in t.accelerators])


--- a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
+++ b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
@@ -18,6 +18,7 @@ import time

 def train(prefix):
    selected_accelerators = os.getenv("FLAGS_selected_accelerators")
+    selected_npus = os.getenv("FLAGS_selected_npus")
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
@@ -26,8 +27,8 @@ def train(prefix):
    device_ids=os.getenv("PADDLE_WORLD_DEVICE_IDS")
    current_device_id=os.getenv("PADDLE_LOCAL_DEVICE_IDS")

-    details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\
-            .format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id)
+    details = "selected_accelerators:{} selected_npus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\
+            .format(selected_accelerators, selected_npus, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id)

    print(details)
    with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), "w") as f:

--- a/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import os
+import time
+import six
+import copy
+import json
+import unittest
+import paddle.fluid as fluid
+
+import paddle.distributed.fleet.ascend_utils as ascend_utils
+
+RANK_TABLE_JSON = {
+    "status": "completed",
+    "version": "1.0",
+    "server_count": "1",
+    "server_list": [
+        {
+            "server_id": "127.0.0.1",
+            "device": [
+                {
+                    "device_id": "0",
+                    "device_ip": "192.1.184.23",
+                    "rank_id": "0"
+                },
+                {
+                    "device_id": "1",
+                    "device_ip": "192.2.21.93",
+                    "rank_id": "1"
+                }
+            ]
+        }
+    ]
+}
+
+class TestAscendUtil(unittest.TestCase):
+    def test_get_cloud_cluster(self):
+        cluster, pod = ascend_utils.get_cloud_cluster()
+        self.assertTrue(cluster)
+        self.assertTrue(pod)
+
+        with open('rank_table_file.json', 'w') as f:
+            json.dump(RANK_TABLE_JSON, f)
+        rank_table_file = "./rank_table_file.json"
+        cluster, pod = ascend_utils.get_cloud_cluster(rank_table_file=rank_table_file)
+        self.assertTrue(cluster)
+        self.assertTrue(pod)
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
@@ -51,8 +51,8 @@ echo "begin test use ascend npu"
 distributed_args="--run_mode=collective --log_dir=testlog"
 python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend

-str1="selected_accelerators:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:0,1 device_id:0"
-str2="selected_accelerators:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:0,1 device_id:1"
+str1="selected_accelerators:0 selected_npus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:0,1 device_id:0"
+str2="selected_accelerators:1 selected_npus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:0,1 device_id:1"
 file_0="multi_process_fleetlaunchascend.check_0.log"
 file_1="multi_process_fleetlaunchascend.check_1.log"