未验证 提交 821c2f4e 编写于 作者: X xiayanming 提交者: GitHub

add ascend unittest (#31249)

add ascend unittest
上级 d45f5d78
......@@ -476,8 +476,7 @@ def start_local_trainers(cluster,
if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.GPU:
proc_env["FLAGS_selected_gpus"] = "%s" % ",".join(
[str(g) for g in t.accelerators])
if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.ASCEND_NPU:
elif len(t.accelerators) > 0 and pod.device_mode==DeviceMode.ASCEND_NPU:
proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
[str(g) for g in t.accelerators])
......
......@@ -18,6 +18,7 @@ import time
def train(prefix):
selected_accelerators = os.getenv("FLAGS_selected_accelerators")
selected_npus = os.getenv("FLAGS_selected_npus")
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
......@@ -26,8 +27,8 @@ def train(prefix):
device_ids=os.getenv("PADDLE_WORLD_DEVICE_IDS")
current_device_id=os.getenv("PADDLE_LOCAL_DEVICE_IDS")
details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\
.format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id)
details = "selected_accelerators:{} selected_npus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\
.format(selected_accelerators, selected_npus, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id)
print(details)
with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), "w") as f:
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import sys
import os
import time
import six
import copy
import json
import unittest
import paddle.fluid as fluid
import paddle.distributed.fleet.ascend_utils as ascend_utils
RANK_TABLE_JSON = {
"status": "completed",
"version": "1.0",
"server_count": "1",
"server_list": [
{
"server_id": "127.0.0.1",
"device": [
{
"device_id": "0",
"device_ip": "192.1.184.23",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.2.21.93",
"rank_id": "1"
}
]
}
]
}
class TestAscendUtil(unittest.TestCase):
def test_get_cloud_cluster(self):
cluster, pod = ascend_utils.get_cloud_cluster()
self.assertTrue(cluster)
self.assertTrue(pod)
with open('rank_table_file.json', 'w') as f:
json.dump(RANK_TABLE_JSON, f)
rank_table_file = "./rank_table_file.json"
cluster, pod = ascend_utils.get_cloud_cluster(rank_table_file=rank_table_file)
self.assertTrue(cluster)
self.assertTrue(pod)
if __name__ == '__main__':
unittest.main()
\ No newline at end of file
......@@ -51,8 +51,8 @@ echo "begin test use ascend npu"
distributed_args="--run_mode=collective --log_dir=testlog"
python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
str1="selected_accelerators:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:0,1 device_id:0"
str2="selected_accelerators:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:0,1 device_id:1"
str1="selected_accelerators:0 selected_npus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:0,1 device_id:0"
str2="selected_accelerators:1 selected_npus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:0,1 device_id:1"
file_0="multi_process_fleetlaunchascend.check_0.log"
file_1="multi_process_fleetlaunchascend.check_1.log"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册