未验证 提交 a6edbc47 编写于 作者: X xiayanming 提交者: GitHub

support parsing ascend rank table file (#31000)

support parsing ascend rank table file
上级 1201cd2e
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import json
import paddle
from paddle.distributed.fleet.launch_utils import get_cluster, logger, get_host_name_ip, DeviceMode
def _get_ascend_rankfile(rank_table_file_path):
"""
Args:
rank_table_file_path: ascend npu rank file json
{
"status": "completed",
"version": "1.0",
"server_count": "2",
"server_list": [
{
"server_id": "192.168.24.217",
"device": [
{
"device_id": "0",
"device_ip": "192.1.184.23",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.2.21.93",
"rank_id": "1"
}
]
},
{
"server_id": "192.168.26.177",
"device": [
{
"device_id": "0",
"device_ip": "192.1.94.132",
"rank_id": "2"
},
{
"device_id": "1",
"device_ip": "192.2.94.30",
"rank_id": "3"
}
]
}
]
}
Returns:
node_ips: node ip list
device_count: number of npu per machine
"""
json_data = None
with open(rank_table_file_path) as json_file:
json_data = json.load(json_file)
node_ips = []
device_count = 0
server_list = json_data['server_list']
for server in server_list:
node_ips.append(server['server_id'])
device_list = server['device']
device_count = len(device_list)
return node_ips, device_count
def get_cloud_cluster(rank_table_file=None,
device_mode=DeviceMode.ASCEND_NPU,
devices_per_proc=None,
start_port=6070):
"""
Args:
rank_table_file: string, ascend npu rank file path
device_mode: DeviceMode(Int)
devices_per_proc:list
start_port: the start port of current runtime env
"""
if rank_table_file:
# multi trainers
node_ips, device_count = _get_ascend_rankfile(rank_table_file)
node_index = os.environ.get("PADDLE_TRAINER_ID")
node_ip = None
if node_index is None:
_, node_ip = get_host_name_ip()
else:
node_ip = node_ips[int(node_index)]
assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
% (node_ip, node_ips)
else:
# single trainer (single ascend card)
node_ips = ["127.0.0.1"]
node_ip = node_ips[0]
device_count = 1
devices_per_proc = None
if devices_per_proc is None:
devices_per_proc = [str(x) for x in range(device_count)]
free_ports = [
x for x in range(start_port, start_port + len(devices_per_proc))
]
trainer_endpoints = []
for ip in node_ips:
trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
return get_cluster(node_ips, node_ip, trainer_endpoints,
device_mode, devices_per_proc)
\ No newline at end of file
......@@ -73,6 +73,7 @@ from paddle.distributed.fleet import launch_utils
# TODO(danleifeng): Don't import * from a module
from paddle.distributed.fleet.launch_utils import *
import paddle.distributed.fleet.cloud_utils as cloud_utils
import paddle.distributed.fleet.ascend_utils as ascend_utils
def _print_arguments(args):
......@@ -129,7 +130,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
default=None,
help="It's for ascend npu training."
"For example:"
"--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one gpu."
"--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one npu."
)
base_group.add_argument("--selected_gpus", dest="gpus")
......@@ -227,6 +228,13 @@ def launch_collective(args):
cluster, pod = cloud_utils.get_cloud_cluster(
args.ips, device_mode, devices_per_proc, start_port)
logger.debug("get cluster from cloud:{}".format(cluster))
elif device_mode == DeviceMode.ASCEND_NPU:
# for ascend
cluster, pod = ascend_utils.get_cloud_cluster(
rank_table_file=os.getenv("RANK_TABLE_FILE", None),
device_mode=device_mode,
devices_per_proc=devices_per_proc,
start_port=start_port)
else:
# trainers_num = 1 or not use paddlecloud ips="a,b"
cluster, pod = get_cluster_from_args(args, device_mode,
......
......@@ -459,7 +459,7 @@ def start_local_trainers(cluster,
current_env.pop("http_proxy", None)
current_env.pop("https_proxy", None)
ids=cluster.world_device_ids()
ids = cluster.world_device_ids()
res = [':'.join(ele) for ele in ids]
procs = []
for idx, t in enumerate(pod.trainers):
......@@ -582,8 +582,8 @@ def get_ascend_npus(npus):
if npus is None:
count = fluid.core.NPUDevice.get_device_count()
if count <= 0:
return ret
ret = [x for x in range(count)]
return None
ret = [str(x) for x in range(count)]
else:
ret = [x.strip() for x in npus.split(',')]
return ret
......
#!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
RANK_TABLE_FILE_NAME="rank_table_file.json"
cat > ${RANK_TABLE_FILE_NAME} <<EOF
{
"status": "completed",
"version": "1.0",
"server_count": "2",
"server_list": [
{
"server_id": "127.0.0.1",
"device": [
{
"device_id": "0",
"device_ip": "192.1.184.23",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.2.21.93",
"rank_id": "1"
}
]
},
{
"server_id": "127.0.0.2",
"device": [
{
"device_id": "0",
"device_ip": "192.1.94.132",
"rank_id": "2"
},
{
"device_id": "1",
"device_ip": "192.2.94.30",
"rank_id": "3"
}
]
}
]
}
EOF
# set ascend rank table file env
export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
# use paddlecloud
echo "begin test use paddlecloud"
cluster_node_ips="127.0.0.1,127.0.0.2"
export PADDLE_TRAINERS_NUM=2
export POD_IP=127.0.0.1
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
export PADDLE_TRAINER_ID=0
export PADDLE_PORT=35789
export TRAINER_PORTS_NUM=2
distributed_args="--run_mode=collective --log_dir=testlog"
python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0"
str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1"
file_0="multi_process_fleetlaunchascend.check_0.log"
file_1="multi_process_fleetlaunchascend.check_1.log"
echo "paddlecloud params test"
if grep -q "$str1" "$file_0"; then
echo "find trainer 0"
else
echo "not find trainer 0"
exit -1
fi
if grep -q "$str2" "$file_1"; then
echo "find trainer 1"
else
echo "not find trainer 1"
exit -1
fi
# test async poll process
if [ -f $file_0 ]; then
rm $file_0
fi
if [ -f $file_1 ]; then
rm $file_1
fi
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册