未验证 提交 387c1db4 编写于 作者: X xiayanming 提交者: GitHub

Ascendrc (#31065)

Ascendrc
上级 ff4654e2
...@@ -79,24 +79,25 @@ def _get_ascend_rankfile(rank_table_file_path): ...@@ -79,24 +79,25 @@ def _get_ascend_rankfile(rank_table_file_path):
def get_cloud_cluster(rank_table_file=None, def get_cloud_cluster(rank_table_file=None,
device_mode=DeviceMode.ASCEND_NPU, device_mode=DeviceMode.ASCEND_NPU,
devices_per_proc=None,
start_port=6070): start_port=6070):
""" """
Args: Args:
rank_table_file: string, ascend npu rank file path rank_table_file: string, ascend npu rank file path
device_mode: DeviceMode(Int) device_mode: DeviceMode(Int)
devices_per_proc:list
start_port: the start port of current runtime env start_port: the start port of current runtime env
""" """
if rank_table_file: if rank_table_file:
# multi trainers # multi trainers
node_ips, device_count = _get_ascend_rankfile(rank_table_file) node_ips, device_count = _get_ascend_rankfile(rank_table_file)
if len(node_ips) == 1:
node_ip = node_ips[0]
else:
node_index = os.environ.get("PADDLE_TRAINER_ID") node_index = os.environ.get("PADDLE_TRAINER_ID")
node_ip = None node_ip = None
if node_index is None: if node_index:
_, node_ip = get_host_name_ip()
else:
node_ip = node_ips[int(node_index)] node_ip = node_ips[int(node_index)]
else:
_, node_ip = get_host_name_ip()
assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \ assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
% (node_ip, node_ips) % (node_ip, node_ips)
...@@ -105,11 +106,8 @@ def get_cloud_cluster(rank_table_file=None, ...@@ -105,11 +106,8 @@ def get_cloud_cluster(rank_table_file=None,
node_ips = ["127.0.0.1"] node_ips = ["127.0.0.1"]
node_ip = node_ips[0] node_ip = node_ips[0]
device_count = 1 device_count = 1
devices_per_proc = None
if devices_per_proc is None:
devices_per_proc = [str(x) for x in range(device_count)] devices_per_proc = [str(x) for x in range(device_count)]
free_ports = [ free_ports = [
x for x in range(start_port, start_port + len(devices_per_proc)) x for x in range(start_port, start_port + len(devices_per_proc))
] ]
......
...@@ -124,15 +124,6 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra ...@@ -124,15 +124,6 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
default="collective", default="collective",
help="run mode of job, can be:collective/ps/ps-heter") help="run mode of job, can be:collective/ps/ps-heter")
base_group.add_argument(
"--ascend_npus",
type=str,
default=None,
help="It's for ascend npu training."
"For example:"
"--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one npu."
)
base_group.add_argument("--selected_gpus", dest="gpus") base_group.add_argument("--selected_gpus", dest="gpus")
base_group.add_argument( base_group.add_argument(
...@@ -233,7 +224,6 @@ def launch_collective(args): ...@@ -233,7 +224,6 @@ def launch_collective(args):
cluster, pod = ascend_utils.get_cloud_cluster( cluster, pod = ascend_utils.get_cloud_cluster(
rank_table_file=os.getenv("RANK_TABLE_FILE", None), rank_table_file=os.getenv("RANK_TABLE_FILE", None),
device_mode=device_mode, device_mode=device_mode,
devices_per_proc=devices_per_proc,
start_port=start_port) start_port=start_port)
else: else:
# trainers_num = 1 or not use paddlecloud ips="a,b" # trainers_num = 1 or not use paddlecloud ips="a,b"
......
...@@ -477,6 +477,10 @@ def start_local_trainers(cluster, ...@@ -477,6 +477,10 @@ def start_local_trainers(cluster,
proc_env["FLAGS_selected_gpus"] = "%s" % ",".join( proc_env["FLAGS_selected_gpus"] = "%s" % ",".join(
[str(g) for g in t.accelerators]) [str(g) for g in t.accelerators])
if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.ASCEND_NPU:
proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
[str(g) for g in t.accelerators])
if len(t.accelerators) > 0: if len(t.accelerators) > 0:
proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join( proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join(
[str(g) for g in t.accelerators]) [str(g) for g in t.accelerators])
...@@ -578,16 +582,6 @@ def watch_local_trainers(procs, nranks): ...@@ -578,16 +582,6 @@ def watch_local_trainers(procs, nranks):
return alive return alive
def get_ascend_npus(npus):
if npus is None:
count = fluid.core.NPUDevice.get_device_count()
if count <= 0:
return None
ret = [str(x) for x in range(count)]
else:
ret = [x.strip() for x in npus.split(',')]
return ret
def get_gpus(gpus): def get_gpus(gpus):
if gpus is None: if gpus is None:
gpus_num = fluid.core.get_cuda_device_count() gpus_num = fluid.core.get_cuda_device_count()
...@@ -650,9 +644,7 @@ def get_device_proc_info(args): ...@@ -650,9 +644,7 @@ def get_device_proc_info(args):
else: else:
devices_per_proc = gpus devices_per_proc = gpus
elif device_mode == DeviceMode.ASCEND_NPU: elif device_mode == DeviceMode.ASCEND_NPU:
npus = get_ascend_npus(args.ascend_npus) devices_per_proc = None
assert args.nproc_per_node is None, "ascend_npus need't nproc_per_node arguments"
devices_per_proc=npus
elif device_mode == DeviceMode.CPU: elif device_mode == DeviceMode.CPU:
if args.nproc_per_node is None: if args.nproc_per_node is None:
devices_per_proc = [0] devices_per_proc = [0]
......
...@@ -69,6 +69,24 @@ def init_communicator(startup_program, main_program, current_endpoint, endpoints ...@@ -69,6 +69,24 @@ def init_communicator(startup_program, main_program, current_endpoint, endpoints
OP_ROLE_KEY: OpRole.Forward, OP_ROLE_KEY: OpRole.Forward,
}) })
# add input op for test
fill_var_name = "tensor@Filled"
fill_var = block.create_var(
name=fill_var_name,
shape=[10, 10],
dtype='float32',
persistable=False,
stop_gradient=True)
block.append_op(
type="fill_constant",
outputs={"Out": fill_var_name},
attrs={
"shape": [10, 10],
"dtype": fill_var.dtype,
"value": 1.0,
"place_type": 1
})
with fluid.program_guard(main_program): with fluid.program_guard(main_program):
op_type="c_allreduce_sum" op_type="c_allreduce_sum"
data=fluid.layers.fill_constant(shape=[1], dtype='float32', value=2.5) data=fluid.layers.fill_constant(shape=[1], dtype='float32', value=2.5)
...@@ -117,10 +135,11 @@ def train(world_endpoints, world_device_ids, local_device_ids,local_rank): ...@@ -117,10 +135,11 @@ def train(world_endpoints, world_device_ids, local_device_ids,local_rank):
main_program = main_programs[local_rank] main_program = main_programs[local_rank]
loss = Loss(Block(main_program)) loss = Loss(Block(main_program))
optimizer = ascend_optimizer.AscendOptimizer(None, fetch_list=[]) optimizer = ascend_optimizer.AscendOptimizer(None, fetch_list=[])
optimizer.minimize(loss, startup_program, auto_dp=True) optimizer.minimize(loss, startup_program, auto_dp=True,
rank_table_file=os.getenv("RANK_TABLE_FILE", None))
exe = paddle.static.Executor(paddle.CPUPlace()) exe = paddle.static.Executor(paddle.CPUPlace())
#exe.run(startup_program) exe.run(startup_program)
exe.run(main_program) exe.run(main_program)
......
# -*- coding:UTF-8 -*-
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""generate hccl config file script"""
import os
import sys
import json
import socket
from argparse import ArgumentParser
from typing import Dict, Any
def parse_args():
"""
parse args .
Args:
Returns:
args.
Examples:
>>> parse_args()
"""
parser = ArgumentParser(description="mindspore distributed training launch "
"helper utilty that will generate hccl"
" config file")
parser.add_argument("--device_num", type=str, default="[0,8)",
help="The number of the Ascend accelerators used. please note that the Ascend accelerators"
"used must be continuous, such [0,4) means to use four chips "
"0,1,2,3; [0,1) means to use chip 0; The first four chips are"
"a group, and the last four chips are a group. In addition to"
"the [0,8) chips are allowed, other cross-group such as [3,6)"
"are prohibited.")
parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7",
help="will use the visible devices sequentially")
parser.add_argument("--server_ip", type=str, default="",
help="server ip")
args = parser.parse_args()
return args
def get_host_ip():
"""
get host ip
"""
ip = None
try:
hostname = socket.gethostname()
ip = socket.gethostbyname(hostname)
except EOFError:
pass
return ip
def main():
print("start", __file__)
args = parse_args()
# visible_devices
visible_devices = args.visible_devices.split(',')
print('visible_devices:{}'.format(visible_devices))
# server_id
ip = get_host_ip()
if args.server_ip:
server_id = args.server_ip
elif ip:
server_id = ip
else:
raise ValueError("please input server ip!")
print('server_id:{}'.format(server_id))
# device_num
first_num = int(args.device_num[1])
last_num = int(args.device_num[3])
if first_num < 0 or last_num > 8:
raise ValueError("device num {} must be in range [0,8] !".format(args.device_num))
if first_num > last_num:
raise ValueError("First num {} of device num {} must less than last num {} !".format(first_num, args.device_num,
last_num))
if first_num < 4:
if last_num > 4:
if first_num == 0 and last_num == 8:
pass
else:
raise ValueError("device num {} must be in the same group of [0,4] or [4,8] !".format(args.device_num))
device_num_list = list(range(first_num, last_num))
print("device_num_list:", device_num_list)
assert len(visible_devices) >= len(device_num_list)
# construct hccn_table
device_ips: Dict[Any, Any] = {}
with open('/etc/hccn.conf', 'r') as fin:
for hccn_item in fin.readlines():
if hccn_item.strip().startswith('address_'):
device_id, device_ip = hccn_item.split('=')
device_id = device_id.split('_')[1]
device_ips[device_id] = device_ip.strip()
hccn_table = {'version': '1.0',
'server_count': '1',
'server_list': []}
device_list = []
rank_id = 0
for instance_id in device_num_list:
device_id = visible_devices[instance_id]
device_ip = device_ips[device_id]
device = {'device_id': device_id,
'device_ip': device_ip,
'rank_id': str(rank_id)}
print('rank_id:{}, device_id:{}, device_ip:{}'.format(rank_id, device_id, device_ip))
rank_id += 1
device_list.append(device)
hccn_table['server_list'].append({
'server_id': server_id,
'device': device_list,
'host_nic_ip': 'reserve'
})
hccn_table['status'] = 'completed'
# save hccn_table to file
table_path = os.getcwd()
table_fn = os.path.join(table_path,
'hccl_{}p_{}_{}.json'.format(len(device_num_list), "".join(map(str, device_num_list)),
server_id))
with open(table_fn, 'w') as table_fp:
json.dump(hccn_table, table_fp, indent=4)
sys.stdout.flush()
print("Completed: hccl file was save in :", table_fn)
if __name__ == "__main__":
main()
...@@ -16,15 +16,14 @@ ...@@ -16,15 +16,14 @@
set -e set -e
cluster_node_ips="127.0.0.1" curr_host_ip=`hostname -i`
export PADDLE_TRAINERS_NUM=4 python hccl_tools.py --device_num "[0,4)" --server_ip ${curr_host_ip}
export POD_IP=127.0.0.1
export PADDLE_TRAINERS=127.0.0.1
export PADDLE_TRAINER_ID=0
export PADDLE_PORT=35789 export RANK_TABLE_FILE="${PWD}/hccl_4p_0123_${curr_host_ip}.json"
export TRAINER_PORTS_NUM=4
distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1,2,3 --log_dir=testlog" # use ascend
echo "begin test use ascend npu"
distributed_args="--run_mode=collective --log_dir=testlog"
python -m paddle.distributed.fleet.launch ${distributed_args} \ python -m paddle.distributed.fleet.launch ${distributed_args} \
ascend_group.py fleetascendgroup ascend_group.py fleetascendgroup
\ No newline at end of file
...@@ -16,22 +16,43 @@ ...@@ -16,22 +16,43 @@
set -e set -e
# use paddlecloud RANK_TABLE_FILE_NAME="rank_table_file.json"
echo "begin test use paddlecloud" cat > ${RANK_TABLE_FILE_NAME} <<EOF
cluster_node_ips="127.0.0.1,127.0.0.2" {
export PADDLE_TRAINERS_NUM=2 "status": "completed",
export POD_IP=127.0.0.1 "version": "1.0",
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 "server_count": "1",
export PADDLE_TRAINER_ID=0 "server_list": [
{
export PADDLE_PORT=35789 "server_id": "127.0.0.1",
export TRAINER_PORTS_NUM=2 "device": [
{
distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1 --log_dir=testlog" "device_id": "0",
"device_ip": "192.1.184.23",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.2.21.93",
"rank_id": "1"
}
]
}
]
}
EOF
# set ascend rank table file env
export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
# use ascend
echo "begin test use ascend npu"
distributed_args="--run_mode=collective --log_dir=testlog"
python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0" str1="selected_accelerators:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:0,1 device_id:0"
str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1" str2="selected_accelerators:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:0,1 device_id:1"
file_0="multi_process_fleetlaunchascend.check_0.log" file_0="multi_process_fleetlaunchascend.check_0.log"
file_1="multi_process_fleetlaunchascend.check_1.log" file_1="multi_process_fleetlaunchascend.check_1.log"
......
#!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
RANK_TABLE_FILE_NAME="rank_table_file.json"
cat > ${RANK_TABLE_FILE_NAME} <<EOF
{
"status": "completed",
"version": "1.0",
"server_count": "2",
"server_list": [
{
"server_id": "127.0.0.1",
"device": [
{
"device_id": "0",
"device_ip": "192.1.184.23",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.2.21.93",
"rank_id": "1"
}
]
},
{
"server_id": "127.0.0.2",
"device": [
{
"device_id": "0",
"device_ip": "192.1.94.132",
"rank_id": "2"
},
{
"device_id": "1",
"device_ip": "192.2.94.30",
"rank_id": "3"
}
]
}
]
}
EOF
# set ascend rank table file env
export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
# use paddlecloud
echo "begin test use paddlecloud"
cluster_node_ips="127.0.0.1,127.0.0.2"
export PADDLE_TRAINERS_NUM=2
export POD_IP=127.0.0.1
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
export PADDLE_TRAINER_ID=0
export PADDLE_PORT=35789
export TRAINER_PORTS_NUM=2
distributed_args="--run_mode=collective --log_dir=testlog"
python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0"
str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1"
file_0="multi_process_fleetlaunchascend.check_0.log"
file_1="multi_process_fleetlaunchascend.check_1.log"
echo "paddlecloud params test"
if grep -q "$str1" "$file_0"; then
echo "find trainer 0"
else
echo "not find trainer 0"
exit -1
fi
if grep -q "$str2" "$file_1"; then
echo "find trainer 1"
else
echo "not find trainer 1"
exit -1
fi
# test async poll process
if [ -f $file_0 ]; then
rm $file_0
fi
if [ -f $file_1 ]; then
rm $file_1
fi
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册