From 387c1db4f1a1e400e9f684ffd93d6a47e61b8179 Mon Sep 17 00:00:00 2001 From: xiayanming <41795079@qq.com> Date: Thu, 25 Feb 2021 20:54:05 +0800 Subject: [PATCH] Ascendrc (#31065) Ascendrc --- .../paddle/distributed/fleet/ascend_utils.py | 22 ++- python/paddle/distributed/fleet/launch.py | 10 -- .../paddle/distributed/fleet/launch_utils.py | 18 +-- .../fluid/tests/unittests/ascend_group.py | 23 ++- .../fluid/tests/unittests/hccl_tools.py | 150 ++++++++++++++++++ .../tests/unittests/test_ascend_group.sh | 15 +- .../unittests/test_fleet_launch_ascend.sh | 49 ++++-- .../unittests/test_fleet_launch_ascend2.sh | 103 ------------ 8 files changed, 228 insertions(+), 162 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/hccl_tools.py delete mode 100644 python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh diff --git a/python/paddle/distributed/fleet/ascend_utils.py b/python/paddle/distributed/fleet/ascend_utils.py index c90ad6fde5a..c27ab94c30b 100644 --- a/python/paddle/distributed/fleet/ascend_utils.py +++ b/python/paddle/distributed/fleet/ascend_utils.py @@ -79,24 +79,25 @@ def _get_ascend_rankfile(rank_table_file_path): def get_cloud_cluster(rank_table_file=None, device_mode=DeviceMode.ASCEND_NPU, - devices_per_proc=None, start_port=6070): """ Args: rank_table_file: string, ascend npu rank file path device_mode: DeviceMode(Int) - devices_per_proc:list start_port: the start port of current runtime env """ if rank_table_file: # multi trainers node_ips, device_count = _get_ascend_rankfile(rank_table_file) - node_index = os.environ.get("PADDLE_TRAINER_ID") - node_ip = None - if node_index is None: - _, node_ip = get_host_name_ip() + if len(node_ips) == 1: + node_ip = node_ips[0] else: - node_ip = node_ips[int(node_index)] + node_index = os.environ.get("PADDLE_TRAINER_ID") + node_ip = None + if node_index: + node_ip = node_ips[int(node_index)] + else: + _, node_ip = get_host_name_ip() assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \ % (node_ip, node_ips) @@ -105,11 +106,8 @@ def get_cloud_cluster(rank_table_file=None, node_ips = ["127.0.0.1"] node_ip = node_ips[0] device_count = 1 - devices_per_proc = None - - if devices_per_proc is None: - devices_per_proc = [str(x) for x in range(device_count)] - + + devices_per_proc = [str(x) for x in range(device_count)] free_ports = [ x for x in range(start_port, start_port + len(devices_per_proc)) ] diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index 697989dc2eb..e6026089255 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -124,15 +124,6 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra default="collective", help="run mode of job, can be:collective/ps/ps-heter") - base_group.add_argument( - "--ascend_npus", - type=str, - default=None, - help="It's for ascend npu training." - "For example:" - "--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one npu." - ) - base_group.add_argument("--selected_gpus", dest="gpus") base_group.add_argument( @@ -233,7 +224,6 @@ def launch_collective(args): cluster, pod = ascend_utils.get_cloud_cluster( rank_table_file=os.getenv("RANK_TABLE_FILE", None), device_mode=device_mode, - devices_per_proc=devices_per_proc, start_port=start_port) else: # trainers_num = 1 or not use paddlecloud ips="a,b" diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 25d6d95291b..bbca8118839 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -476,6 +476,10 @@ def start_local_trainers(cluster, if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.GPU: proc_env["FLAGS_selected_gpus"] = "%s" % ",".join( [str(g) for g in t.accelerators]) + + if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.ASCEND_NPU: + proc_env["FLAGS_selected_npus"] = "%s" % ",".join( + [str(g) for g in t.accelerators]) if len(t.accelerators) > 0: proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join( @@ -578,16 +582,6 @@ def watch_local_trainers(procs, nranks): return alive -def get_ascend_npus(npus): - if npus is None: - count = fluid.core.NPUDevice.get_device_count() - if count <= 0: - return None - ret = [str(x) for x in range(count)] - else: - ret = [x.strip() for x in npus.split(',')] - return ret - def get_gpus(gpus): if gpus is None: gpus_num = fluid.core.get_cuda_device_count() @@ -650,9 +644,7 @@ def get_device_proc_info(args): else: devices_per_proc = gpus elif device_mode == DeviceMode.ASCEND_NPU: - npus = get_ascend_npus(args.ascend_npus) - assert args.nproc_per_node is None, "ascend_npus need't nproc_per_node arguments" - devices_per_proc=npus + devices_per_proc = None elif device_mode == DeviceMode.CPU: if args.nproc_per_node is None: devices_per_proc = [0] diff --git a/python/paddle/fluid/tests/unittests/ascend_group.py b/python/paddle/fluid/tests/unittests/ascend_group.py index 0bc810373c9..5b76a1ecd4b 100644 --- a/python/paddle/fluid/tests/unittests/ascend_group.py +++ b/python/paddle/fluid/tests/unittests/ascend_group.py @@ -69,6 +69,24 @@ def init_communicator(startup_program, main_program, current_endpoint, endpoints OP_ROLE_KEY: OpRole.Forward, }) + # add input op for test + fill_var_name = "tensor@Filled" + fill_var = block.create_var( + name=fill_var_name, + shape=[10, 10], + dtype='float32', + persistable=False, + stop_gradient=True) + block.append_op( + type="fill_constant", + outputs={"Out": fill_var_name}, + attrs={ + "shape": [10, 10], + "dtype": fill_var.dtype, + "value": 1.0, + "place_type": 1 + }) + with fluid.program_guard(main_program): op_type="c_allreduce_sum" data=fluid.layers.fill_constant(shape=[1], dtype='float32', value=2.5) @@ -117,10 +135,11 @@ def train(world_endpoints, world_device_ids, local_device_ids,local_rank): main_program = main_programs[local_rank] loss = Loss(Block(main_program)) optimizer = ascend_optimizer.AscendOptimizer(None, fetch_list=[]) - optimizer.minimize(loss, startup_program, auto_dp=True) + optimizer.minimize(loss, startup_program, auto_dp=True, + rank_table_file=os.getenv("RANK_TABLE_FILE", None)) exe = paddle.static.Executor(paddle.CPUPlace()) - #exe.run(startup_program) + exe.run(startup_program) exe.run(main_program) diff --git a/python/paddle/fluid/tests/unittests/hccl_tools.py b/python/paddle/fluid/tests/unittests/hccl_tools.py new file mode 100644 index 00000000000..32bcd114b06 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/hccl_tools.py @@ -0,0 +1,150 @@ +# -*- coding:UTF-8 -*- +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""generate hccl config file script""" +import os +import sys +import json +import socket +from argparse import ArgumentParser +from typing import Dict, Any + + +def parse_args(): + """ + parse args . + + Args: + + Returns: + args. + + Examples: + >>> parse_args() + """ + parser = ArgumentParser(description="mindspore distributed training launch " + "helper utilty that will generate hccl" + " config file") + parser.add_argument("--device_num", type=str, default="[0,8)", + help="The number of the Ascend accelerators used. please note that the Ascend accelerators" + "used must be continuous, such [0,4) means to use four chips " + "0,1,2,3; [0,1) means to use chip 0; The first four chips are" + "a group, and the last four chips are a group. In addition to" + "the [0,8) chips are allowed, other cross-group such as [3,6)" + "are prohibited.") + parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7", + help="will use the visible devices sequentially") + parser.add_argument("--server_ip", type=str, default="", + help="server ip") + args = parser.parse_args() + return args + + +def get_host_ip(): + """ + get host ip + """ + ip = None + + try: + hostname = socket.gethostname() + ip = socket.gethostbyname(hostname) + except EOFError: + pass + + return ip + + +def main(): + print("start", __file__) + args = parse_args() + + # visible_devices + visible_devices = args.visible_devices.split(',') + print('visible_devices:{}'.format(visible_devices)) + + # server_id + ip = get_host_ip() + if args.server_ip: + server_id = args.server_ip + elif ip: + server_id = ip + else: + raise ValueError("please input server ip!") + print('server_id:{}'.format(server_id)) + + # device_num + first_num = int(args.device_num[1]) + last_num = int(args.device_num[3]) + if first_num < 0 or last_num > 8: + raise ValueError("device num {} must be in range [0,8] !".format(args.device_num)) + if first_num > last_num: + raise ValueError("First num {} of device num {} must less than last num {} !".format(first_num, args.device_num, + last_num)) + if first_num < 4: + if last_num > 4: + if first_num == 0 and last_num == 8: + pass + else: + raise ValueError("device num {} must be in the same group of [0,4] or [4,8] !".format(args.device_num)) + + device_num_list = list(range(first_num, last_num)) + print("device_num_list:", device_num_list) + + assert len(visible_devices) >= len(device_num_list) + + # construct hccn_table + device_ips: Dict[Any, Any] = {} + with open('/etc/hccn.conf', 'r') as fin: + for hccn_item in fin.readlines(): + if hccn_item.strip().startswith('address_'): + device_id, device_ip = hccn_item.split('=') + device_id = device_id.split('_')[1] + device_ips[device_id] = device_ip.strip() + + hccn_table = {'version': '1.0', + 'server_count': '1', + 'server_list': []} + device_list = [] + rank_id = 0 + for instance_id in device_num_list: + device_id = visible_devices[instance_id] + device_ip = device_ips[device_id] + device = {'device_id': device_id, + 'device_ip': device_ip, + 'rank_id': str(rank_id)} + print('rank_id:{}, device_id:{}, device_ip:{}'.format(rank_id, device_id, device_ip)) + rank_id += 1 + device_list.append(device) + hccn_table['server_list'].append({ + 'server_id': server_id, + 'device': device_list, + 'host_nic_ip': 'reserve' + }) + hccn_table['status'] = 'completed' + + # save hccn_table to file + table_path = os.getcwd() + table_fn = os.path.join(table_path, + 'hccl_{}p_{}_{}.json'.format(len(device_num_list), "".join(map(str, device_num_list)), + server_id)) + with open(table_fn, 'w') as table_fp: + json.dump(hccn_table, table_fp, indent=4) + sys.stdout.flush() + print("Completed: hccl file was save in :", table_fn) + + +if __name__ == "__main__": + main() diff --git a/python/paddle/fluid/tests/unittests/test_ascend_group.sh b/python/paddle/fluid/tests/unittests/test_ascend_group.sh index 5f901d59ad4..07ad3a696db 100644 --- a/python/paddle/fluid/tests/unittests/test_ascend_group.sh +++ b/python/paddle/fluid/tests/unittests/test_ascend_group.sh @@ -16,15 +16,14 @@ set -e -cluster_node_ips="127.0.0.1" -export PADDLE_TRAINERS_NUM=4 -export POD_IP=127.0.0.1 -export PADDLE_TRAINERS=127.0.0.1 -export PADDLE_TRAINER_ID=0 +curr_host_ip=`hostname -i` +python hccl_tools.py --device_num "[0,4)" --server_ip ${curr_host_ip} -export PADDLE_PORT=35789 -export TRAINER_PORTS_NUM=4 +export RANK_TABLE_FILE="${PWD}/hccl_4p_0123_${curr_host_ip}.json" -distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1,2,3 --log_dir=testlog" +# use ascend +echo "begin test use ascend npu" + +distributed_args="--run_mode=collective --log_dir=testlog" python -m paddle.distributed.fleet.launch ${distributed_args} \ ascend_group.py fleetascendgroup \ No newline at end of file diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh index 0960083abf2..7310af7d64c 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh @@ -16,22 +16,43 @@ set -e -# use paddlecloud -echo "begin test use paddlecloud" -cluster_node_ips="127.0.0.1,127.0.0.2" -export PADDLE_TRAINERS_NUM=2 -export POD_IP=127.0.0.1 -export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 -export PADDLE_TRAINER_ID=0 - -export PADDLE_PORT=35789 -export TRAINER_PORTS_NUM=2 - -distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1 --log_dir=testlog" +RANK_TABLE_FILE_NAME="rank_table_file.json" +cat > ${RANK_TABLE_FILE_NAME} < ${RANK_TABLE_FILE_NAME} <