diff --git a/python/paddle/distributed/fleet/ascend_utils.py b/python/paddle/distributed/fleet/ascend_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c90ad6fde5a8c0713f8e49fbf92d03c0d2441427 --- /dev/null +++ b/python/paddle/distributed/fleet/ascend_utils.py @@ -0,0 +1,122 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import json +import paddle +from paddle.distributed.fleet.launch_utils import get_cluster, logger, get_host_name_ip, DeviceMode + +def _get_ascend_rankfile(rank_table_file_path): + """ + Args: + rank_table_file_path: ascend npu rank file json + { + "status": "completed", + "version": "1.0", + "server_count": "2", + "server_list": [ + { + "server_id": "192.168.24.217", + "device": [ + { + "device_id": "0", + "device_ip": "192.1.184.23", + "rank_id": "0" + }, + { + "device_id": "1", + "device_ip": "192.2.21.93", + "rank_id": "1" + } + ] + }, + { + "server_id": "192.168.26.177", + "device": [ + { + "device_id": "0", + "device_ip": "192.1.94.132", + "rank_id": "2" + }, + { + "device_id": "1", + "device_ip": "192.2.94.30", + "rank_id": "3" + } + ] + } + ] + } + + Returns: + node_ips: node ip list + device_count: number of npu per machine + """ + json_data = None + with open(rank_table_file_path) as json_file: + json_data = json.load(json_file) + + node_ips = [] + device_count = 0 + server_list = json_data['server_list'] + for server in server_list: + node_ips.append(server['server_id']) + device_list = server['device'] + device_count = len(device_list) + + return node_ips, device_count + +def get_cloud_cluster(rank_table_file=None, + device_mode=DeviceMode.ASCEND_NPU, + devices_per_proc=None, + start_port=6070): + """ + Args: + rank_table_file: string, ascend npu rank file path + device_mode: DeviceMode(Int) + devices_per_proc:list + start_port: the start port of current runtime env + """ + if rank_table_file: + # multi trainers + node_ips, device_count = _get_ascend_rankfile(rank_table_file) + node_index = os.environ.get("PADDLE_TRAINER_ID") + node_ip = None + if node_index is None: + _, node_ip = get_host_name_ip() + else: + node_ip = node_ips[int(node_index)] + + assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \ + % (node_ip, node_ips) + else: + # single trainer (single ascend card) + node_ips = ["127.0.0.1"] + node_ip = node_ips[0] + device_count = 1 + devices_per_proc = None + + if devices_per_proc is None: + devices_per_proc = [str(x) for x in range(device_count)] + + free_ports = [ + x for x in range(start_port, start_port + len(devices_per_proc)) + ] + + trainer_endpoints = [] + for ip in node_ips: + trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports]) + + return get_cluster(node_ips, node_ip, trainer_endpoints, + device_mode, devices_per_proc) \ No newline at end of file diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index 4bf1edf16363e13ad43fa5252635b741c7a64256..697989dc2eb2f5d3d64250409fcacac7dda1d1fd 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -73,6 +73,7 @@ from paddle.distributed.fleet import launch_utils # TODO(danleifeng): Don't import * from a module from paddle.distributed.fleet.launch_utils import * import paddle.distributed.fleet.cloud_utils as cloud_utils +import paddle.distributed.fleet.ascend_utils as ascend_utils def _print_arguments(args): @@ -129,7 +130,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra default=None, help="It's for ascend npu training." "For example:" - "--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one gpu." + "--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one npu." ) base_group.add_argument("--selected_gpus", dest="gpus") @@ -227,6 +228,13 @@ def launch_collective(args): cluster, pod = cloud_utils.get_cloud_cluster( args.ips, device_mode, devices_per_proc, start_port) logger.debug("get cluster from cloud:{}".format(cluster)) + elif device_mode == DeviceMode.ASCEND_NPU: + # for ascend + cluster, pod = ascend_utils.get_cloud_cluster( + rank_table_file=os.getenv("RANK_TABLE_FILE", None), + device_mode=device_mode, + devices_per_proc=devices_per_proc, + start_port=start_port) else: # trainers_num = 1 or not use paddlecloud ips="a,b" cluster, pod = get_cluster_from_args(args, device_mode, diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index f39e2284a5805f589e7a37cca90771ec875ce17c..25d6d95291b56f411ad70d6f6615527433b7de94 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -459,7 +459,7 @@ def start_local_trainers(cluster, current_env.pop("http_proxy", None) current_env.pop("https_proxy", None) - ids=cluster.world_device_ids() + ids = cluster.world_device_ids() res = [':'.join(ele) for ele in ids] procs = [] for idx, t in enumerate(pod.trainers): @@ -582,8 +582,8 @@ def get_ascend_npus(npus): if npus is None: count = fluid.core.NPUDevice.get_device_count() if count <= 0: - return ret - ret = [x for x in range(count)] + return None + ret = [str(x) for x in range(count)] else: ret = [x.strip() for x in npus.split(',')] return ret diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh new file mode 100644 index 0000000000000000000000000000000000000000..2e9c1e6995399e94e43c54168bec8d86533ab2ff --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +RANK_TABLE_FILE_NAME="rank_table_file.json" +cat > ${RANK_TABLE_FILE_NAME} <