Ascendrc (#31065)

Ascendrc

Ascendrc (#31065)
Ascendrc
387c1db4 · xiayanming · GitHub · ff4654e2 · 387c1db4 · 387c1db4
8 changed file
--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ b/python/paddle/distributed/fleet/ascend_utils.py
@@ -79,24 +79,25 @@ def _get_ascend_rankfile(rank_table_file_path):
 def get_cloud_cluster(rank_table_file=None, 
                    device_mode=DeviceMode.ASCEND_NPU, 
-                    devices_per_proc=None,
                    start_port=6070):
    """
    Args:
    rank_table_file: string, ascend npu rank file path
    device_mode: DeviceMode(Int)
-    devices_per_proc:list
    start_port: the start port of current runtime env
    """
    if rank_table_file: 
        # multi trainers
        node_ips, device_count = _get_ascend_rankfile(rank_table_file)
+        if len(node_ips) == 1:
+            node_ip = node_ips[0]
+        else:
            node_index = os.environ.get("PADDLE_TRAINER_ID")
            node_ip = None
-        if node_index is None:
+            if node_index:
-            _, node_ip = get_host_name_ip()
-        else:
                node_ip = node_ips[int(node_index)]
+            else:
+                _, node_ip = get_host_name_ip()
        assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
            % (node_ip, node_ips)
@@ -105,11 +106,8 @@ def get_cloud_cluster(rank_table_file=None,
        node_ips = ["127.0.0.1"]
        node_ip = node_ips[0]
        device_count = 1
-        devices_per_proc = None
-    if devices_per_proc is None:
    devices_per_proc = [str(x) for x in range(device_count)]
    free_ports = [
        x for x in range(start_port, start_port + len(devices_per_proc))
    ]

--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -124,15 +124,6 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
        default="collective",
        help="run mode of job, can be:collective/ps/ps-heter")
-    base_group.add_argument(
-        "--ascend_npus",
-        type=str,
-        default=None,
-        help="It's for ascend npu training."
-        "For example:"
-        "--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one npu."
-    )
    base_group.add_argument("--selected_gpus", dest="gpus")
    base_group.add_argument(
@@ -233,7 +224,6 @@ def launch_collective(args):
        cluster, pod = ascend_utils.get_cloud_cluster(
                rank_table_file=os.getenv("RANK_TABLE_FILE", None), 
                device_mode=device_mode,
-                devices_per_proc=devices_per_proc,
                start_port=start_port)
    else:
        # trainers_num = 1 or not use paddlecloud ips="a,b"

--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -477,6 +477,10 @@ def start_local_trainers(cluster,
            proc_env["FLAGS_selected_gpus"] = "%s" % ",".join(
                [str(g) for g in t.accelerators])
+        if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.ASCEND_NPU:
+            proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
+                [str(g) for g in t.accelerators])
        if len(t.accelerators) > 0:
            proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join(
                [str(g) for g in t.accelerators])
@@ -578,16 +582,6 @@ def watch_local_trainers(procs, nranks):
    return alive
-def get_ascend_npus(npus):
-    if npus is None:
-        count = fluid.core.NPUDevice.get_device_count()
-        if count <= 0:
-            return None
-        ret = [str(x) for x in range(count)]
-    else:
-        ret = [x.strip() for x in npus.split(',')]
-    return ret
 def get_gpus(gpus):
    if gpus is None:
        gpus_num = fluid.core.get_cuda_device_count()
@@ -650,9 +644,7 @@ def get_device_proc_info(args):
        else:
            devices_per_proc = gpus
    elif device_mode == DeviceMode.ASCEND_NPU:
-        npus = get_ascend_npus(args.ascend_npus)
+        devices_per_proc = None
-        assert args.nproc_per_node is None, "ascend_npus need't nproc_per_node arguments"
-        devices_per_proc=npus
    elif device_mode == DeviceMode.CPU:
        if args.nproc_per_node is None:
            devices_per_proc = [0]

--- a/python/paddle/fluid/tests/unittests/ascend_group.py
+++ b/python/paddle/fluid/tests/unittests/ascend_group.py
@@ -69,6 +69,24 @@ def init_communicator(startup_program, main_program, current_endpoint, endpoints
            OP_ROLE_KEY: OpRole.Forward,
        })
+    # add input op for test
+    fill_var_name = "tensor@Filled"
+    fill_var = block.create_var(
+                name=fill_var_name,
+                shape=[10, 10],
+                dtype='float32',
+                persistable=False,
+                stop_gradient=True)
+    block.append_op(
+        type="fill_constant",
+        outputs={"Out": fill_var_name},
+        attrs={
+            "shape": [10, 10],
+            "dtype": fill_var.dtype,
+            "value": 1.0,
+            "place_type": 1
+        })
    with fluid.program_guard(main_program):
        op_type="c_allreduce_sum"
        data=fluid.layers.fill_constant(shape=[1], dtype='float32', value=2.5)
@@ -117,10 +135,11 @@ def train(world_endpoints, world_device_ids, local_device_ids,local_rank):
    main_program = main_programs[local_rank]
    loss = Loss(Block(main_program))
    optimizer = ascend_optimizer.AscendOptimizer(None, fetch_list=[])
-    optimizer.minimize(loss, startup_program, auto_dp=True)
+    optimizer.minimize(loss, startup_program, auto_dp=True, 
+                    rank_table_file=os.getenv("RANK_TABLE_FILE", None))
    exe = paddle.static.Executor(paddle.CPUPlace())
-    #exe.run(startup_program)
+    exe.run(startup_program)
    exe.run(main_program)

--- a/python/paddle/fluid/tests/unittests/hccl_tools.py
+++ b/python/paddle/fluid/tests/unittests/hccl_tools.py
+# -*- coding:UTF-8 -*-
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""generate hccl config file script"""
+import os
+import sys
+import json
+import socket
+from argparse import ArgumentParser
+from typing import Dict, Any
+def parse_args():
+    """
+    parse args .
+    Args:
+    Returns:
+        args.
+    Examples:
+        >>> parse_args()
+    """
+    parser = ArgumentParser(description="mindspore distributed training launch "
+                                        "helper utilty that will generate hccl"
+                                        " config file")
+    parser.add_argument("--device_num", type=str, default="[0,8)",
+                        help="The number of the Ascend accelerators used. please note that the Ascend accelerators"
+                             "used must be continuous, such [0,4) means to use four chips "
+                             "0，1，2，3; [0,1) means to use chip 0; The first four chips are"
+                             "a group, and the last four chips are a group. In addition to"
+                             "the [0,8) chips are allowed, other cross-group such as [3,6)"
+                             "are prohibited.")
+    parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7",
+                        help="will use the visible devices sequentially")
+    parser.add_argument("--server_ip", type=str, default="",
+                        help="server ip")
+    args = parser.parse_args()
+    return args
+def get_host_ip():
+    """
+    get host ip
+    """
+    ip = None
+    try:
+        hostname = socket.gethostname()
+        ip = socket.gethostbyname(hostname)
+    except EOFError:
+        pass
+    return ip
+def main():
+    print("start", __file__)
+    args = parse_args()
+    # visible_devices
+    visible_devices = args.visible_devices.split(',')
+    print('visible_devices:{}'.format(visible_devices))
+    # server_id
+    ip = get_host_ip()
+    if args.server_ip:
+        server_id = args.server_ip
+    elif ip:
+        server_id = ip
+    else:
+        raise ValueError("please input server ip!")
+    print('server_id:{}'.format(server_id))
+    # device_num
+    first_num = int(args.device_num[1])
+    last_num = int(args.device_num[3])
+    if first_num < 0 or last_num > 8:
+        raise ValueError("device num {} must be in range [0,8] !".format(args.device_num))
+    if first_num > last_num:
+        raise ValueError("First num {} of device num {} must less than last num {} !".format(first_num, args.device_num,
+                                                                                             last_num))
+    if first_num < 4:
+        if last_num > 4:
+            if first_num == 0 and last_num == 8:
+                pass
+            else:
+                raise ValueError("device num {} must be in the same group of [0,4] or [4,8] !".format(args.device_num))
+    device_num_list = list(range(first_num, last_num))
+    print("device_num_list:", device_num_list)
+    assert len(visible_devices) >= len(device_num_list)
+    # construct hccn_table
+    device_ips: Dict[Any, Any] = {}
+    with open('/etc/hccn.conf', 'r') as fin:
+        for hccn_item in fin.readlines():
+            if hccn_item.strip().startswith('address_'):
+                device_id, device_ip = hccn_item.split('=')
+                device_id = device_id.split('_')[1]
+                device_ips[device_id] = device_ip.strip()
+    hccn_table = {'version': '1.0',
+                  'server_count': '1',
+                  'server_list': []}
+    device_list = []
+    rank_id = 0
+    for instance_id in device_num_list:
+        device_id = visible_devices[instance_id]
+        device_ip = device_ips[device_id]
+        device = {'device_id': device_id,
+                  'device_ip': device_ip,
+                  'rank_id': str(rank_id)}
+        print('rank_id:{}, device_id:{}, device_ip:{}'.format(rank_id, device_id, device_ip))
+        rank_id += 1
+        device_list.append(device)
+    hccn_table['server_list'].append({
+        'server_id': server_id,
+        'device': device_list,
+        'host_nic_ip': 'reserve'
+    })
+    hccn_table['status'] = 'completed'
+    # save hccn_table to file
+    table_path = os.getcwd()
+    table_fn = os.path.join(table_path,
+                            'hccl_{}p_{}_{}.json'.format(len(device_num_list), "".join(map(str, device_num_list)),
+                                                         server_id))
+    with open(table_fn, 'w') as table_fp:
+        json.dump(hccn_table, table_fp, indent=4)
+    sys.stdout.flush()
+    print("Completed: hccl file was save in :", table_fn)
+if __name__ == "__main__":
+    main()
--- a/python/paddle/fluid/tests/unittests/test_ascend_group.sh
+++ b/python/paddle/fluid/tests/unittests/test_ascend_group.sh
@@ -16,15 +16,14 @@
 set -e
-cluster_node_ips="127.0.0.1"
+curr_host_ip=`hostname -i`
-export PADDLE_TRAINERS_NUM=4
+python hccl_tools.py --device_num "[0,4)" --server_ip ${curr_host_ip}
-export POD_IP=127.0.0.1
-export PADDLE_TRAINERS=127.0.0.1
-export PADDLE_TRAINER_ID=0
-export PADDLE_PORT=35789
+export RANK_TABLE_FILE="${PWD}/hccl_4p_0123_${curr_host_ip}.json"
-export TRAINER_PORTS_NUM=4
-distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1,2,3 --log_dir=testlog"
+# use ascend
+echo "begin test use ascend npu"
+distributed_args="--run_mode=collective --log_dir=testlog"
 python -m paddle.distributed.fleet.launch ${distributed_args} \
  ascend_group.py fleetascendgroup
\ No newline at end of file
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
@@ -16,22 +16,43 @@
 set -e
-# use paddlecloud
+RANK_TABLE_FILE_NAME="rank_table_file.json"
-echo "begin test use paddlecloud"
+cat > ${RANK_TABLE_FILE_NAME} <<EOF
-cluster_node_ips="127.0.0.1,127.0.0.2"
+{
-export PADDLE_TRAINERS_NUM=2
+    "status": "completed",
-export POD_IP=127.0.0.1
+    "version": "1.0",
-export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
+    "server_count": "1",
-export PADDLE_TRAINER_ID=0
+    "server_list": [
+        {
-export PADDLE_PORT=35789
+            "server_id": "127.0.0.1",
-export TRAINER_PORTS_NUM=2
+            "device": [
+                {
-distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1 --log_dir=testlog"
+                    "device_id": "0",
+                    "device_ip": "192.1.184.23",
+                    "rank_id": "0"
+                },
+                {
+                    "device_id": "1",
+                    "device_ip": "192.2.21.93",
+                    "rank_id": "1"
+                }
+            ]
+        }
+    ]
+}
+EOF
+# set ascend rank table file env
+export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
+# use ascend
+echo "begin test use ascend npu"
+distributed_args="--run_mode=collective --log_dir=testlog"
 python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
-str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0"
+str1="selected_accelerators:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:0,1 device_id:0"
-str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1"
+str2="selected_accelerators:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:0,1 device_id:1"
 file_0="multi_process_fleetlaunchascend.check_0.log"
 file_1="multi_process_fleetlaunchascend.check_1.log"

--- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh
-#!/bin/bash
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-RANK_TABLE_FILE_NAME="rank_table_file.json"
-cat > ${RANK_TABLE_FILE_NAME} <<EOF
-{
-    "status": "completed",
-    "version": "1.0",
-    "server_count": "2",
-    "server_list": [
-        {
-            "server_id": "127.0.0.1",
-            "device": [
-                {
-                    "device_id": "0",
-                    "device_ip": "192.1.184.23",
-                    "rank_id": "0"
-                },
-                {
-                    "device_id": "1",
-                    "device_ip": "192.2.21.93",
-                    "rank_id": "1"
-                }
-            ]
-        },
-        {
-            "server_id": "127.0.0.2",
-            "device": [
-                {
-                    "device_id": "0",
-                    "device_ip": "192.1.94.132",
-                    "rank_id": "2"
-                },
-                {
-                    "device_id": "1",
-                    "device_ip": "192.2.94.30",
-                    "rank_id": "3"
-                }
-            ]
-        }
-    ]
-}
-EOF
-# set ascend rank table file env
-export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
-# use paddlecloud
-echo "begin test use paddlecloud"
-cluster_node_ips="127.0.0.1,127.0.0.2"
-export PADDLE_TRAINERS_NUM=2
-export POD_IP=127.0.0.1
-export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
-export PADDLE_TRAINER_ID=0
-export PADDLE_PORT=35789
-export TRAINER_PORTS_NUM=2
-distributed_args="--run_mode=collective --log_dir=testlog"
-python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
-str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0"
-str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1"
-file_0="multi_process_fleetlaunchascend.check_0.log"
-file_1="multi_process_fleetlaunchascend.check_1.log"
-echo "paddlecloud params test"
-if grep -q "$str1" "$file_0"; then
-    echo "find trainer 0"
-else
-    echo "not find trainer 0"
-    exit -1
-fi
-if grep -q "$str2" "$file_1"; then
-    echo "find trainer 1"
-else
-    echo "not find trainer 1"
-    exit -1
-fi
-# test async poll process
-if [ -f $file_0 ]; then
-    rm $file_0
-fi
-if [ -f $file_1 ]; then
-    rm $file_1
-fi