From 387c1db4f1a1e400e9f684ffd93d6a47e61b8179 Mon Sep 17 00:00:00 2001
From: xiayanming <41795079@qq.com>
Date: Thu, 25 Feb 2021 20:54:05 +0800
Subject: [PATCH] Ascendrc (#31065)

Ascendrc
---
 .../paddle/distributed/fleet/ascend_utils.py  |  22 ++-
 python/paddle/distributed/fleet/launch.py     |  10 --
 .../paddle/distributed/fleet/launch_utils.py  |  18 +--
 .../fluid/tests/unittests/ascend_group.py     |  23 ++-
 .../fluid/tests/unittests/hccl_tools.py       | 150 ++++++++++++++++++
 .../tests/unittests/test_ascend_group.sh      |  15 +-
 .../unittests/test_fleet_launch_ascend.sh     |  49 ++++--
 .../unittests/test_fleet_launch_ascend2.sh    | 103 ------------
 8 files changed, 228 insertions(+), 162 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/hccl_tools.py
 delete mode 100644 python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh

diff --git a/python/paddle/distributed/fleet/ascend_utils.py b/python/paddle/distributed/fleet/ascend_utils.py
index c90ad6fde5a..c27ab94c30b 100644
--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ b/python/paddle/distributed/fleet/ascend_utils.py
@@ -79,24 +79,25 @@ def _get_ascend_rankfile(rank_table_file_path):
 
 def get_cloud_cluster(rank_table_file=None, 
                     device_mode=DeviceMode.ASCEND_NPU, 
-                    devices_per_proc=None,
                     start_port=6070):
     """
     Args:
     rank_table_file: string, ascend npu rank file path
     device_mode: DeviceMode(Int)
-    devices_per_proc:list
     start_port: the start port of current runtime env
     """
     if rank_table_file: 
         # multi trainers
         node_ips, device_count = _get_ascend_rankfile(rank_table_file)
-        node_index = os.environ.get("PADDLE_TRAINER_ID")
-        node_ip = None
-        if node_index is None:
-            _, node_ip = get_host_name_ip()
+        if len(node_ips) == 1:
+            node_ip = node_ips[0]
         else:
-            node_ip = node_ips[int(node_index)]
+            node_index = os.environ.get("PADDLE_TRAINER_ID")
+            node_ip = None
+            if node_index:
+                node_ip = node_ips[int(node_index)]
+            else:
+                _, node_ip = get_host_name_ip()
 
         assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
             % (node_ip, node_ips)
@@ -105,11 +106,8 @@ def get_cloud_cluster(rank_table_file=None,
         node_ips = ["127.0.0.1"]
         node_ip = node_ips[0]
         device_count = 1
-        devices_per_proc = None
-
-    if devices_per_proc is None:
-        devices_per_proc = [str(x) for x in range(device_count)]
-
+        
+    devices_per_proc = [str(x) for x in range(device_count)]
     free_ports = [
         x for x in range(start_port, start_port + len(devices_per_proc))
     ]
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 697989dc2eb..e6026089255 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -124,15 +124,6 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
         default="collective",
         help="run mode of job, can be:collective/ps/ps-heter")
 
-    base_group.add_argument(
-        "--ascend_npus",
-        type=str,
-        default=None,
-        help="It's for ascend npu training."
-        "For example:"
-        "--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one npu."
-    )
-
     base_group.add_argument("--selected_gpus", dest="gpus")
 
     base_group.add_argument(
@@ -233,7 +224,6 @@ def launch_collective(args):
         cluster, pod = ascend_utils.get_cloud_cluster(
                 rank_table_file=os.getenv("RANK_TABLE_FILE", None), 
                 device_mode=device_mode,
-                devices_per_proc=devices_per_proc,
                 start_port=start_port)
     else:
         # trainers_num = 1 or not use paddlecloud ips="a,b"
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 25d6d95291b..bbca8118839 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -476,6 +476,10 @@ def start_local_trainers(cluster,
         if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.GPU:
             proc_env["FLAGS_selected_gpus"] = "%s" % ",".join(
                 [str(g) for g in t.accelerators])
+        
+        if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.ASCEND_NPU:
+            proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
+                [str(g) for g in t.accelerators])
 
         if len(t.accelerators) > 0:
             proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join(
@@ -578,16 +582,6 @@ def watch_local_trainers(procs, nranks):
     return alive
 
 
-def get_ascend_npus(npus):
-    if npus is None:
-        count = fluid.core.NPUDevice.get_device_count()
-        if count <= 0:
-            return None
-        ret = [str(x) for x in range(count)]
-    else:
-        ret = [x.strip() for x in npus.split(',')]
-    return ret
-
 def get_gpus(gpus):
     if gpus is None:
         gpus_num = fluid.core.get_cuda_device_count()
@@ -650,9 +644,7 @@ def get_device_proc_info(args):
         else:
             devices_per_proc = gpus
     elif device_mode == DeviceMode.ASCEND_NPU:
-        npus = get_ascend_npus(args.ascend_npus)
-        assert args.nproc_per_node is None, "ascend_npus need't nproc_per_node arguments"
-        devices_per_proc=npus
+        devices_per_proc = None
     elif device_mode == DeviceMode.CPU:
         if args.nproc_per_node is None:
             devices_per_proc = [0]
diff --git a/python/paddle/fluid/tests/unittests/ascend_group.py b/python/paddle/fluid/tests/unittests/ascend_group.py
index 0bc810373c9..5b76a1ecd4b 100644
--- a/python/paddle/fluid/tests/unittests/ascend_group.py
+++ b/python/paddle/fluid/tests/unittests/ascend_group.py
@@ -69,6 +69,24 @@ def init_communicator(startup_program, main_program, current_endpoint, endpoints
             OP_ROLE_KEY: OpRole.Forward,
         })
 
+    # add input op for test
+    fill_var_name = "tensor@Filled"
+    fill_var = block.create_var(
+                name=fill_var_name,
+                shape=[10, 10],
+                dtype='float32',
+                persistable=False,
+                stop_gradient=True)
+    block.append_op(
+        type="fill_constant",
+        outputs={"Out": fill_var_name},
+        attrs={
+            "shape": [10, 10],
+            "dtype": fill_var.dtype,
+            "value": 1.0,
+            "place_type": 1
+        })
+
     with fluid.program_guard(main_program):
         op_type="c_allreduce_sum"
         data=fluid.layers.fill_constant(shape=[1], dtype='float32', value=2.5)
@@ -117,10 +135,11 @@ def train(world_endpoints, world_device_ids, local_device_ids,local_rank):
     main_program = main_programs[local_rank]
     loss = Loss(Block(main_program))
     optimizer = ascend_optimizer.AscendOptimizer(None, fetch_list=[])
-    optimizer.minimize(loss, startup_program, auto_dp=True)
+    optimizer.minimize(loss, startup_program, auto_dp=True, 
+                    rank_table_file=os.getenv("RANK_TABLE_FILE", None))
 
     exe = paddle.static.Executor(paddle.CPUPlace())
-    #exe.run(startup_program)
+    exe.run(startup_program)
     exe.run(main_program)
 
 
diff --git a/python/paddle/fluid/tests/unittests/hccl_tools.py b/python/paddle/fluid/tests/unittests/hccl_tools.py
new file mode 100644
index 00000000000..32bcd114b06
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hccl_tools.py
@@ -0,0 +1,150 @@
+# -*- coding:UTF-8 -*-
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""generate hccl config file script"""
+import os
+import sys
+import json
+import socket
+from argparse import ArgumentParser
+from typing import Dict, Any
+
+
+def parse_args():
+    """
+    parse args .
+
+    Args:
+
+    Returns:
+        args.
+
+    Examples:
+        >>> parse_args()
+    """
+    parser = ArgumentParser(description="mindspore distributed training launch "
+                                        "helper utilty that will generate hccl"
+                                        " config file")
+    parser.add_argument("--device_num", type=str, default="[0,8)",
+                        help="The number of the Ascend accelerators used. please note that the Ascend accelerators"
+                             "used must be continuous, such [0,4) means to use four chips "
+                             "0，1，2，3; [0,1) means to use chip 0; The first four chips are"
+                             "a group, and the last four chips are a group. In addition to"
+                             "the [0,8) chips are allowed, other cross-group such as [3,6)"
+                             "are prohibited.")
+    parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7",
+                        help="will use the visible devices sequentially")
+    parser.add_argument("--server_ip", type=str, default="",
+                        help="server ip")
+    args = parser.parse_args()
+    return args
+
+
+def get_host_ip():
+    """
+    get host ip
+    """
+    ip = None
+
+    try:
+        hostname = socket.gethostname()
+        ip = socket.gethostbyname(hostname)
+    except EOFError:
+        pass
+
+    return ip
+
+
+def main():
+    print("start", __file__)
+    args = parse_args()
+
+    # visible_devices
+    visible_devices = args.visible_devices.split(',')
+    print('visible_devices:{}'.format(visible_devices))
+
+    # server_id
+    ip = get_host_ip()
+    if args.server_ip:
+        server_id = args.server_ip
+    elif ip:
+        server_id = ip
+    else:
+        raise ValueError("please input server ip!")
+    print('server_id:{}'.format(server_id))
+
+    # device_num
+    first_num = int(args.device_num[1])
+    last_num = int(args.device_num[3])
+    if first_num < 0 or last_num > 8:
+        raise ValueError("device num {} must be in range [0,8] !".format(args.device_num))
+    if first_num > last_num:
+        raise ValueError("First num {} of device num {} must less than last num {} !".format(first_num, args.device_num,
+                                                                                             last_num))
+    if first_num < 4:
+        if last_num > 4:
+            if first_num == 0 and last_num == 8:
+                pass
+            else:
+                raise ValueError("device num {} must be in the same group of [0,4] or [4,8] !".format(args.device_num))
+
+    device_num_list = list(range(first_num, last_num))
+    print("device_num_list:", device_num_list)
+
+    assert len(visible_devices) >= len(device_num_list)
+
+    # construct hccn_table
+    device_ips: Dict[Any, Any] = {}
+    with open('/etc/hccn.conf', 'r') as fin:
+        for hccn_item in fin.readlines():
+            if hccn_item.strip().startswith('address_'):
+                device_id, device_ip = hccn_item.split('=')
+                device_id = device_id.split('_')[1]
+                device_ips[device_id] = device_ip.strip()
+
+    hccn_table = {'version': '1.0',
+                  'server_count': '1',
+                  'server_list': []}
+    device_list = []
+    rank_id = 0
+    for instance_id in device_num_list:
+        device_id = visible_devices[instance_id]
+        device_ip = device_ips[device_id]
+        device = {'device_id': device_id,
+                  'device_ip': device_ip,
+                  'rank_id': str(rank_id)}
+        print('rank_id:{}, device_id:{}, device_ip:{}'.format(rank_id, device_id, device_ip))
+        rank_id += 1
+        device_list.append(device)
+    hccn_table['server_list'].append({
+        'server_id': server_id,
+        'device': device_list,
+        'host_nic_ip': 'reserve'
+    })
+    hccn_table['status'] = 'completed'
+
+    # save hccn_table to file
+    table_path = os.getcwd()
+    table_fn = os.path.join(table_path,
+                            'hccl_{}p_{}_{}.json'.format(len(device_num_list), "".join(map(str, device_num_list)),
+                                                         server_id))
+    with open(table_fn, 'w') as table_fp:
+        json.dump(hccn_table, table_fp, indent=4)
+    sys.stdout.flush()
+    print("Completed: hccl file was save in :", table_fn)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/paddle/fluid/tests/unittests/test_ascend_group.sh b/python/paddle/fluid/tests/unittests/test_ascend_group.sh
index 5f901d59ad4..07ad3a696db 100644
--- a/python/paddle/fluid/tests/unittests/test_ascend_group.sh
+++ b/python/paddle/fluid/tests/unittests/test_ascend_group.sh
@@ -16,15 +16,14 @@
 
 set -e
 
-cluster_node_ips="127.0.0.1"
-export PADDLE_TRAINERS_NUM=4
-export POD_IP=127.0.0.1
-export PADDLE_TRAINERS=127.0.0.1
-export PADDLE_TRAINER_ID=0
+curr_host_ip=`hostname -i`
+python hccl_tools.py --device_num "[0,4)" --server_ip ${curr_host_ip}
 
-export PADDLE_PORT=35789
-export TRAINER_PORTS_NUM=4
+export RANK_TABLE_FILE="${PWD}/hccl_4p_0123_${curr_host_ip}.json"
 
-distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1,2,3 --log_dir=testlog"
+# use ascend
+echo "begin test use ascend npu"
+
+distributed_args="--run_mode=collective --log_dir=testlog"
 python -m paddle.distributed.fleet.launch ${distributed_args} \
   ascend_group.py fleetascendgroup
\ No newline at end of file
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
index 0960083abf2..7310af7d64c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
@@ -16,22 +16,43 @@
 
 set -e
 
-# use paddlecloud
-echo "begin test use paddlecloud"
-cluster_node_ips="127.0.0.1,127.0.0.2"
-export PADDLE_TRAINERS_NUM=2
-export POD_IP=127.0.0.1
-export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
-export PADDLE_TRAINER_ID=0
-
-export PADDLE_PORT=35789
-export TRAINER_PORTS_NUM=2
-
-distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1 --log_dir=testlog"
+RANK_TABLE_FILE_NAME="rank_table_file.json"
+cat > ${RANK_TABLE_FILE_NAME} <<EOF
+{
+    "status": "completed",
+    "version": "1.0",
+    "server_count": "1",
+    "server_list": [
+        {
+            "server_id": "127.0.0.1",
+            "device": [
+                {
+                    "device_id": "0",
+                    "device_ip": "192.1.184.23",
+                    "rank_id": "0"
+                },
+                {
+                    "device_id": "1",
+                    "device_ip": "192.2.21.93",
+                    "rank_id": "1"
+                }
+            ]
+        }
+    ]
+}
+EOF
+
+# set ascend rank table file env
+export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
+
+# use ascend
+echo "begin test use ascend npu"
+
+distributed_args="--run_mode=collective --log_dir=testlog"
 python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
 
-str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0"
-str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1"
+str1="selected_accelerators:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:0,1 device_id:0"
+str2="selected_accelerators:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:0,1 device_id:1"
 file_0="multi_process_fleetlaunchascend.check_0.log"
 file_1="multi_process_fleetlaunchascend.check_1.log"
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh
deleted file mode 100644
index 2e9c1e69953..00000000000
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh
+++ /dev/null
@@ -1,103 +0,0 @@
-#!/bin/bash
-
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -e
-
-RANK_TABLE_FILE_NAME="rank_table_file.json"
-cat > ${RANK_TABLE_FILE_NAME} <<EOF
-{
-    "status": "completed",
-    "version": "1.0",
-    "server_count": "2",
-    "server_list": [
-        {
-            "server_id": "127.0.0.1",
-            "device": [
-                {
-                    "device_id": "0",
-                    "device_ip": "192.1.184.23",
-                    "rank_id": "0"
-                },
-                {
-                    "device_id": "1",
-                    "device_ip": "192.2.21.93",
-                    "rank_id": "1"
-                }
-            ]
-        },
-        {
-            "server_id": "127.0.0.2",
-            "device": [
-                {
-                    "device_id": "0",
-                    "device_ip": "192.1.94.132",
-                    "rank_id": "2"
-                },
-                {
-                    "device_id": "1",
-                    "device_ip": "192.2.94.30",
-                    "rank_id": "3"
-                }
-            ]
-        }
-    ]
-}
-EOF
-
-# set ascend rank table file env
-export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
-
-# use paddlecloud
-echo "begin test use paddlecloud"
-cluster_node_ips="127.0.0.1,127.0.0.2"
-export PADDLE_TRAINERS_NUM=2
-export POD_IP=127.0.0.1
-export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
-export PADDLE_TRAINER_ID=0
-
-export PADDLE_PORT=35789
-export TRAINER_PORTS_NUM=2
-
-distributed_args="--run_mode=collective --log_dir=testlog"
-python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
-
-str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0"
-str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1"
-file_0="multi_process_fleetlaunchascend.check_0.log"
-file_1="multi_process_fleetlaunchascend.check_1.log"
-
-echo "paddlecloud params test"
-if grep -q "$str1" "$file_0"; then
-    echo "find trainer 0"
-else
-    echo "not find trainer 0"
-    exit -1
-fi
-
-if grep -q "$str2" "$file_1"; then
-    echo "find trainer 1"
-else
-    echo "not find trainer 1"
-    exit -1
-fi
-
-# test async poll process
-if [ -f $file_0 ]; then
-    rm $file_0
-fi
-if [ -f $file_1 ]; then
-    rm $file_1
-fi
-- 
GitLab