support parsing ascend rank table file (#31000)

support parsing ascend rank table file

support parsing ascend rank table file (#31000)
support parsing ascend rank table file
a6edbc47 · xiayanming · GitHub · 1201cd2e · a6edbc47 · a6edbc47
4 changed file
--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ b/python/paddle/distributed/fleet/ascend_utils.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+import paddle
+from paddle.distributed.fleet.launch_utils import get_cluster, logger, get_host_name_ip, DeviceMode
+
+def _get_ascend_rankfile(rank_table_file_path):
+    """
+    Args:
+    rank_table_file_path: ascend npu rank file json
+    {
+        "status": "completed",
+        "version": "1.0",
+        "server_count": "2",
+        "server_list": [
+            {
+                "server_id": "192.168.24.217",
+                "device": [
+                    {
+                        "device_id": "0",
+                        "device_ip": "192.1.184.23",
+                        "rank_id": "0"
+                    },
+                    {
+                        "device_id": "1",
+                        "device_ip": "192.2.21.93",
+                        "rank_id": "1"
+                    }
+                ]
+            },
+            {
+                "server_id": "192.168.26.177",
+                "device": [
+                    {
+                        "device_id": "0",
+                        "device_ip": "192.1.94.132",
+                        "rank_id": "2"
+                    },
+                    {
+                        "device_id": "1",
+                        "device_ip": "192.2.94.30",
+                        "rank_id": "3"
+                    }
+                ]
+            }
+        ]
+    }
+
+    Returns:
+        node_ips: node ip list
+        device_count: number of npu per machine
+    """
+    json_data = None
+    with open(rank_table_file_path) as json_file:
+        json_data = json.load(json_file)
+
+    node_ips = []
+    device_count = 0
+    server_list = json_data['server_list']
+    for server in server_list:
+        node_ips.append(server['server_id'])
+        device_list = server['device']
+        device_count = len(device_list)
+            
+    return node_ips, device_count
+
+def get_cloud_cluster(rank_table_file=None, 
+                    device_mode=DeviceMode.ASCEND_NPU, 
+                    devices_per_proc=None,
+                    start_port=6070):
+    """
+    Args:
+    rank_table_file: string, ascend npu rank file path
+    device_mode: DeviceMode(Int)
+    devices_per_proc:list
+    start_port: the start port of current runtime env
+    """
+    if rank_table_file: 
+        # multi trainers
+        node_ips, device_count = _get_ascend_rankfile(rank_table_file)
+        node_index = os.environ.get("PADDLE_TRAINER_ID")
+        node_ip = None
+        if node_index is None:
+            _, node_ip = get_host_name_ip()
+        else:
+            node_ip = node_ips[int(node_index)]
+
+        assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
+            % (node_ip, node_ips)
+    else:
+        # single trainer (single ascend card)
+        node_ips = ["127.0.0.1"]
+        node_ip = node_ips[0]
+        device_count = 1
+        devices_per_proc = None
+
+    if devices_per_proc is None:
+        devices_per_proc = [str(x) for x in range(device_count)]
+
+    free_ports = [
+        x for x in range(start_port, start_port + len(devices_per_proc))
+    ]
+
+    trainer_endpoints = []
+    for ip in node_ips:
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+
+    return get_cluster(node_ips, node_ip, trainer_endpoints,
+                               device_mode, devices_per_proc)
\ No newline at end of file
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -73,6 +73,7 @@ from paddle.distributed.fleet import launch_utils
 # TODO(danleifeng): Don't import * from a module
 from paddle.distributed.fleet.launch_utils import *
 import paddle.distributed.fleet.cloud_utils as cloud_utils
+import paddle.distributed.fleet.ascend_utils as ascend_utils


 def _print_arguments(args):
@@ -129,7 +130,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
        default=None,
        help="It's for ascend npu training."
        "For example:"
-        "--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one gpu."
+        "--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one npu."
    )

    base_group.add_argument("--selected_gpus", dest="gpus")
@@ -227,6 +228,13 @@ def launch_collective(args):
        cluster, pod = cloud_utils.get_cloud_cluster(
            args.ips, device_mode, devices_per_proc, start_port)
        logger.debug("get cluster from cloud:{}".format(cluster))
+    elif device_mode == DeviceMode.ASCEND_NPU:
+        # for ascend
+        cluster, pod = ascend_utils.get_cloud_cluster(
+                rank_table_file=os.getenv("RANK_TABLE_FILE", None), 
+                device_mode=device_mode,
+                devices_per_proc=devices_per_proc,
+                start_port=start_port)
    else:
        # trainers_num = 1 or not use paddlecloud ips="a,b"
        cluster, pod = get_cluster_from_args(args, device_mode,

--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -459,7 +459,7 @@ def start_local_trainers(cluster,
    current_env.pop("http_proxy", None)
    current_env.pop("https_proxy", None)

-    ids=cluster.world_device_ids()
+    ids = cluster.world_device_ids()
    res = [':'.join(ele) for ele in ids]
    procs = []
    for idx, t in enumerate(pod.trainers):
@@ -582,8 +582,8 @@ def get_ascend_npus(npus):
    if npus is None:
        count = fluid.core.NPUDevice.get_device_count()
        if count <= 0:
-            return ret
-        ret = [x for x in range(count)]
+            return None
+        ret = [str(x) for x in range(count)]
    else:
        ret = [x.strip() for x in npus.split(',')]
    return ret

--- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+RANK_TABLE_FILE_NAME="rank_table_file.json"
+cat > ${RANK_TABLE_FILE_NAME} <<EOF
+{
+    "status": "completed",
+    "version": "1.0",
+    "server_count": "2",
+    "server_list": [
+        {
+            "server_id": "127.0.0.1",
+            "device": [
+                {
+                    "device_id": "0",
+                    "device_ip": "192.1.184.23",
+                    "rank_id": "0"
+                },
+                {
+                    "device_id": "1",
+                    "device_ip": "192.2.21.93",
+                    "rank_id": "1"
+                }
+            ]
+        },
+        {
+            "server_id": "127.0.0.2",
+            "device": [
+                {
+                    "device_id": "0",
+                    "device_ip": "192.1.94.132",
+                    "rank_id": "2"
+                },
+                {
+                    "device_id": "1",
+                    "device_ip": "192.2.94.30",
+                    "rank_id": "3"
+                }
+            ]
+        }
+    ]
+}
+EOF
+
+# set ascend rank table file env
+export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
+
+# use paddlecloud
+echo "begin test use paddlecloud"
+cluster_node_ips="127.0.0.1,127.0.0.2"
+export PADDLE_TRAINERS_NUM=2
+export POD_IP=127.0.0.1
+export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
+export PADDLE_TRAINER_ID=0
+
+export PADDLE_PORT=35789
+export TRAINER_PORTS_NUM=2
+
+distributed_args="--run_mode=collective --log_dir=testlog"
+python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
+
+str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0"
+str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1"
+file_0="multi_process_fleetlaunchascend.check_0.log"
+file_1="multi_process_fleetlaunchascend.check_1.log"
+
+echo "paddlecloud params test"
+if grep -q "$str1" "$file_0"; then
+    echo "find trainer 0"
+else
+    echo "not find trainer 0"
+    exit -1
+fi
+
+if grep -q "$str2" "$file_1"; then
+    echo "find trainer 1"
+else
+    echo "not find trainer 1"
+    exit -1
+fi
+
+# test async poll process
+if [ -f $file_0 ]; then
+    rm $file_0
+fi
+if [ -f $file_1 ]; then
+    rm $file_1
+fi