提交 549d5611 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!2944 add hccl config

Merge pull request !2944 from wandongdong/master
...@@ -60,14 +60,14 @@ Dataset used: [imagenet](http://www.image-net.org/) ...@@ -60,14 +60,14 @@ Dataset used: [imagenet](http://www.image-net.org/)
### Usage ### Usage
- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] - Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]
- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] - GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]
### Launch ### Launch
``` ```
# training example # training example
Ascend: sh run_train.sh Ascend 8 192.168.0.1 0,1,2,3,4,5,6,7 ~/imagenet/train/ mobilenet_199.ckpt Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json ~/imagenet/train/ mobilenet_199.ckpt
GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/
``` ```
......
...@@ -22,14 +22,16 @@ run_ascend() ...@@ -22,14 +22,16 @@ run_ascend()
exit 1 exit 1
fi fi
if [ ! -d $5 ] if [ ! -d $5 ] && [ ! -f $5 ]
then then
echo "error: DATASET_PATH=$5 is not a directory" echo "error: DATASET_PATH=$5 is not a directory or file"
exit 1 exit 1
fi fi
BASEPATH=$(cd "`dirname $0`" || exit; pwd) BASEPATH=$(cd "`dirname $0`" || exit; pwd)
export PYTHONPATH=${BASEPATH}:$PYTHONPATH export PYTHONPATH=${BASEPATH}:$PYTHONPATH
export MINDSPORE_HCCL_CONFIG_PATH=$4
export RANK_TABLE_FILE=$4
if [ -d "../train" ]; if [ -d "../train" ];
then then
rm -rf ../train rm -rf ../train
...@@ -38,8 +40,7 @@ run_ascend() ...@@ -38,8 +40,7 @@ run_ascend()
cd ../train || exit cd ../train || exit
python ${BASEPATH}/../src/launch.py \ python ${BASEPATH}/../src/launch.py \
--nproc_per_node=$2 \ --nproc_per_node=$2 \
--visible_devices=$4 \ --visible_devices=$3 \
--server_id=$3 \
--training_script=${BASEPATH}/../train.py \ --training_script=${BASEPATH}/../train.py \
--dataset_path=$5 \ --dataset_path=$5 \
--pre_trained=$6 \ --pre_trained=$6 \
...@@ -80,7 +81,7 @@ run_gpu() ...@@ -80,7 +81,7 @@ run_gpu()
if [ $# -gt 6 ] || [ $# -lt 4 ] if [ $# -gt 6 ] || [ $# -lt 4 ]
then then
echo "Usage:\n \ echo "Usage:\n \
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \ Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]\n \
GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \ GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \
" "
exit 1 exit 1
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
"""launch train script""" """launch train script"""
import os import os
import sys import sys
import json
import subprocess import subprocess
import shutil import shutil
from argparse import ArgumentParser from argparse import ArgumentParser
...@@ -42,8 +41,6 @@ def parse_args(): ...@@ -42,8 +41,6 @@ def parse_args():
"each process can be bound to a single D.") "each process can be bound to a single D.")
parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7", parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7",
help="will use the visible devices sequentially") help="will use the visible devices sequentially")
parser.add_argument("--server_id", type=str, default="",
help="server ip")
parser.add_argument("--training_script", type=str, parser.add_argument("--training_script", type=str,
help="The full path to the single D training " help="The full path to the single D training "
"program/script to be launched in parallel, " "program/script to be launched in parallel, "
...@@ -63,66 +60,6 @@ def main(): ...@@ -63,66 +60,6 @@ def main():
assert os.path.isfile(args.training_script) assert os.path.isfile(args.training_script)
assert len(visible_devices) >= args.nproc_per_node assert len(visible_devices) >= args.nproc_per_node
print('visible_devices:{}'.format(visible_devices)) print('visible_devices:{}'.format(visible_devices))
if not args.server_id:
print('pleaser input server ip!!!')
exit(0)
print('server_id:{}'.format(args.server_id))
# construct hccn_table
hccn_configs = open('/etc/hccn.conf', 'r').readlines()
device_ips = {}
for hccn_item in hccn_configs:
hccn_item = hccn_item.strip()
if hccn_item.startswith('address_'):
device_id, device_ip = hccn_item.split('=')
device_id = device_id.split('_')[1]
device_ips[device_id] = device_ip
print('device_id:{}, device_ip:{}'.format(device_id, device_ip))
hccn_table = {}
hccn_table['board_id'] = '0x0000'
hccn_table['chip_info'] = '910'
hccn_table['deploy_mode'] = 'lab'
hccn_table['group_count'] = '1'
hccn_table['group_list'] = []
instance_list = []
usable_dev = ''
for instance_id in range(args.nproc_per_node):
instance = {}
instance['devices'] = []
device_id = visible_devices[instance_id]
device_ip = device_ips[device_id]
usable_dev += str(device_id)
instance['devices'].append({
'device_id': device_id,
'device_ip': device_ip,
})
instance['rank_id'] = str(instance_id)
instance['server_id'] = args.server_id
instance_list.append(instance)
hccn_table['group_list'].append({
'device_num': str(args.nproc_per_node),
'server_num': '1',
'group_name': '',
'instance_count': str(args.nproc_per_node),
'instance_list': instance_list,
})
hccn_table['para_plane_nic_location'] = 'device'
hccn_table['para_plane_nic_name'] = []
for instance_id in range(args.nproc_per_node):
eth_id = visible_devices[instance_id]
hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id))
hccn_table['para_plane_nic_num'] = str(args.nproc_per_node)
hccn_table['status'] = 'completed'
# save hccn_table to file
table_path = os.getcwd()
if not os.path.exists(table_path):
os.mkdir(table_path)
table_fn = os.path.join(table_path,
'rank_table_{}p_{}_{}.json'.format(args.nproc_per_node, usable_dev, args.server_id))
with open(table_fn, 'w') as table_fp:
json.dump(hccn_table, table_fp, indent=4)
sys.stdout.flush()
# spawn the processes # spawn the processes
processes = [] processes = []
...@@ -137,9 +74,6 @@ def main(): ...@@ -137,9 +74,6 @@ def main():
device_dir = os.path.join(cur_path, 'device{}'.format(rank_id)) device_dir = os.path.join(cur_path, 'device{}'.format(rank_id))
env['RANK_ID'] = str(rank_id) env['RANK_ID'] = str(rank_id)
env['DEVICE_ID'] = str(device_id) env['DEVICE_ID'] = str(device_id)
if args.nproc_per_node > 1:
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
env['RANK_TABLE_FILE'] = table_fn
if os.path.exists(device_dir): if os.path.exists(device_dir):
shutil.rmtree(device_dir) shutil.rmtree(device_dir)
os.mkdir(device_dir) os.mkdir(device_dir)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册