提交 3e82ae7f 编写于 作者: P panbingao

remove old MINDSPORE_HCCL_CONFIG_PATH in model zoo

上级 1b699234
...@@ -21,7 +21,7 @@ This is an example of training bert by second-order optimizer THOR. THOR is a no ...@@ -21,7 +21,7 @@ This is an example of training bert by second-order optimizer THOR. THOR is a no
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model. - Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
``` bash ``` bash
sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR RANK_TABLE_FILE
``` ```
## Usage ## Usage
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
echo "==============================================================================================================" echo "=============================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH" echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR RANK_TABLE_FILE"
echo "for example: bash run_distribute_pretrain.sh 8 1 /path/zh-wiki/ /path/Schema.json /path/hccl.json" echo "for example: bash run_distribute_pretrain.sh 8 1 /path/zh-wiki/ /path/Schema.json /path/hccl.json"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "==============================================================================================================" echo "=============================================================================================================="
...@@ -26,7 +26,6 @@ DATA_DIR=$3 ...@@ -26,7 +26,6 @@ DATA_DIR=$3
SCHEMA_DIR=$4 SCHEMA_DIR=$4
ulimit -u unlimited ulimit -u unlimited
export MINDSPORE_HCCL_CONFIG_PATH=$5
export RANK_TABLE_FILE=$5 export RANK_TABLE_FILE=$5
export RANK_SIZE=$1 export RANK_SIZE=$1
export HCCL_CONNECT_TIMEOUT=300 export HCCL_CONNECT_TIMEOUT=300
......
...@@ -60,7 +60,7 @@ Dataset used: [imagenet](http://www.image-net.org/) ...@@ -60,7 +60,7 @@ Dataset used: [imagenet](http://www.image-net.org/)
### Usage ### Usage
- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH] - Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH]
- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] - GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]
### Launch ### Launch
......
...@@ -30,7 +30,6 @@ run_ascend() ...@@ -30,7 +30,6 @@ run_ascend()
BASEPATH=$(cd "`dirname $0`" || exit; pwd) BASEPATH=$(cd "`dirname $0`" || exit; pwd)
export PYTHONPATH=${BASEPATH}:$PYTHONPATH export PYTHONPATH=${BASEPATH}:$PYTHONPATH
export MINDSPORE_HCCL_CONFIG_PATH=$4
export RANK_TABLE_FILE=$4 export RANK_TABLE_FILE=$4
if [ -d "../train" ]; if [ -d "../train" ];
then then
...@@ -81,7 +80,7 @@ run_gpu() ...@@ -81,7 +80,7 @@ run_gpu()
if [ $# -gt 6 ] || [ $# -lt 4 ] if [ $# -gt 6 ] || [ $# -lt 4 ]
then then
echo "Usage:\n \ echo "Usage:\n \
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]\n \ Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH]\n \
GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \ GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \
" "
exit 1 exit 1
......
...@@ -141,7 +141,6 @@ def main(): ...@@ -141,7 +141,6 @@ def main():
env['RANK_ID'] = str(rank_id) env['RANK_ID'] = str(rank_id)
env['DEVICE_ID'] = str(device_id) env['DEVICE_ID'] = str(device_id)
if args.nproc_per_node > 1: if args.nproc_per_node > 1:
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
env['RANK_TABLE_FILE'] = table_fn env['RANK_TABLE_FILE'] = table_fn
if os.path.exists(device_dir): if os.path.exists(device_dir):
shutil.rmtree(device_dir) shutil.rmtree(device_dir)
......
...@@ -138,7 +138,7 @@ def main(): ...@@ -138,7 +138,7 @@ def main():
env['RANK_ID'] = str(rank_id) env['RANK_ID'] = str(rank_id)
env['DEVICE_ID'] = str(device_id) env['DEVICE_ID'] = str(device_id)
if args.nproc_per_node > 1: if args.nproc_per_node > 1:
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn env['RANK_TABLE_FILE'] = table_fn
env['RANK_TABLE_FILE'] = table_fn env['RANK_TABLE_FILE'] = table_fn
if os.path.exists(device_dir): if os.path.exists(device_dir):
shutil.rmtree(device_dir) shutil.rmtree(device_dir)
......
...@@ -25,7 +25,7 @@ This is an example of training DeepLabV3 with PASCAL VOC 2012 dataset in MindSpo ...@@ -25,7 +25,7 @@ This is an example of training DeepLabV3 with PASCAL VOC 2012 dataset in MindSpo
``` ```
- Run `run_distribute_train.sh` for distributed training. - Run `run_distribute_train.sh` for distributed training.
``` bash ``` bash
sh scripts/run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH sh scripts/run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
``` ```
### Evaluation ### Evaluation
Set options in evaluation_config.py. Make sure the 'data_file' and 'finetune_ckpt' are set to your own path. Set options in evaluation_config.py. Make sure the 'data_file' and 'finetune_ckpt' are set to your own path.
......
...@@ -16,14 +16,13 @@ ...@@ -16,14 +16,13 @@
echo "==============================================================================================================" echo "=============================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "bash run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH" echo "bash run_distribute_train.sh RANK_TABLE_FILE DATA_PATH"
echo "for example: bash run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH [PRETRAINED_CKPT_PATH](option)" echo "for example: bash run_distribute_train.sh RANK_TABLE_FILE DATA_PATH [PRETRAINED_CKPT_PATH](option)"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "==============================================================================================================" echo "=============================================================================================================="
DATA_DIR=$2 DATA_DIR=$2
export MINDSPORE_HCCL_CONFIG_PATH=$1
export RANK_TABLE_FILE=$1 export RANK_TABLE_FILE=$1
export RANK_SIZE=8 export RANK_SIZE=8
PATH_CHECKPOINT="" PATH_CHECKPOINT=""
......
...@@ -87,13 +87,13 @@ FasterRcnn is a two-stage target detection network,This network uses a region pr ...@@ -87,13 +87,13 @@ FasterRcnn is a two-stage target detection network,This network uses a region pr
``` ```
# distributed training # distributed training
sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_MODEL] sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
# standalone training # standalone training
sh run_standalone_train.sh [PRETRAINED_MODEL] sh run_standalone_train.sh [PRETRAINED_MODEL]
``` ```
> Rank_table.json which is specified by MINDSPORE_HCCL_CONFIG_PATH is needed when you are running a distribute task. You can generate it by using the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools). > Rank_table.json which is specified by RANK_TABLE_FILE is needed when you are running a distribute task. You can generate it by using the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools).
> As for PRETRAINED_MODEL,if not set, the model will be trained from the very beginning.Ready-made pretrained_models are not available now. Stay tuned. > As for PRETRAINED_MODEL,if not set, the model will be trained from the very beginning.Ready-made pretrained_models are not available now. Stay tuned.
#### Result #### Result
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
if [ $# -lt 1 ] || [ $# -gt 2 ] if [ $# -lt 1 ] || [ $# -gt 2 ]
then then
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_PATH]" echo "Usage: sh run_train.sh [RANK_TABLE_FILE] [PRETRAINED_PATH]"
exit 1 exit 1
fi fi
...@@ -33,7 +33,7 @@ echo $PATH1 ...@@ -33,7 +33,7 @@ echo $PATH1
if [ ! -f $PATH1 ] if [ ! -f $PATH1 ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1 exit 1
fi fi
...@@ -51,7 +51,6 @@ fi ...@@ -51,7 +51,6 @@ fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1 export RANK_TABLE_FILE=$PATH1
for((i=0; i<${DEVICE_NUM}; i++)) for((i=0; i<${DEVICE_NUM}; i++))
......
...@@ -16,22 +16,22 @@ ...@@ -16,22 +16,22 @@
if [ $# != 1 ] if [ $# != 1 ]
then then
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH]" echo "Usage: sh run_train.sh [RANK_TABLE_FILE]"
exit 1 exit 1
fi fi
if [ ! -f $1 ] if [ ! -f $1 ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file" echo "error: RANK_TABLE_FILE=$1 is not a file"
exit 1 exit 1
fi fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
MINDSPORE_HCCL_CONFIG_PATH=$(realpath $1) RANK_TABLE_FILE=$(realpath $1)
export MINDSPORE_HCCL_CONFIG_PATH export RANK_TABLE_FILE
echo "MINDSPORE_HCCL_CONFIG_PATH=${MINDSPORE_HCCL_CONFIG_PATH}" echo "RANK_TABLE_FILE=${RANK_TABLE_FILE}"
export SERVER_ID=0 export SERVER_ID=0
rank_start=$((DEVICE_NUM * SERVER_ID)) rank_start=$((DEVICE_NUM * SERVER_ID))
......
...@@ -88,7 +88,7 @@ MaskRcnn is a two-stage target detection network,This network uses a region prop ...@@ -88,7 +88,7 @@ MaskRcnn is a two-stage target detection network,This network uses a region prop
``` ```
# distributed training # distributed training
sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_MODEL] sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
# standalone training # standalone training
sh run_standalone_train.sh [PRETRAINED_MODEL] sh run_standalone_train.sh [PRETRAINED_MODEL]
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
if [ $# != 2 ] if [ $# != 2 ]
then then
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_PATH]" echo "Usage: sh run_train.sh [RANK_TABLE_FILE] [PRETRAINED_PATH]"
exit 1 exit 1
fi fi
...@@ -35,7 +35,7 @@ echo $PATH2 ...@@ -35,7 +35,7 @@ echo $PATH2
if [ ! -f $PATH1 ] if [ ! -f $PATH1 ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1 exit 1
fi fi
...@@ -48,7 +48,6 @@ fi ...@@ -48,7 +48,6 @@ fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1 export RANK_TABLE_FILE=$PATH1
echo 3 > /proc/sys/vm/drop_caches echo 3 > /proc/sys/vm/drop_caches
......
...@@ -132,7 +132,7 @@ Parameters for both training and evaluation can be set in config.py. ...@@ -132,7 +132,7 @@ Parameters for both training and evaluation can be set in config.py.
``` ```
# distributed training # distributed training
Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH]
[PRETRAINED_CKPT_PATH](optional) [PRETRAINED_CKPT_PATH](optional)
# standalone training # standalone training
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
if [ $# != 4 ] && [ $# != 5 ] if [ $# != 4 ] && [ $# != 5 ]
then then
echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1 exit 1
fi fi
...@@ -57,7 +57,7 @@ fi ...@@ -57,7 +57,7 @@ fi
if [ ! -f $PATH1 ] if [ ! -f $PATH1 ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1 exit 1
fi fi
...@@ -76,7 +76,6 @@ fi ...@@ -76,7 +76,6 @@ fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1 export RANK_TABLE_FILE=$PATH1
for((i=0; i<${DEVICE_NUM}; i++)) for((i=0; i<${DEVICE_NUM}; i++))
......
...@@ -82,7 +82,7 @@ Parameters for both training and evaluating can be set in config.py ...@@ -82,7 +82,7 @@ Parameters for both training and evaluating can be set in config.py
``` ```
# distribute training example(8p) # distribute training example(8p)
sh run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH sh run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
# standalone training # standalone training
sh run_standalone_train.sh DEVICE_ID DATA_PATH sh run_standalone_train.sh DEVICE_ID DATA_PATH
``` ```
...@@ -91,7 +91,7 @@ sh run_standalone_train.sh DEVICE_ID DATA_PATH ...@@ -91,7 +91,7 @@ sh run_standalone_train.sh DEVICE_ID DATA_PATH
```bash ```bash
# distributed training example(8p) for Ascend # distributed training example(8p) for Ascend
sh scripts/run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH /dataset/train sh scripts/run_distribute_train.sh RANK_TABLE_FILE /dataset/train
# standalone training example for Ascend # standalone training example for Ascend
sh scripts/run_standalone_train.sh 0 /dataset/train sh scripts/run_standalone_train.sh 0 /dataset/train
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
echo "==============================================================================================================" echo "=============================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET RANK_TABLE_FILE PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
echo "for example: sh run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)" echo "for example: sh run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "=================================================================================================================" echo "================================================================================================================="
...@@ -24,7 +24,7 @@ echo "========================================================================== ...@@ -24,7 +24,7 @@ echo "==========================================================================
if [ $# != 5 ] && [ $# != 7 ] if [ $# != 5 ] && [ $# != 7 ]
then then
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \ echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \
[MINDSPORE_HCCL_CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)" [RANK_TABLE_FILE] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
exit 1 exit 1
fi fi
...@@ -41,7 +41,7 @@ LR=$3 ...@@ -41,7 +41,7 @@ LR=$3
DATASET=$4 DATASET=$4
PRE_TRAINED=$6 PRE_TRAINED=$6
PRE_TRAINED_EPOCH_SIZE=$7 PRE_TRAINED_EPOCH_SIZE=$7
export MINDSPORE_HCCL_CONFIG_PATH=$5 export RANK_TABLE_FILE=$5
for((i=0;i<RANK_SIZE;i++)) for((i=0;i<RANK_SIZE;i++))
do do
......
...@@ -101,9 +101,9 @@ parameters/options: ...@@ -101,9 +101,9 @@ parameters/options:
### Distribute Training ### Distribute Training
``` ```
Usage: sh script/run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH] Usage: sh script/run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH]
parameters/options: parameters/options:
MINDSPORE_HCCL_CONFIG_PATH HCCL configuration file path. RANK_TABLE_FILE HCCL configuration file path.
DATA_PATH the storage path of dataset. DATA_PATH the storage path of dataset.
``` ```
...@@ -16,13 +16,13 @@ ...@@ -16,13 +16,13 @@
if [ $# != 2 ] if [ $# != 2 ]
then then
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]" echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH]"
exit 1 exit 1
fi fi
if [ ! -f $1 ] if [ ! -f $1 ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file" echo "error: RANK_TABLE_FILEH=$1 is not a file"
exit 1 exit 1
fi fi
...@@ -34,7 +34,7 @@ fi ...@@ -34,7 +34,7 @@ fi
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$1 export RANK_TABLE_FILE=$1
for((i=0;i<RANK_SIZE;i++)) for((i=0;i<RANK_SIZE;i++))
do do
......
...@@ -77,7 +77,7 @@ Parameters for both training and evaluation can be set in config.py. ...@@ -77,7 +77,7 @@ Parameters for both training and evaluation can be set in config.py.
``` ```
# distributed training in Ascend # distributed training in Ascend
Usage: bash run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH]
# distributed training in GPU # distributed training in GPU
Usage: bash run_distribute_train_for_gpu.sh [RANK_SIZE] [DATASET_PATH] Usage: bash run_distribute_train_for_gpu.sh [RANK_SIZE] [DATASET_PATH]
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
# ============================================================================ # ============================================================================
if [ $# != 2 ]; then if [ $# != 2 ]; then
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]" echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH]"
exit 1 exit 1
fi fi
...@@ -31,7 +31,7 @@ PATH1=$(get_real_path $1) ...@@ -31,7 +31,7 @@ PATH1=$(get_real_path $1)
PATH2=$(get_real_path $2) PATH2=$(get_real_path $2)
if [ ! -f $PATH1 ]; then if [ ! -f $PATH1 ]; then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1 exit 1
fi fi
...@@ -43,7 +43,6 @@ fi ...@@ -43,7 +43,6 @@ fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1 export RANK_TABLE_FILE=$PATH1
for ((i = 0; i < ${DEVICE_NUM}; i++)); do for ((i = 0; i < ${DEVICE_NUM}; i++)); do
......
...@@ -55,7 +55,7 @@ This is an example of training YOLOV3-DarkNet53 with COCO2014 dataset in MindSpo ...@@ -55,7 +55,7 @@ This is an example of training YOLOV3-DarkNet53 with COCO2014 dataset in MindSpo
``` ```
# distributed training # distributed training
sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [MINDSPORE_HCCL_CONFIG_PATH] sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [RANK_TABLE_FILE]
# standalone training # standalone training
sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE]
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
if [ $# != 3 ] if [ $# != 3 ]
then then
echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [MINDSPORE_HCCL_CONFIG_PATH]" echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [RANK_TABLE_FILE]"
exit 1 exit 1
fi fi
...@@ -30,10 +30,10 @@ get_real_path(){ ...@@ -30,10 +30,10 @@ get_real_path(){
DATASET_PATH=$(get_real_path $1) DATASET_PATH=$(get_real_path $1)
PRETRAINED_BACKBONE=$(get_real_path $2) PRETRAINED_BACKBONE=$(get_real_path $2)
MINDSPORE_HCCL_CONFIG_PATH=$(get_real_path $3) RANK_TABLE_FILE=$(get_real_path $3)
echo $DATASET_PATH echo $DATASET_PATH
echo $PRETRAINED_BACKBONE echo $PRETRAINED_BACKBONE
echo $MINDSPORE_HCCL_CONFIG_PATH echo $RANK_TABLE_FILE
if [ ! -d $DATASET_PATH ] if [ ! -d $DATASET_PATH ]
then then
...@@ -47,15 +47,15 @@ then ...@@ -47,15 +47,15 @@ then
exit 1 exit 1
fi fi
if [ ! -f $MINDSPORE_HCCL_CONFIG_PATH ] if [ ! -f $RANK_TABLE_FILE ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$MINDSPORE_HCCL_CONFIG_PATH is not a file" echo "error: RANK_TABLE_FILE=$RANK_TABLE_FILE is not a file"
exit 1 exit 1
fi fi
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$MINDSPORE_HCCL_CONFIG_PATH export RANK_TABLE_FILEH=$RANK_TABLE_FILE
for((i=0; i<${DEVICE_NUM}; i++)) for((i=0; i<${DEVICE_NUM}; i++))
do do
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
echo "=======================================================================================================================================================" echo "======================================================================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH RANK_TABLE_FILE PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)" echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script." echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script."
...@@ -24,7 +24,7 @@ echo "========================================================================== ...@@ -24,7 +24,7 @@ echo "==========================================================================
if [ $# != 6 ] && [ $# != 8 ] if [ $# != 6 ] && [ $# != 8 ]
then then
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [MINDSPORE_HCCL_CONFIG_PATH] \ echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [RANK_TABLE_FILE] \
[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)" [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
exit 1 exit 1
fi fi
...@@ -42,7 +42,7 @@ python train.py --only_create_dataset=1 --mindrecord_dir=$MINDRECORD_DIR --image ...@@ -42,7 +42,7 @@ python train.py --only_create_dataset=1 --mindrecord_dir=$MINDRECORD_DIR --image
echo "After running the scipt, the network runs in the background. The log will be generated in LOGx/log.txt" echo "After running the scipt, the network runs in the background. The log will be generated in LOGx/log.txt"
export MINDSPORE_HCCL_CONFIG_PATH=$6 export RANK_TABLE_FILE=$6
export RANK_SIZE=$1 export RANK_SIZE=$1
BASE_PATH=$(cd "`dirname $0`" || exit; pwd) BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
......
...@@ -21,7 +21,7 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base]( ...@@ -21,7 +21,7 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base](
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model. - Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
``` bash ``` bash
sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH sh scripts/run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH
``` ```
### Fine-Tuning and Evaluation ### Fine-Tuning and Evaluation
......
...@@ -16,9 +16,11 @@ ...@@ -16,9 +16,11 @@
echo "==============================================================================================================" echo "=============================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH" echo "bash run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH"
echo "for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json /path/hccl.json" echo "for example: bash run_distribute_pretrain.sh /path/dataset /path/hccl.json"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "For hyper parameter, please note that you should customize the scripts:
'{CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini' "
echo "==============================================================================================================" echo "=============================================================================================================="
EPOCH_SIZE=$2 EPOCH_SIZE=$2
......
...@@ -44,7 +44,6 @@ set_hccl_json() ...@@ -44,7 +44,6 @@ set_hccl_json()
do do
if [[ "$1" == "-j" || "$1" == "--hccl_json" ]] if [[ "$1" == "-j" || "$1" == "--hccl_json" ]]
then then
export MINDSPORE_HCCL_CONFIG_PATH=$2
export RANK_TABLE_FILE=$2 export RANK_TABLE_FILE=$2
break break
fi fi
......
...@@ -74,7 +74,7 @@ This example implements training and evaluation of Transformer Model, which is i ...@@ -74,7 +74,7 @@ This example implements training and evaluation of Transformer Model, which is i
- Run `run_distribute_train.sh` for distributed training of Transformer model. - Run `run_distribute_train.sh` for distributed training of Transformer model.
``` bash ``` bash
sh scripts/run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATA_PATH MINDSPORE_HCCL_CONFIG_PATH sh scripts/run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE
``` ```
### Evaluation ### Evaluation
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
echo "==============================================================================================================" echo "=============================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH MINDSPORE_HCCL_CONFIG_PATH" echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE"
echo "for example: sh run_distribute_pretrain.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json" echo "for example: sh run_distribute_pretrain.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "==============================================================================================================" echo "=============================================================================================================="
...@@ -28,7 +28,6 @@ cd run_distribute_train || exit ...@@ -28,7 +28,6 @@ cd run_distribute_train || exit
EPOCH_SIZE=$2 EPOCH_SIZE=$2
DATA_PATH=$3 DATA_PATH=$3
export MINDSPORE_HCCL_CONFIG_PATH=$4
export RANK_TABLE_FILE=$4 export RANK_TABLE_FILE=$4
export RANK_SIZE=$1 export RANK_SIZE=$1
export HCCL_FLAG=1 export HCCL_FLAG=1
......
...@@ -21,7 +21,7 @@ echo "After running the script, the network runs in the background, The log will ...@@ -21,7 +21,7 @@ echo "After running the script, the network runs in the background, The log will
export RANK_SIZE=$1 export RANK_SIZE=$1
DATA_URL=$2 DATA_URL=$2
export MINDSPORE_HCCL_CONFIG_PATH=$3 export RANK_TABLE_FILE=$3
for ((i=0; i<RANK_SIZE;i++)) for ((i=0; i<RANK_SIZE;i++))
do do
......
...@@ -22,7 +22,6 @@ export RANK_SIZE=$1 ...@@ -22,7 +22,6 @@ export RANK_SIZE=$1
export EPOCH_SIZE=$2 export EPOCH_SIZE=$2
export DATASET=$3 export DATASET=$3
export RANK_TABLE_FILE=$4 export RANK_TABLE_FILE=$4
export MINDSPORE_HCCL_CONFIG_PATH=$4
for((i=0;i<$RANK_SIZE;i++)); for((i=0;i<$RANK_SIZE;i++));
do do
......
...@@ -22,7 +22,6 @@ export RANK_SIZE=$1 ...@@ -22,7 +22,6 @@ export RANK_SIZE=$1
export EPOCH_SIZE=$2 export EPOCH_SIZE=$2
export DATASET=$3 export DATASET=$3
export RANK_TABLE_FILE=$4 export RANK_TABLE_FILE=$4
export MINDSPORE_HCCL_CONFIG_PATH=$4
for((i=0;i<$RANK_SIZE;i++)); for((i=0;i<$RANK_SIZE;i++));
do do
......
...@@ -21,7 +21,6 @@ export RANK_SIZE=$1 ...@@ -21,7 +21,6 @@ export RANK_SIZE=$1
export EPOCH_SIZE=$2 export EPOCH_SIZE=$2
export DATASET=$3 export DATASET=$3
export RANK_TABLE_FILE=$4 export RANK_TABLE_FILE=$4
export MINDSPORE_HCCL_CONFIG_PATH=$4
export MS_COMM_TYPE=zmq export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1 export MS_SCHED_NUM=1
......
...@@ -29,7 +29,7 @@ EPOCH_SIZE=$2 ...@@ -29,7 +29,7 @@ EPOCH_SIZE=$2
VOCAB_SIZE=$3 VOCAB_SIZE=$3
EMB_DIM=$4 EMB_DIM=$4
DATASET=$5 DATASET=$5
MINDSPORE_HCCL_CONFIG_PATH=$6 RANK_TABLE_FILE=$6
ENV_SH=$7 ENV_SH=$7
MODE=$8 MODE=$8
...@@ -39,7 +39,7 @@ do ...@@ -39,7 +39,7 @@ do
passwd=$(get_node_passwd ${cluster_config_path} ${node}) passwd=$(get_node_passwd ${cluster_config_path} ${node})
echo "------------------${user}@${node}---------------------" echo "------------------${user}@${node}---------------------"
if [ $MODE == "host_device_mix" ]; then if [ $MODE == "host_device_mix" ]; then
ssh_pass ${node} ${user} ${passwd} "mkdir -p ${execute_path}; cd ${execute_path}; bash ${SCRIPTPATH}/run_auto_parallel_train_cluster.sh ${RANK_SIZE} ${RANK_START} ${EPOCH_SIZE} ${VOCAB_SIZE} ${EMB_DIM} ${DATASET} ${ENV_SH} ${MODE} ${MINDSPORE_HCCL_CONFIG_PATH}" ssh_pass ${node} ${user} ${passwd} "mkdir -p ${execute_path}; cd ${execute_path}; bash ${SCRIPTPATH}/run_auto_parallel_train_cluster.sh ${RANK_SIZE} ${RANK_START} ${EPOCH_SIZE} ${VOCAB_SIZE} ${EMB_DIM} ${DATASET} ${ENV_SH} ${MODE} ${RANK_TABLE_FILE}"
else else
echo "[ERROR] mode is wrong" echo "[ERROR] mode is wrong"
exit 1 exit 1
......
...@@ -140,7 +140,7 @@ def main(): ...@@ -140,7 +140,7 @@ def main():
env['RANK_ID'] = str(rank_id) env['RANK_ID'] = str(rank_id)
env['DEVICE_ID'] = str(device_id) env['DEVICE_ID'] = str(device_id)
if args.nproc_per_node > 1: if args.nproc_per_node > 1:
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn env['RANK_TABLE_FILE'] = table_fn
env['RANK_TABLE_FILE'] = table_fn env['RANK_TABLE_FILE'] = table_fn
if os.path.exists(device_dir): if os.path.exists(device_dir):
shutil.rmtree(device_dir) shutil.rmtree(device_dir)
......
...@@ -74,7 +74,7 @@ Parameters for both training and inference can be set in config.py. ...@@ -74,7 +74,7 @@ Parameters for both training and inference can be set in config.py.
``` ```
# distributed training # distributed training
Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [DEVICE_NUM] Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [DEVICE_NUM]
``` ```
......
...@@ -16,13 +16,13 @@ ...@@ -16,13 +16,13 @@
if [ $# != 3 ] if [ $# != 3 ]
then then
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [DEVICE_NUM]" echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [DEVICE_NUM]"
exit 1 exit 1
fi fi
if [ ! -f $1 ] if [ ! -f $1 ]
then then
echo "error: DMINDSPORE_HCCL_CONFIG_PATH=$1 is not a file" echo "error: DRANK_TABLE_FILE=$1 is not a file"
exit 1 exit 1
fi fi
...@@ -38,7 +38,7 @@ cd $BASE_PATH/../ || exit ...@@ -38,7 +38,7 @@ cd $BASE_PATH/../ || exit
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=$3 export DEVICE_NUM=$3
export RANK_SIZE=$3 export RANK_SIZE=$3
export MINDSPORE_HCCL_CONFIG_PATH=$1 export RANK_TABLE_FILE=$1
for((i=0; i<${DEVICE_NUM}; i++)) for((i=0; i<${DEVICE_NUM}; i++))
do do
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册