提交 3e82ae7f 编写于 作者: P panbingao

remove old MINDSPORE_HCCL_CONFIG_PATH in model zoo

上级 1b699234
......@@ -21,7 +21,7 @@ This is an example of training bert by second-order optimizer THOR. THOR is a no
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
``` bash
sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH
sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR RANK_TABLE_FILE
```
## Usage
......
......@@ -16,7 +16,7 @@
echo "=============================================================================================================="
echo "Please run the scipt as: "
echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH"
echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR RANK_TABLE_FILE"
echo "for example: bash run_distribute_pretrain.sh 8 1 /path/zh-wiki/ /path/Schema.json /path/hccl.json"
echo "It is better to use absolute path."
echo "=============================================================================================================="
......@@ -26,7 +26,6 @@ DATA_DIR=$3
SCHEMA_DIR=$4
ulimit -u unlimited
export MINDSPORE_HCCL_CONFIG_PATH=$5
export RANK_TABLE_FILE=$5
export RANK_SIZE=$1
export HCCL_CONNECT_TIMEOUT=300
......
......@@ -60,7 +60,7 @@ Dataset used: [imagenet](http://www.image-net.org/)
### Usage
- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]
- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH]
- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]
### Launch
......
......@@ -30,7 +30,6 @@ run_ascend()
BASEPATH=$(cd "`dirname $0`" || exit; pwd)
export PYTHONPATH=${BASEPATH}:$PYTHONPATH
export MINDSPORE_HCCL_CONFIG_PATH=$4
export RANK_TABLE_FILE=$4
if [ -d "../train" ];
then
......@@ -81,7 +80,7 @@ run_gpu()
if [ $# -gt 6 ] || [ $# -lt 4 ]
then
echo "Usage:\n \
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]\n \
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH]\n \
GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \
"
exit 1
......
......@@ -141,7 +141,6 @@ def main():
env['RANK_ID'] = str(rank_id)
env['DEVICE_ID'] = str(device_id)
if args.nproc_per_node > 1:
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
env['RANK_TABLE_FILE'] = table_fn
if os.path.exists(device_dir):
shutil.rmtree(device_dir)
......
......@@ -138,7 +138,7 @@ def main():
env['RANK_ID'] = str(rank_id)
env['DEVICE_ID'] = str(device_id)
if args.nproc_per_node > 1:
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
env['RANK_TABLE_FILE'] = table_fn
env['RANK_TABLE_FILE'] = table_fn
if os.path.exists(device_dir):
shutil.rmtree(device_dir)
......
......@@ -25,7 +25,7 @@ This is an example of training DeepLabV3 with PASCAL VOC 2012 dataset in MindSpo
```
- Run `run_distribute_train.sh` for distributed training.
``` bash
sh scripts/run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH
sh scripts/run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
```
### Evaluation
Set options in evaluation_config.py. Make sure the 'data_file' and 'finetune_ckpt' are set to your own path.
......
......@@ -16,14 +16,13 @@
echo "=============================================================================================================="
echo "Please run the scipt as: "
echo "bash run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH"
echo "for example: bash run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH [PRETRAINED_CKPT_PATH](option)"
echo "bash run_distribute_train.sh RANK_TABLE_FILE DATA_PATH"
echo "for example: bash run_distribute_train.sh RANK_TABLE_FILE DATA_PATH [PRETRAINED_CKPT_PATH](option)"
echo "It is better to use absolute path."
echo "=============================================================================================================="
DATA_DIR=$2
export MINDSPORE_HCCL_CONFIG_PATH=$1
export RANK_TABLE_FILE=$1
export RANK_SIZE=8
PATH_CHECKPOINT=""
......
......@@ -87,13 +87,13 @@ FasterRcnn is a two-stage target detection network,This network uses a region pr
```
# distributed training
sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_MODEL]
sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
# standalone training
sh run_standalone_train.sh [PRETRAINED_MODEL]
```
> Rank_table.json which is specified by MINDSPORE_HCCL_CONFIG_PATH is needed when you are running a distribute task. You can generate it by using the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools).
> Rank_table.json which is specified by RANK_TABLE_FILE is needed when you are running a distribute task. You can generate it by using the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools).
> As for PRETRAINED_MODEL,if not set, the model will be trained from the very beginning.Ready-made pretrained_models are not available now. Stay tuned.
#### Result
......
......@@ -16,7 +16,7 @@
if [ $# -lt 1 ] || [ $# -gt 2 ]
then
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_PATH]"
echo "Usage: sh run_train.sh [RANK_TABLE_FILE] [PRETRAINED_PATH]"
exit 1
fi
......@@ -33,7 +33,7 @@ echo $PATH1
if [ ! -f $PATH1 ]
then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file"
echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1
fi
......@@ -51,7 +51,6 @@ fi
ulimit -u unlimited
export DEVICE_NUM=8
export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1
for((i=0; i<${DEVICE_NUM}; i++))
......
......@@ -16,22 +16,22 @@
if [ $# != 1 ]
then
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH]"
echo "Usage: sh run_train.sh [RANK_TABLE_FILE]"
exit 1
fi
if [ ! -f $1 ]
then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file"
echo "error: RANK_TABLE_FILE=$1 is not a file"
exit 1
fi
ulimit -u unlimited
export DEVICE_NUM=8
export RANK_SIZE=8
MINDSPORE_HCCL_CONFIG_PATH=$(realpath $1)
export MINDSPORE_HCCL_CONFIG_PATH
echo "MINDSPORE_HCCL_CONFIG_PATH=${MINDSPORE_HCCL_CONFIG_PATH}"
RANK_TABLE_FILE=$(realpath $1)
export RANK_TABLE_FILE
echo "RANK_TABLE_FILE=${RANK_TABLE_FILE}"
export SERVER_ID=0
rank_start=$((DEVICE_NUM * SERVER_ID))
......
......@@ -88,7 +88,7 @@ MaskRcnn is a two-stage target detection network,This network uses a region prop
```
# distributed training
sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_MODEL]
sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
# standalone training
sh run_standalone_train.sh [PRETRAINED_MODEL]
......
......@@ -16,7 +16,7 @@
if [ $# != 2 ]
then
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_PATH]"
echo "Usage: sh run_train.sh [RANK_TABLE_FILE] [PRETRAINED_PATH]"
exit 1
fi
......@@ -35,7 +35,7 @@ echo $PATH2
if [ ! -f $PATH1 ]
then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file"
echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1
fi
......@@ -48,7 +48,6 @@ fi
ulimit -u unlimited
export DEVICE_NUM=8
export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1
echo 3 > /proc/sys/vm/drop_caches
......
......@@ -132,7 +132,7 @@ Parameters for both training and evaluation can be set in config.py.
```
# distributed training
Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]
Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH]
[PRETRAINED_CKPT_PATH](optional)
# standalone training
......
......@@ -16,7 +16,7 @@
if [ $# != 4 ] && [ $# != 5 ]
then
echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1
fi
......@@ -57,7 +57,7 @@ fi
if [ ! -f $PATH1 ]
then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file"
echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1
fi
......@@ -76,7 +76,6 @@ fi
ulimit -u unlimited
export DEVICE_NUM=8
export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1
for((i=0; i<${DEVICE_NUM}; i++))
......
......@@ -82,7 +82,7 @@ Parameters for both training and evaluating can be set in config.py
```
# distribute training example(8p)
sh run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH
sh run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
# standalone training
sh run_standalone_train.sh DEVICE_ID DATA_PATH
```
......@@ -91,7 +91,7 @@ sh run_standalone_train.sh DEVICE_ID DATA_PATH
```bash
# distributed training example(8p) for Ascend
sh scripts/run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH /dataset/train
sh scripts/run_distribute_train.sh RANK_TABLE_FILE /dataset/train
# standalone training example for Ascend
sh scripts/run_standalone_train.sh 0 /dataset/train
......
......@@ -16,7 +16,7 @@
echo "=============================================================================================================="
echo "Please run the scipt as: "
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET RANK_TABLE_FILE PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
echo "for example: sh run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)"
echo "It is better to use absolute path."
echo "================================================================================================================="
......@@ -24,7 +24,7 @@ echo "==========================================================================
if [ $# != 5 ] && [ $# != 7 ]
then
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \
[MINDSPORE_HCCL_CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
[RANK_TABLE_FILE] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
exit 1
fi
......@@ -41,7 +41,7 @@ LR=$3
DATASET=$4
PRE_TRAINED=$6
PRE_TRAINED_EPOCH_SIZE=$7
export MINDSPORE_HCCL_CONFIG_PATH=$5
export RANK_TABLE_FILE=$5
for((i=0;i<RANK_SIZE;i++))
do
......
......@@ -101,9 +101,9 @@ parameters/options:
### Distribute Training
```
Usage: sh script/run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]
Usage: sh script/run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH]
parameters/options:
MINDSPORE_HCCL_CONFIG_PATH HCCL configuration file path.
RANK_TABLE_FILE HCCL configuration file path.
DATA_PATH the storage path of dataset.
```
......@@ -16,13 +16,13 @@
if [ $# != 2 ]
then
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]"
echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH]"
exit 1
fi
if [ ! -f $1 ]
then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file"
echo "error: RANK_TABLE_FILEH=$1 is not a file"
exit 1
fi
......@@ -34,7 +34,7 @@ fi
export DEVICE_NUM=8
export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$1
export RANK_TABLE_FILE=$1
for((i=0;i<RANK_SIZE;i++))
do
......
......@@ -77,7 +77,7 @@ Parameters for both training and evaluation can be set in config.py.
```
# distributed training in Ascend
Usage: bash run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]
Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH]
# distributed training in GPU
Usage: bash run_distribute_train_for_gpu.sh [RANK_SIZE] [DATASET_PATH]
......
......@@ -15,7 +15,7 @@
# ============================================================================
if [ $# != 2 ]; then
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]"
echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH]"
exit 1
fi
......@@ -31,7 +31,7 @@ PATH1=$(get_real_path $1)
PATH2=$(get_real_path $2)
if [ ! -f $PATH1 ]; then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file"
echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1
fi
......@@ -43,7 +43,6 @@ fi
ulimit -u unlimited
export DEVICE_NUM=8
export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1
for ((i = 0; i < ${DEVICE_NUM}; i++)); do
......
......@@ -55,7 +55,7 @@ This is an example of training YOLOV3-DarkNet53 with COCO2014 dataset in MindSpo
```
# distributed training
sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [MINDSPORE_HCCL_CONFIG_PATH]
sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [RANK_TABLE_FILE]
# standalone training
sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE]
......
......@@ -16,7 +16,7 @@
if [ $# != 3 ]
then
echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [MINDSPORE_HCCL_CONFIG_PATH]"
echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [RANK_TABLE_FILE]"
exit 1
fi
......@@ -30,10 +30,10 @@ get_real_path(){
DATASET_PATH=$(get_real_path $1)
PRETRAINED_BACKBONE=$(get_real_path $2)
MINDSPORE_HCCL_CONFIG_PATH=$(get_real_path $3)
RANK_TABLE_FILE=$(get_real_path $3)
echo $DATASET_PATH
echo $PRETRAINED_BACKBONE
echo $MINDSPORE_HCCL_CONFIG_PATH
echo $RANK_TABLE_FILE
if [ ! -d $DATASET_PATH ]
then
......@@ -47,15 +47,15 @@ then
exit 1
fi
if [ ! -f $MINDSPORE_HCCL_CONFIG_PATH ]
if [ ! -f $RANK_TABLE_FILE ]
then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$MINDSPORE_HCCL_CONFIG_PATH is not a file"
echo "error: RANK_TABLE_FILE=$RANK_TABLE_FILE is not a file"
exit 1
fi
export DEVICE_NUM=8
export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$MINDSPORE_HCCL_CONFIG_PATH
export RANK_TABLE_FILEH=$RANK_TABLE_FILE
for((i=0; i<${DEVICE_NUM}; i++))
do
......
......@@ -16,7 +16,7 @@
echo "======================================================================================================================================================="
echo "Please run the scipt as: "
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH RANK_TABLE_FILE PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)"
echo "It is better to use absolute path."
echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script."
......@@ -24,7 +24,7 @@ echo "==========================================================================
if [ $# != 6 ] && [ $# != 8 ]
then
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [MINDSPORE_HCCL_CONFIG_PATH] \
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [RANK_TABLE_FILE] \
[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
exit 1
fi
......@@ -42,7 +42,7 @@ python train.py --only_create_dataset=1 --mindrecord_dir=$MINDRECORD_DIR --image
echo "After running the scipt, the network runs in the background. The log will be generated in LOGx/log.txt"
export MINDSPORE_HCCL_CONFIG_PATH=$6
export RANK_TABLE_FILE=$6
export RANK_SIZE=$1
BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
......
......@@ -21,7 +21,7 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base](
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
``` bash
sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH
sh scripts/run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH
```
### Fine-Tuning and Evaluation
......
......@@ -16,9 +16,11 @@
echo "=============================================================================================================="
echo "Please run the scipt as: "
echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH"
echo "for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json /path/hccl.json"
echo "bash run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH"
echo "for example: bash run_distribute_pretrain.sh /path/dataset /path/hccl.json"
echo "It is better to use absolute path."
echo "For hyper parameter, please note that you should customize the scripts:
'{CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini' "
echo "=============================================================================================================="
EPOCH_SIZE=$2
......
......@@ -44,7 +44,6 @@ set_hccl_json()
do
if [[ "$1" == "-j" || "$1" == "--hccl_json" ]]
then
export MINDSPORE_HCCL_CONFIG_PATH=$2
export RANK_TABLE_FILE=$2
break
fi
......
......@@ -74,7 +74,7 @@ This example implements training and evaluation of Transformer Model, which is i
- Run `run_distribute_train.sh` for distributed training of Transformer model.
``` bash
sh scripts/run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATA_PATH MINDSPORE_HCCL_CONFIG_PATH
sh scripts/run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE
```
### Evaluation
......
......@@ -16,7 +16,7 @@
echo "=============================================================================================================="
echo "Please run the scipt as: "
echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH MINDSPORE_HCCL_CONFIG_PATH"
echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE"
echo "for example: sh run_distribute_pretrain.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json"
echo "It is better to use absolute path."
echo "=============================================================================================================="
......@@ -28,7 +28,6 @@ cd run_distribute_train || exit
EPOCH_SIZE=$2
DATA_PATH=$3
export MINDSPORE_HCCL_CONFIG_PATH=$4
export RANK_TABLE_FILE=$4
export RANK_SIZE=$1
export HCCL_FLAG=1
......
......@@ -21,7 +21,7 @@ echo "After running the script, the network runs in the background, The log will
export RANK_SIZE=$1
DATA_URL=$2
export MINDSPORE_HCCL_CONFIG_PATH=$3
export RANK_TABLE_FILE=$3
for ((i=0; i<RANK_SIZE;i++))
do
......
......@@ -22,7 +22,6 @@ export RANK_SIZE=$1
export EPOCH_SIZE=$2
export DATASET=$3
export RANK_TABLE_FILE=$4
export MINDSPORE_HCCL_CONFIG_PATH=$4
for((i=0;i<$RANK_SIZE;i++));
do
......
......@@ -22,7 +22,6 @@ export RANK_SIZE=$1
export EPOCH_SIZE=$2
export DATASET=$3
export RANK_TABLE_FILE=$4
export MINDSPORE_HCCL_CONFIG_PATH=$4
for((i=0;i<$RANK_SIZE;i++));
do
......
......@@ -21,7 +21,6 @@ export RANK_SIZE=$1
export EPOCH_SIZE=$2
export DATASET=$3
export RANK_TABLE_FILE=$4
export MINDSPORE_HCCL_CONFIG_PATH=$4
export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1
......
......@@ -29,7 +29,7 @@ EPOCH_SIZE=$2
VOCAB_SIZE=$3
EMB_DIM=$4
DATASET=$5
MINDSPORE_HCCL_CONFIG_PATH=$6
RANK_TABLE_FILE=$6
ENV_SH=$7
MODE=$8
......@@ -39,7 +39,7 @@ do
passwd=$(get_node_passwd ${cluster_config_path} ${node})
echo "------------------${user}@${node}---------------------"
if [ $MODE == "host_device_mix" ]; then
ssh_pass ${node} ${user} ${passwd} "mkdir -p ${execute_path}; cd ${execute_path}; bash ${SCRIPTPATH}/run_auto_parallel_train_cluster.sh ${RANK_SIZE} ${RANK_START} ${EPOCH_SIZE} ${VOCAB_SIZE} ${EMB_DIM} ${DATASET} ${ENV_SH} ${MODE} ${MINDSPORE_HCCL_CONFIG_PATH}"
ssh_pass ${node} ${user} ${passwd} "mkdir -p ${execute_path}; cd ${execute_path}; bash ${SCRIPTPATH}/run_auto_parallel_train_cluster.sh ${RANK_SIZE} ${RANK_START} ${EPOCH_SIZE} ${VOCAB_SIZE} ${EMB_DIM} ${DATASET} ${ENV_SH} ${MODE} ${RANK_TABLE_FILE}"
else
echo "[ERROR] mode is wrong"
exit 1
......
......@@ -140,7 +140,7 @@ def main():
env['RANK_ID'] = str(rank_id)
env['DEVICE_ID'] = str(device_id)
if args.nproc_per_node > 1:
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
env['RANK_TABLE_FILE'] = table_fn
env['RANK_TABLE_FILE'] = table_fn
if os.path.exists(device_dir):
shutil.rmtree(device_dir)
......
......@@ -74,7 +74,7 @@ Parameters for both training and inference can be set in config.py.
```
# distributed training
Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [DEVICE_NUM]
Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [DEVICE_NUM]
```
......
......@@ -16,13 +16,13 @@
if [ $# != 3 ]
then
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [DEVICE_NUM]"
echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [DEVICE_NUM]"
exit 1
fi
if [ ! -f $1 ]
then
echo "error: DMINDSPORE_HCCL_CONFIG_PATH=$1 is not a file"
echo "error: DRANK_TABLE_FILE=$1 is not a file"
exit 1
fi
......@@ -38,7 +38,7 @@ cd $BASE_PATH/../ || exit
ulimit -u unlimited
export DEVICE_NUM=$3
export RANK_SIZE=$3
export MINDSPORE_HCCL_CONFIG_PATH=$1
export RANK_TABLE_FILE=$1
for((i=0; i<${DEVICE_NUM}; i++))
do
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册