From 9aa868d14db86b7e5ec94a8d503fecfc7e18284b Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Fri, 10 Jun 2022 03:47:32 +0000 Subject: [PATCH] support distrbuted training --- examples/aishell/asr0/local/train.sh | 13 ++++++++++--- examples/aishell/asr0/run.sh | 3 ++- examples/aishell/asr1/local/train.sh | 14 +++++++++++--- examples/aishell/asr1/run.sh | 3 ++- examples/callcenter/asr1/local/train.sh | 13 ++++++++++--- examples/callcenter/asr1/run.sh | 3 ++- examples/librispeech/asr0/local/train.sh | 13 ++++++++++--- examples/librispeech/asr0/run.sh | 3 ++- examples/librispeech/asr1/local/train.sh | 13 ++++++++++--- examples/librispeech/asr1/run.sh | 3 ++- examples/librispeech/asr2/local/train.sh | 13 ++++++++++--- examples/librispeech/asr2/run.sh | 3 ++- examples/mustc/st1/local/train.sh | 20 ++++++++++++++++++-- examples/mustc/st1/run.sh | 5 +++-- examples/ted_en_zh/st0/local/train.sh | 13 ++++++++++--- examples/ted_en_zh/st0/run.sh | 3 ++- examples/ted_en_zh/st1/local/train.sh | 15 ++++++++++++--- examples/ted_en_zh/st1/run.sh | 3 ++- examples/tiny/asr0/local/train.sh | 13 ++++++++++--- examples/tiny/asr0/run.sh | 5 +++-- examples/tiny/asr1/local/train.sh | 13 ++++++++++--- examples/tiny/asr1/run.sh | 5 +++-- 22 files changed, 146 insertions(+), 46 deletions(-) diff --git a/examples/aishell/asr0/local/train.sh b/examples/aishell/asr0/local/train.sh index 76b696d9..256b30d2 100755 --- a/examples/aishell/asr0/local/train.sh +++ b/examples/aishell/asr0/local/train.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" +if [ $# -lt 2 ] && [ $# -gt 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)" exit -1 fi @@ -10,6 +10,13 @@ echo "using $ngpu gpus..." config_path=$1 ckpt_name=$2 +ips=$3 + +if [ ! $ips ];then + ips_config= +else + ips_config="--ips="${ips} +fi mkdir -p exp @@ -26,7 +33,7 @@ python3 -u ${BIN_DIR}/train.py \ --output exp/${ckpt_name} \ --seed ${seed} else -python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ diff --git a/examples/aishell/asr0/run.sh b/examples/aishell/asr0/run.sh index 3bf9aa12..530c013a 100755 --- a/examples/aishell/asr0/run.sh +++ b/examples/aishell/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeech2_online.yaml +ips= #xx.xx.xx.xx,xx.xx.xx.xx decode_conf_path=conf/tuning/decode.yaml avg_num=10 audio_file=data/demo_01_03.wav @@ -24,7 +25,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/examples/aishell/asr1/local/train.sh b/examples/aishell/asr1/local/train.sh index 5617f7ef..f514de30 100755 --- a/examples/aishell/asr1/local/train.sh +++ b/examples/aishell/asr1/local/train.sh @@ -17,13 +17,21 @@ if [ ${seed} != 0 ]; then echo "using seed $seed & FLAGS_cudnn_deterministic=True ..." fi -if [ $# != 2 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" +if [ $# -lt 2 ] && [ $# -gt 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)" exit -1 fi config_path=$1 ckpt_name=$2 +ips=$3 + +if [ ! $ips ];then + ips_config= +else + ips_config="--ips="${ips} +fi +echo ${ips_config} mkdir -p exp @@ -37,7 +45,7 @@ python3 -u ${BIN_DIR}/train.py \ --benchmark-batch-size ${benchmark_batch_size} \ --benchmark-max-step ${benchmark_max_step} else -python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ --seed ${seed} \ --config ${config_path} \ diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh index cb781b20..bd4f50e3 100644 --- a/examples/aishell/asr1/run.sh +++ b/examples/aishell/asr1/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/conformer.yaml +ips= #xx.xx.xx.xx,xx.xx.xx.xx decode_conf_path=conf/tuning/decode.yaml avg_num=30 audio_file=data/demo_01_03.wav @@ -23,7 +24,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/examples/callcenter/asr1/local/train.sh b/examples/callcenter/asr1/local/train.sh index 03b4588e..41da89e2 100755 --- a/examples/callcenter/asr1/local/train.sh +++ b/examples/callcenter/asr1/local/train.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" +if [ $# -lt 2 ] && [ $# -gt 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)" exit -1 fi @@ -10,6 +10,13 @@ echo "using $ngpu gpus..." config_path=$1 ckpt_name=$2 +ips=$3 + +if [ ! $ips ];then + ips_config= +else + ips_config="--ips="${ips} +fi echo "using ${device}..." @@ -28,7 +35,7 @@ python3 -u ${BIN_DIR}/train.py \ --output exp/${ckpt_name} \ --seed ${seed} else -python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ diff --git a/examples/callcenter/asr1/run.sh b/examples/callcenter/asr1/run.sh index 0c7ffc1e..7e3b912a 100644 --- a/examples/callcenter/asr1/run.sh +++ b/examples/callcenter/asr1/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/conformer.yaml +ips= #xx.xx.xx.xx,xx.xx.xx.xx decode_conf_path=conf/tuning/decode.yaml avg_num=20 @@ -22,7 +23,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/examples/librispeech/asr0/local/train.sh b/examples/librispeech/asr0/local/train.sh index ad00653b..71659e28 100755 --- a/examples/librispeech/asr0/local/train.sh +++ b/examples/librispeech/asr0/local/train.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" +if [ $# -lt 2 ] && [ $# -gt 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)" exit -1 fi @@ -10,6 +10,13 @@ echo "using $ngpu gpus..." config_path=$1 ckpt_name=$2 +ips=$3 + +if [ ! $ips ];then + ips_config= +else + ips_config="--ips="${ips} +fi mkdir -p exp @@ -26,7 +33,7 @@ python3 -u ${BIN_DIR}/train.py \ --output exp/${ckpt_name} \ --seed ${seed} else -python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ diff --git a/examples/librispeech/asr0/run.sh b/examples/librispeech/asr0/run.sh index 6b1ff6c6..38112398 100755 --- a/examples/librispeech/asr0/run.sh +++ b/examples/librispeech/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +ips= #xx.xx.xx.xx,xx.xx.xx.xx decode_conf_path=conf/tuning/decode.yaml avg_num=5 audio_file=data/demo_002_en.wav @@ -23,7 +24,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/examples/librispeech/asr1/local/train.sh b/examples/librispeech/asr1/local/train.sh index 3860d85c..f729ed22 100755 --- a/examples/librispeech/asr1/local/train.sh +++ b/examples/librispeech/asr1/local/train.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" +if [ $# -lt 2 ] && [ $# -gt 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)" exit -1 fi @@ -10,6 +10,13 @@ echo "using $ngpu gpus..." config_path=$1 ckpt_name=$2 +ips=$3 + +if [ ! $ips ];then + ips_config= +else + ips_config="--ips="${ips} +fi mkdir -p exp @@ -29,7 +36,7 @@ python3 -u ${BIN_DIR}/train.py \ --output exp/${ckpt_name} \ --seed ${seed} else -python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ diff --git a/examples/librispeech/asr1/run.sh b/examples/librispeech/asr1/run.sh index 116dae12..a14240ee 100755 --- a/examples/librispeech/asr1/run.sh +++ b/examples/librispeech/asr1/run.sh @@ -8,6 +8,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +ips= #xx.xx.xx.xx,xx.xx.xx.xx decode_conf_path=conf/tuning/decode.yaml avg_num=30 audio_file=data/demo_002_en.wav @@ -25,7 +26,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/examples/librispeech/asr2/local/train.sh b/examples/librispeech/asr2/local/train.sh index 560424ea..1f414ad4 100755 --- a/examples/librispeech/asr2/local/train.sh +++ b/examples/librispeech/asr2/local/train.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" +if [ $# -lt 2 ] && [ $# -gt 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)" exit -1 fi @@ -10,6 +10,13 @@ echo "using $ngpu gpus..." config_path=$1 ckpt_name=$2 +ips=$3 + +if [ ! $ips ];then + ips_config= +else + ips_config="--ips="${ips} +fi mkdir -p exp @@ -27,7 +34,7 @@ python3 -u ${BIN_DIR}/train.py \ --output exp/${ckpt_name} \ --seed ${seed} else -python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ --model-name u2_kaldi \ --config ${config_path} \ diff --git a/examples/librispeech/asr2/run.sh b/examples/librispeech/asr2/run.sh index c9a794e3..d156159f 100755 --- a/examples/librispeech/asr2/run.sh +++ b/examples/librispeech/asr2/run.sh @@ -9,6 +9,7 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +ips= #xx.xx.xx.xx,xx.xx.xx.xx decode_conf_path=conf/decode/decode_base.yaml dict_path=data/lang_char/train_960_unigram5000_units.txt avg_num=10 @@ -26,7 +27,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/examples/mustc/st1/local/train.sh b/examples/mustc/st1/local/train.sh index 456c9416..db2a575a 100755 --- a/examples/mustc/st1/local/train.sh +++ b/examples/mustc/st1/local/train.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path" +if [ $# -lt 3 ] && [ $# -gt 4 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path ips(optional)" exit -1 fi @@ -11,6 +11,13 @@ echo "using $ngpu gpus..." config_path=$1 ckpt_name=$2 ckpt_path=$3 +ips=$3 + +if [ ! $ips ];then + ips_config= +else + ips_config="--ips="${ips} +fi mkdir -p exp @@ -21,12 +28,21 @@ if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi +if [ ${ngpu} == 0 ]; then python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ --checkpoint_path "${ckpt_path}" \ --seed ${seed} +else +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--checkpoint_path "${ckpt_path}" \ +--seed ${seed} +fi if [ ${seed} != 0 ]; then unset FLAGS_cudnn_deterministic diff --git a/examples/mustc/st1/run.sh b/examples/mustc/st1/run.sh index 6ceae3b8..99ee2295 100755 --- a/examples/mustc/st1/run.sh +++ b/examples/mustc/st1/run.sh @@ -7,6 +7,7 @@ gpus=0,1,2,3 stage=0 stop_stage=3 conf_path=conf/transformer_es.yaml +ips= #xx.xx.xx.xx,xx.xx.xx.xx decode_conf_path=conf/tuning/decode.yaml must_c_path= lang=es @@ -25,7 +26,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}" + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}" ${ips} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -36,4 +37,4 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${lang} || exit -1 -fi \ No newline at end of file +fi diff --git a/examples/ted_en_zh/st0/local/train.sh b/examples/ted_en_zh/st0/local/train.sh index ad00653b..71659e28 100755 --- a/examples/ted_en_zh/st0/local/train.sh +++ b/examples/ted_en_zh/st0/local/train.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" +if [ $# -lt 2 ] && [ $# -gt 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)" exit -1 fi @@ -10,6 +10,13 @@ echo "using $ngpu gpus..." config_path=$1 ckpt_name=$2 +ips=$3 + +if [ ! $ips ];then + ips_config= +else + ips_config="--ips="${ips} +fi mkdir -p exp @@ -26,7 +33,7 @@ python3 -u ${BIN_DIR}/train.py \ --output exp/${ckpt_name} \ --seed ${seed} else -python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ diff --git a/examples/ted_en_zh/st0/run.sh b/examples/ted_en_zh/st0/run.sh index 1746c025..c5a59f65 100755 --- a/examples/ted_en_zh/st0/run.sh +++ b/examples/ted_en_zh/st0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer_mtl_noam.yaml +ips= #xx.xx.xx.xx,xx.xx.xx.xx decode_conf_path=conf/tuning/decode.yaml avg_num=5 data_path=./TED_EnZh # path to unzipped data @@ -23,7 +24,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/examples/ted_en_zh/st1/local/train.sh b/examples/ted_en_zh/st1/local/train.sh index 5da64e99..3e9295e5 100755 --- a/examples/ted_en_zh/st1/local/train.sh +++ b/examples/ted_en_zh/st1/local/train.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path" +if [ $# -lt 3 ] && [ $# -gt 4 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)" exit -1 fi @@ -11,6 +11,15 @@ echo "using $ngpu gpus..." config_path=$1 ckpt_name=$2 ckpt_path=$3 +ips=$3 + +if [ ! $ips ];then + ips_config= +else + ips_config="--ips="${ips} +fi + +mkdir -p exp mkdir -p exp @@ -28,7 +37,7 @@ python3 -u ${BIN_DIR}/train.py \ --checkpoint_path "${ckpt_path}" \ --seed ${seed} else -python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh index 1808e37b..06a407d4 100755 --- a/examples/ted_en_zh/st1/run.sh +++ b/examples/ted_en_zh/st1/run.sh @@ -7,6 +7,7 @@ gpus=0,1,2,3 stage=1 stop_stage=4 conf_path=conf/transformer_mtl_noam.yaml +ips= #xx.xx.xx.xx,xx.xx.xx.xx decode_conf_path=conf/tuning/decode.yaml ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model) avg_num=5 @@ -29,7 +30,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "Finetune from Pretrained Model" ${ckpt_path} ./local/download_pretrain.sh || exit -1 fi - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}" + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}" ${ips} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/examples/tiny/asr0/local/train.sh b/examples/tiny/asr0/local/train.sh index 2f2cff77..8b67902f 100755 --- a/examples/tiny/asr0/local/train.sh +++ b/examples/tiny/asr0/local/train.sh @@ -15,13 +15,20 @@ if [ ${seed} != 0 ]; then echo "using seed $seed & FLAGS_cudnn_deterministic=True ..." fi -if [ $# != 2 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" +if [ $# -lt 2 ] && [ $# -gt 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)" exit -1 fi config_path=$1 ckpt_name=$2 +ips=$3 + +if [ ! $ips ];then + ips_config= +else + ips_config="--ips="${ips} +fi mkdir -p exp @@ -33,7 +40,7 @@ python3 -u ${BIN_DIR}/train.py \ --profiler-options "${profiler_options}" \ --seed ${seed} else -python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ diff --git a/examples/tiny/asr0/run.sh b/examples/tiny/asr0/run.sh index 16f43130..3e84d422 100755 --- a/examples/tiny/asr0/run.sh +++ b/examples/tiny/asr0/run.sh @@ -2,10 +2,11 @@ set -e source path.sh -gpus=0 +gpus=4 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +ips= #xx.xx.xx.xx,xx.xx.xx.xx decode_conf_path=conf/tuning/decode.yaml avg_num=1 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -21,7 +22,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/examples/tiny/asr1/local/train.sh b/examples/tiny/asr1/local/train.sh index 5617f7ef..459f2e21 100755 --- a/examples/tiny/asr1/local/train.sh +++ b/examples/tiny/asr1/local/train.sh @@ -17,13 +17,20 @@ if [ ${seed} != 0 ]; then echo "using seed $seed & FLAGS_cudnn_deterministic=True ..." fi -if [ $# != 2 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name" +if [ $# -lt 2 ] && [ $# -gt 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)" exit -1 fi config_path=$1 ckpt_name=$2 +ips=$3 + +if [ ! $ips ];then + ips_config= +else + ips_config="--ips="${ips} +fi mkdir -p exp @@ -37,7 +44,7 @@ python3 -u ${BIN_DIR}/train.py \ --benchmark-batch-size ${benchmark_batch_size} \ --benchmark-max-step ${benchmark_max_step} else -python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ +python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ --seed ${seed} \ --config ${config_path} \ diff --git a/examples/tiny/asr1/run.sh b/examples/tiny/asr1/run.sh index 1651c034..ca0a7a01 100755 --- a/examples/tiny/asr1/run.sh +++ b/examples/tiny/asr1/run.sh @@ -2,10 +2,11 @@ set -e source path.sh -gpus=0 +gpus=4 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +ips= #xx.xx.xx.xx,xx.xx.xx.xx decode_conf_path=conf/tuning/decode.yaml avg_num=1 @@ -22,7 +23,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then -- GitLab