librispeech s1 support sclite and multi process decode

20178e0e · Hui Zhang · 238e2365 · 20178e0e · 20178e0e · 20178e0e
4 changed file
--- a/examples/librispeech/s1/cmd.sh
+++ b/examples/librispeech/s1/cmd.sh
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
--- a/examples/librispeech/s1/local/test.sh
+++ b/examples/librispeech/s1/local/test.sh
 #!/bin/bash
-if [ $# != 2 ];then
+expdir=exp
-    echo "usage: ${0} config_path ckpt_path_prefix"
+datadir=data
+lmtag=
+recog_set="test-clean test-other dev-clean dev-other"
+recog_set="test-clean"
+# bpemode (unigram or bpe)
+nbpe=5000
+bpemode=unigram
+bpeprefix="data/bpe_${bpemode}_${nbpe}"
+bpemodel=${bpeprefix}.model
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path dict_path ckpt_path_prefix"
    exit -1
 fi
@@ -9,7 +23,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
-ckpt_prefix=$2
+dict=$2
+ckpt_prefix=$3
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
@@ -24,44 +39,49 @@ echo "chunk mode ${chunk_mode}"
 #    exit 1
 #fi
-for type in attention ctc_greedy_search; do
+pids=() # initialize pids
-    echo "decoding ${type}"
-    if [ ${chunk_mode} == true ];then
+for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do
-        # stream decoding only support batchsize=1
+(
+    for rtask in ${recog_set}; do
+    (
+        decode_dir=decode_${rtask}_${dmethd}_$(basename ${config_path%.*})_${lmtag}
+        feat_recog_dir=${datadir}
+        mkdir -p ${expdir}/${decode_dir}
+        mkdir -p ${feat_recog_dir}
+        # split data
+        split_json.sh ${feat_recog_dir}/manifest.${rtask} ${nj}
+        #### use CPU for decoding
+        ngpu=0
+        # set batchsize 0 to disable batch decoding
        batch_size=1
-    else
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
-        batch_size=64
+            python3 -u ${BIN_DIR}/test.py \
-    fi
+            --model-name u2_kaldi \
-    python3 -u ${BIN_DIR}/test.py \
+            --run-mode test \
-    --nproc ${ngpu} \
+            --nproc ${ngpu} \
-    --config ${config_path} \
+            --dict-path ${dict} \
-    --result_file ${ckpt_prefix}.${type}.rsl \
+            --config ${config_path} \
-    --checkpoint_path ${ckpt_prefix} \
+            --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
+            --result-file ${expdir}/${decode_dir}/data.JOB.json \
-    --opts decoding.batch_size ${batch_size}
+            --opts decoding.decoding_method ${dmethd} \
+            --opts decoding.batch_size ${batch_size} \
-    if [ $? -ne 0 ]; then
+            --opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask}
-        echo "Failed in evaluation!"
-        exit 1
+        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
-    fi
-done
-for type in ctc_prefix_beam_search attention_rescoring; do
+    ) &
-    echo "decoding ${type}"
+    pids+=($!) # store background pids
-    batch_size=1
+    done
-    python3 -u ${BIN_DIR}/test.py \
+) &
-    --nproc ${ngpu} \
+pids+=($!) # store background pids
-    --config ${config_path} \
-    --result_file ${ckpt_prefix}.${type}.rsl \
-    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
-    if [ $? -ne 0 ]; then
-        echo "Failed in evaluation!"
-        exit 1
-    fi
 done
+i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+[ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+echo "Finished"
 exit 0
--- a/examples/librispeech/s1/path.sh
+++ b/examples/librispeech/s1/path.sh
 export MAIN_ROOT=`realpath ${PWD}/../../../`
-export PATH=${MAIN_ROOT}:${PWD}/utils:${PATH}
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sckt/bin/sclite:${PWD}/utils:${PATH}
 export LC_ALL=C
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C

--- a/examples/librispeech/s1/run.sh
+++ b/examples/librispeech/s1/run.sh
 #!/bin/bash
 set -e
-source path.sh
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
 stage=0
 stop_stage=100
 conf_path=conf/transformer.yaml
 avg_num=5
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')