From 54376f5d8f23a39f420029ce1f5b88ae604302ed Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 20 Apr 2022 11:24:22 +0000 Subject: [PATCH] speedup ngram build --- .../ds2_ol/aishell/local/split_data.sh | 16 ++++++---- .../ngram/zh/local/aishell_train_lms.sh | 13 ++++++-- speechx/examples/ngram/zh/local/split_data.sh | 30 +++++++++++++++++++ 3 files changed, 51 insertions(+), 8 deletions(-) create mode 100644 speechx/examples/ngram/zh/local/split_data.sh diff --git a/speechx/examples/ds2_ol/aishell/local/split_data.sh b/speechx/examples/ds2_ol/aishell/local/split_data.sh index ab61f6a2..2af6fc5a 100755 --- a/speechx/examples/ds2_ol/aishell/local/split_data.sh +++ b/speechx/examples/ds2_ol/aishell/local/split_data.sh @@ -1,10 +1,15 @@ #!/usr/bin/env bash +set -eo pipefail + data=$1 -feat_scp=$2 -split_feat_name=$3 +scp=$2 +split_name=$3 numsplit=$4 +# save in $data/split{n} +# $scp to split +# if [[ ! $numsplit -gt 0 ]]; then echo "Invalid num-split argument"; @@ -12,8 +17,8 @@ if [[ ! $numsplit -gt 0 ]]; then fi directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done) -feat_split_scp=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_feat_name}; done) -echo $feat_split_scp +scp_splits=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_name}; done) + # if this mkdir fails due to argument-list being too long, iterate. if ! mkdir -p $directories >&/dev/null; then for n in `seq $numsplit`; do @@ -21,4 +26,5 @@ if ! mkdir -p $directories >&/dev/null; then done fi -utils/split_scp.pl $feat_scp $feat_split_scp +echo "utils/split_scp.pl $scp $scp_splits" +utils/split_scp.pl $scp $scp_splits diff --git a/speechx/examples/ngram/zh/local/aishell_train_lms.sh b/speechx/examples/ngram/zh/local/aishell_train_lms.sh index 76266151..9e6e7e7b 100755 --- a/speechx/examples/ngram/zh/local/aishell_train_lms.sh +++ b/speechx/examples/ngram/zh/local/aishell_train_lms.sh @@ -3,6 +3,7 @@ # To be run from one directory above this script. . ./path.sh +nj=40 text=data/local/lm/text lexicon=data/local/dict/lexicon.txt @@ -31,9 +32,15 @@ cleantext=$dir/text.no_oov # oov to # lexicon line: word char0 ... charn # text line: utt word0 ... wordn -> line: word0 ... wordn -cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } - {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ - > $cleantext || exit 1; +text_dir=$(dirname $text) +split_name=$(basename $text) +./local/split_data.sh $text_dir $text $split_name $nj + +utils/run.pl JOB=1:$nj $text_dir/split${nj}/JOB/${split_name}.no_oov.log \ + cat ${text_dir}/split${nj}/JOB/${split_name} \| awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ + \> ${text_dir}/split${nj}/JOB/${split_name}.no_oov || exit 1; +cat ${text_dir}/split${nj}/*/${split_name}.no_oov > $cleantext # compute word counts, sort in descending order # line: count word diff --git a/speechx/examples/ngram/zh/local/split_data.sh b/speechx/examples/ngram/zh/local/split_data.sh new file mode 100644 index 00000000..2af6fc5a --- /dev/null +++ b/speechx/examples/ngram/zh/local/split_data.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +set -eo pipefail + +data=$1 +scp=$2 +split_name=$3 +numsplit=$4 + +# save in $data/split{n} +# $scp to split +# + +if [[ ! $numsplit -gt 0 ]]; then + echo "Invalid num-split argument"; + exit 1; +fi + +directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done) +scp_splits=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_name}; done) + +# if this mkdir fails due to argument-list being too long, iterate. +if ! mkdir -p $directories >&/dev/null; then + for n in `seq $numsplit`; do + mkdir -p $data/split${numsplit}/$n + done +fi + +echo "utils/split_scp.pl $scp $scp_splits" +utils/split_scp.pl $scp $scp_splits -- GitLab