From eb52896c4a1b7db12072a11d481a7a0260a2492f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 16 Apr 2022 06:48:28 +0000 Subject: [PATCH] zh ngram build --- speechx/examples/README.md | 18 +++- speechx/examples/ngram/README.md | 1 - speechx/examples/ngram/en/README.md | 0 speechx/examples/ngram/path.sh | 20 ---- speechx/examples/ngram/run.sh | 61 ----------- speechx/examples/ngram/utils | 1 - speechx/examples/ngram/zh/README.md | 101 ++++++++++++++++++ .../ngram/{ => zh}/local/aishell_train_lms.sh | 7 ++ speechx/examples/ngram/zh/path.sh | 12 +++ speechx/examples/ngram/zh/run.sh | 62 +++++++++++ speechx/examples/ngram/zh/utils | 1 + speechx/examples/text_lm/.gitignore | 1 + speechx/examples/text_lm/path.sh | 0 speechx/examples/text_lm/run.sh | 0 speechx/examples/wfst/README.md | 18 ++++ speechx/examples/{build_wfst => wfst}/path.sh | 12 +-- speechx/examples/{build_wfst => wfst}/run.sh | 6 -- 17 files changed, 217 insertions(+), 104 deletions(-) delete mode 100644 speechx/examples/ngram/README.md create mode 100644 speechx/examples/ngram/en/README.md delete mode 100644 speechx/examples/ngram/path.sh delete mode 100644 speechx/examples/ngram/run.sh delete mode 120000 speechx/examples/ngram/utils create mode 100644 speechx/examples/ngram/zh/README.md rename speechx/examples/ngram/{ => zh}/local/aishell_train_lms.sh (90%) mode change 100644 => 100755 create mode 100644 speechx/examples/ngram/zh/path.sh create mode 100755 speechx/examples/ngram/zh/run.sh create mode 120000 speechx/examples/ngram/zh/utils create mode 100644 speechx/examples/text_lm/.gitignore create mode 100644 speechx/examples/text_lm/path.sh create mode 100644 speechx/examples/text_lm/run.sh create mode 100644 speechx/examples/wfst/README.md rename speechx/examples/{build_wfst => wfst}/path.sh (68%) rename speechx/examples/{build_wfst => wfst}/run.sh (93%) diff --git a/speechx/examples/README.md b/speechx/examples/README.md index 35174a0d..c3de0d3a 100644 --- a/speechx/examples/README.md +++ b/speechx/examples/README.md @@ -1,12 +1,10 @@ # Examples for SpeechX -* dev - for speechx developer, using for test. -* ngram - using to build NGram ARPA lm. * ds2_ol - ds2 streaming test under `aishell-1` test dataset. - The entrypoint is `ds2_ol/aishell/run.sh` + The entrypoint is `ds2_ol/aishell/run.sh` -## How to run +## How to run `run.sh` is the entry point. @@ -17,9 +15,19 @@ pushd ds2_ol/aishell bash run.sh ``` -## Display Model with [Netron](https://github.com/lutzroeder/netron) +## Display Model with [Netron](https://github.com/lutzroeder/netron) ``` pip install netron netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel --port 8022 --host 10.21.55.20 ``` + +## Build WFST + +* text_lm - process text for build lm +* ngram - using to build NGram ARPA lm. +* wfst - build wfst for TLG. + +## For Developer + +* dev - for speechx developer, using for test. diff --git a/speechx/examples/ngram/README.md b/speechx/examples/ngram/README.md deleted file mode 100644 index b120715f..00000000 --- a/speechx/examples/ngram/README.md +++ /dev/null @@ -1 +0,0 @@ -# NGram Train diff --git a/speechx/examples/ngram/en/README.md b/speechx/examples/ngram/en/README.md new file mode 100644 index 00000000..e69de29b diff --git a/speechx/examples/ngram/path.sh b/speechx/examples/ngram/path.sh deleted file mode 100644 index f926ccd2..00000000 --- a/speechx/examples/ngram/path.sh +++ /dev/null @@ -1,20 +0,0 @@ -# This contains the locations of binarys build required for running the examples. - -SPEECHX_ROOT=$PWD/../../../ -MAIN_ROOT=$SPEECHX_ROOT/../ -SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples - -SPEECHX_TOOLS=$SPEECHX_ROOT/tools -TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin - -[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } - -export LC_AL=C - -export PATH=$PATH:$TOOLS_BIN - -# srilm -export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10 -export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs -export SRILM=${MAIN_ROOT}/tools/srilm -export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64 \ No newline at end of file diff --git a/speechx/examples/ngram/run.sh b/speechx/examples/ngram/run.sh deleted file mode 100644 index 462a8955..00000000 --- a/speechx/examples/ngram/run.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -set -eo pipefail - -. path.sh - -stage=-1 -stop_stage=100 -corpus=aishell - -unit=data/vocab.txt # vocab -lexicon= # aishell/resource_aishell/lexicon.txt -text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt - -. parse_options.sh - -data=$PWD/data -mkdir -p $data - -if [ ! -f $unit ]; then - echo "$0: No such file $unit" - exit 1; -fi - -if [ ! which ngram-count ]; then - pushd $MAIN_ROOT/tools - make srilm.done - popd -fi - -if [ ! which fstaddselfloops ]; then - pushd $MAIN_ROOT/tools - make kaldi.done - popd -fi - -mkdir -p data/local/dict -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # 7.1 Prepare dict - cp $unit data/local/dict/units.txt - utils/fst/prepare_dict.py \ - --unit_file $unit \ - --in_lexicon ${lexicon} \ - --out_lexicon data/local/dict/lexicon.txt -fi - -lm=data/local/lm -mkdir -p data/train -mkdir -p $lm -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # 7.2 Train lm - utils/manifest_key_value.py \ - --manifest_path data/manifest.train \ - --output_path data/train - utils/filter_scp.pl data/train/text \ - $text > $lm/text - - local/aishell_train_lms.sh -fi - -echo "build LM done." -exit 0 diff --git a/speechx/examples/ngram/utils b/speechx/examples/ngram/utils deleted file mode 120000 index 256f914a..00000000 --- a/speechx/examples/ngram/utils +++ /dev/null @@ -1 +0,0 @@ -../../../utils/ \ No newline at end of file diff --git a/speechx/examples/ngram/zh/README.md b/speechx/examples/ngram/zh/README.md new file mode 100644 index 00000000..be2062db --- /dev/null +++ b/speechx/examples/ngram/zh/README.md @@ -0,0 +1,101 @@ +# ngram train for mandarin + +Quick run: +``` +bash run.sh --stage -1 +``` + +## input + +input files: +``` +data/ +├── lexicon.txt +├── text +└── vocab.txt +``` + +``` +==> data/text <== +BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购 +BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉 +BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后 +BAC009S0002W0125 各地 政府 便 纷纷 跟进 +BAC009S0002W0126 仅 一 个 多 月 的 时间 里 +BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外 +BAC009S0002W0128 四十六 个 限 购 城市 当中 +BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购 +BAC009S0002W0130 财政 金融 政策 紧随 其后 而来 +BAC009S0002W0131 显示 出 了 极 强 的 威力 + +==> data/lexicon.txt <== +SIL sil + sil +啊 aa a1 +啊 aa a2 +啊 aa a4 +啊 aa a5 +啊啊啊 aa a2 aa a2 aa a2 +啊啊啊 aa a5 aa a5 aa a5 +坐地 z uo4 d i4 +坐实 z uo4 sh ix2 +坐视 z uo4 sh ix4 +坐稳 z uo4 uu un3 +坐拥 z uo4 ii iong1 +坐诊 z uo4 zh en3 +坐庄 z uo4 zh uang1 +坐姿 z uo4 z iy1 + +==> data/vocab.txt <== + + +A +B +C +D +E +龙 +龚 +龛 + +``` + +## output + +``` +data/ +├── local +│ ├── dict +│ │ ├── lexicon.txt +│ │ └── units.txt +│ └── lm +│ ├── heldout +│ ├── lm.arpa +│ ├── text +│ ├── text.no_oov +│ ├── train +│ ├── unigram.counts +│ ├── word.counts +│ └── wordlist +``` + +``` +/workspace/srilm/bin/i686-m64/ngram-count +Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt') +Ignoring words 矽, which contains oov unit +Ignoring words 傩, which contains oov unit +Ignoring words 堀, which contains oov unit +Ignoring words 莼, which contains oov unit +Ignoring words 菰, which contains oov unit +Ignoring words 摭, which contains oov unit +Ignoring words 帙, which contains oov unit +Ignoring words 迨, which contains oov unit +Ignoring words 孥, which contains oov unit +Ignoring words 瑗, which contains oov unit +... +... +... +file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs +0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745 +build LM done. +``` \ No newline at end of file diff --git a/speechx/examples/ngram/local/aishell_train_lms.sh b/speechx/examples/ngram/zh/local/aishell_train_lms.sh old mode 100644 new mode 100755 similarity index 90% rename from speechx/examples/ngram/local/aishell_train_lms.sh rename to speechx/examples/ngram/zh/local/aishell_train_lms.sh index d9f87aca..e3cee438 --- a/speechx/examples/ngram/local/aishell_train_lms.sh +++ b/speechx/examples/ngram/zh/local/aishell_train_lms.sh @@ -28,10 +28,14 @@ mkdir -p $dir cleantext=$dir/text.no_oov +# oov to +# line: utt word0 ... wordn -> line: word0 ... wordn cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ > $cleantext || exit 1; +# compute word counts +# line: count word cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ sort -nr > $dir/word.counts || exit 1; @@ -42,10 +46,13 @@ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; +# word with cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist +# hold out to compute ppl heldout_sent=10000 # Don't change this if you want result to be comparable with # kaldi_lm results + mkdir -p $dir cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/heldout diff --git a/speechx/examples/ngram/zh/path.sh b/speechx/examples/ngram/zh/path.sh new file mode 100644 index 00000000..a3fb3d75 --- /dev/null +++ b/speechx/examples/ngram/zh/path.sh @@ -0,0 +1,12 @@ +# This contains the locations of binarys build required for running the examples. + +MAIN_ROOT=`realpath $PWD/../../../../` +SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx` + +export LC_AL=C + +# srilm +export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10 +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs +export SRILM=${MAIN_ROOT}/tools/srilm +export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64 diff --git a/speechx/examples/ngram/zh/run.sh b/speechx/examples/ngram/zh/run.sh new file mode 100755 index 00000000..eda422b3 --- /dev/null +++ b/speechx/examples/ngram/zh/run.sh @@ -0,0 +1,62 @@ +#!/bin/bash +set -eo pipefail + +. path.sh + +stage=0 +stop_stage=100 +corpus=aishell + +unit=data/vocab.txt # line: char/spm_pice, vocab file +lexicon=data/lexicon.txt # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt +text=data/text # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt + +. utils/parse_options.sh + +data=$PWD/data +mkdir -p $data + +if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then + if [ ! -f $data/speech.ngram.zh.tar.gz ];then + pushd $data + wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz + tar xvzf speech.ngram.zh.tar.gz + popd + fi +fi + +if [ ! -f $unit ]; then + echo "$0: No such file $unit" + exit 1; +fi + +if ! which ngram-count; then + pushd $MAIN_ROOT/tools + make srilm.done + popd +fi + +mkdir -p data/local/dict +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # 7.1 Prepare dict + # line: char/spm_pices + cp $unit data/local/dict/units.txt + + # line: word ph0 ... phn -> line: word char0 ... charn + utils/fst/prepare_dict.py \ + --unit_file $unit \ + --in_lexicon ${lexicon} \ + --out_lexicon data/local/dict/lexicon.txt +fi + +lm=data/local/lm +mkdir -p $lm + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # 7.2 Train lm + cp $text $lm/text + local/aishell_train_lms.sh +fi + +echo "build LM done." +exit 0 diff --git a/speechx/examples/ngram/zh/utils b/speechx/examples/ngram/zh/utils new file mode 120000 index 00000000..c2519a9d --- /dev/null +++ b/speechx/examples/ngram/zh/utils @@ -0,0 +1 @@ +../../../../utils/ \ No newline at end of file diff --git a/speechx/examples/text_lm/.gitignore b/speechx/examples/text_lm/.gitignore new file mode 100644 index 00000000..1269488f --- /dev/null +++ b/speechx/examples/text_lm/.gitignore @@ -0,0 +1 @@ +data diff --git a/speechx/examples/text_lm/path.sh b/speechx/examples/text_lm/path.sh new file mode 100644 index 00000000..e69de29b diff --git a/speechx/examples/text_lm/run.sh b/speechx/examples/text_lm/run.sh new file mode 100644 index 00000000..e69de29b diff --git a/speechx/examples/wfst/README.md b/speechx/examples/wfst/README.md new file mode 100644 index 00000000..dd9b926f --- /dev/null +++ b/speechx/examples/wfst/README.md @@ -0,0 +1,18 @@ +``` +fstaddselfloops 'echo 4234 |' 'echo 123660 |' +Lexicon and Token FSTs compiling succeeded +arpa2fst --read-symbol-table=data/lang_test/words.txt --keep-symbols=true - +LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:94) Reading \data\ section. +LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \1-grams: section. +LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \2-grams: section. +LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \3-grams: section. +Checking how stochastic G is (the first of these numbers should be small): +fstisstochastic data/lang_test/G.fst +0 -1.14386 +fsttablecompose data/lang_test/L.fst data/lang_test/G.fst +fstminimizeencoded +fstdeterminizestar --use-log=true +fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst +Composing decoding graph TLG.fst succeeded +Aishell build TLG done. +``` \ No newline at end of file diff --git a/speechx/examples/build_wfst/path.sh b/speechx/examples/wfst/path.sh similarity index 68% rename from speechx/examples/build_wfst/path.sh rename to speechx/examples/wfst/path.sh index e4008cd2..877f2399 100644 --- a/speechx/examples/build_wfst/path.sh +++ b/speechx/examples/wfst/path.sh @@ -1,18 +1,10 @@ # This contains the locations of binarys build required for running the examples. -SPEECHX_ROOT=$PWD/../../../ -MAIN_ROOT=$SPEECHX_ROOT/../ -SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples - -SPEECHX_TOOLS=$SPEECHX_ROOT/tools -TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin - -[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } +MAIN_ROOT=`realpath $PWD/../../../../` +SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx` export LC_AL=C -export PATH=$PATH:$TOOLS_BIN - # srilm export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs diff --git a/speechx/examples/build_wfst/run.sh b/speechx/examples/wfst/run.sh similarity index 93% rename from speechx/examples/build_wfst/run.sh rename to speechx/examples/wfst/run.sh index bba14c59..b53e1a5b 100644 --- a/speechx/examples/build_wfst/run.sh +++ b/speechx/examples/wfst/run.sh @@ -13,12 +13,6 @@ text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt source parse_options.sh -if [ ! which ngram-count ]; then - pushd $MAIN_ROOT/tools - make srilm.done - popd -fi - if [ ! which fstprint ]; then pushd $MAIN_ROOT/tools make kaldi.done -- GitLab