diff --git a/speechx/examples/README.md b/speechx/examples/README.md index 35174a0d756be8d2204788628fb94e21f34ca397..c3de0d3a8600ff61fb20760f717a4c9bad9d03c5 100644 --- a/speechx/examples/README.md +++ b/speechx/examples/README.md @@ -1,12 +1,10 @@ # Examples for SpeechX -* dev - for speechx developer, using for test. -* ngram - using to build NGram ARPA lm. * ds2_ol - ds2 streaming test under `aishell-1` test dataset. - The entrypoint is `ds2_ol/aishell/run.sh` + The entrypoint is `ds2_ol/aishell/run.sh` -## How to run +## How to run `run.sh` is the entry point. @@ -17,9 +15,19 @@ pushd ds2_ol/aishell bash run.sh ``` -## Display Model with [Netron](https://github.com/lutzroeder/netron) +## Display Model with [Netron](https://github.com/lutzroeder/netron) ``` pip install netron netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel --port 8022 --host 10.21.55.20 ``` + +## Build WFST + +* text_lm - process text for build lm +* ngram - using to build NGram ARPA lm. +* wfst - build wfst for TLG. + +## For Developer + +* dev - for speechx developer, using for test. diff --git a/speechx/examples/ngram/README.md b/speechx/examples/ngram/README.md deleted file mode 100644 index b120715fc381d8055eab1368487dff3c033eae2a..0000000000000000000000000000000000000000 --- a/speechx/examples/ngram/README.md +++ /dev/null @@ -1 +0,0 @@ -# NGram Train diff --git a/speechx/examples/ngram/en/README.md b/speechx/examples/ngram/en/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/speechx/examples/ngram/path.sh b/speechx/examples/ngram/path.sh deleted file mode 100644 index f926ccd28943e2e1d4ddc78738279d64db657f38..0000000000000000000000000000000000000000 --- a/speechx/examples/ngram/path.sh +++ /dev/null @@ -1,20 +0,0 @@ -# This contains the locations of binarys build required for running the examples. - -SPEECHX_ROOT=$PWD/../../../ -MAIN_ROOT=$SPEECHX_ROOT/../ -SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples - -SPEECHX_TOOLS=$SPEECHX_ROOT/tools -TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin - -[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } - -export LC_AL=C - -export PATH=$PATH:$TOOLS_BIN - -# srilm -export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10 -export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs -export SRILM=${MAIN_ROOT}/tools/srilm -export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64 \ No newline at end of file diff --git a/speechx/examples/ngram/run.sh b/speechx/examples/ngram/run.sh deleted file mode 100644 index 462a89550f888d02e05017b19e01f0ce23a78681..0000000000000000000000000000000000000000 --- a/speechx/examples/ngram/run.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -set -eo pipefail - -. path.sh - -stage=-1 -stop_stage=100 -corpus=aishell - -unit=data/vocab.txt # vocab -lexicon= # aishell/resource_aishell/lexicon.txt -text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt - -. parse_options.sh - -data=$PWD/data -mkdir -p $data - -if [ ! -f $unit ]; then - echo "$0: No such file $unit" - exit 1; -fi - -if [ ! which ngram-count ]; then - pushd $MAIN_ROOT/tools - make srilm.done - popd -fi - -if [ ! which fstaddselfloops ]; then - pushd $MAIN_ROOT/tools - make kaldi.done - popd -fi - -mkdir -p data/local/dict -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # 7.1 Prepare dict - cp $unit data/local/dict/units.txt - utils/fst/prepare_dict.py \ - --unit_file $unit \ - --in_lexicon ${lexicon} \ - --out_lexicon data/local/dict/lexicon.txt -fi - -lm=data/local/lm -mkdir -p data/train -mkdir -p $lm -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # 7.2 Train lm - utils/manifest_key_value.py \ - --manifest_path data/manifest.train \ - --output_path data/train - utils/filter_scp.pl data/train/text \ - $text > $lm/text - - local/aishell_train_lms.sh -fi - -echo "build LM done." -exit 0 diff --git a/speechx/examples/ngram/utils b/speechx/examples/ngram/utils deleted file mode 120000 index 256f914abcaa47d966c44878b88a300437f110fb..0000000000000000000000000000000000000000 --- a/speechx/examples/ngram/utils +++ /dev/null @@ -1 +0,0 @@ -../../../utils/ \ No newline at end of file diff --git a/speechx/examples/ngram/zh/README.md b/speechx/examples/ngram/zh/README.md new file mode 100644 index 0000000000000000000000000000000000000000..be2062db37327005ff5908d257c79d1d73b7f74f --- /dev/null +++ b/speechx/examples/ngram/zh/README.md @@ -0,0 +1,101 @@ +# ngram train for mandarin + +Quick run: +``` +bash run.sh --stage -1 +``` + +## input + +input files: +``` +data/ +├── lexicon.txt +├── text +└── vocab.txt +``` + +``` +==> data/text <== +BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购 +BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉 +BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后 +BAC009S0002W0125 各地 政府 便 纷纷 跟进 +BAC009S0002W0126 仅 一 个 多 月 的 时间 里 +BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外 +BAC009S0002W0128 四十六 个 限 购 城市 当中 +BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购 +BAC009S0002W0130 财政 金融 政策 紧随 其后 而来 +BAC009S0002W0131 显示 出 了 极 强 的 威力 + +==> data/lexicon.txt <== +SIL sil + sil +啊 aa a1 +啊 aa a2 +啊 aa a4 +啊 aa a5 +啊啊啊 aa a2 aa a2 aa a2 +啊啊啊 aa a5 aa a5 aa a5 +坐地 z uo4 d i4 +坐实 z uo4 sh ix2 +坐视 z uo4 sh ix4 +坐稳 z uo4 uu un3 +坐拥 z uo4 ii iong1 +坐诊 z uo4 zh en3 +坐庄 z uo4 zh uang1 +坐姿 z uo4 z iy1 + +==> data/vocab.txt <== + + +A +B +C +D +E +龙 +龚 +龛 + +``` + +## output + +``` +data/ +├── local +│ ├── dict +│ │ ├── lexicon.txt +│ │ └── units.txt +│ └── lm +│ ├── heldout +│ ├── lm.arpa +│ ├── text +│ ├── text.no_oov +│ ├── train +│ ├── unigram.counts +│ ├── word.counts +│ └── wordlist +``` + +``` +/workspace/srilm/bin/i686-m64/ngram-count +Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt') +Ignoring words 矽, which contains oov unit +Ignoring words 傩, which contains oov unit +Ignoring words 堀, which contains oov unit +Ignoring words 莼, which contains oov unit +Ignoring words 菰, which contains oov unit +Ignoring words 摭, which contains oov unit +Ignoring words 帙, which contains oov unit +Ignoring words 迨, which contains oov unit +Ignoring words 孥, which contains oov unit +Ignoring words 瑗, which contains oov unit +... +... +... +file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs +0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745 +build LM done. +``` \ No newline at end of file diff --git a/speechx/examples/ngram/local/aishell_train_lms.sh b/speechx/examples/ngram/zh/local/aishell_train_lms.sh old mode 100644 new mode 100755 similarity index 90% rename from speechx/examples/ngram/local/aishell_train_lms.sh rename to speechx/examples/ngram/zh/local/aishell_train_lms.sh index d9f87aca920c6b7244fe1f16a8c1cd481e138457..e3cee43891c5255c2e027f25742f81f6d7642adc --- a/speechx/examples/ngram/local/aishell_train_lms.sh +++ b/speechx/examples/ngram/zh/local/aishell_train_lms.sh @@ -28,10 +28,14 @@ mkdir -p $dir cleantext=$dir/text.no_oov +# oov to +# line: utt word0 ... wordn -> line: word0 ... wordn cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ > $cleantext || exit 1; +# compute word counts +# line: count word cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ sort -nr > $dir/word.counts || exit 1; @@ -42,10 +46,13 @@ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; +# word with cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist +# hold out to compute ppl heldout_sent=10000 # Don't change this if you want result to be comparable with # kaldi_lm results + mkdir -p $dir cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/heldout diff --git a/speechx/examples/ngram/zh/path.sh b/speechx/examples/ngram/zh/path.sh new file mode 100644 index 0000000000000000000000000000000000000000..a3fb3d75878a3b7a641d7f13e464aa324a9b0ea6 --- /dev/null +++ b/speechx/examples/ngram/zh/path.sh @@ -0,0 +1,12 @@ +# This contains the locations of binarys build required for running the examples. + +MAIN_ROOT=`realpath $PWD/../../../../` +SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx` + +export LC_AL=C + +# srilm +export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10 +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs +export SRILM=${MAIN_ROOT}/tools/srilm +export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64 diff --git a/speechx/examples/ngram/zh/run.sh b/speechx/examples/ngram/zh/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..eda422b3304e10788c7c2f104c6da7f10d0ae6e7 --- /dev/null +++ b/speechx/examples/ngram/zh/run.sh @@ -0,0 +1,62 @@ +#!/bin/bash +set -eo pipefail + +. path.sh + +stage=0 +stop_stage=100 +corpus=aishell + +unit=data/vocab.txt # line: char/spm_pice, vocab file +lexicon=data/lexicon.txt # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt +text=data/text # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt + +. utils/parse_options.sh + +data=$PWD/data +mkdir -p $data + +if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then + if [ ! -f $data/speech.ngram.zh.tar.gz ];then + pushd $data + wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz + tar xvzf speech.ngram.zh.tar.gz + popd + fi +fi + +if [ ! -f $unit ]; then + echo "$0: No such file $unit" + exit 1; +fi + +if ! which ngram-count; then + pushd $MAIN_ROOT/tools + make srilm.done + popd +fi + +mkdir -p data/local/dict +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # 7.1 Prepare dict + # line: char/spm_pices + cp $unit data/local/dict/units.txt + + # line: word ph0 ... phn -> line: word char0 ... charn + utils/fst/prepare_dict.py \ + --unit_file $unit \ + --in_lexicon ${lexicon} \ + --out_lexicon data/local/dict/lexicon.txt +fi + +lm=data/local/lm +mkdir -p $lm + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # 7.2 Train lm + cp $text $lm/text + local/aishell_train_lms.sh +fi + +echo "build LM done." +exit 0 diff --git a/speechx/examples/ngram/zh/utils b/speechx/examples/ngram/zh/utils new file mode 120000 index 0000000000000000000000000000000000000000..c2519a9dd0bc11c4ca3de4bf89e16634dadcd4c9 --- /dev/null +++ b/speechx/examples/ngram/zh/utils @@ -0,0 +1 @@ +../../../../utils/ \ No newline at end of file diff --git a/speechx/examples/text_lm/.gitignore b/speechx/examples/text_lm/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..1269488f7fb1f4b56a8c0e5eb48cecbfadfa9219 --- /dev/null +++ b/speechx/examples/text_lm/.gitignore @@ -0,0 +1 @@ +data diff --git a/speechx/examples/text_lm/path.sh b/speechx/examples/text_lm/path.sh new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/speechx/examples/text_lm/run.sh b/speechx/examples/text_lm/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/speechx/examples/wfst/README.md b/speechx/examples/wfst/README.md new file mode 100644 index 0000000000000000000000000000000000000000..dd9b926f4e82210ae27bf1955dc50b1b1526ba55 --- /dev/null +++ b/speechx/examples/wfst/README.md @@ -0,0 +1,18 @@ +``` +fstaddselfloops 'echo 4234 |' 'echo 123660 |' +Lexicon and Token FSTs compiling succeeded +arpa2fst --read-symbol-table=data/lang_test/words.txt --keep-symbols=true - +LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:94) Reading \data\ section. +LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \1-grams: section. +LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \2-grams: section. +LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \3-grams: section. +Checking how stochastic G is (the first of these numbers should be small): +fstisstochastic data/lang_test/G.fst +0 -1.14386 +fsttablecompose data/lang_test/L.fst data/lang_test/G.fst +fstminimizeencoded +fstdeterminizestar --use-log=true +fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst +Composing decoding graph TLG.fst succeeded +Aishell build TLG done. +``` \ No newline at end of file diff --git a/speechx/examples/build_wfst/path.sh b/speechx/examples/wfst/path.sh similarity index 68% rename from speechx/examples/build_wfst/path.sh rename to speechx/examples/wfst/path.sh index e4008cd2ce9514dbdfdfb84d601249fb32d3af51..877f23991c7c0f43b3b32f05a57c1ed7da0d9c59 100644 --- a/speechx/examples/build_wfst/path.sh +++ b/speechx/examples/wfst/path.sh @@ -1,18 +1,10 @@ # This contains the locations of binarys build required for running the examples. -SPEECHX_ROOT=$PWD/../../../ -MAIN_ROOT=$SPEECHX_ROOT/../ -SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples - -SPEECHX_TOOLS=$SPEECHX_ROOT/tools -TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin - -[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } +MAIN_ROOT=`realpath $PWD/../../../../` +SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx` export LC_AL=C -export PATH=$PATH:$TOOLS_BIN - # srilm export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs diff --git a/speechx/examples/build_wfst/run.sh b/speechx/examples/wfst/run.sh similarity index 93% rename from speechx/examples/build_wfst/run.sh rename to speechx/examples/wfst/run.sh index bba14c59457fbc610cd288ca7bf72e8dbab9f835..b53e1a5b78a389bcb0efac4bd8e3a38931d83a38 100644 --- a/speechx/examples/build_wfst/run.sh +++ b/speechx/examples/wfst/run.sh @@ -13,12 +13,6 @@ text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt source parse_options.sh -if [ ! which ngram-count ]; then - pushd $MAIN_ROOT/tools - make srilm.done - popd -fi - if [ ! which fstprint ]; then pushd $MAIN_ROOT/tools make kaldi.done