Merge pull request #1715 from zh794390558/spx_egs

[speechx] refactor egs and more egs for TLG wfst graph build

Merge pull request #1715 from zh794390558/spx_egs
[speechx] refactor egs and more egs for TLG wfst graph build
7220b11b · Hui Zhang · GitHub · b78bc637 · 0ede6c2e · 7220b11b
36 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -33,6 +33,12 @@ tools/Miniconda3-latest-Linux-x86_64.sh
 tools/activate_python.sh
 tools/miniconda.sh
 tools/CRF++-0.58/
+tools/liblbfgs-1.10/
+tools/srilm/
+tools/env.sh
+tools/openfst-1.8.1/
+tools/libsndfile/
+tools/python-soundfile/
 speechx/fc_patch/

--- a/examples/other/ngram_lm/s0/local/build_zh_lm.sh
+++ b/examples/other/ngram_lm/s0/local/build_zh_lm.sh
@@ -27,7 +27,7 @@ arpa=$3
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
    # text tn & wordseg preprocess
    echo "process text."
-    python3 ${MAIN_ROOT}/utils/zh_tn.py ${type} ${text} ${text}.${type}.tn
+    python3 ${MAIN_ROOT}/utils/zh_tn.py --token_type ${type} ${text} ${text}.${type}.tn
 fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then

--- a/examples/other/ngram_lm/s0/local/download_lm_zh.sh
+++ b/examples/other/ngram_lm/s0/local/download_lm_zh.sh
@@ -10,6 +10,11 @@ MD5="29e02312deb2e59b3c8686c7966d4fe3"
 TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm
+if [ -e $TARGET ];then
+    echo "already have lm"
+    exit 0;
+fi
 echo "Download language model ..."
 download $URL $MD5 $TARGET
 if [ $? -ne 0 ]; then

--- a/setup.py
+++ b/setup.py
@@ -65,6 +65,7 @@ base = [
    "webrtcvad",
    "yacs~=0.1.8",
    "prettytable",
+    "zhon",
 ]
 server = [
@@ -91,7 +92,6 @@ requirements = {
        "unidecode",
        "yq",
        "pre-commit",
-        "zhon",
    ]
 }

--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
 # Examples for SpeechX
-* dev - for speechx developer, using for test.
-* ngram - using to build NGram ARPA lm.
 * ds2_ol - ds2 streaming test under `aishell-1` test dataset.
- The entrypoint is `ds2_ol/aishell/run.sh`
+   The entrypoint is `ds2_ol/aishell/run.sh`
-## How to run
+## How to run  
 `run.sh` is the entry point.
@@ -17,9 +15,23 @@ pushd ds2_ol/aishell
 bash run.sh
 ```
-## Display Model with [Netron](https://github.com/lutzroeder/netron)
+## Display Model with [Netron](https://github.com/lutzroeder/netron)  
 ```
 pip install netron
 netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel  --port 8022 --host 10.21.55.20
 ```
+## For Developer  
+> Warning: Only for developer, make sure you know what's it.
+* dev - for speechx developer, using for test.
+## Build WFST  
+> Warning: Using below example when you know what's it.
+* text_lm - process text for build lm
+* ngram - using to build NGram ARPA lm.
+* wfst - build wfst for TLG.
--- a/speechx/examples/ds2_ol/aishell/README.md
+++ b/speechx/examples/ds2_ol/aishell/README.md
@@ -10,12 +10,18 @@ Other -> 0.00 % N=0 C=0 S=0 D=0 I=0
 ## CTC Prefix Beam Search w LM
+LM: zh_giga.no_cna_cmn.prune01244.klm
 ```
+Overall -> 7.86 % N=104768 C=96865 S=7573 D=330 I=327
+Mandarin -> 7.86 % N=104768 C=96865 S=7573 D=330 I=327
+Other -> 0.00 % N=0 C=0 S=0 D=0 I=0
 ```
 ## CTC WFST
+LM: aishell train
+```
+Overall -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1819
+Mandarin -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1818
+Other -> 0.00 % N=0 C=0 S=0 D=0 I=1
 ```
-```
\ No newline at end of file
--- a/speechx/examples/ds2_ol/aishell/path.sh
+++ b/speechx/examples/ds2_ol/aishell/path.sh
@@ -11,4 +11,4 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 export LC_AL=C
 SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/decoder:$SPEECHX_EXAMPLES/ds2_ol/feat
 export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
\ No newline at end of file
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@@ -5,7 +5,10 @@ set -e
 . path.sh
 nj=40
+stage=0
+stop_stage=100
+. utils/parse_options.sh
 # 1. compile
 if [ ! -d ${SPEECHX_EXAMPLES} ]; then
@@ -26,102 +29,112 @@ vocb_dir=$ckpt_dir/data/lang_char/
 mkdir -p exp
 exp=$PWD/exp
-aishell_wav_scp=aishell_test.scp
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
-if [ ! -d $data/test ]; then
+    aishell_wav_scp=aishell_test.scp
-    pushd $data
+    if [ ! -d $data/test ]; then
-    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
+        pushd $data
-    unzip  aishell_test.zip
+        wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
-    popd
+        unzip  aishell_test.zip
+        popd
-    realpath $data/test/*/*.wav > $data/wavlist
-    awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
+        realpath $data/test/*/*.wav > $data/wavlist
-    paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
+        awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
-fi
+        paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
+    fi
-if [ ! -d $ckpt_dir ]; then
-    mkdir -p $ckpt_dir
+    if [ ! -d $ckpt_dir ]; then
-    wget -P $ckpt_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
+        mkdir -p $ckpt_dir
-    tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $ckpt_dir
+        wget -P $ckpt_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
-fi
+        tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $ckpt_dir
+    fi
-lm=$data/zh_giga.no_cna_cmn.prune01244.klm
-if [ ! -f $lm ]; then
+    lm=$data/zh_giga.no_cna_cmn.prune01244.klm
-    pushd $data
+    if [ ! -f $lm ]; then
-    wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm
+        pushd $data
-    popd
+        wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm
+        popd
+    fi
 fi
 # 3. make feature
+text=$data/test/text
 label_file=./aishell_result
 wer=./aishell_wer
 export GLOG_logtostderr=1
-# 3. gen linear feat
-cmvn=$PWD/cmvn.ark
-cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+    # 3. gen linear feat
+    cmvn=$data/cmvn.ark
+    cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
-./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
+    ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
-utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \
+    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \
-linear-spectrogram-wo-db-norm-ol \
+    linear-spectrogram-wo-db-norm-ol \
-    --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
+        --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
-    --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
+        --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
-    --cmvn_file=$cmvn \
+        --cmvn_file=$cmvn \
-    --streaming_chunk=0.36
+        --streaming_chunk=0.36
+fi
-text=$data/test/text
-# 4. recognizer
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
-utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
+    #  recognizer
-  ctc-prefix-beam-search-decoder-ol \
+    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
-    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
+    ctc-prefix-beam-search-decoder-ol \
-    --model_path=$model_dir/avg_1.jit.pdmodel \
+        --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
-    --param_path=$model_dir/avg_1.jit.pdiparams \
+        --model_path=$model_dir/avg_1.jit.pdmodel \
-    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
+        --param_path=$model_dir/avg_1.jit.pdiparams \
-    --dict_file=$vocb_dir/vocab.txt \
+        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-    --result_wspecifier=ark,t:$data/split${nj}/JOB/result
+        --dict_file=$vocb_dir/vocab.txt \
+        --result_wspecifier=ark,t:$data/split${nj}/JOB/result
-cat $data/split${nj}/*/result > ${label_file}
-utils/compute-wer.py --char=1 --v=1 ${label_file} $text > ${wer}
+    cat $data/split${nj}/*/result > $exp/${label_file}
+    utils/compute-wer.py --char=1 --v=1 $exp/${label_file} $text > $exp/${wer}
-# 4. decode with lm
-utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
-  ctc-prefix-beam-search-decoder-ol \
-    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
-    --model_path=$model_dir/avg_1.jit.pdmodel \
-    --param_path=$model_dir/avg_1.jit.pdiparams \
-    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-    --dict_file=$vocb_dir/vocab.txt \
-    --lm_path=$lm \
-    --result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm
-cat $data/split${nj}/*/result_lm > ${label_file}_lm
-utils/compute-wer.py --char=1 --v=1 ${label_file}_lm $text > ${wer}_lm
-graph_dir=./aishell_graph
-if [ ! -d $ ]; then
-    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
-    unzip -d aishell_graph.zip
 fi
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
+    #  decode with lm
+    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
+    ctc-prefix-beam-search-decoder-ol \
+        --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
+        --model_path=$model_dir/avg_1.jit.pdmodel \
+        --param_path=$model_dir/avg_1.jit.pdiparams \
+        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
+        --dict_file=$vocb_dir/vocab.txt \
+        --lm_path=$lm \
+        --result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm
+    cat $data/split${nj}/*/result_lm > $exp/${label_file}_lm
+    utils/compute-wer.py --char=1 --v=1 $exp/${label_file}_lm $text > $exp/${wer}_lm
+fi
-# 5. test TLG decoder
-utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
-  wfst-decoder-ol \
-    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
-    --model_path=$model_dir/avg_1.jit.pdmodel \
-    --param_path=$model_dir/avg_1.jit.pdiparams \
-    --word_symbol_table=$graph_dir/words.txt \
-    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
-     --graph_path=$graph_dir/TLG.fst --max_active=7500 \
-    --acoustic_scale=1.2 \
-    --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg
+wfst=$data/wfst/
+mkdir -p $wfst
+if [ ! -f $wfst/aishell_graph.zip ]; then
+    pushd $wfst
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
+    unzip aishell_graph.zip
+    popd
+fi
-cat $data/split${nj}/*/result_tlg > ${label_file}_tlg
+graph_dir=$wfst/aishell_graph
-utils/compute-wer.py --char=1 --v=1 ${label_file}_tlg $text > ${wer}_tlg
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
\ No newline at end of file
+    #  TLG decoder
+    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
+    wfst-decoder-ol \
+        --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
+        --model_path=$model_dir/avg_1.jit.pdmodel \
+        --param_path=$model_dir/avg_1.jit.pdiparams \
+        --word_symbol_table=$graph_dir/words.txt \
+        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
+        --graph_path=$graph_dir/TLG.fst --max_active=7500 \
+        --acoustic_scale=1.2 \
+        --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg
+    cat $data/split${nj}/*/result_tlg > $exp/${label_file}_tlg
+    utils/compute-wer.py --char=1 --v=1 $exp/${label_file}_tlg $text > $exp/${wer}_tlg
+fi
\ No newline at end of file
--- a/speechx/examples/ngram/README.md
+++ b/speechx/examples/ngram/README.md
-# NGram Train
--- a/speechx/examples/ngram/en/README.md
+++ b/speechx/examples/ngram/en/README.md
--- a/speechx/examples/ngram/zh/README.md
+++ b/speechx/examples/ngram/zh/README.md
+# ngram train for mandarin
+Quick run:
+```
+bash run.sh --stage -1
+```
+## input
+input files:
+```
+data/
+├── lexicon.txt
+├── text
+└── vocab.txt
+```
+```
+==> data/text <==
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+BAC009S0002W0131 显示 出 了 极 强 的 威力
+==> data/lexicon.txt <==
+SIL sil
+<SPOKEN_NOISE> sil
+啊 aa a1
+啊 aa a2
+啊 aa a4
+啊 aa a5
+啊啊啊 aa a2 aa a2 aa a2
+啊啊啊 aa a5 aa a5 aa a5
+坐地 z uo4 d i4
+坐实 z uo4 sh ix2
+坐视 z uo4 sh ix4
+坐稳 z uo4 uu un3
+坐拥 z uo4 ii iong1
+坐诊 z uo4 zh en3
+坐庄 z uo4 zh uang1
+坐姿 z uo4 z iy1
+==> data/vocab.txt <==
+<blank>
+<unk>
+A
+B
+C
+D
+E
+龙
+龚
+龛
+<eos>
+```
+## output
+```
+data/
+├── local
+│   ├── dict
+│   │   ├── lexicon.txt
+│   │   └── units.txt
+│   └── lm
+│       ├── heldout
+│       ├── lm.arpa
+│       ├── text
+│       ├── text.no_oov
+│       ├── train
+│       ├── unigram.counts
+│       ├── word.counts
+│       └── wordlist
+```
+```
+/workspace/srilm/bin/i686-m64/ngram-count
+Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt')
+Ignoring words 矽, which contains oov unit
+Ignoring words 傩, which contains oov unit
+Ignoring words 堀, which contains oov unit
+Ignoring words 莼, which contains oov unit
+Ignoring words 菰, which contains oov unit
+Ignoring words 摭, which contains oov unit
+Ignoring words 帙, which contains oov unit
+Ignoring words 迨, which contains oov unit
+Ignoring words 孥, which contains oov unit
+Ignoring words 瑗, which contains oov unit
+...
+...
+...
+file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs
+0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745
+build LM done.
+```
--- a/speechx/examples/ngram/zh/local/aishell_train_lms.sh
+++ b/speechx/examples/ngram/zh/local/aishell_train_lms.sh
+#!/bin/bash
+# To be run from one directory above this script.
+. ./path.sh
+text=data/local/lm/text
+lexicon=data/local/dict/lexicon.txt
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+# Check SRILM tools
+if ! which ngram-count > /dev/null; then
+    echo "srilm tools are not found, please download it and install it from: "
+    echo "http://www.speech.sri.com/projects/srilm/download.html"
+    echo "Then add the tools to your PATH"
+    exit 1
+fi
+# This script takes no arguments.  It assumes you have already run
+# aishell_data_prep.sh.
+# It takes as input the files
+# data/local/lm/text
+# data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+cleantext=$dir/text.no_oov
+# oov to <SPOKEN_NOISE>
+# lexicon line: word char0 ... charn
+# text line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+# compute word counts, sort in descending order
+# line: count word
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+# word with <s> </s>
+cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
+# hold out to compute ppl
+heldout_sent=10000 # Don't change this if you want result to be comparable with kaldi_lm results
+mkdir -p $dir
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  head -$heldout_sent > $dir/heldout
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  tail -n +$heldout_sent > $dir/train
+ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
+ngram -lm $dir/lm.arpa -ppl $dir/heldout
\ No newline at end of file
--- a/speechx/examples/ngram/zh/local/text_to_lexicon.py
+++ b/speechx/examples/ngram/zh/local/text_to_lexicon.py
+#!/usr/bin/env python3
+import argparse
+from collections import Counter
+def main(args):
+    counter = Counter()
+    with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout:
+        for line in fin:
+            line = line.strip()
+            if args.has_key:
+                utt, text = line.split(maxsplit=1)
+                words = text.split()
+            else:
+                words = line.split()
+            counter.update(words)
+        for word in counter:
+            val = " ".join(list(word))
+            fout.write(f"{word}\t{val}\n")
+            fout.flush()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='text(line:utt1 中国 人) to lexicon（line:中国 中 国).')
+    parser.add_argument(
+        '--has_key',
+        default=True,
+        help='text path, with utt or not')
+    parser.add_argument(
+        '--text',
+        required=True,
+        help='text path. line: utt1 中国 人 or 中国 人')
+    parser.add_argument(
+        '--lexicon',
+        required=True,
+        help='lexicon path. line:中国 中 国')
+    args = parser.parse_args()
+    print(args)
+    main(args)
--- a/speechx/examples/ngram/zh/path.sh
+++ b/speechx/examples/ngram/zh/path.sh
+# This contains the locations of binarys build required for running the examples.
+MAIN_ROOT=`realpath $PWD/../../../../`
+SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
+export LC_AL=C
+# srilm
+export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
+export SRILM=${MAIN_ROOT}/tools/srilm
+export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
--- a/speechx/examples/ngram/zh/run.sh
+++ b/speechx/examples/ngram/zh/run.sh
+#!/bin/bash
+set -eo pipefail
+. path.sh
+stage=-1
+stop_stage=100
+corpus=aishell
+unit=data/vocab.txt       # vocab file, line: char/spm_pice
+lexicon=data/lexicon.txt  # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
+text=data/text            # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
+. utils/parse_options.sh
+data=$PWD/data
+mkdir -p $data
+if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
+    if [ ! -f $data/speech.ngram.zh.tar.gz ];then
+        pushd $data
+        wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
+        tar xvzf speech.ngram.zh.tar.gz
+        popd
+    fi
+fi
+if [ ! -f $unit ]; then
+    echo "$0: No such file $unit"
+    exit 1;
+fi
+if ! which ngram-count; then
+    pushd $MAIN_ROOT/tools
+    make srilm.done
+    popd
+fi
+mkdir -p data/local/dict
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # 7.1 Prepare dict
+    # line: char/spm_pices
+    cp $unit data/local/dict/units.txt
+    if [ ! -f $lexicon ];then
+        local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
+        echo "Generate $lexicon from $text"
+    fi
+    # filter by vocab
+    # line: word ph0 ... phn -> line: word char0 ... charn
+    utils/fst/prepare_dict.py \
+        --unit_file $unit \
+        --in_lexicon ${lexicon} \
+        --out_lexicon data/local/dict/lexicon.txt
+fi
+lm=data/local/lm
+mkdir -p $lm
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # 7.2 Train lm
+    cp $text $lm/text
+    local/aishell_train_lms.sh
+fi
+echo "build LM done."
+exit 0
--- a/speechx/examples/ngram/zh/utils
+++ b/speechx/examples/ngram/zh/utils
+../../../../utils/
\ No newline at end of file
--- a/speechx/examples/text_lm/.gitignore
+++ b/speechx/examples/text_lm/.gitignore
+data
--- a/speechx/examples/text_lm/README.md
+++ b/speechx/examples/text_lm/README.md
+# Text PreProcess for building ngram LM
+Output `text` file like this:
+```
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+```
--- a/speechx/examples/text_lm/path.sh
+++ b/speechx/examples/text_lm/path.sh
+MAIN_ROOT=`realpath $PWD/../../../../`
+SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
+export LC_AL=C
--- a/speechx/examples/text_lm/run.sh
+++ b/speechx/examples/text_lm/run.sh
+#!/bin/bash
+set -eo pipefail
+. path.sh
+stage=0
+stop_stage=100
+has_key=true
+token_type=word
+. utils/parse_options.sh || exit -1;
+text=data/text
+if [ ! -f $text ]; then
+    echo "$0: Not find $1";
+    exit -1;
+fi
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
+    echo "text tn & wordseg preprocess"
+    rm -rf ${text}.tn
+    python3 utils/zh_tn.py --has_key $has_key --token_type $token_type ${text} ${text}.tn
+fi
\ No newline at end of file
--- a/speechx/examples/text_lm/utils
+++ b/speechx/examples/text_lm/utils
+../../../utils/
\ No newline at end of file
--- a/speechx/examples/wfst/.gitignore
+++ b/speechx/examples/wfst/.gitignore
+data
--- a/speechx/examples/wfst/README.md
+++ b/speechx/examples/wfst/README.md
+# Built TLG wfst
+## Input
+```
+data/local/
+├── dict
+│   ├── lexicon.txt
+│   └── units.txt
+└── lm
+    ├── heldout
+    ├── lm.arpa
+    ├── text
+    ├── text.no_oov
+    ├── train
+    ├── unigram.counts
+    ├── word.counts
+    └── wordlist
+```
+```
+==> data/local/dict/lexicon.txt <==
+啊 啊
+啊啊啊 啊 啊 啊
+阿 阿
+阿尔 阿 尔
+阿根廷 阿 根 廷
+阿九 阿 九
+阿克 阿 克
+阿拉伯数字 阿 拉 伯 数 字
+阿拉法特 阿 拉 法 特
+阿拉木图 阿 拉 木 图
+==> data/local/dict/units.txt <==
+<blank>
+<unk>
+A
+B
+C
+D
+E
+F
+G
+H
+==> data/local/lm/heldout <==
+而 对 楼市 成交 抑制 作用 最 大 的 限 购
+也 成为 地方 政府 的 眼中 钉
+自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+各地 政府 便 纷纷 跟进
+仅 一 个 多 月 的 时间 里
+除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+四十六 个 限 购 城市 当中
+四十一 个 已 正式 取消 或 变相 放松 了 限 购
+财政 金融 政策 紧随 其后 而来
+显示 出 了 极 强 的 威力
+==> data/local/lm/lm.arpa <==
+\data\
+ngram 1=129356
+ngram 2=504661
+ngram 3=123455
+\1-grams:
+-1.531278       </s>
+-3.828829       <SPOKEN_NOISE>  -0.1600094
+-6.157292       <UNK>
+==> data/local/lm/text <==
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+BAC009S0002W0131 显示 出 了 极 强 的 威力
+==> data/local/lm/text.no_oov <==
+<SPOKEN_NOISE> 而 对 楼市 成交 抑制 作用 最 大 的 限 购 
+<SPOKEN_NOISE> 也 成为 地方 政府 的 眼中 钉 
+<SPOKEN_NOISE> 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后 
+<SPOKEN_NOISE> 各地 政府 便 纷纷 跟进 
+<SPOKEN_NOISE> 仅 一 个 多 月 的 时间 里 
+<SPOKEN_NOISE> 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外 
+<SPOKEN_NOISE> 四十六 个 限 购 城市 当中 
+<SPOKEN_NOISE> 四十一 个 已 正式 取消 或 变相 放松 了 限 购 
+<SPOKEN_NOISE> 财政 ���融 政策 紧随 其后 而来 
+<SPOKEN_NOISE> 显示 出 了 极 强 的 威力 
+==> data/local/lm/train <==
+汉莎 不 得 不 通过 这样 的 方式 寻求 新 的 发展 点
+并 计划 朝云 计算 方面 发展
+汉莎 的 基础 设施 部门 拥有 一千四百 名 员工
+媒体 就 曾 披露 这笔 交易
+虽然 双方 已经 正式 签署 了 外包 协议
+但是 这笔 交易 还 需要 得到 反 垄断 部门 的 批准
+陈 黎明 一九八九 年 获得 美国 康乃尔 大学 硕士 学位
+并 于 二零零三 年 顺利 完成 美国 哈佛 商学 院 高级 管理 课程
+曾 在 多家 国际 公司 任职
+拥有 业务 开发 商务 及 企业 治理
+==> data/local/lm/unigram.counts <==
+  57487 的
+  13099 在
+  11862 一
+  11397 了
+  10998 不
+   9913 是
+   7952 有
+   6250 和
+   6152 个
+   5422 将
+==> data/local/lm/word.counts <==
+  57486 的
+  13098 在
+  11861 一
+  11396 了
+  10997 不
+   9912 是
+   7951 有
+   6249 和
+   6151 个
+   5421 将
+==> data/local/lm/wordlist <==
+的
+在
+一
+了
+不
+是
+有
+和
+个
+将
+```
+## Output
+```
+fstaddselfloops 'echo 4234 |' 'echo 123660 |' 
+Lexicon and Token FSTs compiling succeeded
+arpa2fst --read-symbol-table=data/lang_test/words.txt --keep-symbols=true - 
+LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:94) Reading \data\ section.
+LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \1-grams: section.
+LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \2-grams: section.
+LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \3-grams: section.
+Checking how stochastic G is (the first of these numbers should be small):
+fstisstochastic data/lang_test/G.fst 
+0 -1.14386
+fsttablecompose data/lang_test/L.fst data/lang_test/G.fst 
+fstminimizeencoded 
+fstdeterminizestar --use-log=true 
+fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst 
+Composing decoding graph TLG.fst succeeded
+Aishell build TLG done.
+```
+```
+data/
+├── lang_test
+│   ├── G.fst
+│   ├── L.fst
+│   ├── LG.fst
+│   ├── T.fst
+│   ├── TLG.fst
+│   ├── tokens.txt
+│   ├── units.txt
+│   └── words.txt
+└── local
+    ├── lang
+    │   ├── L.fst
+    │   ├── T.fst
+    │   ├── tokens.txt
+    │   ├── units.txt
+    │   └── words.txt
+    └── tmp
+        ├── disambig.list
+        ├── lexiconp_disambig.txt
+        ├── lexiconp.txt
+        └── units.list
+```
\ No newline at end of file
--- a/speechx/examples/wfst/path.sh
+++ b/speechx/examples/wfst/path.sh
+# This contains the locations of binarys build required for running the examples.
+MAIN_ROOT=`realpath $PWD/../../../`
+SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
+export LC_AL=C
+# srilm
+export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
+export SRILM=${MAIN_ROOT}/tools/srilm
+export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
+# Kaldi
+export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
+[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
--- a/speechx/examples/wfst/run.sh
+++ b/speechx/examples/wfst/run.sh
+#!/bin/bash
+set -eo pipefail
+. path.sh
+stage=-1
+stop_stage=100
+. utils/parse_options.sh
+if ! which fstprint ; then
+    pushd $MAIN_ROOT/tools
+    make kaldi.done
+    popd
+fi
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+    # build T & L
+    # utils/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>
+    utils/fst/compile_lexicon_token_fst.sh \
+        data/local/dict data/local/tmp data/local/lang
+    # build G & LG & TLG
+    # utils/fst/make_tlg.sh <lm_dir> <src_lang> <tgt_lang>
+    utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
+fi
+echo "build TLG done."
+exit 0
--- a/speechx/examples/wfst/utils
+++ b/speechx/examples/wfst/utils
+../../../utils/
\ No newline at end of file
--- a/speechx/tools/install_srilm.sh
+++ b/speechx/tools/install_srilm.sh
-#!/usr/bin/env bash
-current_path=`pwd`
-current_dir=`basename "$current_path"`
-if [ "tools" != "$current_dir" ]; then
-    echo "You should run this script in tools/ directory!!"
-    exit 1
-fi
-if [ ! -d liblbfgs-1.10 ]; then
-    echo Installing libLBFGS library to support MaxEnt LMs
-    bash extras/install_liblbfgs.sh || exit 1
-fi
-! command -v gawk > /dev/null && \
-   echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1;
-if [ $# -ne 3 ]; then
-    echo "SRILM download requires some information about you"
-    echo
-    echo "Usage: $0 <name> <organization> <email>"
-    exit 1
-fi
-srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download.php"
-post_data="WWW_file=srilm-1.7.3.tar.gz&WWW_name=$1&WWW_org=$2&WWW_email=$3"
-if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then
-    echo 'There was a problem downloading the file.'
-    echo 'Check you internet connection and try again.'
-    exit 1
-fi
-mkdir -p srilm
-cd srilm
-if [ -f ../srilm.tgz ]; then
-    tar -xvzf ../srilm.tgz # Old SRILM format
-elif [  -f ../srilm.tar.gz ]; then
-    tar -xvzf ../srilm.tar.gz # Changed format type from tgz to tar.gz
-fi
-major=`gawk -F. '{ print $1 }' RELEASE`
-minor=`gawk -F. '{ print $2 }' RELEASE`
-micro=`gawk -F. '{ print $3 }' RELEASE`
-if [ $major -le 1 ] && [ $minor -le 7 ] && [ $micro -le 1 ]; then
-  echo "Detected version 1.7.1 or earlier. Applying patch."
-  patch -p0 < ../extras/srilm.patch
-fi
-# set the SRILM variable in the top-level Makefile to this directory.
-cp Makefile tmpf
-cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \
-  > Makefile || exit 1
-rm tmpf
-mtype=`sbin/machine-type`
-echo HAVE_LIBLBFGS=1 >> common/Makefile.machine.$mtype
-grep ADDITIONAL_INCLUDES common/Makefile.machine.$mtype | \
-    sed 's|$| -I$(SRILM)/../liblbfgs-1.10/include|' \
-    >> common/Makefile.machine.$mtype
-grep ADDITIONAL_LDFLAGS common/Makefile.machine.$mtype | \
-    sed 's|$| -L$(SRILM)/../liblbfgs-1.10/lib/ -Wl,-rpath -Wl,$(SRILM)/../liblbfgs-1.10/lib/|' \
-    >> common/Makefile.machine.$mtype
-make || exit
-cd ..
-(
-  [ ! -z "${SRILM}" ] && \
-    echo >&2 "SRILM variable is aleady defined. Undefining..." && \
-    unset SRILM
-  [ -f ./env.sh ] && . ./env.sh
-  [ ! -z "${SRILM}" ] && \
-    echo >&2 "SRILM config is already in env.sh" && exit
-  wd=`pwd`
-  wd=`readlink -f $wd || pwd`
-  echo "export SRILM=$wd/srilm"
-  dirs="\${PATH}"
-  for directory in $(cd srilm && find bin -type d ) ; do
-    dirs="$dirs:\${SRILM}/$directory"
-  done
-  echo "export PATH=$dirs"
-) >> env.sh
-echo >&2 "Installation of SRILM finished successfully"
-echo >&2 "Please source the tools/env.sh in your path.sh to enable it"
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -25,7 +25,7 @@ clean:
 apt.done:
 	apt update -y
-	apt install -y bc flac jq vim tig tree pkg-config libsndfile1 libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev 
+	apt install -y bc flac jq vim tig tree sox pkg-config libsndfile1 libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev 
 	echo "check_certificate = off" >> ~/.wgetrc
 	touch apt.done
@@ -50,7 +50,7 @@ openblas.done:
 	bash extras/install_openblas.sh
 	touch openblas.done
-kaldi.done: openblas.done
+kaldi.done: apt.done openblas.done
 	bash extras/install_kaldi.sh
 	touch kaldi.done
@@ -58,6 +58,11 @@ sctk.done:
 	./extras/install_sclite.sh
 	touch sctk.done
+srilm.done:
+	./extras/install_liblbfgs.sh
+	extras/install_srilm.sh
+	touch srilm.done
 ######################
 dev: python conda_packages.done sctk.done
@@ -96,4 +101,4 @@ conda_packages.done: bc.done cmake.done flac.done ffmpeg.done sox.done sndfile.d
 else
 conda_packages.done:
 endif
 	touch conda_packages.done
\ No newline at end of file
--- a/tools/extras/install_openfst.sh
+++ b/tools/extras/install_openfst.sh
@@ -7,8 +7,9 @@ set -x
 # openfst
 openfst=openfst-1.8.1
 shared=true
+WGET="wget -c --no-check-certificate"
-test -e ${openfst}.tar.gz || wget http://www.openfst.org/twiki/pub/FST/FstDownload/${openfst}.tar.gz
+test -e ${openfst}.tar.gz || $WGET http://www.openfst.org/twiki/pub/FST/FstDownload/${openfst}.tar.gz
 test -d ${openfst} || tar -xvf ${openfst}.tar.gz && chown -R root:root ${openfst}

--- a/utils/compute-wer.py
+++ b/utils/compute-wer.py
--- a/utils/espnet_json_to_manifest.py
+++ b/utils/espnet_json_to_manifest.py
--- a/utils/fst/prepare_dict.py
+++ b/utils/fst/prepare_dict.py
@@ -3,7 +3,8 @@ import argparse
 def main(args):
-    # load `unit` or `vocab` file
+    # load vocab file
+    # line: token
    unit_table = set()
    with open(args.unit_file, 'r') as fin:
        for line in fin:
@@ -11,27 +12,41 @@ def main(args):
            unit_table.add(unit)
    def contain_oov(units):
+        """token not in vocab
+        Args:
+            units (str): token
+        Returns:
+            bool: True token in voca, else False.
+        """
        for unit in units:
            if unit not in unit_table:
                return True
        return False
-    # load spm model
+    # load spm model, for English
    bpemode = args.bpemodel
    if bpemode:
        import sentencepiece as spm
        sp = spm.SentencePieceProcessor()
        sp.Load(sys.bpemodel)
-    # used to filter polyphone
+    # used to filter polyphone and invalid word
    lexicon_table = set()
+    in_n = 0  # in lexicon word count
+    out_n = 0 # out lexicon word cout
    with open(args.in_lexicon, 'r') as fin, \
            open(args.out_lexicon, 'w') as fout:
        for line in fin:
            word = line.split()[0]
+            in_n += 1
            if word == 'SIL' and not bpemode:  # `sil` might be a valid piece in bpemodel
+                # filter 'SIL' for mandarin, keep it in English
                continue
            elif word == '<SPOKEN_NOISE>':
+                # filter <SPOKEN_NOISE>
                continue
            else:
                # each word only has one pronunciation for e2e system
@@ -39,12 +54,14 @@ def main(args):
                    continue
                if bpemode:
+                    # for english
                    pieces = sp.EncodeAsPieces(word)
                    if contain_oov(pieces):
                        print('Ignoring words {}, which contains oov unit'.
                              format(''.join(word).strip('▁')))
                        continue
+                    # word is piece list, which not have <unk> piece, filter out by `contain_oov(pieces)`
                    chars = ' '.join(
                        [p if p in unit_table else '<unk>' for p in pieces])
                else:
@@ -58,11 +75,14 @@ def main(args):
                    # we assume the model unit of our e2e system is char now.
                    if word.encode('utf8').isalpha() and '▁' in unit_table:
                        word = '▁' + word
                    chars = ' '.join(word)  # word is a char list
                fout.write('{} {}\n'.format(word, chars))
                lexicon_table.add(word)
+                out_n += 1
+    print(f"Filter lexicon by unit table: filter out {in_n - out_n}, {out_n}/{in_n}")
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(

--- a/utils/generate_infer_yaml.py
+++ b/utils/generate_infer_yaml.py
--- a/utils/link_wav.py
+++ b/utils/link_wav.py
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@@ -26,23 +26,39 @@ def main(args):
    with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
            'w') as ftxt:
        for line_json in manifest_jsons:
+            # utt:str
+            # utt2spk:str
+            # input: [{name:str, shape:[dur_in_sec, feat_dim], feat:str, filetype:str}, ]
+            # output: [{name:str, shape:[tokenlen, vocab_dim], text:str, token:str, tokenid:str}, ] 
            utt = line_json['utt']
-            feat = line_json['feat']
+            utt2spk = line_json['utt2spk']
+            # input
+            assert (len(line_json['input']) == 1), "only support one input now"
+            input_json = line_json['input'][0]
+            feat = input_json['feat']
+            feat_shape = input_json['shape']
+            file_type = input_json['filetype']
            file_ext = Path(feat).suffix  # .wav
-            text = line_json['text']
-            feat_shape = line_json['feat_shape']
            dur = feat_shape[0]
            feat_dim = feat_shape[1]
-            if 'token' in line_json:
-                tokens = line_json['token']
-                tokenids = line_json['token_id']
-                token_shape = line_json['token_shape']
-                token_len = token_shape[0]
-                vocab_dim = token_shape[1]
            if file_ext == '.wav':
                fwav.write(f"{utt} {feat}\n")
            fdur.write(f"{utt} {dur}\n")
+            # output
+            assert (
+                len(line_json['output']) == 1), "only support one output now"
+            output_json = line_json['output'][0]
+            text = output_json['text']
+            if 'token' in output_json:
+                tokens = output_json['token']
+                tokenids = output_json['tokenid']
+                token_shape = output_json['shape']
+                token_len = token_shape[0]
+                vocab_dim = token_shape[1]
            ftxt.write(f"{utt} {text}\n")
            count += 1

--- a/utils/zh_tn.py
+++ b/utils/zh_tn.py