refactor example dir & add aishell build TLG

537aff97 · Yang Zhou · fef3e5f1 · fef3e5f1 · fef3e5f1 · fef3e5f1
24 changed file
--- a/speechx/examples/dev/CMakeLists.txt
+++ b/speechx/examples/dev/CMakeLists.txt
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-add_subdirectory(glog)
--- a/speechx/examples/dev/glog/CMakeLists.txt
+++ b/speechx/examples/dev/glog/CMakeLists.txt
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-add_executable(glog_test ${CMAKE_CURRENT_SOURCE_DIR}/glog_test.cc)
-target_link_libraries(glog_test glog)
-
-
-add_executable(glog_logtostderr_test ${CMAKE_CURRENT_SOURCE_DIR}/glog_logtostderr_test.cc)
-target_link_libraries(glog_logtostderr_test glog)
\ No newline at end of file
--- a/speechx/examples/dev/glog/README.md
+++ b/speechx/examples/dev/glog/README.md
-# [GLOG](https://rpg.ifi.uzh.ch/docs/glog.html)
-
-Unless otherwise specified, glog writes to the filename `/tmp/<program name>.<hostname>.<user name>.log.<severity level>.<date>.<time>.<pid>` (e.g., "/tmp/hello_world.example.com.hamaji.log.INFO.20080709-222411.10474"). By default, glog copies the log messages of severity level ERROR or FATAL to standard error (stderr) in addition to log files.
-
-Several flags influence glog's output behavior. If the Google gflags library is installed on your machine, the configure script (see the INSTALL file in the package for detail of this script) will automatically detect and use it, allowing you to pass flags on the command line. For example, if you want to turn the flag --logtostderr on, you can start your application with the following command line:
-
-   `./your_application --logtostderr=1`
-
-If the Google gflags library isn't installed, you set flags via environment variables, prefixing the flag name with "GLOG_", e.g.
-
-   `GLOG_logtostderr=1 ./your_application`
-
-You can also modify flag values in your program by modifying global variables `FLAGS_*` . Most settings start working immediately after you update `FLAGS_*` . The exceptions are the flags related to destination files. For example, you might want to set `FLAGS_log_dir` before calling `google::InitGoogleLogging` . Here is an example:
-∂∂
-```c++
-   LOG(INFO) << "file";
-   // Most flags work immediately after updating values.
-   FLAGS_logtostderr = 1;
-   LOG(INFO) << "stderr";
-   FLAGS_logtostderr = 0;
-   // This won't change the log destination. If you want to set this
-   // value, you should do this before google::InitGoogleLogging .
-   FLAGS_log_dir = "/some/log/directory";
-   LOG(INFO) << "the same file";
-```
--- a/speechx/examples/dev/glog/glog_logtostderr_test.cc
+++ b/speechx/examples/dev/glog/glog_logtostderr_test.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <glog/logging.h>
-
-int main(int argc, char* argv[]) {
-    // Initialize Google’s logging library.
-    google::InitGoogleLogging(argv[0]);
-
-    FLAGS_logtostderr = 1;
-
-    LOG(INFO) << "Found " << 10 << " cookies";
-    LOG(ERROR) << "Found " << 10 << " error";
-}
\ No newline at end of file
--- a/speechx/examples/dev/glog/glog_test.cc
+++ b/speechx/examples/dev/glog/glog_test.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <glog/logging.h>
-
-int main(int argc, char* argv[]) {
-    // Initialize Google’s logging library.
-    google::InitGoogleLogging(argv[0]);
-
-    LOG(INFO) << "Found " << 10 << " cookies";
-    LOG(ERROR) << "Found " << 10 << " error";
-}
\ No newline at end of file
--- a/speechx/examples/dev/glog/path.sh
+++ b/speechx/examples/dev/glog/path.sh
-# This contains the locations of binarys build required for running the examples.
-
-SPEECHX_ROOT=$PWD/../../../
-
-SPEECHX_TOOLS=$SPEECHX_ROOT/tools
-TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
-
-
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
-
-SPEECHX_BIN=$SPEECHX_EXAMPLES/dev/glog
-export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
-
-export LC_AL=C
--- a/speechx/examples/dev/glog/run.sh
+++ b/speechx/examples/dev/glog/run.sh
-#!/bin/bash
-set +x
-set -e
-
-. ./path.sh
-
-# 1. compile
-if [ ! -d ${SPEECHX_EXAMPLES} ]; then
-    pushd ${SPEECHX_ROOT} 
-    bash build.sh
-    popd
-fi
-
-# 2. run 
-glog_test
-
-echo "------"
-export FLAGS_logtostderr=1 
-glog_test
-
-echo "------"
-glog_logtostderr_test
--- a/speechx/examples/ds2_ol/aishell/README.md
+++ b/speechx/examples/ds2_ol/aishell/README.md
@@ -42,3 +42,40 @@ Overall -> 10.93 % N=104765 C=93410 S=9780 D=1575 I=95
 Mandarin -> 10.93 % N=104762 C=93410 S=9779 D=1573 I=95
 Other -> 100.00 % N=3 C=0 S=1 D=2 I=0
 ```
+
+## fbank
+```
+bash run_fbank.sh
+```
+
+### CTC Prefix Beam Search w/o LM
+
+```
+Overall -> 10.44 % N=104765 C=94194 S=10174 D=397 I=369
+Mandarin -> 10.44 % N=104762 C=94194 S=10171 D=397 I=369
+Other -> 100.00 % N=3 C=0 S=3 D=0 I=0
+```
+
+### CTC Prefix Beam Search w/ LM
+
+LM: zh_giga.no_cna_cmn.prune01244.klm
+
+```
+Overall -> 5.82 % N=104765 C=99386 S=4944 D=435 I=720
+Mandarin -> 5.82 % N=104762 C=99386 S=4941 D=435 I=720
+English -> 0.00 % N=0 C=0 S=0 D=0 I=0
+```
+
+### CTC WFST
+
+LM: [aishell train](https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph2.zip)
+```
+Overall -> 9.58 % N=104765 C=94817 S=4326 D=5622 I=84
+Mandarin -> 9.57 % N=104762 C=94817 S=4325 D=5620 I=84
+Other -> 100.00 % N=3 C=0 S=1 D=2 I=0
+```
+
+## build TLG graph 
+```
+ bash run_build_tlg.sh
+```
--- a/speechx/examples/ngram/zh/local/aishell_train_lms.sh
+++ b/speechx/examples/ngram/zh/local/aishell_train_lms.sh
--- a/speechx/examples/ngram/zh/local/text_to_lexicon.py
+++ b/speechx/examples/ngram/zh/local/text_to_lexicon.py
--- a/speechx/examples/ds2_ol/aishell/path.sh
+++ b/speechx/examples/ds2_ol/aishell/path.sh
 # This contains the locations of binarys build required for running the examples.

+MAIN_ROOT=`realpath $PWD/../../../../`
 SPEECHX_ROOT=$PWD/../../..
 SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples

@@ -10,5 +11,14 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin

 export LC_AL=C

+# openfst bin & kaldi bin
+KALDI_DIR=$SPEECHX_ROOT/build/speechx/kaldi/
+OPENFST_DIR=$SPEECHX_ROOT/fc_patch/openfst-build/src
+
+# srilm
+export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
+export SRILM=${MAIN_ROOT}/tools/srilm
+
 SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/decoder:$SPEECHX_EXAMPLES/ds2_ol/feat:$SPEECHX_EXAMPLES/ds2_ol/websocket
-export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN:${SRILM}/bin:${SRILM}/bin/i686-m64:$KALDI_DIR/lmbin:$KALDI_DIR/fstbin:$OPENFST_DIR/bin
--- a/speechx/examples/ngram/zh/run.sh
+++ b/speechx/examples/ngram/zh/run.sh
@@ -3,11 +3,15 @@ set -eo pipefail

 . path.sh

+# attention, please replace the vocab is only for this script. 
+# different acustic model has different vocab
+ckpt_dir=data/fbank_model
+unit=$ckpt_dir/data/lang_char/vocab.txt       # vocab file, line: char/spm_pice
+model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
+
 stage=-1
 stop_stage=100
 corpus=aishell
-
-unit=data/vocab.txt       # vocab file, line: char/spm_pice
 lexicon=data/lexicon.txt  # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
 text=data/text            # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt

@@ -23,6 +27,14 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
        tar xvzf speech.ngram.zh.tar.gz
        popd
    fi
+
+    if [ ! -f $ckpt_dir/data/mean_std.json ]; then
+        mkdir -p $ckpt_dir
+        pushd $ckpt_dir
+        wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/WIP1_asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz
+        tar xzfv WIP1_asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz
+        popd
+    fi
 fi

 if [ ! -f $unit ]; then
@@ -38,12 +50,12 @@ fi

 mkdir -p data/local/dict
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # 7.1 Prepare dict
+    # Prepare dict
    # line: char/spm_pices
    cp $unit data/local/dict/units.txt

    if [ ! -f $lexicon ];then
-        local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
+       local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
        echo "Generate $lexicon from $text"
    fi

@@ -59,10 +71,71 @@ lm=data/local/lm
 mkdir -p $lm

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # 7.2 Train lm
+    # Train lm
    cp $text $lm/text
    local/aishell_train_lms.sh
+    echo "build LM done."
+fi
+
+# build TLG
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  # build T & L
+  utils/fst/compile_lexicon_token_fst.sh \
+      data/local/dict data/local/tmp data/local/lang
+ 
+  # build G & TLG
+  utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
+
+fi
+
+aishell_wav_scp=aishell_test.scp
+nj=40
+cmvn=$data/cmvn_fbank.ark
+wfst=$data/lang_test
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+
+    if [ ! -d $data/test ]; then
+        pushd $data
+        wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
+        unzip  aishell_test.zip
+        popd
+
+        realpath $data/test/*/*.wav > $data/wavlist
+        awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
+        paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
+    fi
+
+    ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
+
+    cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
+fi
+
+wer=aishell_wer
+label_file=aishell_result
+export GLOG_logtostderr=1
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    #  TLG decoder
+    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/check_tlg.log \
+    recognizer_test_main \
+        --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
+        --cmvn_file=$cmvn \
+        --model_path=$model_dir/avg_5.jit.pdmodel \
+        --streaming_chunk=30 \
+        --use_fbank=true \
+        --param_path=$model_dir/avg_5.jit.pdiparams \
+        --word_symbol_table=$wfst/words.txt \
+        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
+        --model_cache_shapes="5-1-2048,5-1-2048" \
+        --graph_path=$wfst/TLG.fst --max_active=7500 \
+        --acoustic_scale=1.2 \
+        --result_wspecifier=ark,t:$data/split${nj}/JOB/result_check_tlg
+
+    cat $data/split${nj}/*/result_check_tlg > $exp/${label_file}_check_tlg
+    utils/compute-wer.py --char=1 --v=1 $text $exp/${label_file}_check_tlg > $exp/${wer}.check_tlg
+    echo "recognizer test have finished!!!"
+    echo "please checkout in ${exp}/${wer}.check_tlg"
 fi

-echo "build LM done."
 exit 0
--- a/speechx/examples/ds2_ol/aishell/run_fbank.sh
+++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh
@@ -154,7 +154,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
        --model_path=$model_dir/avg_5.jit.pdmodel \
        --streaming_chunk=30 \
        --use_fbank=true \
-        --to_float32=false \
        --param_path=$model_dir/avg_5.jit.pdiparams \
        --word_symbol_table=$wfst/words.txt \
        --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \

--- a/speechx/examples/ngram/.gitignore
+++ b/speechx/examples/ngram/.gitignore
-data
-exp
--- a/speechx/examples/ngram/en/README.md
+++ b/speechx/examples/ngram/en/README.md
--- a/speechx/examples/ngram/zh/README.md
+++ b/speechx/examples/ngram/zh/README.md
-# ngram train for mandarin
-
-Quick run:
-```
-bash run.sh --stage -1
-```
-
-## input
-
-input files:
-```
-data/
-├── lexicon.txt
-├── text
-└── vocab.txt
-```
-
-```
-==> data/text <==
-BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
-BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
-BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
-BAC009S0002W0125 各地 政府 便 纷纷 跟进
-BAC009S0002W0126 仅 一 个 多 月 的 时间 里
-BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
-BAC009S0002W0128 四十六 个 限 购 城市 当中
-BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
-BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
-BAC009S0002W0131 显示 出 了 极 强 的 威力
-
-==> data/lexicon.txt <==
-SIL sil
-<SPOKEN_NOISE> sil
-啊 aa a1
-啊 aa a2
-啊 aa a4
-啊 aa a5
-啊啊啊 aa a2 aa a2 aa a2
-啊啊啊 aa a5 aa a5 aa a5
-坐地 z uo4 d i4
-坐实 z uo4 sh ix2
-坐视 z uo4 sh ix4
-坐稳 z uo4 uu un3
-坐拥 z uo4 ii iong1
-坐诊 z uo4 zh en3
-坐庄 z uo4 zh uang1
-坐姿 z uo4 z iy1
-
-==> data/vocab.txt <==
-<blank>
-<unk>
-A
-B
-C
-D
-E
-龙
-龚
-龛
-<eos>
-```
-
-## output
-
-```
-data/
-├── local
-│   ├── dict
-│   │   ├── lexicon.txt
-│   │   └── units.txt
-│   └── lm
-│       ├── heldout
-│       ├── lm.arpa
-│       ├── text
-│       ├── text.no_oov
-│       ├── train
-│       ├── unigram.counts
-│       ├── word.counts
-│       └── wordlist
-```
-
-```
-/workspace/srilm/bin/i686-m64/ngram-count
-Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt')
-Ignoring words 矽, which contains oov unit
-Ignoring words 傩, which contains oov unit
-Ignoring words 堀, which contains oov unit
-Ignoring words 莼, which contains oov unit
-Ignoring words 菰, which contains oov unit
-Ignoring words 摭, which contains oov unit
-Ignoring words 帙, which contains oov unit
-Ignoring words 迨, which contains oov unit
-Ignoring words 孥, which contains oov unit
-Ignoring words 瑗, which contains oov unit
-...
-...
-...
-file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs
-0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745
-build LM done.
-```
--- a/speechx/examples/ngram/zh/local/split_data.sh
+++ b/speechx/examples/ngram/zh/local/split_data.sh
-#!/usr/bin/env bash
-
-set -eo pipefail
-
-data=$1
-scp=$2
-split_name=$3
-numsplit=$4
-
-# save in $data/split{n}
-# $scp to split
-# 
-
-if [[ ! $numsplit -gt 0 ]]; then
-  echo "Invalid num-split argument";
-  exit 1;
-fi
-
-directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done)
-scp_splits=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_name}; done)
-
-# if this mkdir fails due to argument-list being too long, iterate.
-if ! mkdir -p $directories >&/dev/null; then
-  for n in `seq $numsplit`; do
-    mkdir -p $data/split${numsplit}/$n
-  done
-fi
-
-echo "utils/split_scp.pl $scp $scp_splits"
-utils/split_scp.pl $scp $scp_splits
--- a/speechx/examples/ngram/zh/path.sh
+++ b/speechx/examples/ngram/zh/path.sh
-# This contains the locations of binarys build required for running the examples.
-
-MAIN_ROOT=`realpath $PWD/../../../../`
-SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
-
-export LC_AL=C
-
-# srilm
-export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
-export SRILM=${MAIN_ROOT}/tools/srilm
-export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
--- a/speechx/examples/ngram/zh/utils
+++ b/speechx/examples/ngram/zh/utils
-../../../../utils/
\ No newline at end of file
--- a/speechx/examples/wfst/.gitignore
+++ b/speechx/examples/wfst/.gitignore
-data
--- a/speechx/examples/wfst/README.md
+++ b/speechx/examples/wfst/README.md
-# Built TLG wfst
-
-## Input
-```
-data/local/
-├── dict
-│   ├── lexicon.txt
-│   └── units.txt
-└── lm
-    ├── heldout
-    ├── lm.arpa
-    ├── text
-    ├── text.no_oov
-    ├── train
-    ├── unigram.counts
-    ├── word.counts
-    └── wordlist
-```
-
-```
-==> data/local/dict/lexicon.txt <==
-啊 啊
-啊啊啊 啊 啊 啊
-阿 阿
-阿尔 阿 尔
-阿根廷 阿 根 廷
-阿九 阿 九
-阿克 阿 克
-阿拉伯数字 阿 拉 伯 数 字
-阿拉法特 阿 拉 法 特
-阿拉木图 阿 拉 木 图
-
-==> data/local/dict/units.txt <==
-<blank>
-<unk>
-A
-B
-C
-D
-E
-F
-G
-H
-
-==> data/local/lm/heldout <==
-而 对 楼市 成交 抑制 作用 最 大 的 限 购
-也 成为 地方 政府 的 眼中 钉
-自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
-各地 政府 便 纷纷 跟进
-仅 一 个 多 月 的 时间 里
-除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
-四十六 个 限 购 城市 当中
-四十一 个 已 正式 取消 或 变相 放松 了 限 购
-财政 金融 政策 紧随 其后 而来
-显示 出 了 极 强 的 威力
-
-==> data/local/lm/lm.arpa <==
-
-\data\
-ngram 1=129356
-ngram 2=504661
-ngram 3=123455
-
-\1-grams:
-1.531278       </s>
-3.828829       <SPOKEN_NOISE>  -0.1600094
-6.157292       <UNK>
-
-==> data/local/lm/text <==
-BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
-BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
-BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
-BAC009S0002W0125 各地 政府 便 纷纷 跟进
-BAC009S0002W0126 仅 一 个 多 月 的 时间 里
-BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
-BAC009S0002W0128 四十六 个 限 购 城市 当中
-BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
-BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
-BAC009S0002W0131 显示 出 了 极 强 的 威力
-
-==> data/local/lm/text.no_oov <==
-<SPOKEN_NOISE> 而 对 楼市 成交 抑制 作用 最 大 的 限 购 
-<SPOKEN_NOISE> 也 成为 地方 政府 的 眼中 钉 
-<SPOKEN_NOISE> 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后 
-<SPOKEN_NOISE> 各地 政府 便 纷纷 跟进 
-<SPOKEN_NOISE> 仅 一 个 多 月 的 时间 里 
-<SPOKEN_NOISE> 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外 
-<SPOKEN_NOISE> 四十六 个 限 购 城市 当中 
-<SPOKEN_NOISE> 四十一 个 已 正式 取消 或 变相 放松 了 限 购 
-<SPOKEN_NOISE> 财政 ���融 政策 紧随 其后 而来 
-<SPOKEN_NOISE> 显示 出 了 极 强 的 威力 
-
-==> data/local/lm/train <==
-汉莎 不 得 不 通过 这样 的 方式 寻求 新 的 发展 点
-并 计划 朝云 计算 方面 发展
-汉莎 的 基础 设施 部门 拥有 一千四百 名 员工
-媒体 就 曾 披露 这笔 交易
-虽然 双方 已经 正式 签署 了 外包 协议
-但是 这笔 交易 还 需要 得到 反 垄断 部门 的 批准
-陈 黎明 一九八九 年 获得 美国 康乃尔 大学 硕士 学位
-并 于 二零零三 年 顺利 完成 美国 哈佛 商学 院 高级 管理 课程
-曾 在 多家 国际 公司 任职
-拥有 业务 开发 商务 及 企业 治理
-
-==> data/local/lm/unigram.counts <==
-  57487 的
-  13099 在
-  11862 一
-  11397 了
-  10998 不
-   9913 是
-   7952 有
-   6250 和
-   6152 个
-   5422 将
-
-==> data/local/lm/word.counts <==
-  57486 的
-  13098 在
-  11861 一
-  11396 了
-  10997 不
-   9912 是
-   7951 有
-   6249 和
-   6151 个
-   5421 将
-
-==> data/local/lm/wordlist <==
-的
-在
-一
-了
-不
-是
-有
-和
-个
-将
-```
-
-## Output
-
-```
-fstaddselfloops 'echo 4234 |' 'echo 123660 |' 
-Lexicon and Token FSTs compiling succeeded
-arpa2fst --read-symbol-table=data/lang_test/words.txt --keep-symbols=true - 
-LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:94) Reading \data\ section.
-LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \1-grams: section.
-LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \2-grams: section.
-LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \3-grams: section.
-Checking how stochastic G is (the first of these numbers should be small):
-fstisstochastic data/lang_test/G.fst 
-0 -1.14386
-fsttablecompose data/lang_test/L.fst data/lang_test/G.fst 
-fstminimizeencoded 
-fstdeterminizestar --use-log=true 
-fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst 
-Composing decoding graph TLG.fst succeeded
-Aishell build TLG done.
-```
-
-```
-data/
-├── lang_test
-│   ├── G.fst
-│   ├── L.fst
-│   ├── LG.fst
-│   ├── T.fst
-│   ├── TLG.fst
-│   ├── tokens.txt
-│   ├── units.txt
-│   └── words.txt
-└── local
-    ├── lang
-    │   ├── L.fst
-    │   ├── T.fst
-    │   ├── tokens.txt
-    │   ├── units.txt
-    │   └── words.txt
-    └── tmp
-        ├── disambig.list
-        ├── lexiconp_disambig.txt
-        ├── lexiconp.txt
-        └── units.list
-```
--- a/speechx/examples/wfst/path.sh
+++ b/speechx/examples/wfst/path.sh
-# This contains the locations of binarys build required for running the examples.
-
-MAIN_ROOT=`realpath $PWD/../../../`
-SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
-
-export LC_AL=C
-
-# srilm
-export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
-export SRILM=${MAIN_ROOT}/tools/srilm
-export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
-
-# Kaldi
-export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
-[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
-export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
-[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
-[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
--- a/speechx/examples/wfst/run.sh
+++ b/speechx/examples/wfst/run.sh
-#!/bin/bash
-set -eo pipefail
-
-. path.sh
-
-stage=-1
-stop_stage=100
-
-. utils/parse_options.sh
-
-if ! which fstprint ; then
-    pushd $MAIN_ROOT/tools
-    make kaldi.done
-    popd
-fi
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
-    # build T & L
-    # utils/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>
-    utils/fst/compile_lexicon_token_fst.sh \
-        data/local/dict data/local/tmp data/local/lang
-
-    # build G & LG & TLG
-    # utils/fst/make_tlg.sh <lm_dir> <src_lang> <tgt_lang>
-    utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
-fi
-
-echo "build TLG done."
-exit 0
--- a/speechx/examples/wfst/utils
+++ b/speechx/examples/wfst/utils
-../../../utils/
\ No newline at end of file