add aishell wfst eg script

90d6b6f1 · Yang Zhou · 18b3225b · 90d6b6f1 · 90d6b6f1 · 90d6b6f1
4 changed file
--- a/speechx/examples/aishell/run.sh
+++ b/speechx/examples/aishell/run.sh
@@ -48,7 +48,7 @@ wer=./aishell_wer
 nj=40
 export GLOG_logtostderr=1
-./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
+#./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
 data=$PWD/data
 # 3. gen linear feat
@@ -72,10 +72,42 @@ utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log \
    --param_path=$aishell_online_model/avg_1.jit.pdiparams \
    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
    --dict_file=$lm_model_dir/vocab.txt \
-    --lm_path=$lm_model_dir/avg_1.jit.klm \
    --result_wspecifier=ark,t:$data/split${nj}/JOB/result
-cat $data/split${nj}/*/result > $label_file
+cat $data/split${nj}/*/result > ${label_file}
+local/compute-wer.py --char=1 --v=1 ${label_file} $text > ${wer}
+# 4. decode with lm
+utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log_lm \
+  offline_decoder_sliding_chunk_main \
+    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
+    --model_path=$aishell_online_model/avg_1.jit.pdmodel \
+    --param_path=$aishell_online_model/avg_1.jit.pdiparams \
+    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
+    --dict_file=$lm_model_dir/vocab.txt \
+    --lm_path=$lm_model_dir/avg_1.jit.klm \
+    --result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm
+cat $data/split${nj}/*/result_lm > ${label_file}_lm
+local/compute-wer.py --char=1 --v=1 ${label_file}_lm $text > ${wer}_lm
+graph_dir=./aishell_graph
+if [ ! -d $ ]; then
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
+    unzip -d aishell_graph.zip
+fi
+# 5. test TLG decoder
+utils/run.pl JOB=1:$nj $data/split${nj}/JOB/log_tlg \
+  offline_wfst_decoder_main \
+    --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
+    --model_path=$aishell_online_model/avg_1.jit.pdmodel \
+    --param_path=$aishell_online_model/avg_1.jit.pdiparams \
+    --word_symbol_table=$graph_dir/words.txt \
+    --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
+     --graph_path=$graph_dir/TLG.fst --max_active=7500 \
+    --acoustic_scale=1.2 \
+    --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg
-local/compute-wer.py --char=1 --v=1 $label_file $text > $wer
+cat $data/split${nj}/*/result_tlg > ${label_file}_tlg
-tail $wer
+local/compute-wer.py --char=1 --v=1 ${label_file}_tlg $text > ${wer}_tlg
\ No newline at end of file
--- a/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc
+++ b/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc
@@ -27,7 +27,7 @@ DEFINE_string(result_wspecifier, "", "test result wspecifier");
 DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
 DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
 DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
-DEFINE_string(lm_path, "lm.klm", "language model");
+DEFINE_string(lm_path, "", "language model");
 DEFINE_int32(receptive_field_length,
             7,
             "receptive field of two CNN(kernel=5) downsampling module.");
@@ -45,7 +45,6 @@ using kaldi::BaseFloat;
 using kaldi::Matrix;
 using std::vector;
 // test ds2 online decoder by feeding speech feature
 int main(int argc, char* argv[]) {
    gflags::ParseCommandLineFlags(&argc, &argv, false);
@@ -63,7 +62,6 @@ int main(int argc, char* argv[]) {
    LOG(INFO) << "dict path: " << dict_file;
    LOG(INFO) << "lm path: " << lm_path;
    int32 num_done = 0, num_err = 0;
    ppspeech::CTCBeamSearchOptions opts;
@@ -139,6 +137,10 @@ int main(int argc, char* argv[]) {
        std::string result;
        result = decoder.GetFinalBestPath();
        KALDI_LOG << " the result of " << utt << " is " << result;
+        if (result.empty()) {
+            // the TokenWriter can not write empty string.
+            result = " ";
+        }
        result_writer.Write(utt, result);
        decodable->Reset();
        decoder.Reset();

--- a/speechx/examples/decoder/offline_wfst_decoder_main.cc
+++ b/speechx/examples/decoder/offline_wfst_decoder_main.cc
@@ -22,10 +22,11 @@
 #include "nnet/decodable.h"
 #include "nnet/paddle_nnet.h"
-DEFINE_string(feature_respecifier, "", "test feature rspecifier");
+DEFINE_string(feature_rspecifier, "", "test feature rspecifier");
+DEFINE_string(result_wspecifier, "", "test result wspecifier");
 DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
 DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
-DEFINE_string(word_symbol_table, "vocab.txt", "word symbol table");
+DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
 DEFINE_string(graph_path, "TLG", "decoder graph");
 DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
 DEFINE_int32(max_active, 7500, "decoder graph");
@@ -35,22 +36,33 @@ DEFINE_int32(receptive_field_length,
 DEFINE_int32(downsampling_rate,
             4,
             "two CNN(kernel=5) module downsampling rate.");
+DEFINE_string(model_output_names,
+              "save_infer_model/scale_0.tmp_1,save_infer_model/"
+              "scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/"
+              "scale_3.tmp_1",
+              "model output names");
+DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names");
 using kaldi::BaseFloat;
 using kaldi::Matrix;
 using std::vector;
-// test clg decoder by feeding speech feature.
+// test TLG decoder by feeding speech feature.
 int main(int argc, char* argv[]) {
    gflags::ParseCommandLineFlags(&argc, &argv, false);
    google::InitGoogleLogging(argv[0]);
    kaldi::SequentialBaseFloatMatrixReader feature_reader(
-        FLAGS_feature_respecifier);
+        FLAGS_feature_rspecifier);
+    kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
    std::string model_graph = FLAGS_model_path;
    std::string model_params = FLAGS_param_path;
    std::string word_symbol_table = FLAGS_word_symbol_table;
    std::string graph_path = FLAGS_graph_path;
+    LOG(INFO) << "model path: " << model_graph;
+    LOG(INFO) << "model param: " << model_params;
+    LOG(INFO) << "word symbol path: " << word_symbol_table;
+    LOG(INFO) << "graph path: " << graph_path;
    int32 num_done = 0, num_err = 0;
@@ -65,7 +77,8 @@ int main(int argc, char* argv[]) {
    ppspeech::ModelOptions model_opts;
    model_opts.model_path = model_graph;
    model_opts.params_path = model_params;
-    model_opts.cache_shape = "5-1-1024,5-1-1024";
+    model_opts.cache_shape = FLAGS_model_cache_names;
+    model_opts.output_names = FLAGS_model_output_names;
    std::shared_ptr<ppspeech::PaddleNnet> nnet(
        new ppspeech::PaddleNnet(model_opts));
    std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
@@ -127,6 +140,11 @@ int main(int argc, char* argv[]) {
        std::string result;
        result = decoder.GetFinalBestPath();
        KALDI_LOG << " the result of " << utt << " is " << result;
+         if (result.empty()) {
+            // the TokenWriter can not write empty string.
+            result = " ";
+        }
+        result_writer.Write(utt, result);
        decodable->Reset();
        decoder.Reset();
        ++num_done;

--- a/speechx/speechx/nnet/paddle_nnet.cc
+++ b/speechx/speechx/nnet/paddle_nnet.cc
@@ -94,7 +94,6 @@ PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) {
 void PaddleNnet::Reset() { InitCacheEncouts(opts_); }
 paddle_infer::Predictor* PaddleNnet::GetPredictor() {
-    LOG(INFO) << "attempt to get a new predictor instance " << std::endl;
    paddle_infer::Predictor* predictor = nullptr;
    std::lock_guard<std::mutex> guard(pool_mutex);
    int pred_id = 0;
@@ -110,7 +109,6 @@ paddle_infer::Predictor* PaddleNnet::GetPredictor() {
    if (predictor) {
        pool_usages[pred_id] = true;
        predictor_to_thread_id[predictor] = pred_id;
-        LOG(INFO) << pred_id << " predictor create success";
    } else {
        LOG(INFO) << "Failed to get predictor from pool !!!";
    }
@@ -119,7 +117,6 @@ paddle_infer::Predictor* PaddleNnet::GetPredictor() {
 }
 int PaddleNnet::ReleasePredictor(paddle_infer::Predictor* predictor) {
-    LOG(INFO) << "attempt to releae a predictor";
    std::lock_guard<std::mutex> guard(pool_mutex);
    auto iter = predictor_to_thread_id.find(predictor);
@@ -128,10 +125,8 @@ int PaddleNnet::ReleasePredictor(paddle_infer::Predictor* predictor) {
        return 0;
    }
-    LOG(INFO) << iter->second << " predictor will be release";
    pool_usages[iter->second] = false;
    predictor_to_thread_id.erase(predictor);
-    LOG(INFO) << "release success";
    return 0;
 }
@@ -152,7 +147,6 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
    int feat_row = features.Dim() / feature_dim;
    std::vector<std::string> input_names = predictor->GetInputNames();
    std::vector<std::string> output_names = predictor->GetOutputNames();
-    LOG(INFO) << "feat info: rows, cols: " << feat_row << ", " << feature_dim;
    std::unique_ptr<paddle_infer::Tensor> input_tensor =
        predictor->GetInputHandle(input_names[0]);