unify model opts; add attention rescore in decodable; rename ds2 ctc beam search

5c8725e8 · Hui Zhang · 6987751f · 5c8725e8 · 5c8725e8 · 5c8725e8
15 changed file
--- a/speechx/examples/codelab/decoder/run.sh
+++ b/speechx/examples/codelab/decoder/run.sh
@@ -69,7 +69,7 @@ compute_linear_spectrogram_main \
 echo "compute linear spectrogram feature."
 # run ctc beam search decoder as streaming
-ctc_prefix_beam_search_decoder_main \
+ctc_beam_search_decoder_main \
    --result_wspecifier=ark,t:$exp_dir/result.txt \
    --feature_rspecifier=ark:$feat_wspecifier \
    --model_path=$model_dir/avg_1.jit.pdmodel \

--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@@ -84,7 +84,7 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    #  recognizer
    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
-    ctc_prefix_beam_search_decoder_main \
+    ctc_beam_search_decoder_main \
        --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
        --model_path=$model_dir/avg_1.jit.pdmodel \
        --param_path=$model_dir/avg_1.jit.pdiparams \
@@ -103,7 +103,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    #  decode with lm
    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
-    ctc_prefix_beam_search_decoder_main \
+    ctc_beam_search_decoder_main \
        --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
        --model_path=$model_dir/avg_1.jit.pdmodel \
        --param_path=$model_dir/avg_1.jit.pdiparams \

--- a/speechx/examples/ds2_ol/aishell/run_fbank.sh
+++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh
@@ -84,7 +84,7 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    #  recognizer
    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wolm.log \
-    ctc_prefix_beam_search_decoder_main \
+    ctc_beam_search_decoder_main \
        --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
        --model_path=$model_dir/avg_5.jit.pdmodel \
        --param_path=$model_dir/avg_5.jit.pdiparams \
@@ -102,7 +102,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    #  decode with lm
    utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.lm.log \
-    ctc_prefix_beam_search_decoder_main \
+    ctc_beam_search_decoder_main \
        --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
        --model_path=$model_dir/avg_5.jit.pdmodel \
        --param_path=$model_dir/avg_5.jit.pdiparams \

--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -12,7 +12,7 @@ add_library(decoder STATIC
 target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder)
 set(BINS 
-  ctc_prefix_beam_search_decoder_main
+  ctc_beam_search_decoder_main
  nnet_logprob_decoder_main
  recognizer_main
  tlg_decoder_main

--- a/speechx/speechx/decoder/ctc_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
+// used by deepspeech2
 #include "base/common.h"
 #include "decoder/ctc_decoders/path_trie.h"
 #include "decoder/ctc_decoders/scorer.h"

--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-// todo refactor, repalce with gtest
+// used by deepspeech2
 #include "base/flags.h"
 #include "base/log.h"

--- a/speechx/speechx/decoder/ctc_prefix_beam_search.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search.cc
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -67,6 +67,7 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
    frame_opts.dither = 0.0;
    frame_opts.frame_shift_ms = 10;
    opts.use_fbank = FLAGS_use_fbank;
+    LOG(INFO) << "feature type: " << opts.use_fbank ? "fbank" : "linear";
    if (opts.use_fbank) {
        opts.to_float32 = false;
        frame_opts.window_type = "povey";

--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@@ -157,4 +157,10 @@ void Decodable::Reset() {
    nnet_out_cache_.Resize(0, 0);
 }
+void Decodable::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
+                          float reverse_weight,
+                          std::vector<float>* rescoring_score){
+    nnet_->AttentionRescoring(hyps, reverse_weight, rescoring_score);
+}
 }  // namespace ppspeech
\ No newline at end of file
--- a/speechx/speechx/nnet/decodable.h
+++ b/speechx/speechx/nnet/decodable.h
@@ -30,23 +30,31 @@ class Decodable : public kaldi::DecodableInterface {
    // void Init(DecodableOpts config);
-    // nnet logprob output
+    // nnet logprob output, used by wfst
    virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index);
+    // nnet output
+    virtual bool FrameLikelihood(int32 frame,
+                                 std::vector<kaldi::BaseFloat>* likelihood);
+    // forward nnet with feats
+    bool AdvanceChunk();
+    // forward nnet with feats, and get nnet output
+    bool AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
+                      int* vocab_dim);
+    void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
+                          float reverse_weight,
+                          std::vector<float>* rescoring_score);
    virtual bool IsLastFrame(int32 frame);
    // nnet output dim, e.g. vocab size
    virtual int32 NumIndices() const;
-    // nnet prob output
-    virtual bool FrameLikelihood(int32 frame,
-                                 std::vector<kaldi::BaseFloat>* likelihood);
    virtual int32 NumFramesReady() const;
-    // for offline test
-    void Acceptlikelihood(const kaldi::Matrix<kaldi::BaseFloat>& likelihood);
    void Reset();
    bool IsInputFinished() const { return frontend_->IsFinished(); }
@@ -57,11 +65,8 @@ class Decodable : public kaldi::DecodableInterface {
    std::shared_ptr<NnetInterface> Nnet() { return nnet_; }
-    // forward nnet with feats
+    // for offline test
-    bool AdvanceChunk();
+    void Acceptlikelihood(const kaldi::Matrix<kaldi::BaseFloat>& likelihood);
-    // forward nnet with feats, and get nnet output
-    bool AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
-                      int* vocab_dim);
  private:
    std::shared_ptr<FrontendInterface> frontend_;

--- a/speechx/speechx/nnet/ds2_nnet.h
+++ b/speechx/speechx/nnet/ds2_nnet.h
@@ -15,56 +15,11 @@
 #include <numeric>
 #include "base/common.h"
 #include "kaldi/matrix/kaldi-matrix.h"
-#include "kaldi/util/options-itf.h"
 #include "nnet/nnet_itf.h"
 #include "paddle_inference_api.h"
 namespace ppspeech {
-struct ModelOptions {
-    std::string model_path;
-    std::string param_path;
-    int thread_num;  // predictor thread pool size
-    bool use_gpu;
-    bool switch_ir_optim;
-    std::string input_names;
-    std::string output_names;
-    std::string cache_names;
-    std::string cache_shape;
-    bool enable_fc_padding;
-    bool enable_profile;
-    ModelOptions()
-        : model_path(""),
-          param_path(""),
-          thread_num(2),
-          use_gpu(false),
-          input_names(""),
-          output_names(""),
-          cache_names(""),
-          cache_shape(""),
-          switch_ir_optim(false),
-          enable_fc_padding(false),
-          enable_profile(false) {}
-    void Register(kaldi::OptionsItf* opts) {
-        opts->Register("model-path", &model_path, "model file path");
-        opts->Register("model-param", &param_path, "params model file path");
-        opts->Register("thread-num", &thread_num, "thread num");
-        opts->Register("use-gpu", &use_gpu, "if use gpu");
-        opts->Register("input-names", &input_names, "paddle input names");
-        opts->Register("output-names", &output_names, "paddle output names");
-        opts->Register("cache-names", &cache_names, "cache names");
-        opts->Register("cache-shape", &cache_shape, "cache shape");
-        opts->Register("switch-ir-optiom",
-                       &switch_ir_optim,
-                       "paddle SwitchIrOptim option");
-        opts->Register("enable-fc-padding",
-                       &enable_fc_padding,
-                       "paddle EnableFCPadding option");
-        opts->Register(
-            "enable-profile", &enable_profile, "paddle EnableProfile option");
-    }
-};
 template <typename T>
 class Tensor {
@@ -100,6 +55,12 @@ class PaddleNnet : public NnetInterface {
                     const int32& feature_dim,
                     NnetOut* out) override;
+    void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
+                                    float reverse_weight,
+                                    std::vector<float>* rescoring_score) override {
+      VLOG(2) << "deepspeech2 not has AttentionRescoring.";
+    }
    void Dim();
    void Reset() override;

--- a/speechx/speechx/nnet/nnet_itf.h
+++ b/speechx/speechx/nnet/nnet_itf.h
@@ -18,9 +18,56 @@
 #include "base/basic_types.h"
 #include "kaldi/base/kaldi-types.h"
 #include "kaldi/matrix/kaldi-matrix.h"
+#include "kaldi/util/options-itf.h"
 namespace ppspeech {
+struct ModelOptions {
+    std::string model_path;
+    std::string param_path;
+    int thread_num;  // predictor thread pool size for ds2;
+    bool use_gpu;
+    bool switch_ir_optim;
+    std::string input_names;
+    std::string output_names;
+    std::string cache_names;
+    std::string cache_shape;
+    bool enable_fc_padding;
+    bool enable_profile;
+    ModelOptions()
+        : model_path(""),
+          param_path(""),
+          thread_num(1),
+          use_gpu(false),
+          input_names(""),
+          output_names(""),
+          cache_names(""),
+          cache_shape(""),
+          switch_ir_optim(false),
+          enable_fc_padding(false),
+          enable_profile(false) {}
+    void Register(kaldi::OptionsItf* opts) {
+        opts->Register("model-path", &model_path, "model file path");
+        opts->Register("model-param", &param_path, "params model file path");
+        opts->Register("thread-num", &thread_num, "thread num");
+        opts->Register("use-gpu", &use_gpu, "if use gpu");
+        opts->Register("input-names", &input_names, "paddle input names");
+        opts->Register("output-names", &output_names, "paddle output names");
+        opts->Register("cache-names", &cache_names, "cache names");
+        opts->Register("cache-shape", &cache_shape, "cache shape");
+        opts->Register("switch-ir-optiom",
+                       &switch_ir_optim,
+                       "paddle SwitchIrOptim option");
+        opts->Register("enable-fc-padding",
+                       &enable_fc_padding,
+                       "paddle EnableFCPadding option");
+        opts->Register(
+            "enable-profile", &enable_profile, "paddle EnableProfile option");
+    }
+};
 struct NnetOut {
    // nnet out. maybe logprob or prob. Almost time this is logprob.
    kaldi::Vector<kaldi::BaseFloat> logprobs;
@@ -45,6 +92,10 @@ class NnetInterface {
                             const int32& feature_dim,
                             NnetOut* out) = 0;
+    virtual void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
+                                    float reverse_weight,
+                                    std::vector<float>* rescoring_score) = 0;
    // reset nnet state, e.g. nnet_logprob_cache_, offset_, encoder_outs_.
    virtual void Reset() = 0;

--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -166,7 +166,7 @@ void U2Nnet::Warmup() {
    Reset();
 }
-U2Nnet::U2Nnet(const U2ModelOptions& opts) : opts_(opts) {
+U2Nnet::U2Nnet(const ModelOptions& opts) : opts_(opts) {
    LoadModel(opts_.model_path);
 }

--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -17,28 +17,14 @@
 #include "base/common.h"
 #include "kaldi/matrix/kaldi-matrix.h"
-#include "kaldi/util/options-itf.h"
-#include "nnet/nnet_itf.h"
+#include "nnet/nnet_itf.h"
 #include "paddle/extension.h"
 #include "paddle/jit/all.h"
 #include "paddle/phi/api/all.h"
 namespace ppspeech {
-struct U2ModelOptions {
-    std::string model_path;
-    int thread_num;
-    bool use_gpu;
-    U2ModelOptions() : model_path(""), thread_num(1), use_gpu(false) {}
-    void Register(kaldi::OptionsItf* opts) {
-        opts->Register("model-path", &model_path, "model file path");
-        opts->Register("thread-num", &thread_num, "thread num");
-        opts->Register("use-gpu", &use_gpu, "if use gpu");
-    }
-};
 class U2NnetBase : public NnetInterface {
  public:
@@ -65,10 +51,6 @@ class U2NnetBase : public NnetInterface {
        std::vector<kaldi::BaseFloat>* ctc_probs,
        int32* vocab_dim);
-    virtual void AttentionRescoring(const std::vector<std::vector<int>>& hyps,
-                                    float reverse_weight,
-                                    std::vector<float>* rescoring_score) = 0;
  protected:
    virtual void ForwardEncoderChunkImpl(
        const std::vector<kaldi::BaseFloat>& chunk_feats,
@@ -102,7 +84,7 @@ class U2NnetBase : public NnetInterface {
 class U2Nnet : public U2NnetBase {
  public:
-    U2Nnet(const U2ModelOptions& opts);
+    U2Nnet(const ModelOptions& opts);
    U2Nnet(const U2Nnet& other);
    void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
@@ -143,7 +125,7 @@ class U2Nnet : public U2NnetBase {
        std::vector<kaldi::Vector<kaldi::BaseFloat>>* encoder_out) const;
  private:
-    U2ModelOptions opts_;
+    ModelOptions opts_;
    phi::Place dev_;
    std::shared_ptr<paddle::jit::Layer> model_{nullptr};

--- a/speechx/speechx/nnet/u2_nnet_main.cc
+++ b/speechx/speechx/nnet/u2_nnet_main.cc
@@ -58,7 +58,7 @@ int main(int argc, char* argv[]) {
    kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier);
    kaldi::BaseFloatMatrixWriter nnet_encoder_outs_writer(FLAGS_nnet_encoder_outs_wspecifier);
-    ppspeech::U2ModelOptions model_opts;
+    ppspeech::ModelOptions model_opts;
    model_opts.model_path = FLAGS_model_path;
    int32 chunk_size =