u2 recog test main ok

17ea30e7 · Hui Zhang · 86eb7189 · 17ea30e7 · 17ea30e7 · 17ea30e7
7 changed file
--- a/speechx/examples/codelab/u2/local/recognizer.sh
+++ b/speechx/examples/codelab/u2/local/recognizer.sh
+#!/bin/bash
+set -e
+
+. path.sh
+
+data=data
+exp=exp
+mkdir -p $exp
+ckpt_dir=./data/model
+model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/
+
+u2_recognizer_main \
+    --use_fbank=true \
+    --num_bins=80 \
+    --cmvn_file=$exp/cmvn.ark \
+    --model_path=$model_dir/export.jit \
+    --nnet_decoder_chunk=16 \
+    --receptive_field_length=7 \
+    --downsampling_rate=4 \
+    --vocab_path=$model_dir/unit.txt \
+    --wav_rspecifier=scp:$data/wav.scp \
+    --result_wspecifier=ark,t:$exp/result.ark
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -52,11 +52,12 @@ DEFINE_string(model_cache_names,
              "chunk_state_h_box,chunk_state_c_box",
              "model cache names");
 DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
+DEFINE_string(vocab_path, "", "nnet vocab path.");

 // decoder
-DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
-DEFINE_string(graph_path, "TLG", "decoder graph");
 DEFINE_double(acoustic_scale, 1.0, "acoustic scale");
+DEFINE_string(graph_path, "TLG", "decoder graph");
+DEFINE_string(word_symbol_table, "words.txt", "word symbol table");
 DEFINE_int32(max_active, 7500, "max active");
 DEFINE_double(beam, 15.0, "decoder beam");
 DEFINE_double(lattice_beam, 7.5, "decoder beam");
@@ -72,13 +73,14 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
    frame_opts.dither = 0.0;
    frame_opts.frame_shift_ms = 10;
    opts.use_fbank = FLAGS_use_fbank;
-    LOG(INFO) << "feature type: " << opts.use_fbank ? "fbank" : "linear";
+    LOG(INFO) << "feature type: " << (opts.use_fbank ? "fbank" : "linear");
    if (opts.use_fbank) {
        opts.to_float32 = false;
        frame_opts.window_type = "povey";
        frame_opts.frame_length_ms = 25;
        opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
        opts.fbank_opts.frame_opts = frame_opts;
+        LOG(INFO) << "num bins: " << opts.fbank_opts.mel_opts.num_bins;
    } else {
        opts.to_float32 = true;
        frame_opts.remove_dc_offset = false;

--- a/speechx/speechx/decoder/u2_recognizer.cc
+++ b/speechx/speechx/decoder/u2_recognizer.cc
@@ -33,12 +33,15 @@ U2Recognizer::U2Recognizer(const U2RecognizerResource& resource): opts_(resource
    BaseFloat am_scale = resource.acoustic_scale;
    decodable_.reset(new Decodable(nnet, feature_pipeline_, am_scale));

+    CHECK(resource.vocab_path != "");
    decoder_.reset(new CTCPrefixBeamSearch(resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts));

    unit_table_ = decoder_->VocabTable();
    symbol_table_ = unit_table_;

    input_finished_ = false;
+
+    Reset();
 }

 void U2Recognizer::Reset() {
@@ -69,6 +72,7 @@ void U2Recognizer::Accept(const VectorBase<BaseFloat>& waves) {

 void U2Recognizer::Decode() { 
    decoder_->AdvanceDecode(decodable_); 
+    UpdateResult(false);
 }

 void U2Recognizer::Rescoring() {

--- a/speechx/speechx/decoder/u2_recognizer.h
+++ b/speechx/speechx/decoder/u2_recognizer.h
@@ -92,12 +92,13 @@ struct DecodeOptions {


 struct U2RecognizerResource {
+    kaldi::BaseFloat acoustic_scale{1.0};
+    std::string vocab_path{};
+
    FeaturePipelineOptions feature_pipeline_opts{};
    ModelOptions model_opts{};
    DecodeOptions decoder_opts{};
    //    CTCBeamSearchOptions beam_search_opts;
-    kaldi::BaseFloat acoustic_scale{1.0};
-    std::string vocab_path{};
 };



--- a/speechx/speechx/decoder/u2_recognizer_main.cc
+++ b/speechx/speechx/decoder/u2_recognizer_main.cc
@@ -25,13 +25,16 @@ DEFINE_int32(sample_rate, 16000, "sample rate");

 ppspeech::U2RecognizerResource InitOpts() {
    ppspeech::U2RecognizerResource resource;
+    resource.vocab_path = FLAGS_vocab_path;
    resource.acoustic_scale = FLAGS_acoustic_scale;
-    resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions();

+    resource.feature_pipeline_opts = ppspeech::InitFeaturePipelineOptions();
+    LOG(INFO) << "feature!";
    ppspeech::ModelOptions model_opts;
    model_opts.model_path = FLAGS_model_path;

    resource.model_opts = model_opts;
+     LOG(INFO) << "model!";

    ppspeech::DecodeOptions decoder_opts;
    decoder_opts.chunk_size=16;
@@ -44,6 +47,7 @@ ppspeech::U2RecognizerResource InitOpts() {
    decoder_opts.ctc_prefix_search_opts.second_beam_size = 10;

    resource.decoder_opts = decoder_opts;
+    LOG(INFO) << "decoder!";
    return resource;
 }

@@ -57,9 +61,6 @@ int main(int argc, char* argv[]) {
    int32 num_done = 0, num_err = 0;
    double tot_wav_duration = 0.0;

-    ppspeech::U2RecognizerResource resource = InitOpts();
-    ppspeech::U2Recognizer recognizer(resource);
-
    kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
        FLAGS_wav_rspecifier);
    kaldi::TokenWriter result_writer(FLAGS_result_wspecifier);
@@ -71,8 +72,10 @@ int main(int argc, char* argv[]) {
    LOG(INFO) << "chunk size (s): " << streaming_chunk;
    LOG(INFO) << "chunk size (sample): " << chunk_sample_size;

-    kaldi::Timer timer;
+    ppspeech::U2RecognizerResource resource = InitOpts();
+    ppspeech::U2Recognizer recognizer(resource);

+    kaldi::Timer timer;
    for (; !wav_reader.Done(); wav_reader.Next()) {
        std::string utt = wav_reader.Key();
        const kaldi::WaveData& wave_data = wav_reader.Value();

--- a/speechx/speechx/frontend/audio/cmvn.cc
+++ b/speechx/speechx/frontend/audio/cmvn.cc
@@ -29,7 +29,9 @@ using std::unique_ptr;

 CMVN::CMVN(std::string cmvn_file, unique_ptr<FrontendInterface> base_extractor)
    : var_norm_(true) {
+    CHECK(cmvn_file != "");
    base_extractor_ = std::move(base_extractor);
+
    bool binary;
    kaldi::Input ki(cmvn_file, &binary);
    stats_.Read(ki.Stream(), binary);
@@ -55,11 +57,11 @@ bool CMVN::Read(kaldi::Vector<BaseFloat>* feats) {
 // feats contain num_frames feature.
 void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
    KALDI_ASSERT(feats != NULL);
-    int32 dim = stats_.NumCols() - 1;
+   
    if (stats_.NumRows() > 2 || stats_.NumRows() < 1 ||
-        feats->Dim() % dim != 0) {
-        KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << 'x'
-                  << stats_.NumCols() << ", feats " << feats->Dim() << 'x';
+        feats->Dim() % dim_ != 0) {
+        KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << ','
+                  << stats_.NumCols() - 1 << ", feats " << feats->Dim() << 'x';
    }
    if (stats_.NumRows() == 1 && var_norm_) {
        KALDI_ERR
@@ -67,7 +69,7 @@ void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
            << "are supplied.";
    }

-    double count = stats_(0, dim);
+    double count = stats_(0, dim_);
    // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
    // computing an offset and representing it as stats_, we use a count of one.
    if (count < 1.0)
@@ -77,14 +79,14 @@ void CMVN::Compute(VectorBase<BaseFloat>* feats) const {

    if (!var_norm_) {
        Vector<BaseFloat> offset(feats->Dim());
-        SubVector<double> mean_stats(stats_.RowData(0), dim);
+        SubVector<double> mean_stats(stats_.RowData(0), dim_);
        Vector<double> mean_stats_apply(feats->Dim());
-        // fill the datat of mean_stats in mean_stats_appy whose dim is equal
-        // with the dim of feature.
-        // the dim of feats = dim * num_frames;
-        for (int32 idx = 0; idx < feats->Dim() / dim; ++idx) {
-            SubVector<double> stats_tmp(mean_stats_apply.Data() + dim * idx,
-                                        dim);
+        // fill the datat of mean_stats in mean_stats_appy whose dim_ is equal
+        // with the dim_ of feature.
+        // the dim_ of feats = dim_ * num_frames;
+        for (int32 idx = 0; idx < feats->Dim() / dim_; ++idx) {
+            SubVector<double> stats_tmp(mean_stats_apply.Data() + dim_ * idx,
+                                        dim_);
            stats_tmp.CopyFromVec(mean_stats);
        }
        offset.AddVec(-1.0 / count, mean_stats_apply);
@@ -94,7 +96,7 @@ void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
    // norm(0, d) = mean offset;
    // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
    kaldi::Matrix<BaseFloat> norm(2, feats->Dim());
-    for (int32 d = 0; d < dim; d++) {
+    for (int32 d = 0; d < dim_; d++) {
        double mean, offset, scale;
        mean = stats_(0, d) / count;
        double var = (stats_(1, d) / count) - mean * mean, floor = 1.0e-20;
@@ -111,7 +113,7 @@ void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
        for (int32 d_skip = d; d_skip < feats->Dim();) {
            norm(0, d_skip) = offset;
            norm(1, d_skip) = scale;
-            d_skip = d_skip + dim;
+            d_skip = d_skip + dim_;
        }
    }
    // Apply the normalization.

--- a/speechx/speechx/frontend/audio/feature_pipeline.cc
+++ b/speechx/speechx/frontend/audio/feature_pipeline.cc
@@ -32,6 +32,7 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) : opts_(opt
            opts.linear_spectrogram_opts, std::move(data_source)));
    }

+    CHECK(opts.cmvn_file != "");
    unique_ptr<FrontendInterface> cmvn(
        new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature)));