add fill zero opt for frontend

28dafea0 · Hui Zhang · 83f885c6 · 28dafea0 · 28dafea0 · 28dafea0
7 changed file
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -20,6 +20,7 @@

 // feature
 DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
+DEFINE_bool(fill_zero, false, "fill zero at last chunk, when chunk < chunk_size");
 // DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear
 // feature, or fbank");
 DEFINE_int32(num_bins, 161, "num bins of mel");

--- a/speechx/speechx/frontend/audio/assembler.cc
+++ b/speechx/speechx/frontend/audio/assembler.cc
@@ -47,17 +47,16 @@ bool Assembler::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
 // read frame by frame from base_feature_extractor_ into cache_
 bool Assembler::Compute(Vector<BaseFloat>* feats) {
    // compute and feed frame by frame
-    bool result = false;
    while (feature_cache_.size() < frame_chunk_size_) {
        Vector<BaseFloat> feature;
-        result = base_extractor_->Read(&feature);
+        bool result = base_extractor_->Read(&feature);
        if (result == false || feature.Dim() == 0) {
-            VLOG(1) << "result: " << result << "feature dim: " << feature.Dim();
+            VLOG(1) << "result: " << result << " feature dim: " << feature.Dim();
            if (IsFinished() == false) {
-                LOG(INFO) << "finished reading feature. cache size: " << feature_cache_.size();
+                VLOG(1) << "finished reading feature. cache size: " << feature_cache_.size();
                return false;
            } else {
-                LOG(INFO) << "break";
+                VLOG(1) << "break";
                break;
            }
        }
@@ -103,7 +102,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
  
        counter++;
    }
-    CHECK(feature_cache_.size() == cache_size_ );
+    CHECK(feature_cache_.size() == cache_size_);

    return true;
 }

--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@@ -27,6 +27,7 @@

 // feature
 DECLARE_bool(use_fbank);
+DECLARE_bool(fill_zero);
 DECLARE_int32(num_bins);
 DECLARE_string(cmvn_file);

@@ -80,15 +81,18 @@ struct FeaturePipelineOptions {

        // assembler opts
        opts.assembler_opts.subsampling_rate = FLAGS_subsampling_rate;
-        LOG(INFO) << "subsampling rate: "
-                  << opts.assembler_opts.subsampling_rate;
        opts.assembler_opts.receptive_filed_length =
            FLAGS_receptive_field_length;
+        opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk;
+        opts.assembler_opts.fill_zero = FLAGS_fill_zero;
+        LOG(INFO) << "subsampling rate: "
+                  << opts.assembler_opts.subsampling_rate;
        LOG(INFO) << "nnet receptive filed length: "
                  << opts.assembler_opts.receptive_filed_length;
-        opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk;
        LOG(INFO) << "nnet chunk size: "
                  << opts.assembler_opts.nnet_decoder_chunk;
+        LOG(INFO) << "frontend fill zeros: "
+                  << opts.assembler_opts.fill_zero;
        return opts;
    }
 };

--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@@ -114,7 +114,7 @@ bool Decodable::AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
 // read one frame likelihood
 bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
    if (EnsureFrameHaveComputed(frame) == false) {
-        LOG(INFO) << "framelikehood exit.";
+        VLOG(1) << "framelikehood exit.";
        return false;
    }


--- a/speechx/speechx/recognizer/recognizer.h
+++ b/speechx/speechx/recognizer/recognizer.h
@@ -38,6 +38,8 @@ struct RecognizerResource {
        resource.acoustic_scale = FLAGS_acoustic_scale;
        resource.feature_pipeline_opts =
            FeaturePipelineOptions::InitFromFlags();
+        resource.feature_pipeline_opts.assembler_opts.fill_zero = true;
+        LOG(INFO) << "ds2 need fill zero be true: " << resource.feature_pipeline_opts.assembler_opts.fill_zero;
        resource.model_opts = ModelOptions::InitFromFlags();
        resource.tlg_opts = TLGDecoderOptions::InitFromFlags();
        return resource;

--- a/speechx/speechx/recognizer/u2_recognizer.h
+++ b/speechx/speechx/recognizer/u2_recognizer.h
@@ -101,6 +101,8 @@ struct U2RecognizerResource {

        resource.feature_pipeline_opts =
            ppspeech::FeaturePipelineOptions::InitFromFlags();
+        resource.feature_pipeline_opts.assembler_opts.fill_zero = false;
+        LOG(INFO) << "u2 need fill zero be false: " << resource.feature_pipeline_opts.assembler_opts.fill_zero;
        resource.model_opts = ppspeech::ModelOptions::InitFromFlags();
        resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags();
        return resource;

--- a/speechx/speechx/recognizer/u2_recognizer_main.cc
+++ b/speechx/speechx/recognizer/u2_recognizer_main.cc
@@ -85,9 +85,6 @@ int main(int argc, char* argv[]) {
            cnt++;
        }
        CHECK(sample_offset == tot_samples);
-        VLOG(1) << "num decode: " << cnt;
-
-        // recognizer.SetFinished();

        // second pass decoding
        recognizer.Rescoring();