diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index 1f13bbc0e468c97e7206006a9087a93dfa4e6600..1a332755da0ee2a3279da0c06e11ceb1d86790ca 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -20,6 +20,7 @@ // feature DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); +DEFINE_bool(fill_zero, false, "fill zero at last chunk, when chunk < chunk_size"); // DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear // feature, or fbank"); DEFINE_int32(num_bins, 161, "num bins of mel"); diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc index bbd0944225f845f9c0751dab9c5ca7175469740d..26a3905bdf52bfef64c7689c16451dc647d49663 100644 --- a/speechx/speechx/frontend/audio/assembler.cc +++ b/speechx/speechx/frontend/audio/assembler.cc @@ -47,17 +47,16 @@ bool Assembler::Read(kaldi::Vector* feats) { // read frame by frame from base_feature_extractor_ into cache_ bool Assembler::Compute(Vector* feats) { // compute and feed frame by frame - bool result = false; while (feature_cache_.size() < frame_chunk_size_) { Vector feature; - result = base_extractor_->Read(&feature); + bool result = base_extractor_->Read(&feature); if (result == false || feature.Dim() == 0) { - VLOG(1) << "result: " << result << "feature dim: " << feature.Dim(); + VLOG(1) << "result: " << result << " feature dim: " << feature.Dim(); if (IsFinished() == false) { - LOG(INFO) << "finished reading feature. cache size: " << feature_cache_.size(); + VLOG(1) << "finished reading feature. cache size: " << feature_cache_.size(); return false; } else { - LOG(INFO) << "break"; + VLOG(1) << "break"; break; } } @@ -103,7 +102,7 @@ bool Assembler::Compute(Vector* feats) { counter++; } - CHECK(feature_cache_.size() == cache_size_ ); + CHECK(feature_cache_.size() == cache_size_); return true; } diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index d91a70e352ffe85689e666b37d1b6e2d8f971e81..e06995b17ac7135d8b6812888b12e4a2b07237f0 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -27,6 +27,7 @@ // feature DECLARE_bool(use_fbank); +DECLARE_bool(fill_zero); DECLARE_int32(num_bins); DECLARE_string(cmvn_file); @@ -80,15 +81,18 @@ struct FeaturePipelineOptions { // assembler opts opts.assembler_opts.subsampling_rate = FLAGS_subsampling_rate; - LOG(INFO) << "subsampling rate: " - << opts.assembler_opts.subsampling_rate; opts.assembler_opts.receptive_filed_length = FLAGS_receptive_field_length; + opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk; + opts.assembler_opts.fill_zero = FLAGS_fill_zero; + LOG(INFO) << "subsampling rate: " + << opts.assembler_opts.subsampling_rate; LOG(INFO) << "nnet receptive filed length: " << opts.assembler_opts.receptive_filed_length; - opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk; LOG(INFO) << "nnet chunk size: " << opts.assembler_opts.nnet_decoder_chunk; + LOG(INFO) << "frontend fill zeros: " + << opts.assembler_opts.fill_zero; return opts; } }; diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index 6956a2cb8f371c1f90c368a70dae84982dda4d9d..a7de58b5831657faee498c3acc2ad9833dc25e83 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -114,7 +114,7 @@ bool Decodable::AdvanceChunk(kaldi::Vector* logprobs, // read one frame likelihood bool Decodable::FrameLikelihood(int32 frame, vector* likelihood) { if (EnsureFrameHaveComputed(frame) == false) { - LOG(INFO) << "framelikehood exit."; + VLOG(1) << "framelikehood exit."; return false; } diff --git a/speechx/speechx/recognizer/recognizer.h b/speechx/speechx/recognizer/recognizer.h index 0402bcd3ccd5582b4b248c44a309c978046fad16..27f1228aeb35e67f94b61a2eddabf7847c31d6c0 100644 --- a/speechx/speechx/recognizer/recognizer.h +++ b/speechx/speechx/recognizer/recognizer.h @@ -38,6 +38,8 @@ struct RecognizerResource { resource.acoustic_scale = FLAGS_acoustic_scale; resource.feature_pipeline_opts = FeaturePipelineOptions::InitFromFlags(); + resource.feature_pipeline_opts.assembler_opts.fill_zero = true; + LOG(INFO) << "ds2 need fill zero be true: " << resource.feature_pipeline_opts.assembler_opts.fill_zero; resource.model_opts = ModelOptions::InitFromFlags(); resource.tlg_opts = TLGDecoderOptions::InitFromFlags(); return resource; diff --git a/speechx/speechx/recognizer/u2_recognizer.h b/speechx/speechx/recognizer/u2_recognizer.h index 54f4d2580bb73b7d7a9de4d546bc189aac922417..4746d86f840e336bf257e3988699e0d74ad44f1c 100644 --- a/speechx/speechx/recognizer/u2_recognizer.h +++ b/speechx/speechx/recognizer/u2_recognizer.h @@ -101,6 +101,8 @@ struct U2RecognizerResource { resource.feature_pipeline_opts = ppspeech::FeaturePipelineOptions::InitFromFlags(); + resource.feature_pipeline_opts.assembler_opts.fill_zero = false; + LOG(INFO) << "u2 need fill zero be false: " << resource.feature_pipeline_opts.assembler_opts.fill_zero; resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags(); return resource; diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc index bfb37fb8ed440d0b886abe708336d496d8134f20..7e59d6cb0ee2eb5b7b490e073efa36e19710ed9c 100644 --- a/speechx/speechx/recognizer/u2_recognizer_main.cc +++ b/speechx/speechx/recognizer/u2_recognizer_main.cc @@ -85,9 +85,6 @@ int main(int argc, char* argv[]) { cnt++; } CHECK(sample_offset == tot_samples); - VLOG(1) << "num decode: " << cnt; - - // recognizer.SetFinished(); // second pass decoding recognizer.Rescoring();