From cb0b6785b0e403527a323360f301d24f33175181 Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Wed, 4 May 2022 08:50:34 +0800 Subject: [PATCH] add fbank into feature pipeline --- .../ds2_ol/feat/compute_fbank_main.cc | 3 +-- speechx/speechx/decoder/param.h | 23 +++++++++++++------ speechx/speechx/frontend/audio/CMakeLists.txt | 2 +- speechx/speechx/frontend/audio/fbank.cc | 18 ++++++++++++--- speechx/speechx/frontend/audio/fbank.h | 2 +- .../frontend/audio/feature_pipeline.cc | 14 +++++++---- .../speechx/frontend/audio/feature_pipeline.h | 5 ++++ speechx/speechx/kaldi/feat/feature-fbank.cc | 7 +++--- speechx/speechx/kaldi/feat/feature-fbank.h | 2 +- 9 files changed, 53 insertions(+), 23 deletions(-) diff --git a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc index 8db0f3b5..7beaa587 100644 --- a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc +++ b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc @@ -43,7 +43,7 @@ int main(int argc, char* argv[]) { int32 num_done = 0, num_err = 0; - // feature pipeline: wave cache --> hanning window + // feature pipeline: wave cache --> povey window // -->fbank --> global cmvn -> feat cache std::unique_ptr data_source( @@ -78,7 +78,6 @@ int main(int argc, char* argv[]) { LOG(INFO) << "chunk size (s): " << streaming_chunk; LOG(INFO) << "chunk size (sample): " << chunk_sample_size; - for (; !wav_reader.Done(); wav_reader.Next()) { std::string utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h index ef565621..85de08ca 100644 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -47,7 +47,8 @@ DEFINE_string(model_cache_names, "chunk_state_h_box,chunk_state_c_box", "model cache names"); DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); - +DEFINE_bool(use_fbank, false, "use fbank or linear feature"); +DEFINE_int32(num_bins, 161, "num bins of mel"); namespace ppspeech { // todo refactor later @@ -57,13 +58,21 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk; opts.to_float32 = FLAGS_to_float32; kaldi::FrameExtractionOptions frame_opts; - frame_opts.frame_length_ms = 20; - frame_opts.frame_shift_ms = 10; - frame_opts.remove_dc_offset = false; - frame_opts.window_type = "hanning"; - frame_opts.preemph_coeff = 0.0; frame_opts.dither = 0.0; - opts.linear_spectrogram_opts.frame_opts = frame_opts; + frame_opts.frame_shift_ms = 10; + opts.use_fbank = FLAGS_use_fbank; + if (opts.use_fbank) { + frame_opts.window_type = "povey"; + frame_opts.frame_length_ms = 25; + opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; + opts.fbank_opts.fbank_opts.frame_opts = frame_opts; + } else { + frame_opts.remove_dc_offset = false; + frame_opts.frame_length_ms = 20; + frame_opts.window_type = "hanning"; + frame_opts.preemph_coeff = 0.0; + opts.linear_spectrogram_opts.frame_opts = frame_opts; + } opts.feature_cache_opts.frame_chunk_size = FLAGS_receptive_field_length; opts.feature_cache_opts.frame_chunk_stride = FLAGS_downsampling_rate; return opts; diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt index 028da6c9..745832fe 100644 --- a/speechx/speechx/frontend/audio/CMakeLists.txt +++ b/speechx/speechx/frontend/audio/CMakeLists.txt @@ -10,4 +10,4 @@ add_library(frontend STATIC fbank.cc ) -target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common) +target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common kaldi-fbank) diff --git a/speechx/speechx/frontend/audio/fbank.cc b/speechx/speechx/frontend/audio/fbank.cc index a3105b39..a865db59 100644 --- a/speechx/speechx/frontend/audio/fbank.cc +++ b/speechx/speechx/frontend/audio/fbank.cc @@ -29,6 +29,8 @@ using kaldi::VectorBase; using kaldi::Matrix; using std::vector; +// todo refactor later:(SmileGoat) + Fbank::Fbank(const FbankOptions& opts, std::unique_ptr base_extractor) : opts_(opts), @@ -98,12 +100,22 @@ bool Fbank::Compute(const Vector& waves, Vector* feats) { Vector this_feature(computer_.Dim(), kaldi::kUndefined); // note: this online feature-extraction code does not support VTLN. - BaseFloat vtln_warp = 1.0; - computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature); + RealFft(&window, true); + kaldi::ComputePowerSpectrum(&window); + const kaldi::MelBanks &mel_bank = *(computer_.GetMelBanks(1.0)); + SubVector power_spectrum(window, 0, window.Dim() / 2 + 1); + if (!opts_.fbank_opts.use_power) { + power_spectrum.ApplyPow(0.5); + } + int32 mel_offset = ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1 : 0); + SubVector mel_energies(this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins); + mel_bank.Compute(power_spectrum, &mel_energies); + mel_energies.ApplyFloor(1e-07); + mel_energies.ApplyLog(); SubVector output_row(feats->Data() + frame * Dim(), Dim()); output_row.CopyFromVec(this_feature); } return true; } -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech diff --git a/speechx/speechx/frontend/audio/fbank.h b/speechx/speechx/frontend/audio/fbank.h index 720e17c7..66957dc6 100644 --- a/speechx/speechx/frontend/audio/fbank.h +++ b/speechx/speechx/frontend/audio/fbank.h @@ -74,4 +74,4 @@ class Fbank : public FrontendInterface { DISALLOW_COPY_AND_ASSIGN(Fbank); }; -} // namespace ppspeech \ No newline at end of file +} // namespace ppspeech diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc index 5914fedb..40891871 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.cc +++ b/speechx/speechx/frontend/audio/feature_pipeline.cc @@ -22,12 +22,18 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) { unique_ptr data_source( new ppspeech::AudioCache(1000 * kint16max, opts.to_float32)); - unique_ptr linear_spectrogram( - new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts, - std::move(data_source))); + unique_ptr base_feature; + + if (opts.use_fbank) { + base_feature.reset(new ppspeech::Fbank(opts.fbank_opts, + std::move(data_source))); + } else { + base_feature.reset(new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts, + std::move(data_source))); + } unique_ptr cmvn( - new ppspeech::CMVN(opts.cmvn_file, std::move(linear_spectrogram))); + new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature))); base_extractor_.reset( new ppspeech::FeatureCache(opts.feature_cache_opts, std::move(cmvn))); diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h index 580c02fa..4868d37e 100644 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -21,6 +21,7 @@ #include "frontend/audio/feature_cache.h" #include "frontend/audio/frontend_itf.h" #include "frontend/audio/linear_spectrogram.h" +#include "frontend/audio/fbank.h" #include "frontend/audio/normalizer.h" namespace ppspeech { @@ -28,12 +29,16 @@ namespace ppspeech { struct FeaturePipelineOptions { std::string cmvn_file; bool to_float32; + bool use_fbank; LinearSpectrogramOptions linear_spectrogram_opts; + FbankOptions fbank_opts; FeatureCacheOptions feature_cache_opts; FeaturePipelineOptions() : cmvn_file(""), to_float32(false), + use_fbank(false), linear_spectrogram_opts(), + fbank_opts(), feature_cache_opts() {} }; diff --git a/speechx/speechx/kaldi/feat/feature-fbank.cc b/speechx/speechx/kaldi/feat/feature-fbank.cc index 8f3fcd52..5d10156b 100644 --- a/speechx/speechx/kaldi/feat/feature-fbank.cc +++ b/speechx/speechx/kaldi/feat/feature-fbank.cc @@ -85,10 +85,9 @@ void FbankComputer::Compute(BaseFloat signal_raw_log_energy, signal_raw_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame), std::numeric_limits::epsilon())); - // todo : remove later; as align fbank feature in paddleaudio - //if (srfft_ != NULL) // Compute FFT using split-radix algorithm. - // srfft_->Compute(signal_frame->Data(), true); - //else // An alternative algorithm that works for non-powers-of-two. + if (srfft_ != NULL) // Compute FFT using split-radix algorithm. + srfft_->Compute(signal_frame->Data(), true); + else // An alternative algorithm that works for non-powers-of-two. RealFft(signal_frame, true); // Convert the FFT into a power spectrum. diff --git a/speechx/speechx/kaldi/feat/feature-fbank.h b/speechx/speechx/kaldi/feat/feature-fbank.h index f57d185a..d121cc0e 100644 --- a/speechx/speechx/kaldi/feat/feature-fbank.h +++ b/speechx/speechx/kaldi/feat/feature-fbank.h @@ -128,8 +128,8 @@ class FbankComputer { ~FbankComputer(); - private: const MelBanks *GetMelBanks(BaseFloat vtln_warp); + private: FbankOptions opts_; -- GitLab