From cb0b6785b0e403527a323360f301d24f33175181 Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Wed, 4 May 2022 08:50:34 +0800
Subject: [PATCH] add fbank into feature pipeline

---
 .../ds2_ol/feat/compute_fbank_main.cc         |  3 +--
 speechx/speechx/decoder/param.h               | 23 +++++++++++++------
 speechx/speechx/frontend/audio/CMakeLists.txt |  2 +-
 speechx/speechx/frontend/audio/fbank.cc       | 18 ++++++++++++---
 speechx/speechx/frontend/audio/fbank.h        |  2 +-
 .../frontend/audio/feature_pipeline.cc        | 14 +++++++----
 .../speechx/frontend/audio/feature_pipeline.h |  5 ++++
 speechx/speechx/kaldi/feat/feature-fbank.cc   |  7 +++---
 speechx/speechx/kaldi/feat/feature-fbank.h    |  2 +-
 9 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc
index 8db0f3b5..7beaa587 100644
--- a/speechx/examples/ds2_ol/feat/compute_fbank_main.cc
+++ b/speechx/examples/ds2_ol/feat/compute_fbank_main.cc
@@ -43,7 +43,7 @@ int main(int argc, char* argv[]) {
 
     int32 num_done = 0, num_err = 0;
 
-    // feature pipeline: wave cache --> hanning window
+    // feature pipeline: wave cache --> povey window
     // -->fbank --> global cmvn -> feat cache
 
     std::unique_ptr<ppspeech::FrontendInterface> data_source(
@@ -78,7 +78,6 @@ int main(int argc, char* argv[]) {
     LOG(INFO) << "chunk size (s): " << streaming_chunk;
     LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
 
-
     for (; !wav_reader.Done(); wav_reader.Next()) {
         std::string utt = wav_reader.Key();
         const kaldi::WaveData& wave_data = wav_reader.Value();
diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h
index ef565621..85de08ca 100644
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -47,7 +47,8 @@ DEFINE_string(model_cache_names,
               "chunk_state_h_box,chunk_state_c_box",
               "model cache names");
 DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
-
+DEFINE_bool(use_fbank, false, "use fbank or linear feature");
+DEFINE_int32(num_bins, 161, "num bins of mel");
 
 namespace ppspeech {
 // todo refactor later
@@ -57,13 +58,21 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
     opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk;
     opts.to_float32 = FLAGS_to_float32;
     kaldi::FrameExtractionOptions frame_opts;
-    frame_opts.frame_length_ms = 20;
-    frame_opts.frame_shift_ms = 10;
-    frame_opts.remove_dc_offset = false;
-    frame_opts.window_type = "hanning";
-    frame_opts.preemph_coeff = 0.0;
     frame_opts.dither = 0.0;
-    opts.linear_spectrogram_opts.frame_opts = frame_opts;
+    frame_opts.frame_shift_ms = 10;
+    opts.use_fbank = FLAGS_use_fbank;
+    if (opts.use_fbank) {
+      frame_opts.window_type = "povey";
+      frame_opts.frame_length_ms = 25;
+      opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
+      opts.fbank_opts.fbank_opts.frame_opts = frame_opts;
+    } else {
+      frame_opts.remove_dc_offset = false;
+      frame_opts.frame_length_ms = 20;
+      frame_opts.window_type = "hanning";
+      frame_opts.preemph_coeff = 0.0;
+      opts.linear_spectrogram_opts.frame_opts = frame_opts;
+    }
     opts.feature_cache_opts.frame_chunk_size = FLAGS_receptive_field_length;
     opts.feature_cache_opts.frame_chunk_stride = FLAGS_downsampling_rate;
     return opts;
diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt
index 028da6c9..745832fe 100644
--- a/speechx/speechx/frontend/audio/CMakeLists.txt
+++ b/speechx/speechx/frontend/audio/CMakeLists.txt
@@ -10,4 +10,4 @@ add_library(frontend STATIC
   fbank.cc
 )
 
-target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common)
+target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common kaldi-fbank)
diff --git a/speechx/speechx/frontend/audio/fbank.cc b/speechx/speechx/frontend/audio/fbank.cc
index a3105b39..a865db59 100644
--- a/speechx/speechx/frontend/audio/fbank.cc
+++ b/speechx/speechx/frontend/audio/fbank.cc
@@ -29,6 +29,8 @@ using kaldi::VectorBase;
 using kaldi::Matrix;
 using std::vector;
 
+// todo refactor later:(SmileGoat)
+
 Fbank::Fbank(const FbankOptions& opts,
              std::unique_ptr<FrontendInterface> base_extractor)
     : opts_(opts),
@@ -98,12 +100,22 @@ bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) {
 
         Vector<BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
         // note: this online feature-extraction code does not support VTLN.
-        BaseFloat vtln_warp = 1.0;
-        computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
+        RealFft(&window, true);
+        kaldi::ComputePowerSpectrum(&window);
+        const kaldi::MelBanks &mel_bank = *(computer_.GetMelBanks(1.0));
+        SubVector<BaseFloat> power_spectrum(window, 0, window.Dim() / 2 + 1); 
+        if (!opts_.fbank_opts.use_power) {
+            power_spectrum.ApplyPow(0.5);
+        }
+        int32 mel_offset = ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1 : 0);
+        SubVector<BaseFloat> mel_energies(this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins);
+        mel_bank.Compute(power_spectrum, &mel_energies);
+        mel_energies.ApplyFloor(1e-07);
+        mel_energies.ApplyLog();
         SubVector<BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
         output_row.CopyFromVec(this_feature);
     }
     return true;
 }
 
-}  // namespace ppspeech
\ No newline at end of file
+}  // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/fbank.h b/speechx/speechx/frontend/audio/fbank.h
index 720e17c7..66957dc6 100644
--- a/speechx/speechx/frontend/audio/fbank.h
+++ b/speechx/speechx/frontend/audio/fbank.h
@@ -74,4 +74,4 @@ class Fbank : public FrontendInterface {
     DISALLOW_COPY_AND_ASSIGN(Fbank);
 };
 
-}  // namespace ppspeech
\ No newline at end of file
+}  // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc
index 5914fedb..40891871 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.cc
+++ b/speechx/speechx/frontend/audio/feature_pipeline.cc
@@ -22,12 +22,18 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
     unique_ptr<FrontendInterface> data_source(
         new ppspeech::AudioCache(1000 * kint16max, opts.to_float32));
 
-    unique_ptr<FrontendInterface> linear_spectrogram(
-        new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts,
-                                        std::move(data_source)));
+    unique_ptr<FrontendInterface> base_feature;
+    
+    if (opts.use_fbank) {
+        base_feature.reset(new ppspeech::Fbank(opts.fbank_opts,
+                              std::move(data_source)));
+    } else {
+        base_feature.reset(new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts,
+                              std::move(data_source)));
+    }
 
     unique_ptr<FrontendInterface> cmvn(
-        new ppspeech::CMVN(opts.cmvn_file, std::move(linear_spectrogram)));
+        new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature)));
 
     base_extractor_.reset(
         new ppspeech::FeatureCache(opts.feature_cache_opts, std::move(cmvn)));
diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h
index 580c02fa..4868d37e 100644
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@@ -21,6 +21,7 @@
 #include "frontend/audio/feature_cache.h"
 #include "frontend/audio/frontend_itf.h"
 #include "frontend/audio/linear_spectrogram.h"
+#include "frontend/audio/fbank.h"
 #include "frontend/audio/normalizer.h"
 
 namespace ppspeech {
@@ -28,12 +29,16 @@ namespace ppspeech {
 struct FeaturePipelineOptions {
     std::string cmvn_file;
     bool to_float32;
+    bool use_fbank;
     LinearSpectrogramOptions linear_spectrogram_opts;
+    FbankOptions fbank_opts;
     FeatureCacheOptions feature_cache_opts;
     FeaturePipelineOptions()
         : cmvn_file(""),
           to_float32(false),
+          use_fbank(false),
           linear_spectrogram_opts(),
+          fbank_opts(),
           feature_cache_opts() {}
 };
 
diff --git a/speechx/speechx/kaldi/feat/feature-fbank.cc b/speechx/speechx/kaldi/feat/feature-fbank.cc
index 8f3fcd52..5d10156b 100644
--- a/speechx/speechx/kaldi/feat/feature-fbank.cc
+++ b/speechx/speechx/kaldi/feat/feature-fbank.cc
@@ -85,10 +85,9 @@ void FbankComputer::Compute(BaseFloat signal_raw_log_energy,
     signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
                                      std::numeric_limits<float>::epsilon()));
   
-  // todo : remove later; as align fbank feature in paddleaudio
-  //if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
-  //  srfft_->Compute(signal_frame->Data(), true);
-  //else  // An alternative algorithm that works for non-powers-of-two.
+  if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
+    srfft_->Compute(signal_frame->Data(), true);
+  else  // An alternative algorithm that works for non-powers-of-two.
     RealFft(signal_frame, true);
 
   // Convert the FFT into a power spectrum.
diff --git a/speechx/speechx/kaldi/feat/feature-fbank.h b/speechx/speechx/kaldi/feat/feature-fbank.h
index f57d185a..d121cc0e 100644
--- a/speechx/speechx/kaldi/feat/feature-fbank.h
+++ b/speechx/speechx/kaldi/feat/feature-fbank.h
@@ -128,8 +128,8 @@ class FbankComputer {
 
   ~FbankComputer();
 
- private:
   const MelBanks *GetMelBanks(BaseFloat vtln_warp);
+ private:
 
 
   FbankOptions opts_;
-- 
GitLab