From 88275aff05e918727b562312484d1cad10767194 Mon Sep 17 00:00:00 2001
From: SmileGoat <goat.zhou@qq.com>
Date: Thu, 27 Jan 2022 13:28:58 +0800
Subject: [PATCH] add linear spectrogram feature extractor, test=doc

---
 speechx/speechx/base/common.h                 |  33 +++++
 speechx/speechx/base/flags.h                  |  17 +++
 speechx/speechx/base/log.h                    |  17 +++
 speechx/speechx/base/thread_pool.h            | 100 +++++++++++++
 speechx/speechx/frontend/fbank.h              |  36 +++++
 .../frontend/feature_extractor_controller.h   |   0
 .../feature_extractor_controller_impl.h       |   0
 .../frontend/feature_extractor_interface.h    |  29 ++++
 .../speechx/frontend/linear_spectrogram.cc    | 139 ++++++++++++++++++
 speechx/speechx/frontend/linear_spectrogram.h |  46 ++++++
 .../frontend/linear_spectrogram_main.cc       |  39 +++++
 speechx/speechx/frontend/mfcc.h               |  16 ++
 speechx/speechx/frontend/window.h             |  16 ++
 13 files changed, 488 insertions(+)
 create mode 100644 speechx/speechx/base/common.h
 create mode 100644 speechx/speechx/base/flags.h
 create mode 100644 speechx/speechx/base/log.h
 create mode 100644 speechx/speechx/base/thread_pool.h
 create mode 100644 speechx/speechx/frontend/fbank.h
 create mode 100644 speechx/speechx/frontend/feature_extractor_controller.h
 create mode 100644 speechx/speechx/frontend/feature_extractor_controller_impl.h
 create mode 100644 speechx/speechx/frontend/feature_extractor_interface.h
 create mode 100644 speechx/speechx/frontend/linear_spectrogram.cc
 create mode 100644 speechx/speechx/frontend/linear_spectrogram.h
 create mode 100644 speechx/speechx/frontend/linear_spectrogram_main.cc
 create mode 100644 speechx/speechx/frontend/mfcc.h
 create mode 100644 speechx/speechx/frontend/window.h
diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h
new file mode 100644
index 00000000..a16fc55b
--- /dev/null
+++ b/speechx/speechx/base/common.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <deque>
+#include <iostream>
+#include <istream>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <set>
+#include <sstream>
+#include <stack>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "base/log.h"
+#include "base/basic_types.h"
+#include "base/macros.h"
diff --git a/speechx/speechx/base/flags.h b/speechx/speechx/base/flags.h
new file mode 100644
index 00000000..41df0d45
--- /dev/null
+++ b/speechx/speechx/base/flags.h
@@ -0,0 +1,17 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fst/flags.h"
diff --git a/speechx/speechx/base/log.h b/speechx/speechx/base/log.h
new file mode 100644
index 00000000..d1b7b169
--- /dev/null
+++ b/speechx/speechx/base/log.h
@@ -0,0 +1,17 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "glog/logging.h"
diff --git a/speechx/speechx/base/thread_pool.h b/speechx/speechx/base/thread_pool.h
new file mode 100644
index 00000000..f6dada90
--- /dev/null
+++ b/speechx/speechx/base/thread_pool.h
@@ -0,0 +1,100 @@
+// this code is from https://github.com/progschj/ThreadPool
+
+#ifndef BASE_THREAD_POOL_H
+#define BASE_THREAD_POOL_H
+
+#include <vector>
+#include <queue>
+#include <memory>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <future>
+#include <functional>
+#include <stdexcept>
+
+class ThreadPool {
+public:
+    ThreadPool(size_t);
+    template<class F, class... Args>
+    auto enqueue(F&& f, Args&&... args) 
+        -> std::future<typename std::result_of<F(Args...)>::type>;
+    ~ThreadPool();
+private:
+    // need to keep track of threads so we can join them
+    std::vector< std::thread > workers;
+    // the task queue
+    std::queue< std::function<void()> > tasks;
+    
+    // synchronization
+    std::mutex queue_mutex;
+    std::condition_variable condition;
+    bool stop;
+};
+ 
+// the constructor just launches some amount of workers
+inline ThreadPool::ThreadPool(size_t threads)
+    :   stop(false)
+{
+    for(size_t i = 0;i<threads;++i)
+        workers.emplace_back(
+            [this]
+            {
+                for(;;)
+                {
+                    std::function<void()> task;
+
+                    {
+                        std::unique_lock<std::mutex> lock(this->queue_mutex);
+                        this->condition.wait(lock,
+                            [this]{ return this->stop || !this->tasks.empty(); });
+                        if(this->stop && this->tasks.empty())
+                            return;
+                        task = std::move(this->tasks.front());
+                        this->tasks.pop();
+                    }
+
+                    task();
+                }
+            }
+        );
+}
+
+// add new work item to the pool
+template<class F, class... Args>
+auto ThreadPool::enqueue(F&& f, Args&&... args) 
+    -> std::future<typename std::result_of<F(Args...)>::type>
+{
+    using return_type = typename std::result_of<F(Args...)>::type;
+
+    auto task = std::make_shared< std::packaged_task<return_type()> >(
+            std::bind(std::forward<F>(f), std::forward<Args>(args)...)
+        );
+        
+    std::future<return_type> res = task->get_future();
+    {
+        std::unique_lock<std::mutex> lock(queue_mutex);
+
+        // don't allow enqueueing after stopping the pool
+        if(stop)
+            throw std::runtime_error("enqueue on stopped ThreadPool");
+
+        tasks.emplace([task](){ (*task)(); });
+    }
+    condition.notify_one();
+    return res;
+}
+
+// the destructor joins all threads
+inline ThreadPool::~ThreadPool()
+{
+    {
+        std::unique_lock<std::mutex> lock(queue_mutex);
+        stop = true;
+    }
+    condition.notify_all();
+    for(std::thread &worker: workers)
+        worker.join();
+}
+
+#endif
\ No newline at end of file
diff --git a/speechx/speechx/frontend/fbank.h b/speechx/speechx/frontend/fbank.h
new file mode 100644
index 00000000..6956690d
--- /dev/null
+++ b/speechx/speechx/frontend/fbank.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// wrap the fbank feat of kaldi, todo (SmileGoat)
+
+#include "kaldi/feat/feature-mfcc.h"
+
+#incldue "kaldi/matrix/kaldi-vector.h"
+
+namespace ppspeech {
+
+class FbankExtractor : FeatureExtractorInterface {
+  public:
+    explicit FbankExtractor(const FbankOptions& opts, 
+                            share_ptr<FeatureExtractorInterface> pre_extractor);
+    virtual void AcceptWaveform(const kaldi::Vector<kaldi::BaseFloat>& input) = 0;
+    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat) = 0;
+    virtual size_t Dim() const = 0;
+
+  private:
+     bool Compute(const kaldi::Vector<kaldi::BaseFloat>& wave,
+                  kaldi::Vector<kaldi::BaseFloat>* feat) const; 
+};
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/feature_extractor_controller.h b/speechx/speechx/frontend/feature_extractor_controller.h
new file mode 100644
index 00000000..e69de29b
diff --git a/speechx/speechx/frontend/feature_extractor_controller_impl.h b/speechx/speechx/frontend/feature_extractor_controller_impl.h
new file mode 100644
index 00000000..e69de29b
diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/feature_extractor_interface.h
new file mode 100644
index 00000000..3f3f0ff9
--- /dev/null
+++ b/speechx/speechx/frontend/feature_extractor_interface.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "base/basic_types.h"
+#incldue "kaldi/matrix/kaldi-vector.h"
+
+namespace ppspeech {
+
+class FeatureExtractorInterface {
+  public:
+    virtual void AcceptWaveform(const kaldi::Vector<kaldi::BaseFloat>& input) = 0;
+    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat) = 0;
+    virtual size_t Dim() const = 0;
+};
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc
new file mode 100644
index 00000000..327c3f57
--- /dev/null
+++ b/speechx/speechx/frontend/linear_spectrogram.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "frontend/linear_spectrogram.h"
+#include "kaldi/base/kaldi-math.h"
+#include "kaldi/matrix/matrix-functions.h"
+
+using kaldi::int32;
+using kaldi::BaseFloat;
+using kaldi::Vector;
+using kaldi::Matrix;
+using std::vector;
+
+LinearSpectrogram::LinearSpectrogram(const LinearSpectrogramOptions& opts) {
+  int32 window_size = opts.frame_opts.WindowSize();
+  int32 window_shift = opts.frame_opts.WindowShift();
+  fft_points_ = window_size;
+  hanning_window_.resize(window_size);
+
+  double a = M_2PI / (window_size - 1);
+  hanning_window_energy_ = 0;
+  for (int i = 0; i < window_size; ++i) {
+    hanning_window_[i] = 0.5 - 0.5 * cos(a * i);
+    hanning_window_energy_ += hanning_window_[i] * hanning_window_[i];
+  }
+}
+
+void LinearSpectrogram::AcceptWavefrom(const Vector<BaseFloat>& input) {
+  wavefrom_.resize(input.Dim());
+  for (size_t idx = 0; idx < input.Dim(); ++idx) {
+    waveform_[idx] = input(idx);
+  }
+}
+
+void LinearSpectrogram::Hanning(vector<float>* data) const {
+  CHECK_GE(data->size(), hanning_window_.size());
+
+  for (size_t i = 0; i < hanning_window_.size(); ++i) {
+      data->at(i) *= hanning_window_[i];
+  }
+}
+
+bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
+                                 vector<BaseFloat>* real,
+                                 vector<BaseFloat>* img) {
+  if (RealFft(v, true)) {
+    LOG(ERROR) << "compute the fft occurs error";
+    return false;
+  }
+  real->push_back(v->at(0));
+  img->push_back(0);
+  for (int i = 1; i < v->size() / 2; i++) {
+    real->push_back(v->at(2 * i));
+    img->push_back(v->at(2 * i + 1));
+  }
+  real->push_back(v->at(1));
+  img->push_back(0);
+
+  return true;
+}
+
+// todo refactor later
+bool LinearSpectrogram::ReadFeats(Matrix<BaseFloat>* feats) const {
+  vector<vector<BaseFloat>> feat;
+  if (wavefrom_.empty()) {
+      return false;
+  }
+  vector<vector<BaseFloat>> result;
+  Compute(wavefrom_, result);
+  feats->Resize(result.size(), result[0].size());
+  for (int row_idx = 0; row_idx < result.size(); ++row_idx) {
+    for (int col_idx = 0; col_idx < result.size(); ++col_idx) {
+        feats(row_idx, col_idx) = result[row_idx][col_idx];
+  }
+  wavefrom_.clear();
+  return true;
+}
+
+// Compute spectrogram feat, return num frames
+// todo: refactor later (SmileGoat)
+int32 LinearSpectrogram::Compute(const vector<float>& wave,
+                                 vector<vector<float>>& feat) {
+  int num_samples = wave.size();
+  const int& frame_length = opts.frame_opts.WindowSize();
+  const int& sample_rate = opts.frame_opts.samp_freq;
+  const int& frame_shift = opts.frame_opts.WindowShift();
+  const int& fft_points = fft_points_;
+  const float scale = hanning_window_energy_ * frame_shift;
+
+  if (num_samples < frame_length) {
+          return 0;
+  }
+
+  int num_frames = 1 + ((num_samples - frame_length) / frame_shift);
+  feat.resize(num_frames);
+  vector<float> fft_real((fft_points_ / 2 + 1), 0);
+  vector<float> fft_img((fft_points_ / 2 + 1), 0);
+  vector<float> v(frame_length, 0);
+  vector<float> power((fft_points / 2 + 1));
+
+  for (int i = 0; i < num_frames; ++i) {
+    vector<float> data(wave.data() + i * frame_shift,
+                       wave.data() + i * frame_shift + frame_length);
+    Hanning(data);
+    fft_img.clear();
+    fft_real.clear();
+    v.assign(data.begin(), data.end());
+    if (NumpyFft(&v, fft_real, fft_img)) {
+      LOG(ERROR)<< i  << " fft compute occurs error, please checkout the input data";
+      return -1;
+    }
+
+    feat[i].resize(fft_points / 2 + 1);  // the last dimension is Fs/2 Hz
+    for (int j = 0; j < (fft_points / 2 + 1); ++j) {
+      power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j];
+      feat[i][j] = power[j];
+
+      if (j == 0 || j == feat[0].size() - 1) {
+          feat[i][j] /= scale;
+      } else {
+          feat[i][j] *= (2.0 / scale);
+      }
+
+      // log added eps=1e-14
+      feat[i][j] = std::log(feat[i][j] + 1e-14);
+    }
+  return 0;
+}
diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h
new file mode 100644
index 00000000..b69050d1
--- /dev/null
+++ b/speechx/speechx/frontend/linear_spectrogram.h
@@ -0,0 +1,46 @@
+
+#pragma once
+
+#include "frontend/feature_extractor_interface.h"
+#include "kaldi/feat/feature-window.h"
+#include "base/common.h"
+
+namespace ppspeech {
+
+struct LinearSpectrogramOptions {
+    kaldi::FrameExtrationOptions frame_opts;
+    LinearSpectrogramOptions():
+        frame_opts() {}
+
+    void Register(kaldi::OptionsItf* opts) {
+        frame_opts.Register(opts);
+    }
+};
+
+class LinearSpectrogram : public FeatureExtractorInterface {
+  public:
+    explict LinearSpectrogram(const LinearSpectrogramOptions& opts);
+    virtual void AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input);
+    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat);
+    virtual size_t Dim() const;
+    void ReadFeats(kaldi::Matrix<kaldi::BaesFloat>* feats) const;
+
+  private: 
+    void Hanning(std::vector<kaldi::BaseFloat>& data) const;
+    kaldi::int32 Compute(const std::vector<kaldi::BaseFloat>& wave,
+                         std::vector<std::vector<kaldi::BaseFloat>>& feat) const;
+    bool NumpyFft(std::vector<kaldi::BaseFloat>* v,
+                  std::vector<kaldi::BaseFloat>* real,
+                  std::vector<kaldi::BaseFloat>* img) const;
+
+    kaldi::int32 fft_points_;
+    size_t dim_;
+    std::vector<kaldi::BaseFloat> hanning_window_;
+    kaldi::BaseFloat hanning_window_energy_;
+    LinearSpectrogramOptions opts_;
+    std::vector<kaldi::BaseFloat> wavefrom_; // remove later, todo(SmileGoat)
+    DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram);
+};
+
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/linear_spectrogram_main.cc b/speechx/speechx/frontend/linear_spectrogram_main.cc
new file mode 100644
index 00000000..455f4f91
--- /dev/null
+++ b/speechx/speechx/frontend/linear_spectrogram_main.cc
@@ -0,0 +1,39 @@
+
+#include "frontend/linear_spectrogram.h"
+#include "kaldi/util/table-types.h"
+#include "base/log.h"
+#include "base/flags.h"
+#include "kaldi/feat/wave-reader.h"
+
+DEFINE_string(wav_rspecifier, "", "test wav path");
+DEFINE_string(feature_wspecifier, "", "test wav ark");
+
+int main(int argc, char* argv[]) {
+  google::ParseCommandLineFlags(&argc, &argv, false);
+  google::InitGoogleLogging(argv[0]);
+  
+  kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(FLAGS_wav_rspecifier);
+  kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
+
+  int32 num_done = 0, num_err = 0;
+  ppspeech::LinearSpectrogramOptions opt;
+  ppspeech::LinearSpectrogram linear_spectrogram(opt);
+  for (; !wav_reader.Done(); wav_reader.Next()) {
+    std::string utt = wav_reader.Key();
+    const kaldi::WaveData &wave_data = wav_reader.Value();
+
+    int32 this_channel = 0;
+    kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(), this_channel);
+    kaldi::Matrix<BaseFloat> features;
+    linear_spectrogram.AcceptWaveform(waveform);
+    linear_spectrogram.ReadFeats(&features);
+
+    feat_writer.Write(utt, features);
+    if (num_done % 50 == 0 && num_done != 0)
+    KALDI_VLOG(2) << "Processed " << num_done << " utterances";
+    num_done++;
+  }
+    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+}
\ No newline at end of file
diff --git a/speechx/speechx/frontend/mfcc.h b/speechx/speechx/frontend/mfcc.h
new file mode 100644
index 00000000..aa369655
--- /dev/null
+++ b/speechx/speechx/frontend/mfcc.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// wrap the mfcc feat of kaldi, todo (SmileGoat)
+#include "kaldi/feat/feature-mfcc.h"
\ No newline at end of file
diff --git a/speechx/speechx/frontend/window.h b/speechx/speechx/frontend/window.h
new file mode 100644
index 00000000..5303cad8
--- /dev/null
+++ b/speechx/speechx/frontend/window.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// extract the window of kaldi feat.
+
-- 
GitLab