From 88275aff05e918727b562312484d1cad10767194 Mon Sep 17 00:00:00 2001 From: SmileGoat Date: Thu, 27 Jan 2022 13:28:58 +0800 Subject: [PATCH] add linear spectrogram feature extractor, test=doc --- speechx/speechx/base/common.h | 33 +++++ speechx/speechx/base/flags.h | 17 +++ speechx/speechx/base/log.h | 17 +++ speechx/speechx/base/thread_pool.h | 100 +++++++++++++ speechx/speechx/frontend/fbank.h | 36 +++++ .../frontend/feature_extractor_controller.h | 0 .../feature_extractor_controller_impl.h | 0 .../frontend/feature_extractor_interface.h | 29 ++++ .../speechx/frontend/linear_spectrogram.cc | 139 ++++++++++++++++++ speechx/speechx/frontend/linear_spectrogram.h | 46 ++++++ .../frontend/linear_spectrogram_main.cc | 39 +++++ speechx/speechx/frontend/mfcc.h | 16 ++ speechx/speechx/frontend/window.h | 16 ++ 13 files changed, 488 insertions(+) create mode 100644 speechx/speechx/base/common.h create mode 100644 speechx/speechx/base/flags.h create mode 100644 speechx/speechx/base/log.h create mode 100644 speechx/speechx/base/thread_pool.h create mode 100644 speechx/speechx/frontend/fbank.h create mode 100644 speechx/speechx/frontend/feature_extractor_controller.h create mode 100644 speechx/speechx/frontend/feature_extractor_controller_impl.h create mode 100644 speechx/speechx/frontend/feature_extractor_interface.h create mode 100644 speechx/speechx/frontend/linear_spectrogram.cc create mode 100644 speechx/speechx/frontend/linear_spectrogram.h create mode 100644 speechx/speechx/frontend/linear_spectrogram_main.cc create mode 100644 speechx/speechx/frontend/mfcc.h create mode 100644 speechx/speechx/frontend/window.h diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h new file mode 100644 index 00000000..a16fc55b --- /dev/null +++ b/speechx/speechx/base/common.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "base/log.h" +#include "base/basic_types.h" +#include "base/macros.h" diff --git a/speechx/speechx/base/flags.h b/speechx/speechx/base/flags.h new file mode 100644 index 00000000..41df0d45 --- /dev/null +++ b/speechx/speechx/base/flags.h @@ -0,0 +1,17 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "fst/flags.h" diff --git a/speechx/speechx/base/log.h b/speechx/speechx/base/log.h new file mode 100644 index 00000000..d1b7b169 --- /dev/null +++ b/speechx/speechx/base/log.h @@ -0,0 +1,17 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "glog/logging.h" diff --git a/speechx/speechx/base/thread_pool.h b/speechx/speechx/base/thread_pool.h new file mode 100644 index 00000000..f6dada90 --- /dev/null +++ b/speechx/speechx/base/thread_pool.h @@ -0,0 +1,100 @@ +// this code is from https://github.com/progschj/ThreadPool + +#ifndef BASE_THREAD_POOL_H +#define BASE_THREAD_POOL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class ThreadPool { +public: + ThreadPool(size_t); + template + auto enqueue(F&& f, Args&&... args) + -> std::future::type>; + ~ThreadPool(); +private: + // need to keep track of threads so we can join them + std::vector< std::thread > workers; + // the task queue + std::queue< std::function > tasks; + + // synchronization + std::mutex queue_mutex; + std::condition_variable condition; + bool stop; +}; + +// the constructor just launches some amount of workers +inline ThreadPool::ThreadPool(size_t threads) + : stop(false) +{ + for(size_t i = 0;i task; + + { + std::unique_lock lock(this->queue_mutex); + this->condition.wait(lock, + [this]{ return this->stop || !this->tasks.empty(); }); + if(this->stop && this->tasks.empty()) + return; + task = std::move(this->tasks.front()); + this->tasks.pop(); + } + + task(); + } + } + ); +} + +// add new work item to the pool +template +auto ThreadPool::enqueue(F&& f, Args&&... args) + -> std::future::type> +{ + using return_type = typename std::result_of::type; + + auto task = std::make_shared< std::packaged_task >( + std::bind(std::forward(f), std::forward(args)...) + ); + + std::future res = task->get_future(); + { + std::unique_lock lock(queue_mutex); + + // don't allow enqueueing after stopping the pool + if(stop) + throw std::runtime_error("enqueue on stopped ThreadPool"); + + tasks.emplace([task](){ (*task)(); }); + } + condition.notify_one(); + return res; +} + +// the destructor joins all threads +inline ThreadPool::~ThreadPool() +{ + { + std::unique_lock lock(queue_mutex); + stop = true; + } + condition.notify_all(); + for(std::thread &worker: workers) + worker.join(); +} + +#endif \ No newline at end of file diff --git a/speechx/speechx/frontend/fbank.h b/speechx/speechx/frontend/fbank.h new file mode 100644 index 00000000..6956690d --- /dev/null +++ b/speechx/speechx/frontend/fbank.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// wrap the fbank feat of kaldi, todo (SmileGoat) + +#include "kaldi/feat/feature-mfcc.h" + +#incldue "kaldi/matrix/kaldi-vector.h" + +namespace ppspeech { + +class FbankExtractor : FeatureExtractorInterface { + public: + explicit FbankExtractor(const FbankOptions& opts, + share_ptr pre_extractor); + virtual void AcceptWaveform(const kaldi::Vector& input) = 0; + virtual void Read(kaldi::Vector* feat) = 0; + virtual size_t Dim() const = 0; + + private: + bool Compute(const kaldi::Vector& wave, + kaldi::Vector* feat) const; +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/feature_extractor_controller.h b/speechx/speechx/frontend/feature_extractor_controller.h new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/frontend/feature_extractor_controller_impl.h b/speechx/speechx/frontend/feature_extractor_controller_impl.h new file mode 100644 index 00000000..e69de29b diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/feature_extractor_interface.h new file mode 100644 index 00000000..3f3f0ff9 --- /dev/null +++ b/speechx/speechx/frontend/feature_extractor_interface.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "base/basic_types.h" +#incldue "kaldi/matrix/kaldi-vector.h" + +namespace ppspeech { + +class FeatureExtractorInterface { + public: + virtual void AcceptWaveform(const kaldi::Vector& input) = 0; + virtual void Read(kaldi::Vector* feat) = 0; + virtual size_t Dim() const = 0; +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc new file mode 100644 index 00000000..327c3f57 --- /dev/null +++ b/speechx/speechx/frontend/linear_spectrogram.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "frontend/linear_spectrogram.h" +#include "kaldi/base/kaldi-math.h" +#include "kaldi/matrix/matrix-functions.h" + +using kaldi::int32; +using kaldi::BaseFloat; +using kaldi::Vector; +using kaldi::Matrix; +using std::vector; + +LinearSpectrogram::LinearSpectrogram(const LinearSpectrogramOptions& opts) { + int32 window_size = opts.frame_opts.WindowSize(); + int32 window_shift = opts.frame_opts.WindowShift(); + fft_points_ = window_size; + hanning_window_.resize(window_size); + + double a = M_2PI / (window_size - 1); + hanning_window_energy_ = 0; + for (int i = 0; i < window_size; ++i) { + hanning_window_[i] = 0.5 - 0.5 * cos(a * i); + hanning_window_energy_ += hanning_window_[i] * hanning_window_[i]; + } +} + +void LinearSpectrogram::AcceptWavefrom(const Vector& input) { + wavefrom_.resize(input.Dim()); + for (size_t idx = 0; idx < input.Dim(); ++idx) { + waveform_[idx] = input(idx); + } +} + +void LinearSpectrogram::Hanning(vector* data) const { + CHECK_GE(data->size(), hanning_window_.size()); + + for (size_t i = 0; i < hanning_window_.size(); ++i) { + data->at(i) *= hanning_window_[i]; + } +} + +bool LinearSpectrogram::NumpyFft(vector* v, + vector* real, + vector* img) { + if (RealFft(v, true)) { + LOG(ERROR) << "compute the fft occurs error"; + return false; + } + real->push_back(v->at(0)); + img->push_back(0); + for (int i = 1; i < v->size() / 2; i++) { + real->push_back(v->at(2 * i)); + img->push_back(v->at(2 * i + 1)); + } + real->push_back(v->at(1)); + img->push_back(0); + + return true; +} + +// todo refactor later +bool LinearSpectrogram::ReadFeats(Matrix* feats) const { + vector> feat; + if (wavefrom_.empty()) { + return false; + } + vector> result; + Compute(wavefrom_, result); + feats->Resize(result.size(), result[0].size()); + for (int row_idx = 0; row_idx < result.size(); ++row_idx) { + for (int col_idx = 0; col_idx < result.size(); ++col_idx) { + feats(row_idx, col_idx) = result[row_idx][col_idx]; + } + wavefrom_.clear(); + return true; +} + +// Compute spectrogram feat, return num frames +// todo: refactor later (SmileGoat) +int32 LinearSpectrogram::Compute(const vector& wave, + vector>& feat) { + int num_samples = wave.size(); + const int& frame_length = opts.frame_opts.WindowSize(); + const int& sample_rate = opts.frame_opts.samp_freq; + const int& frame_shift = opts.frame_opts.WindowShift(); + const int& fft_points = fft_points_; + const float scale = hanning_window_energy_ * frame_shift; + + if (num_samples < frame_length) { + return 0; + } + + int num_frames = 1 + ((num_samples - frame_length) / frame_shift); + feat.resize(num_frames); + vector fft_real((fft_points_ / 2 + 1), 0); + vector fft_img((fft_points_ / 2 + 1), 0); + vector v(frame_length, 0); + vector power((fft_points / 2 + 1)); + + for (int i = 0; i < num_frames; ++i) { + vector data(wave.data() + i * frame_shift, + wave.data() + i * frame_shift + frame_length); + Hanning(data); + fft_img.clear(); + fft_real.clear(); + v.assign(data.begin(), data.end()); + if (NumpyFft(&v, fft_real, fft_img)) { + LOG(ERROR)<< i << " fft compute occurs error, please checkout the input data"; + return -1; + } + + feat[i].resize(fft_points / 2 + 1); // the last dimension is Fs/2 Hz + for (int j = 0; j < (fft_points / 2 + 1); ++j) { + power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j]; + feat[i][j] = power[j]; + + if (j == 0 || j == feat[0].size() - 1) { + feat[i][j] /= scale; + } else { + feat[i][j] *= (2.0 / scale); + } + + // log added eps=1e-14 + feat[i][j] = std::log(feat[i][j] + 1e-14); + } + return 0; +} diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/linear_spectrogram.h new file mode 100644 index 00000000..b69050d1 --- /dev/null +++ b/speechx/speechx/frontend/linear_spectrogram.h @@ -0,0 +1,46 @@ + +#pragma once + +#include "frontend/feature_extractor_interface.h" +#include "kaldi/feat/feature-window.h" +#include "base/common.h" + +namespace ppspeech { + +struct LinearSpectrogramOptions { + kaldi::FrameExtrationOptions frame_opts; + LinearSpectrogramOptions(): + frame_opts() {} + + void Register(kaldi::OptionsItf* opts) { + frame_opts.Register(opts); + } +}; + +class LinearSpectrogram : public FeatureExtractorInterface { + public: + explict LinearSpectrogram(const LinearSpectrogramOptions& opts); + virtual void AcceptWavefrom(const kaldi::Vector& input); + virtual void Read(kaldi::Vector* feat); + virtual size_t Dim() const; + void ReadFeats(kaldi::Matrix* feats) const; + + private: + void Hanning(std::vector& data) const; + kaldi::int32 Compute(const std::vector& wave, + std::vector>& feat) const; + bool NumpyFft(std::vector* v, + std::vector* real, + std::vector* img) const; + + kaldi::int32 fft_points_; + size_t dim_; + std::vector hanning_window_; + kaldi::BaseFloat hanning_window_energy_; + LinearSpectrogramOptions opts_; + std::vector wavefrom_; // remove later, todo(SmileGoat) + DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram); +}; + + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/linear_spectrogram_main.cc b/speechx/speechx/frontend/linear_spectrogram_main.cc new file mode 100644 index 00000000..455f4f91 --- /dev/null +++ b/speechx/speechx/frontend/linear_spectrogram_main.cc @@ -0,0 +1,39 @@ + +#include "frontend/linear_spectrogram.h" +#include "kaldi/util/table-types.h" +#include "base/log.h" +#include "base/flags.h" +#include "kaldi/feat/wave-reader.h" + +DEFINE_string(wav_rspecifier, "", "test wav path"); +DEFINE_string(feature_wspecifier, "", "test wav ark"); + +int main(int argc, char* argv[]) { + google::ParseCommandLineFlags(&argc, &argv, false); + google::InitGoogleLogging(argv[0]); + + kaldi::SequentialTableReader wav_reader(FLAGS_wav_rspecifier); + kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); + + int32 num_done = 0, num_err = 0; + ppspeech::LinearSpectrogramOptions opt; + ppspeech::LinearSpectrogram linear_spectrogram(opt); + for (; !wav_reader.Done(); wav_reader.Next()) { + std::string utt = wav_reader.Key(); + const kaldi::WaveData &wave_data = wav_reader.Value(); + + int32 this_channel = 0; + kaldi::SubVector waveform(wave_data.Data(), this_channel); + kaldi::Matrix features; + linear_spectrogram.AcceptWaveform(waveform); + linear_spectrogram.ReadFeats(&features); + + feat_writer.Write(utt, features); + if (num_done % 50 == 0 && num_done != 0) + KALDI_VLOG(2) << "Processed " << num_done << " utterances"; + num_done++; + } + KALDI_LOG << "Done " << num_done << " utterances, " << num_err + << " with errors."; + return (num_done != 0 ? 0 : 1); +} \ No newline at end of file diff --git a/speechx/speechx/frontend/mfcc.h b/speechx/speechx/frontend/mfcc.h new file mode 100644 index 00000000..aa369655 --- /dev/null +++ b/speechx/speechx/frontend/mfcc.h @@ -0,0 +1,16 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// wrap the mfcc feat of kaldi, todo (SmileGoat) +#include "kaldi/feat/feature-mfcc.h" \ No newline at end of file diff --git a/speechx/speechx/frontend/window.h b/speechx/speechx/frontend/window.h new file mode 100644 index 00000000..5303cad8 --- /dev/null +++ b/speechx/speechx/frontend/window.h @@ -0,0 +1,16 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// extract the window of kaldi feat. + -- GitLab