提交 cb0b6785 编写于 作者: Y Yang Zhou

add fbank into feature pipeline

上级 a36ecccf
...@@ -43,7 +43,7 @@ int main(int argc, char* argv[]) { ...@@ -43,7 +43,7 @@ int main(int argc, char* argv[]) {
int32 num_done = 0, num_err = 0; int32 num_done = 0, num_err = 0;
// feature pipeline: wave cache --> hanning window // feature pipeline: wave cache --> povey window
// -->fbank --> global cmvn -> feat cache // -->fbank --> global cmvn -> feat cache
std::unique_ptr<ppspeech::FrontendInterface> data_source( std::unique_ptr<ppspeech::FrontendInterface> data_source(
...@@ -78,7 +78,6 @@ int main(int argc, char* argv[]) { ...@@ -78,7 +78,6 @@ int main(int argc, char* argv[]) {
LOG(INFO) << "chunk size (s): " << streaming_chunk; LOG(INFO) << "chunk size (s): " << streaming_chunk;
LOG(INFO) << "chunk size (sample): " << chunk_sample_size; LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
for (; !wav_reader.Done(); wav_reader.Next()) { for (; !wav_reader.Done(); wav_reader.Next()) {
std::string utt = wav_reader.Key(); std::string utt = wav_reader.Key();
const kaldi::WaveData& wave_data = wav_reader.Value(); const kaldi::WaveData& wave_data = wav_reader.Value();
......
...@@ -47,7 +47,8 @@ DEFINE_string(model_cache_names, ...@@ -47,7 +47,8 @@ DEFINE_string(model_cache_names,
"chunk_state_h_box,chunk_state_c_box", "chunk_state_h_box,chunk_state_c_box",
"model cache names"); "model cache names");
DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
DEFINE_bool(use_fbank, false, "use fbank or linear feature");
DEFINE_int32(num_bins, 161, "num bins of mel");
namespace ppspeech { namespace ppspeech {
// todo refactor later // todo refactor later
...@@ -57,13 +58,21 @@ FeaturePipelineOptions InitFeaturePipelineOptions() { ...@@ -57,13 +58,21 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk; opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk;
opts.to_float32 = FLAGS_to_float32; opts.to_float32 = FLAGS_to_float32;
kaldi::FrameExtractionOptions frame_opts; kaldi::FrameExtractionOptions frame_opts;
frame_opts.frame_length_ms = 20;
frame_opts.frame_shift_ms = 10;
frame_opts.remove_dc_offset = false;
frame_opts.window_type = "hanning";
frame_opts.preemph_coeff = 0.0;
frame_opts.dither = 0.0; frame_opts.dither = 0.0;
opts.linear_spectrogram_opts.frame_opts = frame_opts; frame_opts.frame_shift_ms = 10;
opts.use_fbank = FLAGS_use_fbank;
if (opts.use_fbank) {
frame_opts.window_type = "povey";
frame_opts.frame_length_ms = 25;
opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
opts.fbank_opts.fbank_opts.frame_opts = frame_opts;
} else {
frame_opts.remove_dc_offset = false;
frame_opts.frame_length_ms = 20;
frame_opts.window_type = "hanning";
frame_opts.preemph_coeff = 0.0;
opts.linear_spectrogram_opts.frame_opts = frame_opts;
}
opts.feature_cache_opts.frame_chunk_size = FLAGS_receptive_field_length; opts.feature_cache_opts.frame_chunk_size = FLAGS_receptive_field_length;
opts.feature_cache_opts.frame_chunk_stride = FLAGS_downsampling_rate; opts.feature_cache_opts.frame_chunk_stride = FLAGS_downsampling_rate;
return opts; return opts;
......
...@@ -10,4 +10,4 @@ add_library(frontend STATIC ...@@ -10,4 +10,4 @@ add_library(frontend STATIC
fbank.cc fbank.cc
) )
target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common) target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common kaldi-fbank)
...@@ -29,6 +29,8 @@ using kaldi::VectorBase; ...@@ -29,6 +29,8 @@ using kaldi::VectorBase;
using kaldi::Matrix; using kaldi::Matrix;
using std::vector; using std::vector;
// todo refactor later:(SmileGoat)
Fbank::Fbank(const FbankOptions& opts, Fbank::Fbank(const FbankOptions& opts,
std::unique_ptr<FrontendInterface> base_extractor) std::unique_ptr<FrontendInterface> base_extractor)
: opts_(opts), : opts_(opts),
...@@ -98,12 +100,22 @@ bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) { ...@@ -98,12 +100,22 @@ bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) {
Vector<BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined); Vector<BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
// note: this online feature-extraction code does not support VTLN. // note: this online feature-extraction code does not support VTLN.
BaseFloat vtln_warp = 1.0; RealFft(&window, true);
computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature); kaldi::ComputePowerSpectrum(&window);
const kaldi::MelBanks &mel_bank = *(computer_.GetMelBanks(1.0));
SubVector<BaseFloat> power_spectrum(window, 0, window.Dim() / 2 + 1);
if (!opts_.fbank_opts.use_power) {
power_spectrum.ApplyPow(0.5);
}
int32 mel_offset = ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1 : 0);
SubVector<BaseFloat> mel_energies(this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins);
mel_bank.Compute(power_spectrum, &mel_energies);
mel_energies.ApplyFloor(1e-07);
mel_energies.ApplyLog();
SubVector<BaseFloat> output_row(feats->Data() + frame * Dim(), Dim()); SubVector<BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
output_row.CopyFromVec(this_feature); output_row.CopyFromVec(this_feature);
} }
return true; return true;
} }
} // namespace ppspeech } // namespace ppspeech
\ No newline at end of file
...@@ -74,4 +74,4 @@ class Fbank : public FrontendInterface { ...@@ -74,4 +74,4 @@ class Fbank : public FrontendInterface {
DISALLOW_COPY_AND_ASSIGN(Fbank); DISALLOW_COPY_AND_ASSIGN(Fbank);
}; };
} // namespace ppspeech } // namespace ppspeech
\ No newline at end of file
...@@ -22,12 +22,18 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) { ...@@ -22,12 +22,18 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
unique_ptr<FrontendInterface> data_source( unique_ptr<FrontendInterface> data_source(
new ppspeech::AudioCache(1000 * kint16max, opts.to_float32)); new ppspeech::AudioCache(1000 * kint16max, opts.to_float32));
unique_ptr<FrontendInterface> linear_spectrogram( unique_ptr<FrontendInterface> base_feature;
new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts,
std::move(data_source))); if (opts.use_fbank) {
base_feature.reset(new ppspeech::Fbank(opts.fbank_opts,
std::move(data_source)));
} else {
base_feature.reset(new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts,
std::move(data_source)));
}
unique_ptr<FrontendInterface> cmvn( unique_ptr<FrontendInterface> cmvn(
new ppspeech::CMVN(opts.cmvn_file, std::move(linear_spectrogram))); new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature)));
base_extractor_.reset( base_extractor_.reset(
new ppspeech::FeatureCache(opts.feature_cache_opts, std::move(cmvn))); new ppspeech::FeatureCache(opts.feature_cache_opts, std::move(cmvn)));
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "frontend/audio/feature_cache.h" #include "frontend/audio/feature_cache.h"
#include "frontend/audio/frontend_itf.h" #include "frontend/audio/frontend_itf.h"
#include "frontend/audio/linear_spectrogram.h" #include "frontend/audio/linear_spectrogram.h"
#include "frontend/audio/fbank.h"
#include "frontend/audio/normalizer.h" #include "frontend/audio/normalizer.h"
namespace ppspeech { namespace ppspeech {
...@@ -28,12 +29,16 @@ namespace ppspeech { ...@@ -28,12 +29,16 @@ namespace ppspeech {
struct FeaturePipelineOptions { struct FeaturePipelineOptions {
std::string cmvn_file; std::string cmvn_file;
bool to_float32; bool to_float32;
bool use_fbank;
LinearSpectrogramOptions linear_spectrogram_opts; LinearSpectrogramOptions linear_spectrogram_opts;
FbankOptions fbank_opts;
FeatureCacheOptions feature_cache_opts; FeatureCacheOptions feature_cache_opts;
FeaturePipelineOptions() FeaturePipelineOptions()
: cmvn_file(""), : cmvn_file(""),
to_float32(false), to_float32(false),
use_fbank(false),
linear_spectrogram_opts(), linear_spectrogram_opts(),
fbank_opts(),
feature_cache_opts() {} feature_cache_opts() {}
}; };
......
...@@ -85,10 +85,9 @@ void FbankComputer::Compute(BaseFloat signal_raw_log_energy, ...@@ -85,10 +85,9 @@ void FbankComputer::Compute(BaseFloat signal_raw_log_energy,
signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame), signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
std::numeric_limits<float>::epsilon())); std::numeric_limits<float>::epsilon()));
// todo : remove later; as align fbank feature in paddleaudio if (srfft_ != NULL) // Compute FFT using split-radix algorithm.
//if (srfft_ != NULL) // Compute FFT using split-radix algorithm. srfft_->Compute(signal_frame->Data(), true);
// srfft_->Compute(signal_frame->Data(), true); else // An alternative algorithm that works for non-powers-of-two.
//else // An alternative algorithm that works for non-powers-of-two.
RealFft(signal_frame, true); RealFft(signal_frame, true);
// Convert the FFT into a power spectrum. // Convert the FFT into a power spectrum.
......
...@@ -128,8 +128,8 @@ class FbankComputer { ...@@ -128,8 +128,8 @@ class FbankComputer {
~FbankComputer(); ~FbankComputer();
private:
const MelBanks *GetMelBanks(BaseFloat vtln_warp); const MelBanks *GetMelBanks(BaseFloat vtln_warp);
private:
FbankOptions opts_; FbankOptions opts_;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册