提交 cb0b6785 编写于 作者: Y Yang Zhou

add fbank into feature pipeline

上级 a36ecccf
......@@ -43,7 +43,7 @@ int main(int argc, char* argv[]) {
int32 num_done = 0, num_err = 0;
// feature pipeline: wave cache --> hanning window
// feature pipeline: wave cache --> povey window
// -->fbank --> global cmvn -> feat cache
std::unique_ptr<ppspeech::FrontendInterface> data_source(
......@@ -78,7 +78,6 @@ int main(int argc, char* argv[]) {
LOG(INFO) << "chunk size (s): " << streaming_chunk;
LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
for (; !wav_reader.Done(); wav_reader.Next()) {
std::string utt = wav_reader.Key();
const kaldi::WaveData& wave_data = wav_reader.Value();
......
......@@ -47,7 +47,8 @@ DEFINE_string(model_cache_names,
"chunk_state_h_box,chunk_state_c_box",
"model cache names");
DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
DEFINE_bool(use_fbank, false, "use fbank or linear feature");
DEFINE_int32(num_bins, 161, "num bins of mel");
namespace ppspeech {
// todo refactor later
......@@ -57,13 +58,21 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk;
opts.to_float32 = FLAGS_to_float32;
kaldi::FrameExtractionOptions frame_opts;
frame_opts.frame_length_ms = 20;
frame_opts.frame_shift_ms = 10;
frame_opts.remove_dc_offset = false;
frame_opts.window_type = "hanning";
frame_opts.preemph_coeff = 0.0;
frame_opts.dither = 0.0;
opts.linear_spectrogram_opts.frame_opts = frame_opts;
frame_opts.frame_shift_ms = 10;
opts.use_fbank = FLAGS_use_fbank;
if (opts.use_fbank) {
frame_opts.window_type = "povey";
frame_opts.frame_length_ms = 25;
opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
opts.fbank_opts.fbank_opts.frame_opts = frame_opts;
} else {
frame_opts.remove_dc_offset = false;
frame_opts.frame_length_ms = 20;
frame_opts.window_type = "hanning";
frame_opts.preemph_coeff = 0.0;
opts.linear_spectrogram_opts.frame_opts = frame_opts;
}
opts.feature_cache_opts.frame_chunk_size = FLAGS_receptive_field_length;
opts.feature_cache_opts.frame_chunk_stride = FLAGS_downsampling_rate;
return opts;
......
......@@ -10,4 +10,4 @@ add_library(frontend STATIC
fbank.cc
)
target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common)
target_link_libraries(frontend PUBLIC kaldi-matrix kaldi-feat-common kaldi-fbank)
......@@ -29,6 +29,8 @@ using kaldi::VectorBase;
using kaldi::Matrix;
using std::vector;
// todo refactor later:(SmileGoat)
Fbank::Fbank(const FbankOptions& opts,
std::unique_ptr<FrontendInterface> base_extractor)
: opts_(opts),
......@@ -98,12 +100,22 @@ bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) {
Vector<BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
// note: this online feature-extraction code does not support VTLN.
BaseFloat vtln_warp = 1.0;
computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
RealFft(&window, true);
kaldi::ComputePowerSpectrum(&window);
const kaldi::MelBanks &mel_bank = *(computer_.GetMelBanks(1.0));
SubVector<BaseFloat> power_spectrum(window, 0, window.Dim() / 2 + 1);
if (!opts_.fbank_opts.use_power) {
power_spectrum.ApplyPow(0.5);
}
int32 mel_offset = ((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1 : 0);
SubVector<BaseFloat> mel_energies(this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins);
mel_bank.Compute(power_spectrum, &mel_energies);
mel_energies.ApplyFloor(1e-07);
mel_energies.ApplyLog();
SubVector<BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
output_row.CopyFromVec(this_feature);
}
return true;
}
} // namespace ppspeech
\ No newline at end of file
} // namespace ppspeech
......@@ -74,4 +74,4 @@ class Fbank : public FrontendInterface {
DISALLOW_COPY_AND_ASSIGN(Fbank);
};
} // namespace ppspeech
\ No newline at end of file
} // namespace ppspeech
......@@ -22,12 +22,18 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) {
unique_ptr<FrontendInterface> data_source(
new ppspeech::AudioCache(1000 * kint16max, opts.to_float32));
unique_ptr<FrontendInterface> linear_spectrogram(
new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts,
std::move(data_source)));
unique_ptr<FrontendInterface> base_feature;
if (opts.use_fbank) {
base_feature.reset(new ppspeech::Fbank(opts.fbank_opts,
std::move(data_source)));
} else {
base_feature.reset(new ppspeech::LinearSpectrogram(opts.linear_spectrogram_opts,
std::move(data_source)));
}
unique_ptr<FrontendInterface> cmvn(
new ppspeech::CMVN(opts.cmvn_file, std::move(linear_spectrogram)));
new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature)));
base_extractor_.reset(
new ppspeech::FeatureCache(opts.feature_cache_opts, std::move(cmvn)));
......
......@@ -21,6 +21,7 @@
#include "frontend/audio/feature_cache.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/linear_spectrogram.h"
#include "frontend/audio/fbank.h"
#include "frontend/audio/normalizer.h"
namespace ppspeech {
......@@ -28,12 +29,16 @@ namespace ppspeech {
struct FeaturePipelineOptions {
std::string cmvn_file;
bool to_float32;
bool use_fbank;
LinearSpectrogramOptions linear_spectrogram_opts;
FbankOptions fbank_opts;
FeatureCacheOptions feature_cache_opts;
FeaturePipelineOptions()
: cmvn_file(""),
to_float32(false),
use_fbank(false),
linear_spectrogram_opts(),
fbank_opts(),
feature_cache_opts() {}
};
......
......@@ -85,10 +85,9 @@ void FbankComputer::Compute(BaseFloat signal_raw_log_energy,
signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
std::numeric_limits<float>::epsilon()));
// todo : remove later; as align fbank feature in paddleaudio
//if (srfft_ != NULL) // Compute FFT using split-radix algorithm.
// srfft_->Compute(signal_frame->Data(), true);
//else // An alternative algorithm that works for non-powers-of-two.
if (srfft_ != NULL) // Compute FFT using split-radix algorithm.
srfft_->Compute(signal_frame->Data(), true);
else // An alternative algorithm that works for non-powers-of-two.
RealFft(signal_frame, true);
// Convert the FFT into a power spectrum.
......
......@@ -128,8 +128,8 @@ class FbankComputer {
~FbankComputer();
private:
const MelBanks *GetMelBanks(BaseFloat vtln_warp);
private:
FbankOptions opts_;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册