提交 c6027751 编写于 作者: S SmileGoat

add frontend cmakelist

上级 f03d48f7
......@@ -35,3 +35,7 @@ We borrowed a lot of code from these repos to build `model` and `engine`, thanks
* [librosa](https://github.com/librosa/librosa/blob/main/LICENSE.md)
- ISC License
- Audio feature
* [ThreadPool](https://github.com/progschj/ThreadPool/blob/master/COPYING)
- zlib License
- ThreadPool
......@@ -65,7 +65,7 @@ FetchContent_Declare(
URL_HASH SHA256=9e1b54eb2782f53cd8af107ecf08d2ab64b8d0dc2b7f5594472f3bd63ca85cdc
include_directories(${glog_BINARY_DIR} ${glog_SOURCE_DIR}/src)
# gtest
......@@ -4,11 +4,22 @@ project(speechx LANGUAGES CXX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
add_executable(mfcc-test codelab/feat_test/feature-mfcc-test.cc)
target_link_libraries(mfcc-test kaldi-mfcc)
add_executable(linear_spectrogram_main codelab/feat_test/linear_spectrogram_main.cc)
target_link_libraries(linear_spectrogram_main frontend kaildi-util kaldi-feat)
......@@ -16,7 +16,7 @@
#include "kaldi/base/kaldi-types.h"
#include <limits.h>
#include <limits>
typedef float BaseFloat;
typedef double double64;
......@@ -35,7 +35,7 @@ typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint32;
if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD)
#if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD)
typedef unsigned long uint64;
typedef unsigned long long uint64;
// Copyright (c) 2012 Jakob Progsch, Václav Zeman
// This software is provided 'as-is', without any express or implied
// warranty. In no event will the authors be held liable for any damages
// arising from the use of this software.
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it
// freely, subject to the following restrictions:
// 1. The origin of this software must not be misrepresented; you must not
// claim that you wrote the original software. If you use this software
// in a product, an acknowledgment in the product documentation would be
// appreciated but is not required.
// 2. Altered source versions must be plainly marked as such, and must not be
// misrepresented as being the original software.
// 3. This notice may not be removed or altered from any source
// distribution.
// this code is from https://github.com/progschj/ThreadPool
......@@ -97,4 +117,4 @@ inline ThreadPool::~ThreadPool()
\ No newline at end of file
......@@ -2,6 +2,7 @@
#include "frontend/linear_spectrogram.h"
#include "frontend/normalizer.h"
#include "frontend/feature_extractor_interface.h"
#include "kaldi/util/table-types.h"
#include "base/log.h"
#include "base/flags.h"
......@@ -22,7 +23,7 @@ int main(int argc, char* argv[]) {
ppspeech::LinearSpectrogramOptions opt;
ppspeech::DecibelNormalizerOptions db_norm_opt;
std::unique_ptr<ppspeech::FeatureExtractorInterface> base_feature_extractor =
new DecibelNormalizer(db_norm_opt);
new ppspeech::DecibelNormalizer(db_norm_opt);
ppspeech::LinearSpectrogram linear_spectrogram(opt, base_featrue_extractor);
for (; !wav_reader.Done(); wav_reader.Next()) {
target_link_libraries(frontend kaldi-matrix)
\ No newline at end of file
......@@ -15,16 +15,14 @@
#pragma once
#include "base/basic_types.h"
#incldue "kaldi/matrix/kaldi-vector.h"
#include "kaldi/matrix/kaldi-vector.h"
namespace ppspeech {
class FeatureExtractorInterface {
virtual void AcceptWaveform(const kaldi::Vector<kaldi::BaseFloat>& input) = 0;
virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat) = 0;
virtual void Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input,
kaldi::VectorBae<kaldi::BaseFloat>* feature) = 0;
virtual void AcceptWaveform(const kaldi::VectorBase<kaldi::BaseFloat>& input) = 0;
virtual void Read(kaldi::VectorBase<kaldi::BaseFloat>* feat) = 0;
virtual size_t Dim() const = 0;
......@@ -16,15 +16,36 @@
#include "kaldi/base/kaldi-math.h"
#include "kaldi/matrix/matrix-functions.h"
namespace ppspeech {
using kaldi::int32;
using kaldi::BaseFloat;
using kaldi::Vector;
using kaldi::Matrix;
using std::vector;
//todo remove later
void CopyVector2StdVector(const kaldi::Vector<BaseFloat>& input,
vector<BaseFloat>* output) {
if (input.Dim() == 0) return;
for (size_t idx = 0; idx < input.Dim(); ++idx) {
(*output)[idx] = input(idx);
void CopyStdVector2Vector(const vector<BaseFloat>& input,
Vector<BaseFloat>* output) {
if (input.empty()) return;
for (size_t idx = 0; idx < input.size(); ++idx) {
(*output)(idx) = input[idx];
const LinearSpectrogramOptions& opts,
const std::unique_ptr<FeatureExtractorInterface> base_extractor) {
std::unique_ptr<FeatureExtractorInterface> base_extractor) {
base_extractor_ = std::move(base_extractor);
int32 window_size = opts.frame_opts.WindowSize();
int32 window_shift = opts.frame_opts.WindowShift();
......@@ -41,11 +62,8 @@ LinearSpectrogram::LinearSpectrogram(
dim_ = fft_points_ / 2 + 1; // the dimension is Fs/2 Hz
void LinearSpectrogram::AcceptWavefrom(const Vector<BaseFloat>& input) {
for (size_t idx = 0; idx < input.Dim(); ++idx) {
waveform_[idx] = input(idx);
void LinearSpectrogram::AcceptWavefrom(const kaldi::VectorBase<BaseFloat>& input) {
void LinearSpectrogram::Hanning(vector<float>* data) const {
......@@ -58,11 +76,11 @@ void LinearSpectrogram::Hanning(vector<float>* data) const {
bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
vector<BaseFloat>* real,
vector<BaseFloat>* img) {
if (RealFft(v, true)) {
LOG(ERROR) << "compute the fft occurs error";
return false;
vector<BaseFloat>* img) const {
Vector<BaseFloat> v_tmp;
CopyStdVector2Vector(*v, &v_tmp);
RealFft(&v_tmp, true);
CopyVector2StdVector(v_tmp, v);
for (int i = 1; i < v->size() / 2; i++) {
......@@ -75,36 +93,28 @@ bool LinearSpectrogram::NumpyFft(vector<BaseFloat>* v,
return true;
//todo remove later
void CopyVector2StdVector(const kaldi::Vector<BaseFloat>& input,
vector<BaseFloat>* output) {
// todo remove later
bool LinearSpectrogram::ReadFeats(Matrix<BaseFloat>* feats) const {
if (wavefrom_.Dim() == 0) {
return false;
kaldi::Vector<BaseFloat> feats;
Compute(wavefrom_, &feats);
void LinearSpectrogram::ReadFeats(Matrix<BaseFloat>* feats) {
Vector<BaseFloat> tmp;
Compute(tmp, &waveform_);
vector<vector<BaseFloat>> result;
vector<BaseFloat> feats_vec;
CopyVector2StdVector(feats, &feats_vec);
CopyVector2StdVector(waveform_, &feats_vec);
Compute(feats_vec, result);
feats->Resize(result.size(), result[0].size());
for (int row_idx = 0; row_idx < result.size(); ++row_idx) {
for (int col_idx = 0; col_idx < result.size(); ++col_idx) {
feats(row_idx, col_idx) = result[row_idx][col_idx];
(*feats)(row_idx, col_idx) = result[row_idx][col_idx];
return true;
// only for test, remove later
// todo: compute the feature frame by frame.
void LinearSpectrogram::Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input,
kaldi::VectorBae<kaldi::BaseFloat>* feature) {
base_extractor_->Compute(input, feature);
void LinearSpectrogram::Compute(const kaldi::Vector<kaldi::BaseFloat>& input,
kaldi::Vector<kaldi::BaseFloat>* feature) {
// Compute spectrogram feat, only for test, remove later
......@@ -112,9 +122,9 @@ void LinearSpectrogram::Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input
bool LinearSpectrogram::Compute(const vector<float>& wave,
vector<vector<float>>& feat) {
int num_samples = wave.size();
const int& frame_length = opts.frame_opts.WindowSize();
const int& sample_rate = opts.frame_opts.samp_freq;
const int& frame_shift = opts.frame_opts.WindowShift();
const int& frame_length = opts_.frame_opts.WindowSize();
const int& sample_rate = opts_.frame_opts.samp_freq;
const int& frame_shift = opts_.frame_opts.WindowShift();
const int& fft_points = fft_points_;
const float scale = hanning_window_energy_ * frame_shift;
......@@ -132,11 +142,11 @@ bool LinearSpectrogram::Compute(const vector<float>& wave,
for (int i = 0; i < num_frames; ++i) {
vector<float> data(wave.data() + i * frame_shift,
wave.data() + i * frame_shift + frame_length);
v.assign(data.begin(), data.end());
if (NumpyFft(&v, fft_real, fft_img)) {
if (NumpyFft(&v, &fft_real, &fft_img)) {
LOG(ERROR)<< i << " fft compute occurs error, please checkout the input data";
return false;
......@@ -155,5 +165,8 @@ bool LinearSpectrogram::Compute(const vector<float>& wave,
// log added eps=1e-14
feat[i][j] = std::log(feat[i][j] + 1e-14);
return true;
} // namespace ppspeech
\ No newline at end of file
......@@ -8,7 +8,7 @@
namespace ppspeech {
struct LinearSpectrogramOptions {
kaldi::FrameExtrationOptions frame_opts;
kaldi::FrameExtractionOptions frame_opts;
frame_opts() {}
......@@ -19,19 +19,19 @@ struct LinearSpectrogramOptions {
class LinearSpectrogram : public FeatureExtractorInterface {
explict LinearSpectrogram(const LinearSpectrogramOptions& opts,
const std::unique_ptr<FeatureExtractorInterface> base_extractor);
virtual void AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input);
virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat);
explicit LinearSpectrogram(const LinearSpectrogramOptions& opts,
std::unique_ptr<FeatureExtractorInterface> base_extractor);
virtual void AcceptWavefrom(const kaldi::VectorBase<kaldi::BaseFloat>& input);
virtual void Read(kaldi::VectorBase<kaldi::BaseFloat>* feat);
virtual size_t Dim() const { return dim_; }
void ReadFeats(kaldi::Matrix<kaldi::BaesFloat>* feats) const;
void ReadFeats(kaldi::Matrix<kaldi::BaseFloat>* feats);
void Hanning(std::vector<kaldi::BaseFloat>& data) const;
kaldi::int32 Compute(const std::vector<kaldi::BaseFloat>& wave,
std::vector<std::vector<kaldi::BaseFloat>>& feat);
void Compute(const kaldi::VectorBase<kaldi::BaseFloat>& input,
kaldi::VectorBae<kaldi::BaseFloat>* feature);
void Hanning(std::vector<kaldi::BaseFloat>* data) const;
bool Compute(const std::vector<kaldi::BaseFloat>& wave,
std::vector<std::vector<kaldi::BaseFloat>>& feat);
void Compute(const kaldi::Vector<kaldi::BaseFloat>& input,
kaldi::Vector<kaldi::BaseFloat>* feature);
bool NumpyFft(std::vector<kaldi::BaseFloat>* v,
std::vector<kaldi::BaseFloat>* real,
std::vector<kaldi::BaseFloat>* img) const;
......@@ -41,7 +41,7 @@ class LinearSpectrogram : public FeatureExtractorInterface {
std::vector<kaldi::BaseFloat> hanning_window_;
kaldi::BaseFloat hanning_window_energy_;
LinearSpectrogramOptions opts_;
kaldi::Vector<kaldi::BaseFloat> wavefrom_; // remove later, todo(SmileGoat)
kaldi::Vector<kaldi::BaseFloat> waveform_; // remove later, todo(SmileGoat)
std::unique_ptr<FeatureExtractorInterface> base_extractor_;
#include "frontend/normalizer.h"
const DecibelNormalizerOptions& opts) {
namespace ppspeech {
using kaldi::Vector;
using kaldi::BaseFloat;
using std::vector;
DecibelNormalizer::DecibelNormalizer(const DecibelNormalizerOptions& opts) {
opts_ = opts;
void DecibelNormalizer::AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input) {
void DecibelNormalizer::AcceptWavefrom(const Vector<BaseFloat>& input) {
waveform_ = input;
void DecibelNormalizer::Read(Vector<BaseFloat>* feat) {
if (waveform_.Dim() == 0) return;
Compute(waveform_, feat);
void DecibelNormalizer::Read(kaldi::Vector<kaldi::BaseFloat>* feat) {
//todo remove later
void CopyVector2StdVector(const kaldi::Vector<BaseFloat>& input,
vector<BaseFloat>* output) {
if (input.Dim() == 0) return;
for (size_t idx = 0; idx < input.Dim(); ++idx) {
(*output)[idx] = input(idx);
void CopyStdVector2Vector(const vector<BaseFloat>& input,
Vector<BaseFloat>* output) {
if (input.empty()) return;
for (size_t idx = 0; idx < input.size(); ++idx) {
(*output)(idx) = input[idx];
bool DecibelNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
kaldi::Vector<kaldi::BaseFloat>* feat) {
bool DecibelNormalizer::Compute(const Vector<BaseFloat>& input,
Vector<BaseFloat>* feat) const {
// calculate db rms
float rms_db = 0.0;
float mean_square = 0.0;
float gain = 0.0;
vector<BaseFloat> smaples;
BaseFloat rms_db = 0.0;
BaseFloat mean_square = 0.0;
BaseFloat gain = 0.0;
BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1));
vector<BaseFloat> samples;
for (int32 i = 0; i < samples.size(); ++i) {
samples[i] = input(i);
// square
for (auto &d : samples) {
if (_opts.convert_int_float) {
if (opts_.convert_int_float) {
d = d * wave_float_normlization;
mean_square += d * d;
......@@ -37,12 +64,12 @@ bool DecibelNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
// mean
mean_square /= samples.size();
rms_db = 10 * std::log10(mean_square);
gain = opts.target_db - rms_db;
gain = opts_.target_db - rms_db;
if (gain > opts.max_gain_db) {
LOG(ERROR) << "Unable to normalize segment to " << opts.target_db << "dB,"
<< "because the the probable gain have exceeds opts.max_gain_db"
<< opts.max_gain_db << "dB.";
if (gain > opts_.max_gain_db) {
LOG(ERROR) << "Unable to normalize segment to " << opts_.target_db << "dB,"
<< "because the the probable gain have exceeds opts_.max_gain_db"
<< opts_.max_gain_db << "dB.";
return false;
......@@ -51,27 +78,28 @@ bool DecibelNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
// python item *= 10.0 ** (gain / 20.0)
item *= std::pow(10.0, gain / 20.0);
CopyStdVector2Vector(samples, feat);
return true;
const PPNormalizerOptions& opts,
const std::unique_ptr<FeatureExtractorInterface>& pre_extractor) {
void PPNormalizer::AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input) {
void PPNormalizer::AcceptWavefrom(const Vector<BaseFloat>& input) {
void PPNormalizer::Read(kaldi::Vector<kaldi::BaseFloat>* feat) {
void PPNormalizer::Read(Vector<BaseFloat>* feat) {
bool PPNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
kaldi::Vector<kaldi::BaseFloat>>* feat) {
bool PPNormalizer::Compute(const Vector<BaseFloat>& input,
Vector<BaseFloat>>* feat) {
if ((input.Dim() % mean_.Dim()) == 0) {
LOG(ERROR) << "CMVN dimension is wrong!";
return false;
......@@ -93,4 +121,6 @@ bool PPNormalizer::Compute(const Vector<kaldi::BaseFloat>& input,
return true;
} // namespace ppspeech
\ No newline at end of file
#pragma once
#include "base/common.h"
#include "frontend/feature_extractor_interface.h"
#include "kaldi/util/options-itf.h"
namespace ppspeech {
......@@ -9,6 +11,7 @@ namespace ppspeech {
struct DecibelNormalizerOptions {
float target_db;
float max_gain_db;
bool convert_int_float;
DecibelNormalizerOptions() :
......@@ -23,16 +26,19 @@ struct DecibelNormalizerOptions {
class DecibelNormalizer : public FeatureExtractorInterface {
explict DecibelNormalizer(const DecibelNormalizerOptions& opts,
const std::unique_ptr<FeatureExtractorInterface>& pre_extractor);
virtual void AcceptWavefrom(const kaldi::Vector<kaldi::BaseFloat>& input);
virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat);
virtual size_t Dim() const;
explicit DecibelNormalizer(const DecibelNormalizerOptions& opts);
virtual void AcceptWavefrom(const kaldi::VectorBase<kaldi::BaseFloat>& input);
virtual void Read(kaldi::VectorBase<kaldi::BaseFloat>* feat);
virtual size_t Dim() const { return 0; }
bool Compute(const kaldi::Vector<kaldi::BaseFloat>& input,
kaldi::Vector<kaldi::BaseFloat>>* feat);
kaldi::Vector<kaldi::BaseFloat>* feat) const;
DecibelNormalizerOptions opts_;
std::unique_ptr<FeatureExtractorInterface> base_extractor_;
kaldi::Vector<kaldi::BaseFloat> waveform_;
struct NormalizerOptions {
std::string mean_std_path;
NormalizerOptions() :
......@@ -61,5 +67,5 @@ class PPNormalizer : public FeatureExtractorInterface {
kaldi::Vector<float> variance_;
NormalizerOptions _opts;
} // namespace ppspeech
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
想要评论请 注册