未验证 提交 ee7c266f 编写于 作者: Y YangZhou 提交者: GitHub

[speechx] rm openblas && refactor kaldi-matrix, kaldi-vector (#2824)

* rm openblas && refactor kaldi-matrix kaldi-vector
上级 c1b1ae05
......@@ -53,9 +53,6 @@ include(gflags)
include(glog)
#openblas
include(openblas)
# openfst
include(openfst)
add_dependencies(openfst gflags glog)
......
......@@ -14,7 +14,7 @@
#include "decoder/ctc_prefix_beam_search_decoder.h"
#include "base/common.h"
#include "frontend/audio/data_cache.h"
#include "frontend/data_cache.h"
#include "fst/symbol-table.h"
#include "kaldi/util/table-types.h"
#include "nnet/decodable.h"
......
......@@ -14,7 +14,7 @@
#include "base/common.h"
#include "kaldi/decoder/decodable-itf.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "matrix/kaldi-matrix.h"
#include "nnet/nnet_itf.h"
#include "nnet/nnet_producer.h"
......
......@@ -15,7 +15,6 @@
#include "base/basic_types.h"
#include "kaldi/base/kaldi-types.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "kaldi/util/options-itf.h"
DECLARE_int32(subsampling_rate);
......
......@@ -13,10 +13,10 @@
// limitations under the License.
#include "nnet/nnet_producer.h"
#include "matrix/kaldi-matrix.h"
namespace ppspeech {
using kaldi::Vector;
using std::vector;
using kaldi::BaseFloat;
......
......@@ -16,7 +16,7 @@
#include "base/common.h"
#include "base/safe_queue.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/frontend_itf.h"
#include "nnet/nnet_itf.h"
namespace ppspeech {
......
......@@ -18,7 +18,7 @@
#pragma once
#include "base/common.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "matrix/kaldi-matrix.h"
#include "nnet/nnet_itf.h"
#include "paddle/extension.h"
#include "paddle/jit/all.h"
......
......@@ -15,8 +15,8 @@
#include "base/common.h"
#include "decoder/param.h"
#include "frontend/audio/assembler.h"
#include "frontend/audio/data_cache.h"
#include "frontend/assembler.h"
#include "frontend/data_cache.h"
#include "kaldi/util/table-types.h"
#include "nnet/decodable.h"
#include "nnet/u2_nnet.h"
......
......@@ -15,7 +15,7 @@ set(TEST_BINS
foreach(bin_name IN LISTS TEST_BINS)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(${bin_name} recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-feat-common)
target_link_libraries(${bin_name} recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util)
target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS})
target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
......
......@@ -18,7 +18,7 @@
#include "decoder/ctc_beam_search_opt.h"
#include "decoder/ctc_prefix_beam_search_decoder.h"
#include "decoder/decoder_itf.h"
#include "frontend/audio/feature_pipeline.h"
#include "frontend/feature_pipeline.h"
#include "fst/fstlib.h"
#include "fst/symbol-table.h"
#include "nnet/decodable.h"
......
......@@ -13,7 +13,7 @@
// limitations under the License.
#include "decoder/param.h"
#include "kaldi/feat/wave-reader.h"
#include "frontend/wave-reader.h"
#include "kaldi/util/table-types.h"
#include "recognizer/u2_recognizer.h"
......
......@@ -14,7 +14,7 @@
#include "recognizer/u2_recognizer.h"
#include "decoder/param.h"
#include "kaldi/feat/wave-reader.h"
#include "frontend/wave-reader.h"
#include "kaldi/util/table-types.h"
DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
......
......@@ -4,6 +4,8 @@ ${CMAKE_CURRENT_SOURCE_DIR}/../
)
add_subdirectory(utils)
add_subdirectory(matrix)
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}/frontend
)
......
add_library(kaldi-native-fbank-core
feature-fbank.cc
feature-functions.cc
feature-window.cc
fftsg.c
mel-computations.cc
rfft.cc
)
add_subdirectory(audio)
\ No newline at end of file
add_library(frontend STATIC
cmvn.cc
audio_cache.cc
feature_cache.cc
feature_pipeline.cc
assembler.cc
wave-reader.cc
)
target_link_libraries(frontend PUBLIC kaldi-native-fbank-core utils)
set(BINS
compute_fbank_main
)
foreach(bin_name IN LISTS BINS)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(${bin_name} PUBLIC frontend utils kaldi-util gflags glog)
endforeach()
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "frontend/audio/assembler.h"
#include "frontend/assembler.h"
namespace ppspeech {
......
......@@ -15,7 +15,7 @@
#pragma once
#include "base/common.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/frontend_itf.h"
namespace ppspeech {
......
add_library(kaldi-native-fbank-core
feature-fbank.cc
feature-functions.cc
feature-window.cc
fftsg.c
mel-computations.cc
rfft.cc
)
add_library(frontend STATIC
cmvn.cc
audio_cache.cc
feature_cache.cc
feature_pipeline.cc
assembler.cc
)
target_link_libraries(frontend PUBLIC kaldi-native-fbank-core utils)
set(BINS
compute_fbank_main
)
foreach(bin_name IN LISTS BINS)
add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(${bin_name} PUBLIC frontend utils kaldi-util gflags glog kaldi-feat-common)
endforeach()
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "frontend/audio/audio_cache.h"
#include "frontend/audio_cache.h"
#include "kaldi/base/timer.h"
......
......@@ -16,7 +16,7 @@
#pragma once
#include "base/common.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/frontend_itf.h"
namespace ppspeech {
......
......@@ -13,7 +13,7 @@
// limitations under the License.
#include "frontend/audio/cmvn.h"
#include "frontend/cmvn.h"
#include "utils/file_utils.h"
#include "utils/picojson.h"
......
......@@ -15,8 +15,7 @@
#pragma once
#include "base/common.h"
#include "frontend/audio/frontend_itf.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "frontend/frontend_itf.h"
#include "kaldi/util/options-itf.h"
namespace ppspeech {
......
......@@ -16,13 +16,13 @@
#include "base/flags.h"
#include "base/log.h"
#include "frontend/audio/audio_cache.h"
#include "frontend/audio/data_cache.h"
#include "frontend/audio/fbank.h"
#include "frontend/audio/feature_cache.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/normalizer.h"
#include "kaldi/feat/wave-reader.h"
#include "frontend/audio_cache.h"
#include "frontend/data_cache.h"
#include "frontend/fbank.h"
#include "frontend/feature_cache.h"
#include "frontend/frontend_itf.h"
#include "frontend/normalizer.h"
#include "frontend/wave-reader.h"
#include "kaldi/util/kaldi-io.h"
#include "kaldi/util/table-types.h"
......
......@@ -16,7 +16,7 @@
#pragma once
#include "base/common.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/frontend_itf.h"
using std::vector;
......
......@@ -15,8 +15,8 @@
#pragma once
#include "base/common.h"
#include "frontend/audio/feature_common.h"
#include "frontend/audio/feature-fbank.h"
#include "frontend/feature_common.h"
#include "frontend/feature-fbank.h"
namespace ppspeech {
......
......@@ -18,11 +18,11 @@
// This file is copied/modified from kaldi/src/feat/feature-fbank.cc
//
#include "frontend/audio/feature-fbank.h"
#include "frontend/feature-fbank.h"
#include <cmath>
#include "frontend/audio/feature-functions.h"
#include "frontend/feature-functions.h"
namespace knf {
......
......@@ -23,9 +23,9 @@
#include <map>
#include "frontend/audio/feature-window.h"
#include "frontend/audio/mel-computations.h"
#include "frontend/audio/rfft.h"
#include "frontend/feature-window.h"
#include "frontend/mel-computations.h"
#include "frontend/rfft.h"
namespace knf {
......
......@@ -18,7 +18,7 @@
// This file is copied/modified from kaldi/src/feat/feature-functions.cc
#include "frontend/audio/feature-functions.h"
#include "frontend/feature-functions.h"
#include <cstdint>
#include <vector>
......
......@@ -4,7 +4,7 @@
// This file is copied/modified from kaldi/src/feat/feature-window.cc
#include "frontend/audio/feature-window.h"
#include "frontend/feature-window.h"
#include <cmath>
#include <vector>
......
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "frontend/audio/feature_cache.h"
#include "frontend/feature_cache.h"
namespace ppspeech {
......
......@@ -15,7 +15,7 @@
#pragma once
#include "base/common.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/frontend_itf.h"
namespace ppspeech {
......
......@@ -15,7 +15,7 @@
#pragma once
#include "frontend_itf.h"
#include "frontend/audio/feature-window.h"
#include "frontend/feature-window.h"
namespace ppspeech {
......@@ -52,4 +52,4 @@ class StreamingFeatureTpl : public FrontendInterface {
} // namespace ppspeech
#include "frontend/audio/feature_common_inl.h"
#include "frontend/feature_common_inl.h"
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "frontend/audio/feature_pipeline.h"
#include "frontend/feature_pipeline.h"
namespace ppspeech {
......
......@@ -16,13 +16,13 @@
#pragma once
#include "frontend/audio/assembler.h"
#include "frontend/audio/audio_cache.h"
#include "frontend/audio/data_cache.h"
#include "frontend/audio/fbank.h"
#include "frontend/audio/feature_cache.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/normalizer.h"
#include "frontend/assembler.h"
#include "frontend/audio_cache.h"
#include "frontend/data_cache.h"
#include "frontend/fbank.h"
#include "frontend/feature_cache.h"
#include "frontend/frontend_itf.h"
#include "frontend/cmvn.h"
// feature
DECLARE_bool(fill_zero);
......
......@@ -15,7 +15,7 @@
#pragma once
#include "base/basic_types.h"
#include "kaldi/matrix/kaldi-vector.h"
#include "matrix/kaldi-vector.h"
namespace ppspeech {
......
......@@ -18,12 +18,12 @@
// This file is copied/modified from kaldi/src/feat/mel-computations.cc
#include "frontend/audio/mel-computations.h"
#include "frontend/mel-computations.h"
#include <algorithm>
#include <sstream>
#include "frontend/audio/feature-window.h"
#include "frontend/feature-window.h"
namespace knf {
......
......@@ -22,7 +22,7 @@
#include <cmath>
#include <string>
#include "frontend/audio/feature-window.h"
#include "frontend/feature-window.h"
namespace knf {
......
......@@ -14,5 +14,4 @@
#pragma once
#include "frontend/audio/cmvn.h"
#include "frontend/audio/db_norm.h"
\ No newline at end of file
#include "frontend/cmvn.h"
\ No newline at end of file
......@@ -16,7 +16,7 @@
* limitations under the License.
*/
#include "frontend/audio/rfft.h"
#include "frontend/rfft.h"
#include <cmath>
#include <vector>
......
......@@ -25,7 +25,7 @@
#include <sstream>
#include <vector>
#include "feat/wave-reader.h"
#include "frontend/wave-reader.h"
#include "base/kaldi-error.h"
#include "base/kaldi-utils.h"
......
add_library(kaldi-matrix
kaldi-matrix.cc
kaldi-vector.cc
)
target_link_libraries(kaldi-matrix kaldi-base)
......@@ -28,7 +28,7 @@ namespace kaldi {
template<typename Real>
Matrix<Real>::Matrix(): MatrixBase<Real>(NULL, 0, 0, 0) { }
/*
template<>
template<>
void MatrixBase<float>::AddVecVec(const float alpha, const VectorBase<float> &ra, const VectorBase<float> &rb);
......@@ -36,6 +36,7 @@ void MatrixBase<float>::AddVecVec(const float alpha, const VectorBase<float> &ra
template<>
template<>
void MatrixBase<double>::AddVecVec(const double alpha, const VectorBase<double> &ra, const VectorBase<double> &rb);
*/
template<typename Real>
inline std::ostream & operator << (std::ostream & os, const MatrixBase<Real> & M) {
......
......@@ -23,17 +23,9 @@
// limitations under the License.
#include "matrix/kaldi-matrix.h"
#include "matrix/sp-matrix.h"
#include "matrix/jama-svd.h"
#include "matrix/jama-eig.h"
#include "matrix/compressed-matrix.h"
#include "matrix/sparse-matrix.h"
static_assert(int(kaldi::kNoTrans) == int(CblasNoTrans) && int(kaldi::kTrans) == int(CblasTrans),
"kaldi::kNoTrans and kaldi::kTrans must be equal to the appropriate CBLAS library constants!");
namespace kaldi {
/*
template<typename Real>
void MatrixBase<Real>::Invert(Real *log_det, Real *det_sign,
bool inverse_needed) {
......@@ -206,29 +198,30 @@ void MatrixBase<Real>::SetMatMatDivMat(const MatrixBase<Real>& A,
}
}
}
*/
//template<typename Real>
//void MatrixBase<Real>::CopyLowerToUpper() {
//KALDI_ASSERT(num_rows_ == num_cols_);
//Real *data = data_;
//MatrixIndexT num_rows = num_rows_, stride = stride_;
//for (int32 i = 0; i < num_rows; i++)
//for (int32 j = 0; j < i; j++)
//data[j * stride + i ] = data[i * stride + j];
//}
template<typename Real>
void MatrixBase<Real>::CopyLowerToUpper() {
KALDI_ASSERT(num_rows_ == num_cols_);
Real *data = data_;
MatrixIndexT num_rows = num_rows_, stride = stride_;
for (int32 i = 0; i < num_rows; i++)
for (int32 j = 0; j < i; j++)
data[j * stride + i ] = data[i * stride + j];
}
//template<typename Real>
//void MatrixBase<Real>::CopyUpperToLower() {
//KALDI_ASSERT(num_rows_ == num_cols_);
//Real *data = data_;
//MatrixIndexT num_rows = num_rows_, stride = stride_;
//for (int32 i = 0; i < num_rows; i++)
//for (int32 j = 0; j < i; j++)
//data[i * stride + j] = data[j * stride + i];
//}
template<typename Real>
void MatrixBase<Real>::CopyUpperToLower() {
KALDI_ASSERT(num_rows_ == num_cols_);
Real *data = data_;
MatrixIndexT num_rows = num_rows_, stride = stride_;
for (int32 i = 0; i < num_rows; i++)
for (int32 j = 0; j < i; j++)
data[i * stride + j] = data[j * stride + i];
}
/*
template<typename Real>
void MatrixBase<Real>::SymAddMat2(const Real alpha,
const MatrixBase<Real> &A,
......@@ -734,7 +727,7 @@ void MatrixBase<Real>::LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U_in,
}
#endif
*/
// Copy constructor. Copies data to newly allocated memory.
template<typename Real>
Matrix<Real>::Matrix (const MatrixBase<Real> & M,
......@@ -898,6 +891,7 @@ template
void MatrixBase<double>::CopyFromMat(const MatrixBase<double> & M,
MatrixTransposeType Trans);
/*
// Specialize the template for CopyFromSp for float, float.
template<>
template<>
......@@ -992,7 +986,7 @@ template
void MatrixBase<double>::CopyFromTp(const TpMatrix<double> & M,
MatrixTransposeType trans);
*/
template<typename Real>
void MatrixBase<Real>::CopyRowsFromVec(const VectorBase<Real> &rv) {
if (rv.Dim() == num_rows_*num_cols_) {
......@@ -1076,7 +1070,6 @@ void MatrixBase<Real>::CopyColsFromVec(const VectorBase<Real> &rv) {
}
}
template<typename Real>
void MatrixBase<Real>::CopyRowFromVec(const VectorBase<Real> &rv, const MatrixIndexT row) {
KALDI_ASSERT(rv.Dim() == num_cols_ &&
......@@ -1088,7 +1081,7 @@ void MatrixBase<Real>::CopyRowFromVec(const VectorBase<Real> &rv, const MatrixIn
std::memcpy(row_data, rv_data, num_cols_ * sizeof(Real));
}
/*
template<typename Real>
void MatrixBase<Real>::CopyDiagFromVec(const VectorBase<Real> &rv) {
KALDI_ASSERT(rv.Dim() == std::min(num_cols_, num_rows_));
......@@ -1096,7 +1089,7 @@ void MatrixBase<Real>::CopyDiagFromVec(const VectorBase<Real> &rv) {
Real *my_data = this->Data();
for (; rv_data != rv_end; rv_data++, my_data += (this->stride_+1))
*my_data = *rv_data;
}
}*/
template<typename Real>
void MatrixBase<Real>::CopyColFromVec(const VectorBase<Real> &rv,
......@@ -1135,7 +1128,7 @@ void Matrix<Real>::Destroy() {
}
/*
template<typename Real>
void MatrixBase<Real>::MulElements(const MatrixBase<Real> &a) {
KALDI_ASSERT(a.NumRows() == num_rows_ && a.NumCols() == num_cols_);
......@@ -1325,6 +1318,7 @@ void MatrixBase<Real>::MulColsVec(const VectorBase<Real> &scale) {
}
}
}
*/
template<typename Real>
void MatrixBase<Real>::SetZero() {
......@@ -1344,6 +1338,7 @@ void MatrixBase<Real>::Set(Real value) {
}
}
/*
template<typename Real>
void MatrixBase<Real>::SetUnit() {
SetZero();
......@@ -1374,6 +1369,7 @@ void MatrixBase<Real>::SetRandUniform() {
}
}
}
*/
template<typename Real>
void MatrixBase<Real>::Write(std::ostream &os, bool binary) const {
......@@ -1420,23 +1416,11 @@ void MatrixBase<Real>::Write(std::ostream &os, bool binary) const {
template<typename Real>
void MatrixBase<Real>::Read(std::istream & is, bool binary, bool add) {
if (add) {
Matrix<Real> tmp(num_rows_, num_cols_);
tmp.Read(is, binary, false); // read without adding.
if (tmp.num_rows_ != this->num_rows_ || tmp.num_cols_ != this->num_cols_)
KALDI_ERR << "MatrixBase::Read, size mismatch "
<< this->num_rows_ << ", " << this->num_cols_
<< " vs. " << tmp.num_rows_ << ", " << tmp.num_cols_;
this->AddMat(1.0, tmp);
return;
}
// now assume add == false.
void MatrixBase<Real>::Read(std::istream & is, bool binary) {
// In order to avoid rewriting this, we just declare a Matrix and
// use it to read the data, then copy.
Matrix<Real> tmp;
tmp.Read(is, binary, false);
tmp.Read(is, binary);
if (tmp.NumRows() != NumRows() || tmp.NumCols() != NumCols()) {
KALDI_ERR << "MatrixBase<Real>::Read, size mismatch "
<< NumRows() << " x " << NumCols() << " versus "
......@@ -1447,23 +1431,7 @@ void MatrixBase<Real>::Read(std::istream & is, bool binary, bool add) {
template<typename Real>
void Matrix<Real>::Read(std::istream & is, bool binary, bool add) {
if (add) {
Matrix<Real> tmp;
tmp.Read(is, binary, false); // read without adding.
if (this->num_rows_ == 0) this->Resize(tmp.num_rows_, tmp.num_cols_);
else {
if (this->num_rows_ != tmp.num_rows_ || this->num_cols_ != tmp.num_cols_) {
if (tmp.num_rows_ == 0) return; // do nothing in this case.
else KALDI_ERR << "Matrix::Read, size mismatch "
<< this->num_rows_ << ", " << this->num_cols_
<< " vs. " << tmp.num_rows_ << ", " << tmp.num_cols_;
}
}
this->AddMat(1.0, tmp);
return;
}
void Matrix<Real>::Read(std::istream & is, bool binary) {
// now assume add == false.
MatrixIndexT pos_at_start = is.tellg();
std::ostringstream specific_error;
......@@ -1472,10 +1440,10 @@ void Matrix<Real>::Read(std::istream & is, bool binary, bool add) {
int peekval = Peek(is, binary);
if (peekval == 'C') {
// This code enables us to read CompressedMatrix as a regular matrix.
CompressedMatrix compressed_mat;
compressed_mat.Read(is, binary); // at this point, add == false.
this->Resize(compressed_mat.NumRows(), compressed_mat.NumCols());
compressed_mat.CopyToMat(this);
//CompressedMatrix compressed_mat;
//compressed_mat.Read(is, binary); // at this point, add == false.
//this->Resize(compressed_mat.NumRows(), compressed_mat.NumCols());
//compressed_mat.CopyToMat(this);
return;
}
const char *my_token = (sizeof(Real) == 4 ? "FM" : "DM");
......@@ -1483,7 +1451,7 @@ void Matrix<Real>::Read(std::istream & is, bool binary, bool add) {
if (peekval == other_token_start) { // need to instantiate the other type to read it.
typedef typename OtherReal<Real>::Real OtherType; // if Real == float, OtherType == double, and vice versa.
Matrix<OtherType> other(this->num_rows_, this->num_cols_);
other.Read(is, binary, false); // add is false at this point anyway.
other.Read(is, binary); // add is false at this point anyway.
this->Resize(other.NumRows(), other.NumCols());
this->CopyFromMat(other);
return;
......@@ -1672,7 +1640,7 @@ SubMatrix<Real>::SubMatrix(Real *data,
}
}
/*
template<typename Real>
void MatrixBase<Real>::Add(const Real alpha) {
Real *data = data_;
......@@ -1812,15 +1780,15 @@ void MatrixBase<Real>::DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
for(int32 i = 0; i < NumRows(); i++)
(*this)(i, i) *= 1.00001;
}*/
bool ans = JamaSvd(s, U, Vt);
if (Vt != NULL) Vt->Transpose(); // possibly to do: change this and also the transpose inside the JamaSvd routine. note, Vt is square.
if (!ans) {
KALDI_ERR << "Error doing Svd"; // This one will be caught.
}
#endif
if (prescale != 1.0) s->Scale(1.0/prescale);
}
// bool ans = JamaSvd(s, U, Vt);
//if (Vt != NULL) Vt->Transpose(); // possibly to do: change this and also the transpose inside the JamaSvd routine. note, Vt is square.
//if (!ans) {
//KALDI_ERR << "Error doing Svd"; // This one will be caught.
//}
//#endif
//if (prescale != 1.0) s->Scale(1.0/prescale);
//}
/*
template<typename Real>
void MatrixBase<Real>::Svd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<Real> *Vt) const {
try {
......@@ -2052,17 +2020,18 @@ void MatrixBase<Real>::InvertDouble(Real *log_det, Real *det_sign,
if (log_det) *log_det = log_det_tmp;
if (det_sign) *det_sign = det_sign_tmp;
}
*/
template<class Real>
void MatrixBase<Real>::CopyFromMat(const CompressedMatrix &mat) {
mat.CopyToMat(this);
}
//template<class Real>
//void MatrixBase<Real>::CopyFromMat(const CompressedMatrix &mat) {
//mat.CopyToMat(this);
//}
template<class Real>
Matrix<Real>::Matrix(const CompressedMatrix &M): MatrixBase<Real>() {
Resize(M.NumRows(), M.NumCols(), kUndefined);
M.CopyToMat(this);
}
//template<class Real>
//Matrix<Real>::Matrix(const CompressedMatrix &M): MatrixBase<Real>() {
//Resize(M.NumRows(), M.NumCols(), kUndefined);
//M.CopyToMat(this);
//}
......@@ -2074,7 +2043,7 @@ void MatrixBase<Real>::InvertElements() {
}
}
}
/*
template<typename Real>
void MatrixBase<Real>::Transpose() {
KALDI_ASSERT(num_rows_ == num_cols_);
......@@ -2250,7 +2219,7 @@ bool MatrixBase<Real>::Power(Real power) {
(*this).AddMatMat(1.0, tmp, kNoTrans, P, kNoTrans, 0.0);
return true;
}
*/
template<typename Real>
void Matrix<Real>::Swap(Matrix<Real> *other) {
std::swap(this->data_, other->data_);
......@@ -2258,7 +2227,7 @@ void Matrix<Real>::Swap(Matrix<Real> *other) {
std::swap(this->num_rows_, other->num_rows_);
std::swap(this->stride_, other->stride_);
}
/*
// Repeating this comment that appeared in the header:
// Eigenvalue Decomposition of a square NxN matrix into the form (*this) = P D
// P^{-1}. Be careful: the relationship of D to the eigenvalues we output is
......@@ -2298,7 +2267,7 @@ void MatrixBase<Real>::Eig(MatrixBase<Real> *P,
// INT_32 mVersion;
// INT_32 mSampSize;
// };
/*
template<typename Real>
bool ReadHtk(std::istream &is, Matrix<Real> *M_ptr, HtkHeader *header_ptr)
{
......@@ -2821,7 +2790,7 @@ void MatrixBase<Real>::GroupMax(const MatrixBase<Real> &src) {
}
}
}
*/
template<typename Real>
void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
const MatrixIndexT *indices) {
......@@ -2847,7 +2816,7 @@ void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
}
}
/*
template<typename Real>
void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
const MatrixIndexT *indices) {
......@@ -2871,8 +2840,9 @@ void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
this_data[c] += src_data[*index_ptr];
}
}
}
}*/
/*
template<typename Real>
void MatrixBase<Real>::CopyRows(const MatrixBase<Real> &src,
const MatrixIndexT *indices) {
......@@ -3022,9 +2992,9 @@ void MatrixBase<Real>::DiffTanh(const MatrixBase<Real> &value,
value_data += value_stride;
diff_data += diff_stride;
}
}
}*/
/*
template<typename Real>
template<typename OtherReal>
void MatrixBase<Real>::AddVecToRows(const Real alpha, const VectorBase<OtherReal> &v) {
......@@ -3087,7 +3057,7 @@ template void MatrixBase<double>::AddVecToCols(const double alpha,
const VectorBase<float> &v);
template void MatrixBase<double>::AddVecToCols(const double alpha,
const VectorBase<double> &v);
*/
//Explicit instantiation of the classes
//Apparently, it seems to be necessary that the instantiation
//happens at the end of the file. Otherwise, not all the member
......
......@@ -44,14 +44,14 @@ std::istream &operator >> (std::istream &is, Vector<Real> &rv) {
return is;
}
template<>
template<>
void VectorBase<float>::AddVec(const float alpha, const VectorBase<float> &rv);
template<>
template<>
void VectorBase<double>::AddVec<double>(const double alpha,
const VectorBase<double> &rv);
//template<>
//template<>
//void VectorBase<float>::AddVec(const float alpha, const VectorBase<float> &rv);
//template<>
//template<>
//void VectorBase<double>::AddVec<double>(const double alpha,
//const VectorBase<double> &rv);
} // namespace kaldi
......
......@@ -49,17 +49,6 @@ class VectorBase {
/// Set all members of a vector to a specified value.
void Set(Real f);
/// Set vector to random normally-distributed noise.
void SetRandn();
/// Sets to numbers uniformly distributed on (0,1)
void SetRandUniform();
/// This function returns a random index into this vector,
/// chosen with probability proportional to the corresponding
/// element. Requires that this->Min() >= 0 and this->Sum() > 0.
MatrixIndexT RandCategorical() const;
/// Returns the dimension of the vector.
inline MatrixIndexT Dim() const { return dim_; }
......@@ -108,178 +97,15 @@ class VectorBase {
/// Copy data from another vector (must match own size).
void CopyFromVec(const VectorBase<Real> &v);
/// Copy data from a SpMatrix or TpMatrix (must match own size).
template<typename OtherReal>
void CopyFromPacked(const PackedMatrix<OtherReal> &M);
/// Copy data from another vector of different type (double vs. float)
template<typename OtherReal>
void CopyFromVec(const VectorBase<OtherReal> &v);
/// Copy from CuVector. This is defined in ../cudamatrix/cu-vector.h
template<typename OtherReal>
void CopyFromVec(const CuVectorBase<OtherReal> &v);
/// Applies floor to all elements. Returns number of elements
/// floored in floored_count if it is non-null.
void Floor(const VectorBase<Real> &v, Real floor_val, MatrixIndexT *floored_count = nullptr);
/// Applies ceiling to all elements. Returns number of elements
/// changed in ceiled_count if it is non-null.
void Ceiling(const VectorBase<Real> &v, Real ceil_val, MatrixIndexT *ceiled_count = nullptr);
void Pow(const VectorBase<Real> &v, Real power);
/// Apply natural log to all elements. Throw if any element of
/// the vector is negative (but doesn't complain about zero; the
/// log will be -infinity
void ApplyLog();
/// Apply natural log to another vector and put result in *this.
void ApplyLogAndCopy(const VectorBase<Real> &v);
/// Apply exponential to each value in vector.
void ApplyExp();
/// Take absolute value of each of the elements
void ApplyAbs();
/// Applies floor to all elements. Returns number of elements
/// floored in floored_count if it is non-null.
inline void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = nullptr) {
this->Floor(*this, floor_val, floored_count);
};
/// Applies ceiling to all elements. Returns number of elements
/// changed in ceiled_count if it is non-null.
inline void ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count = nullptr) {
this->Ceiling(*this, ceil_val, ceiled_count);
};
/// Applies floor to all elements. Returns number of elements floored.
MatrixIndexT ApplyFloor(const VectorBase<Real> &floor_vec);
/// Apply soft-max to vector and return normalizer (log sum of exponentials).
/// This is the same as: \f$ x(i) = exp(x(i)) / \sum_i exp(x(i)) \f$
Real ApplySoftMax();
/// Applies log soft-max to vector and returns normalizer (log sum of
/// exponentials).
/// This is the same as: \f$ x(i) = x(i) - log(\sum_i exp(x(i))) \f$
Real ApplyLogSoftMax();
/// Sets each element of *this to the tanh of the corresponding element of "src".
void Tanh(const VectorBase<Real> &src);
/// Sets each element of *this to the sigmoid function of the corresponding
/// element of "src".
void Sigmoid(const VectorBase<Real> &src);
/// Take all elements of vector to a power.
inline void ApplyPow(Real power) {
this->Pow(*this, power);
};
/// Take the absolute value of all elements of a vector to a power.
/// Include the sign of the input element if include_sign == true.
/// If power is negative and the input value is zero, the output is set zero.
void ApplyPowAbs(Real power, bool include_sign=false);
/// Compute the p-th norm of the vector.
Real Norm(Real p) const;
/// Returns true if ((*this)-other).Norm(2.0) <= tol * (*this).Norm(2.0).
bool ApproxEqual(const VectorBase<Real> &other, float tol = 0.01) const;
/// Invert all elements.
void InvertElements();
/// Add vector : *this = *this + alpha * rv (with casting between floats and
/// doubles)
template<typename OtherReal>
void AddVec(const Real alpha, const VectorBase<OtherReal> &v);
/// Add vector : *this = *this + alpha * rv^2 [element-wise squaring].
void AddVec2(const Real alpha, const VectorBase<Real> &v);
/// Add vector : *this = *this + alpha * rv^2 [element-wise squaring],
/// with casting between floats and doubles.
template<typename OtherReal>
void AddVec2(const Real alpha, const VectorBase<OtherReal> &v);
/// Add matrix times vector : this <-- beta*this + alpha*M*v.
/// Calls BLAS GEMV.
void AddMatVec(const Real alpha, const MatrixBase<Real> &M,
const MatrixTransposeType trans, const VectorBase<Real> &v,
const Real beta); // **beta previously defaulted to 0.0**
/// This is as AddMatVec, except optimized for where v contains a lot
/// of zeros.
void AddMatSvec(const Real alpha, const MatrixBase<Real> &M,
const MatrixTransposeType trans, const VectorBase<Real> &v,
const Real beta); // **beta previously defaulted to 0.0**
/// Add symmetric positive definite matrix times vector:
/// this <-- beta*this + alpha*M*v. Calls BLAS SPMV.
void AddSpVec(const Real alpha, const SpMatrix<Real> &M,
const VectorBase<Real> &v, const Real beta); // **beta previously defaulted to 0.0**
/// Add triangular matrix times vector: this <-- beta*this + alpha*M*v.
/// Works even if rv == *this.
void AddTpVec(const Real alpha, const TpMatrix<Real> &M,
const MatrixTransposeType trans, const VectorBase<Real> &v,
const Real beta); // **beta previously defaulted to 0.0**
/// Set each element to y = (x == orig ? changed : x).
void ReplaceValue(Real orig, Real changed);
/// Multiply element-by-element by another vector.
void MulElements(const VectorBase<Real> &v);
/// Multiply element-by-element by another vector of different type.
template<typename OtherReal>
void MulElements(const VectorBase<OtherReal> &v);
/// Divide element-by-element by a vector.
void DivElements(const VectorBase<Real> &v);
/// Divide element-by-element by a vector of different type.
template<typename OtherReal>
void DivElements(const VectorBase<OtherReal> &v);
/// Add a constant to each element of a vector.
void Add(Real c);
/// Add element-by-element product of vectors:
// this <-- alpha * v .* r + beta*this .
void AddVecVec(Real alpha, const VectorBase<Real> &v,
const VectorBase<Real> &r, Real beta);
/// Add element-by-element quotient of two vectors.
/// this <---- alpha*v/r + beta*this
void AddVecDivVec(Real alpha, const VectorBase<Real> &v,
const VectorBase<Real> &r, Real beta);
/// Multiplies all elements by this constant.
void Scale(Real alpha);
/// Multiplies this vector by lower-triangular matrix: *this <-- *this *M
void MulTp(const TpMatrix<Real> &M, const MatrixTransposeType trans);
/// If trans == kNoTrans, solves M x = b, where b is the value of *this at input
/// and x is the value of *this at output.
/// If trans == kTrans, solves M' x = b.
/// Does not test for M being singular or near-singular, so test it before
/// calling this routine.
void Solve(const TpMatrix<Real> &M, const MatrixTransposeType trans);
/// Performs a row stack of the matrix M
void CopyRowsFromMat(const MatrixBase<Real> &M);
template<typename OtherReal>
void CopyRowsFromMat(const MatrixBase<OtherReal> &M);
/// The following is implemented in ../cudamatrix/cu-matrix.cc
void CopyRowsFromMat(const CuMatrixBase<Real> &M);
/// Performs a column stack of the matrix M
void CopyColsFromMat(const MatrixBase<Real> &M);
......@@ -290,85 +116,19 @@ class VectorBase {
template<typename OtherReal>
void CopyRowFromMat(const MatrixBase<OtherReal> &M, MatrixIndexT row);
/// Extracts a row of the symmetric matrix S.
template<typename OtherReal>
void CopyRowFromSp(const SpMatrix<OtherReal> &S, MatrixIndexT row);
/// Extracts a column of the matrix M.
template<typename OtherReal>
void CopyColFromMat(const MatrixBase<OtherReal> &M , MatrixIndexT col);
/// Extracts the diagonal of the matrix M.
void CopyDiagFromMat(const MatrixBase<Real> &M);
/// Extracts the diagonal of a packed matrix M; works for Sp or Tp.
void CopyDiagFromPacked(const PackedMatrix<Real> &M);
/// Extracts the diagonal of a symmetric matrix.
inline void CopyDiagFromSp(const SpMatrix<Real> &M) { CopyDiagFromPacked(M); }
/// Extracts the diagonal of a triangular matrix.
inline void CopyDiagFromTp(const TpMatrix<Real> &M) { CopyDiagFromPacked(M); }
/// Returns the maximum value of any element, or -infinity for the empty vector.
Real Max() const;
/// Returns the maximum value of any element, and the associated index.
/// Error if vector is empty.
Real Max(MatrixIndexT *index) const;
/// Returns the minimum value of any element, or +infinity for the empty vector.
Real Min() const;
/// Returns the minimum value of any element, and the associated index.
/// Error if vector is empty.
Real Min(MatrixIndexT *index) const;
/// Returns sum of the elements
Real Sum() const;
/// Returns sum of the logs of the elements. More efficient than
/// just taking log of each. Will return NaN if any elements are
/// negative.
Real SumLog() const;
/// Does *this = alpha * (sum of rows of M) + beta * *this.
void AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta = 1.0);
/// Does *this = alpha * (sum of columns of M) + beta * *this.
void AddColSumMat(Real alpha, const MatrixBase<Real> &M, Real beta = 1.0);
/// Add the diagonal of a matrix times itself:
/// *this = diag(M M^T) + beta * *this (if trans == kNoTrans), or
/// *this = diag(M^T M) + beta * *this (if trans == kTrans).
void AddDiagMat2(Real alpha, const MatrixBase<Real> &M,
MatrixTransposeType trans = kNoTrans, Real beta = 1.0);
/// Add the diagonal of a matrix product: *this = diag(M N), assuming the
/// "trans" arguments are both kNoTrans; for transpose arguments, it behaves
/// as you would expect.
void AddDiagMatMat(Real alpha, const MatrixBase<Real> &M, MatrixTransposeType transM,
const MatrixBase<Real> &N, MatrixTransposeType transN,
Real beta = 1.0);
/// Returns log(sum(exp())) without exp overflow
/// If prune > 0.0, ignores terms less than the max - prune.
/// [Note: in future, if prune = 0.0, it will take the max.
/// For now, use -1 if you don't want it to prune.]
Real LogSumExp(Real prune = -1.0) const;
/// Reads from C++ stream (option to add to existing contents).
/// Throws exception on failure
void Read(std::istream &in, bool binary, bool add = false);
void Read(std::istream &in, bool binary);
/// Writes to C++ stream (option to write in binary).
void Write(std::ostream &Out, bool binary) const;
friend class VectorBase<double>;
friend class VectorBase<float>;
friend class CuVectorBase<Real>;
friend class CuVector<Real>;
protected:
/// Destructor; does not deallocate memory, this is handled by child classes.
/// This destructor is protected so this object can only be
......@@ -380,17 +140,6 @@ class VectorBase {
KALDI_ASSERT_IS_FLOATING_TYPE(Real);
}
// Took this out since it is not currently used, and it is possible to create
// objects where the allocated memory is not the same size as dim_ : Arnab
// /// Initializer from a pointer and a size; keeps the pointer internally
// /// (ownership or non-ownership depends on the child class).
// explicit VectorBase(Real* data, MatrixIndexT dim)
// : data_(data), dim_(dim) {}
// Arnab : made this protected since it is unsafe too.
/// Load data into the vector: sz must match own size.
void CopyFromPtr(const Real* Data, MatrixIndexT sz);
/// data memory area
Real* data_;
/// dimension of vector
......@@ -416,8 +165,8 @@ class Vector: public VectorBase<Real> {
/// Copy constructor from CUDA vector
/// This is defined in ../cudamatrix/cu-vector.h
template<typename OtherReal>
explicit Vector(const CuVectorBase<OtherReal> &cu);
//template<typename OtherReal>
//explicit Vector(const CuVectorBase<OtherReal> &cu);
/// Copy constructor. The need for this is controversial.
Vector(const Vector<Real> &v) : VectorBase<Real>() { // (cannot be explicit)
......@@ -455,7 +204,7 @@ class Vector: public VectorBase<Real> {
/// Read function using C++ streams. Can also add to existing contents
/// of matrix.
void Read(std::istream &in, bool binary, bool add = false);
void Read(std::istream &in, bool binary);
/// Set vector to a specified size (can be zero).
/// The value of the new data depends on resize_type:
......@@ -516,10 +265,10 @@ class SubVector : public VectorBase<Real> {
/// This constructor initializes the vector to point at the contents
/// of this packed matrix (SpMatrix or TpMatrix).
SubVector(const PackedMatrix<Real> &M) {
VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
VectorBase<Real>::dim_ = (M.NumRows()*(M.NumRows()+1))/2;
}
// SubVector(const PackedMatrix<Real> &M) {
//VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
//VectorBase<Real>::dim_ = (M.NumRows()*(M.NumRows()+1))/2;
//}
/// Copy constructor
SubVector(const SubVector &other) : VectorBase<Real> () {
......@@ -572,34 +321,18 @@ std::istream & operator >> (std::istream & in, Vector<Real> & v);
/// @{
template<typename Real>
bool ApproxEqual(const VectorBase<Real> &a,
const VectorBase<Real> &b, Real tol = 0.01) {
return a.ApproxEqual(b, tol);
}
template<typename Real>
inline void AssertEqual(VectorBase<Real> &a, VectorBase<Real> &b,
float tol = 0.01) {
KALDI_ASSERT(a.ApproxEqual(b, tol));
}
//template<typename Real>
//bool ApproxEqual(const VectorBase<Real> &a,
//const VectorBase<Real> &b, Real tol = 0.01) {
//return a.ApproxEqual(b, tol);
//}
//template<typename Real>
//inline void AssertEqual(VectorBase<Real> &a, VectorBase<Real> &b,
//float tol = 0.01) {
//KALDI_ASSERT(a.ApproxEqual(b, tol));
//}
/// Returns dot product between v1 and v2.
template<typename Real>
Real VecVec(const VectorBase<Real> &v1, const VectorBase<Real> &v2);
template<typename Real, typename OtherReal>
Real VecVec(const VectorBase<Real> &v1, const VectorBase<OtherReal> &v2);
/// Returns \f$ v_1^T M v_2 \f$ .
/// Not as efficient as it could be where v1 == v2.
template<typename Real>
Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
const VectorBase<Real> &v2);
/// @} End of "addtogroup matrix_funcs_scalar"
} // namespace kaldi
......
......@@ -59,26 +59,7 @@ template<typename Real> class SubVector;
template<typename Real> class MatrixBase;
template<typename Real> class SubMatrix;
template<typename Real> class Matrix;
template<typename Real> class SpMatrix;
template<typename Real> class TpMatrix;
template<typename Real> class PackedMatrix;
template<typename Real> class SparseMatrix;
// these are classes that won't be defined in this
// directory; they're mostly needed for friend declarations.
template<typename Real> class CuMatrixBase;
template<typename Real> class CuSubMatrix;
template<typename Real> class CuMatrix;
template<typename Real> class CuVectorBase;
template<typename Real> class CuSubVector;
template<typename Real> class CuVector;
template<typename Real> class CuPackedMatrix;
template<typename Real> class CuSpMatrix;
template<typename Real> class CuTpMatrix;
template<typename Real> class CuSparseMatrix;
class CompressedMatrix;
class GeneralMatrix;
/// This class provides a way for switching between double and float types.
template<typename T> class OtherReal { }; // useful in reading+writing routines
......
......@@ -5,8 +5,6 @@ ${CMAKE_CURRENT_SOURCE_DIR}
add_subdirectory(base)
add_subdirectory(util)
add_subdirectory(feat)
add_subdirectory(matrix)
add_subdirectory(lat)
add_subdirectory(fstext)
add_subdirectory(decoder)
......
add_library(kaldi-mfcc
feature-mfcc.cc
)
target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common)
add_library(kaldi-fbank
feature-fbank.cc
)
target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common)
add_library(kaldi-feat-common
wave-reader.cc
signal.cc
feature-functions.cc
feature-window.cc
resample.cc
mel-computations.cc
cmvn.cc
)
target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)
// transform/cmvn.cc
// Copyright 2009-2013 Microsoft Corporation
// Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "feat/cmvn.h"
namespace kaldi {
void InitCmvnStats(int32 dim, Matrix<double> *stats) {
KALDI_ASSERT(dim > 0);
stats->Resize(2, dim+1);
}
void AccCmvnStats(const VectorBase<BaseFloat> &feats, BaseFloat weight, MatrixBase<double> *stats) {
int32 dim = feats.Dim();
KALDI_ASSERT(stats != NULL);
KALDI_ASSERT(stats->NumRows() == 2 && stats->NumCols() == dim + 1);
// Remove these __restrict__ modifiers if they cause compilation problems.
// It's just an optimization.
double *__restrict__ mean_ptr = stats->RowData(0),
*__restrict__ var_ptr = stats->RowData(1),
*__restrict__ count_ptr = mean_ptr + dim;
const BaseFloat * __restrict__ feats_ptr = feats.Data();
*count_ptr += weight;
// Careful-- if we change the format of the matrix, the "mean_ptr < count_ptr"
// statement below might become wrong.
for (; mean_ptr < count_ptr; mean_ptr++, var_ptr++, feats_ptr++) {
*mean_ptr += *feats_ptr * weight;
*var_ptr += *feats_ptr * *feats_ptr * weight;
}
}
void AccCmvnStats(const MatrixBase<BaseFloat> &feats,
const VectorBase<BaseFloat> *weights,
MatrixBase<double> *stats) {
int32 num_frames = feats.NumRows();
if (weights != NULL) {
KALDI_ASSERT(weights->Dim() == num_frames);
}
for (int32 i = 0; i < num_frames; i++) {
SubVector<BaseFloat> this_frame = feats.Row(i);
BaseFloat weight = (weights == NULL ? 1.0 : (*weights)(i));
if (weight != 0.0)
AccCmvnStats(this_frame, weight, stats);
}
}
void ApplyCmvn(const MatrixBase<double> &stats,
bool var_norm,
MatrixBase<BaseFloat> *feats) {
KALDI_ASSERT(feats != NULL);
int32 dim = stats.NumCols() - 1;
if (stats.NumRows() > 2 || stats.NumRows() < 1 || feats->NumCols() != dim) {
KALDI_ERR << "Dim mismatch: cmvn "
<< stats.NumRows() << 'x' << stats.NumCols()
<< ", feats " << feats->NumRows() << 'x' << feats->NumCols();
}
if (stats.NumRows() == 1 && var_norm)
KALDI_ERR << "You requested variance normalization but no variance stats "
<< "are supplied.";
double count = stats(0, dim);
// Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
// computing an offset and representing it as stats, we use a count of one.
if (count < 1.0)
KALDI_ERR << "Insufficient stats for cepstral mean and variance normalization: "
<< "count = " << count;
if (!var_norm) {
Vector<BaseFloat> offset(dim);
SubVector<double> mean_stats(stats.RowData(0), dim);
offset.AddVec(-1.0 / count, mean_stats);
feats->AddVecToRows(1.0, offset);
return;
}
// norm(0, d) = mean offset;
// norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
Matrix<BaseFloat> norm(2, dim);
for (int32 d = 0; d < dim; d++) {
double mean, offset, scale;
mean = stats(0, d)/count;
double var = (stats(1, d)/count) - mean*mean,
floor = 1.0e-20;
if (var < floor) {
KALDI_WARN << "Flooring cepstral variance from " << var << " to "
<< floor;
var = floor;
}
scale = 1.0 / sqrt(var);
if (scale != scale || 1/scale == 0.0)
KALDI_ERR << "NaN or infinity in cepstral mean/variance computation";
offset = -(mean*scale);
norm(0, d) = offset;
norm(1, d) = scale;
}
// Apply the normalization.
feats->MulColsVec(norm.Row(1));
feats->AddVecToRows(1.0, norm.Row(0));
}
void ApplyCmvnReverse(const MatrixBase<double> &stats,
bool var_norm,
MatrixBase<BaseFloat> *feats) {
KALDI_ASSERT(feats != NULL);
int32 dim = stats.NumCols() - 1;
if (stats.NumRows() > 2 || stats.NumRows() < 1 || feats->NumCols() != dim) {
KALDI_ERR << "Dim mismatch: cmvn "
<< stats.NumRows() << 'x' << stats.NumCols()
<< ", feats " << feats->NumRows() << 'x' << feats->NumCols();
}
if (stats.NumRows() == 1 && var_norm)
KALDI_ERR << "You requested variance normalization but no variance stats "
<< "are supplied.";
double count = stats(0, dim);
// Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
// computing an offset and representing it as stats, we use a count of one.
if (count < 1.0)
KALDI_ERR << "Insufficient stats for cepstral mean and variance normalization: "
<< "count = " << count;
Matrix<BaseFloat> norm(2, dim); // norm(0, d) = mean offset
// norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
for (int32 d = 0; d < dim; d++) {
double mean, offset, scale;
mean = stats(0, d) / count;
if (!var_norm) {
scale = 1.0;
offset = mean;
} else {
double var = (stats(1, d)/count) - mean*mean,
floor = 1.0e-20;
if (var < floor) {
KALDI_WARN << "Flooring cepstral variance from " << var << " to "
<< floor;
var = floor;
}
// we aim to transform zero-mean, unit-variance input into data
// with the given mean and variance.
scale = sqrt(var);
offset = mean;
}
norm(0, d) = offset;
norm(1, d) = scale;
}
if (var_norm)
feats->MulColsVec(norm.Row(1));
feats->AddVecToRows(1.0, norm.Row(0));
}
void FakeStatsForSomeDims(const std::vector<int32> &dims,
MatrixBase<double> *stats) {
KALDI_ASSERT(stats->NumRows() == 2 && stats->NumCols() > 1);
int32 dim = stats->NumCols() - 1;
double count = (*stats)(0, dim);
for (size_t i = 0; i < dims.size(); i++) {
int32 d = dims[i];
KALDI_ASSERT(d >= 0 && d < dim);
(*stats)(0, d) = 0.0;
(*stats)(1, d) = count;
}
}
} // namespace kaldi
// transform/cmvn.h
// Copyright 2009-2013 Microsoft Corporation
// Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_TRANSFORM_CMVN_H_
#define KALDI_TRANSFORM_CMVN_H_
#include "base/kaldi-common.h"
#include "matrix/matrix-lib.h"
namespace kaldi {
/// This function initializes the matrix to dimension 2 by (dim+1);
/// 1st "dim" elements of 1st row are mean stats, 1st "dim" elements
/// of 2nd row are var stats, last element of 1st row is count,
/// last element of 2nd row is zero.
void InitCmvnStats(int32 dim, Matrix<double> *stats);
/// Accumulation from a single frame (weighted).
void AccCmvnStats(const VectorBase<BaseFloat> &feat,
BaseFloat weight,
MatrixBase<double> *stats);
/// Accumulation from a feature file (possibly weighted-- useful in excluding silence).
void AccCmvnStats(const MatrixBase<BaseFloat> &feats,
const VectorBase<BaseFloat> *weights, // or NULL
MatrixBase<double> *stats);
/// Apply cepstral mean and variance normalization to a matrix of features.
/// If norm_vars == true, expects stats to be of dimension 2 by (dim+1), but
/// if norm_vars == false, will accept stats of dimension 1 by (dim+1); these
/// are produced by the balanced-cmvn code when it computes an offset and
/// represents it as "fake stats".
void ApplyCmvn(const MatrixBase<double> &stats,
bool norm_vars,
MatrixBase<BaseFloat> *feats);
/// This is as ApplyCmvn, but does so in the reverse sense, i.e. applies a transform
/// that would take zero-mean, unit-variance input and turn it into output with the
/// stats of "stats". This can be useful if you trained without CMVN but later want
/// to correct a mismatch, so you would first apply CMVN and then do the "reverse"
/// CMVN with the summed stats of your training data.
void ApplyCmvnReverse(const MatrixBase<double> &stats,
bool norm_vars,
MatrixBase<BaseFloat> *feats);
/// Modify the stats so that for some dimensions (specified in "dims"), we
/// replace them with "fake" stats that have zero mean and unit variance; this
/// is done to disable CMVN for those dimensions.
void FakeStatsForSomeDims(const std::vector<int32> &dims,
MatrixBase<double> *stats);
} // namespace kaldi
#endif // KALDI_TRANSFORM_CMVN_H_
// feat/feature-common-inl.h
// Copyright 2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_FEATURE_COMMON_INL_H_
#define KALDI_FEAT_FEATURE_COMMON_INL_H_
#include "feat/resample.h"
// Do not include this file directly. It is included by feat/feature-common.h
namespace kaldi {
template <class F>
void OfflineFeatureTpl<F>::ComputeFeatures(
const VectorBase<BaseFloat> &wave,
BaseFloat sample_freq,
BaseFloat vtln_warp,
Matrix<BaseFloat> *output) {
KALDI_ASSERT(output != NULL);
BaseFloat new_sample_freq = computer_.GetFrameOptions().samp_freq;
if (sample_freq == new_sample_freq) {
Compute(wave, vtln_warp, output);
} else {
if (new_sample_freq < sample_freq &&
! computer_.GetFrameOptions().allow_downsample)
KALDI_ERR << "Waveform and config sample Frequency mismatch: "
<< sample_freq << " .vs " << new_sample_freq
<< " (use --allow-downsample=true to allow "
<< " downsampling the waveform).";
else if (new_sample_freq > sample_freq &&
! computer_.GetFrameOptions().allow_upsample)
KALDI_ERR << "Waveform and config sample Frequency mismatch: "
<< sample_freq << " .vs " << new_sample_freq
<< " (use --allow-upsample=true option to allow "
<< " upsampling the waveform).";
// Resample the waveform.
Vector<BaseFloat> resampled_wave(wave);
ResampleWaveform(sample_freq, wave,
new_sample_freq, &resampled_wave);
Compute(resampled_wave, vtln_warp, output);
}
}
template <class F>
void OfflineFeatureTpl<F>::Compute(
const VectorBase<BaseFloat> &wave,
BaseFloat vtln_warp,
Matrix<BaseFloat> *output) {
KALDI_ASSERT(output != NULL);
int32 rows_out = NumFrames(wave.Dim(), computer_.GetFrameOptions()),
cols_out = computer_.Dim();
if (rows_out == 0) {
output->Resize(0, 0);
return;
}
output->Resize(rows_out, cols_out);
Vector<BaseFloat> window; // windowed waveform.
bool use_raw_log_energy = computer_.NeedRawLogEnergy();
for (int32 r = 0; r < rows_out; r++) { // r is frame index.
BaseFloat raw_log_energy = 0.0;
ExtractWindow(0, wave, r, computer_.GetFrameOptions(),
feature_window_function_, &window,
(use_raw_log_energy ? &raw_log_energy : NULL));
SubVector<BaseFloat> output_row(*output, r);
computer_.Compute(raw_log_energy, vtln_warp, &window, &output_row);
}
}
template <class F>
void OfflineFeatureTpl<F>::Compute(
const VectorBase<BaseFloat> &wave,
BaseFloat vtln_warp,
Matrix<BaseFloat> *output) const {
OfflineFeatureTpl<F> temp(*this);
// call the non-const version of Compute() on a temporary copy of this object.
// This is a workaround for const-ness that may sometimes be useful in
// multi-threaded code, although it's not optimally efficient.
temp.Compute(wave, vtln_warp, output);
}
} // end namespace kaldi
#endif
// feat/feature-common.h
// Copyright 2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABILITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_FEATURE_COMMON_H_
#define KALDI_FEAT_FEATURE_COMMON_H_
#include <map>
#include <string>
#include "feat/feature-window.h"
namespace kaldi {
/// @addtogroup feat FeatureExtraction
/// @{
/// This class is only added for documentation, it is not intended to ever be
/// used.
struct ExampleFeatureComputerOptions {
FrameExtractionOptions frame_opts;
// .. more would go here.
};
/// This class is only added for documentation, it is not intended to ever be
/// used. It documents the interface of the *Computer classes which wrap the
/// low-level feature extraction. The template argument F of OfflineFeatureTpl must
/// follow this interface. This interface is intended for features such as
/// MFCCs and PLPs which can be computed frame by frame.
class ExampleFeatureComputer {
public:
typedef ExampleFeatureComputerOptions Options;
/// Returns a reference to the frame-extraction options class, which
/// will be part of our own options class.
const FrameExtractionOptions &GetFrameOptions() const {
return opts_.frame_opts;
}
/// Returns the feature dimension
int32 Dim() const;
/// Returns true if this function may inspect the raw log-energy of the signal
/// (before windowing and pre-emphasis); it's safe to always return true, but
/// setting it to false enables an optimization.
bool NeedRawLogEnergy() const { return true; }
/// constructor from options class; it should not store a reference or pointer
/// to the options class but should copy it.
explicit ExampleFeatureComputer(const ExampleFeatureComputerOptions &opts):
opts_(opts) { }
/// Copy constructor; all of these classes must have one.
ExampleFeatureComputer(const ExampleFeatureComputer &other);
/**
Function that computes one frame of features from
one frame of signal.
@param [in] signal_raw_log_energy The log-energy of the frame of the signal
prior to windowing and pre-emphasis, or
log(numeric_limits<float>::min()), whichever is greater. Must be
ignored by this function if this class returns false from
this->NeedRawLogEnergy().
@param [in] vtln_warp The VTLN warping factor that the user wants
to be applied when computing features for this utterance. Will
normally be 1.0, meaning no warping is to be done. The value will
be ignored for feature types that don't support VLTN, such as
spectrogram features.
@param [in] signal_frame One frame of the signal,
as extracted using the function ExtractWindow() using the options
returned by this->GetFrameOptions(). The function will use the
vector as a workspace, which is why it's a non-const pointer.
@param [out] feature Pointer to a vector of size this->Dim(), to which
the computed feature will be written.
*/
void Compute(BaseFloat signal_raw_log_energy,
BaseFloat vtln_warp,
VectorBase<BaseFloat> *signal_frame,
VectorBase<BaseFloat> *feature);
private:
// disallow assignment.
ExampleFeatureComputer &operator = (const ExampleFeatureComputer &in);
Options opts_;
};
/// This templated class is intended for offline feature extraction, i.e. where
/// you have access to the entire signal at the start. It exists mainly to be
/// drop-in replacement for the old (pre-2016) classes Mfcc, Plp and so on, for
/// use in the offline case. In April 2016 we reorganized the online
/// feature-computation code for greater modularity and to have correct support
/// for the snip-edges=false option.
template <class F>
class OfflineFeatureTpl {
public:
typedef typename F::Options Options;
// Note: feature_window_function_ is the windowing function, which initialized
// using the options class, that we cache at this level.
OfflineFeatureTpl(const Options &opts):
computer_(opts),
feature_window_function_(computer_.GetFrameOptions()) { }
// Internal (and back-compatibility) interface for computing features, which
// requires that the user has already checked that the sampling frequency
// of the waveform is equal to the sampling frequency specified in
// the frame-extraction options.
void Compute(const VectorBase<BaseFloat> &wave,
BaseFloat vtln_warp,
Matrix<BaseFloat> *output);
// This const version of Compute() is a wrapper that
// calls the non-const version on a temporary object.
// It's less efficient than the non-const version.
void Compute(const VectorBase<BaseFloat> &wave,
BaseFloat vtln_warp,
Matrix<BaseFloat> *output) const;
/**
Computes the features for one file (one sequence of features).
This is the newer interface where you specify the sample frequency
of the input waveform.
@param [in] wave The input waveform
@param [in] sample_freq The sampling frequency with which
'wave' was sampled.
if sample_freq is higher than the frequency
specified in the config, we will downsample
the waveform, but if lower, it's an error.
@param [in] vtln_warp The VTLN warping factor (will normally
be 1.0)
@param [out] output The matrix of features, where the row-index
is the frame index.
*/
void ComputeFeatures(const VectorBase<BaseFloat> &wave,
BaseFloat sample_freq,
BaseFloat vtln_warp,
Matrix<BaseFloat> *output);
int32 Dim() const { return computer_.Dim(); }
// Copy constructor.
OfflineFeatureTpl(const OfflineFeatureTpl<F> &other):
computer_(other.computer_),
feature_window_function_(other.feature_window_function_) { }
private:
// Disallow assignment.
OfflineFeatureTpl<F> &operator =(const OfflineFeatureTpl<F> &other);
F computer_;
FeatureWindowFunction feature_window_function_;
};
/// @} End of "addtogroup feat"
} // namespace kaldi
#include "feat/feature-common-inl.h"
#endif // KALDI_FEAT_FEATURE_COMMON_H_
// feat/feature-fbank.cc
// Copyright 2009-2012 Karel Vesely
// 2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "feat/feature-fbank.h"
namespace kaldi {
FbankComputer::FbankComputer(const FbankOptions &opts):
opts_(opts), srfft_(NULL) {
if (opts.energy_floor > 0.0)
log_energy_floor_ = Log(opts.energy_floor);
int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two...
srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
// We'll definitely need the filterbanks info for VTLN warping factor 1.0.
// [note: this call caches it.]
GetMelBanks(1.0);
}
FbankComputer::FbankComputer(const FbankComputer &other):
opts_(other.opts_), log_energy_floor_(other.log_energy_floor_),
mel_banks_(other.mel_banks_), srfft_(NULL) {
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
iter != mel_banks_.end();
++iter)
iter->second = new MelBanks(*(iter->second));
if (other.srfft_)
srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
}
FbankComputer::~FbankComputer() {
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
iter != mel_banks_.end(); ++iter)
delete iter->second;
delete srfft_;
}
const MelBanks* FbankComputer::GetMelBanks(BaseFloat vtln_warp) {
MelBanks *this_mel_banks = NULL;
std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
if (iter == mel_banks_.end()) {
this_mel_banks = new MelBanks(opts_.mel_opts,
opts_.frame_opts,
vtln_warp);
mel_banks_[vtln_warp] = this_mel_banks;
} else {
this_mel_banks = iter->second;
}
return this_mel_banks;
}
void FbankComputer::Compute(BaseFloat signal_raw_log_energy,
BaseFloat vtln_warp,
VectorBase<BaseFloat> *signal_frame,
VectorBase<BaseFloat> *feature) {
const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
feature->Dim() == this->Dim());
// Compute energy after window function (not the raw one).
if (opts_.use_energy && !opts_.raw_energy)
signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
std::numeric_limits<float>::epsilon()));
if (srfft_ != NULL) // Compute FFT using split-radix algorithm.
srfft_->Compute(signal_frame->Data(), true);
else // An alternative algorithm that works for non-powers-of-two.
RealFft(signal_frame, true);
// Convert the FFT into a power spectrum.
ComputePowerSpectrum(signal_frame);
SubVector<BaseFloat> power_spectrum(*signal_frame, 0,
signal_frame->Dim() / 2 + 1);
// Use magnitude instead of power if requested.
if (!opts_.use_power)
power_spectrum.ApplyPow(0.5);
int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
SubVector<BaseFloat> mel_energies(*feature,
mel_offset,
opts_.mel_opts.num_bins);
// Sum with mel fiterbanks over the power spectrum
mel_banks.Compute(power_spectrum, &mel_energies);
if (opts_.use_log_fbank) {
// Avoid log of zero (which should be prevented anyway by dithering).
mel_energies.ApplyFloor(std::numeric_limits<float>::epsilon());
mel_energies.ApplyLog(); // take the log.
}
// Copy energy as first value (or the last, if htk_compat == true).
if (opts_.use_energy) {
if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) {
signal_raw_log_energy = log_energy_floor_;
}
int32 energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
(*feature)(energy_index) = signal_raw_log_energy;
}
}
} // namespace kaldi
// feat/feature-fbank.h
// Copyright 2009-2012 Karel Vesely
// 2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_FEATURE_FBANK_H_
#define KALDI_FEAT_FEATURE_FBANK_H_
#include <map>
#include <string>
#include "feat/feature-common.h"
#include "feat/feature-functions.h"
#include "feat/feature-window.h"
#include "feat/mel-computations.h"
namespace kaldi {
/// @addtogroup feat FeatureExtraction
/// @{
/// FbankOptions contains basic options for computing filterbank features.
/// It only includes things that can be done in a "stateless" way, i.e.
/// it does not include energy max-normalization.
/// It does not include delta computation.
struct FbankOptions {
FrameExtractionOptions frame_opts;
MelBanksOptions mel_opts;
bool use_energy; // append an extra dimension with energy to the filter banks
BaseFloat energy_floor;
bool raw_energy; // If true, compute energy before preemphasis and windowing
bool htk_compat; // If true, put energy last (if using energy)
bool use_log_fbank; // if true (default), produce log-filterbank, else linear
bool use_power; // if true (default), use power in filterbank analysis, else magnitude.
FbankOptions(): mel_opts(23),
// defaults the #mel-banks to 23 for the FBANK computations.
// this seems to be common for 16khz-sampled data,
// but for 8khz-sampled data, 15 may be better.
use_energy(false),
energy_floor(0.0),
raw_energy(true),
htk_compat(false),
use_log_fbank(true),
use_power(true) {}
void Register(OptionsItf *opts) {
frame_opts.Register(opts);
mel_opts.Register(opts);
opts->Register("use-energy", &use_energy,
"Add an extra dimension with energy to the FBANK output.");
opts->Register("energy-floor", &energy_floor,
"Floor on energy (absolute, not relative) in FBANK computation. "
"Only makes a difference if --use-energy=true; only necessary if "
"--dither=0.0. Suggested values: 0.1 or 1.0");
opts->Register("raw-energy", &raw_energy,
"If true, compute energy before preemphasis and windowing");
opts->Register("htk-compat", &htk_compat, "If true, put energy last. "
"Warning: not sufficient to get HTK compatible features (need "
"to change other parameters).");
opts->Register("use-log-fbank", &use_log_fbank,
"If true, produce log-filterbank, else produce linear.");
opts->Register("use-power", &use_power,
"If true, use power, else use magnitude.");
}
};
/// Class for computing mel-filterbank features; see \ref feat_mfcc for more
/// information.
class FbankComputer {
public:
typedef FbankOptions Options;
explicit FbankComputer(const FbankOptions &opts);
FbankComputer(const FbankComputer &other);
int32 Dim() const {
return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
}
bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
const FrameExtractionOptions &GetFrameOptions() const {
return opts_.frame_opts;
}
/**
Function that computes one frame of features from
one frame of signal.
@param [in] signal_raw_log_energy The log-energy of the frame of the signal
prior to windowing and pre-emphasis, or
log(numeric_limits<float>::min()), whichever is greater. Must be
ignored by this function if this class returns false from
this->NeedsRawLogEnergy().
@param [in] vtln_warp The VTLN warping factor that the user wants
to be applied when computing features for this utterance. Will
normally be 1.0, meaning no warping is to be done. The value will
be ignored for feature types that don't support VLTN, such as
spectrogram features.
@param [in] signal_frame One frame of the signal,
as extracted using the function ExtractWindow() using the options
returned by this->GetFrameOptions(). The function will use the
vector as a workspace, which is why it's a non-const pointer.
@param [out] feature Pointer to a vector of size this->Dim(), to which
the computed feature will be written.
*/
void Compute(BaseFloat signal_raw_log_energy,
BaseFloat vtln_warp,
VectorBase<BaseFloat> *signal_frame,
VectorBase<BaseFloat> *feature);
~FbankComputer();
const MelBanks *GetMelBanks(BaseFloat vtln_warp);
private:
FbankOptions opts_;
BaseFloat log_energy_floor_;
std::map<BaseFloat, MelBanks*> mel_banks_; // BaseFloat is VTLN coefficient.
SplitRadixRealFft<BaseFloat> *srfft_;
// Disallow assignment.
FbankComputer &operator =(const FbankComputer &other);
};
typedef OfflineFeatureTpl<FbankComputer> Fbank;
/// @} End of "addtogroup feat"
} // namespace kaldi
#endif // KALDI_FEAT_FEATURE_FBANK_H_
// feat/feature-functions.cc
// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation
// 2013 Johns Hopkins University (author: Daniel Povey)
// 2014 IMSL, PKU-HKUST (author: Wei Shi)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "feat/feature-functions.h"
#include "matrix/matrix-functions.h"
namespace kaldi {
void ComputePowerSpectrum(VectorBase<BaseFloat> *waveform) {
int32 dim = waveform->Dim();
// no, letting it be non-power-of-two for now.
// KALDI_ASSERT(dim > 0 && (dim & (dim-1) == 0)); // make sure a power of two.. actually my FFT code
// does not require this (dan) but this is better in case we use different code [dan].
// RealFft(waveform, true); // true == forward (not inverse) FFT; makes no difference here,
// as we just want power spectrum.
// now we have in waveform, first half of complex spectrum
// it's stored as [real0, realN/2, real1, im1, real2, im2, ...]
int32 half_dim = dim/2;
BaseFloat first_energy = (*waveform)(0) * (*waveform)(0),
last_energy = (*waveform)(1) * (*waveform)(1); // handle this special case
for (int32 i = 1; i < half_dim; i++) {
BaseFloat real = (*waveform)(i*2), im = (*waveform)(i*2 + 1);
(*waveform)(i) = real*real + im*im;
}
(*waveform)(0) = first_energy;
(*waveform)(half_dim) = last_energy; // Will actually never be used, and anyway
// if the signal has been bandlimited sensibly this should be zero.
}
DeltaFeatures::DeltaFeatures(const DeltaFeaturesOptions &opts): opts_(opts) {
KALDI_ASSERT(opts.order >= 0 && opts.order < 1000); // just make sure we don't get binary junk.
// opts will normally be 2 or 3.
KALDI_ASSERT(opts.window > 0 && opts.window < 1000); // again, basic sanity check.
// normally the window size will be two.
scales_.resize(opts.order+1);
scales_[0].Resize(1);
scales_[0](0) = 1.0; // trivial window for 0th order delta [i.e. baseline feats]
for (int32 i = 1; i <= opts.order; i++) {
Vector<BaseFloat> &prev_scales = scales_[i-1],
&cur_scales = scales_[i];
int32 window = opts.window; // this code is designed to still
// work if instead we later make it an array and do opts.window[i-1],
// or something like that. "window" is a parameter specifying delta-window
// width which is actually 2*window + 1.
KALDI_ASSERT(window != 0);
int32 prev_offset = (static_cast<int32>(prev_scales.Dim()-1))/2,
cur_offset = prev_offset + window;
cur_scales.Resize(prev_scales.Dim() + 2*window); // also zeros it.
BaseFloat normalizer = 0.0;
for (int32 j = -window; j <= window; j++) {
normalizer += j*j;
for (int32 k = -prev_offset; k <= prev_offset; k++) {
cur_scales(j+k+cur_offset) +=
static_cast<BaseFloat>(j) * prev_scales(k+prev_offset);
}
}
cur_scales.Scale(1.0 / normalizer);
}
}
void DeltaFeatures::Process(const MatrixBase<BaseFloat> &input_feats,
int32 frame,
VectorBase<BaseFloat> *output_frame) const {
KALDI_ASSERT(frame < input_feats.NumRows());
int32 num_frames = input_feats.NumRows(),
feat_dim = input_feats.NumCols();
KALDI_ASSERT(static_cast<int32>(output_frame->Dim()) == feat_dim * (opts_.order+1));
output_frame->SetZero();
for (int32 i = 0; i <= opts_.order; i++) {
const Vector<BaseFloat> &scales = scales_[i];
int32 max_offset = (scales.Dim() - 1) / 2;
SubVector<BaseFloat> output(*output_frame, i*feat_dim, feat_dim);
for (int32 j = -max_offset; j <= max_offset; j++) {
// if asked to read
int32 offset_frame = frame + j;
if (offset_frame < 0) offset_frame = 0;
else if (offset_frame >= num_frames)
offset_frame = num_frames - 1;
BaseFloat scale = scales(j + max_offset);
if (scale != 0.0)
output.AddVec(scale, input_feats.Row(offset_frame));
}
}
}
ShiftedDeltaFeatures::ShiftedDeltaFeatures(
const ShiftedDeltaFeaturesOptions &opts): opts_(opts) {
KALDI_ASSERT(opts.window > 0 && opts.window < 1000);
// Default window is 1.
int32 window = opts.window;
KALDI_ASSERT(window != 0);
scales_.Resize(1 + 2*window); // also zeros it.
BaseFloat normalizer = 0.0;
for (int32 j = -window; j <= window; j++) {
normalizer += j*j;
scales_(j + window) += static_cast<BaseFloat>(j);
}
scales_.Scale(1.0 / normalizer);
}
void ShiftedDeltaFeatures::Process(const MatrixBase<BaseFloat> &input_feats,
int32 frame,
SubVector<BaseFloat> *output_frame) const {
KALDI_ASSERT(frame < input_feats.NumRows());
int32 num_frames = input_feats.NumRows(),
feat_dim = input_feats.NumCols();
KALDI_ASSERT(static_cast<int32>(output_frame->Dim())
== feat_dim * (opts_.num_blocks + 1));
output_frame->SetZero();
// The original features
SubVector<BaseFloat> output(*output_frame, 0, feat_dim);
output.AddVec(1.0, input_feats.Row(frame));
// Concatenate the delta-blocks. Each block is block_shift
// (usually 3) frames apart.
for (int32 i = 0; i < opts_.num_blocks; i++) {
int32 max_offset = (scales_.Dim() - 1) / 2;
SubVector<BaseFloat> output(*output_frame, (i + 1) * feat_dim, feat_dim);
for (int32 j = -max_offset; j <= max_offset; j++) {
int32 offset_frame = frame + j + i * opts_.block_shift;
if (offset_frame < 0) offset_frame = 0;
else if (offset_frame >= num_frames)
offset_frame = num_frames - 1;
BaseFloat scale = scales_(j + max_offset);
if (scale != 0.0)
output.AddVec(scale, input_feats.Row(offset_frame));
}
}
}
void ComputeDeltas(const DeltaFeaturesOptions &delta_opts,
const MatrixBase<BaseFloat> &input_features,
Matrix<BaseFloat> *output_features) {
output_features->Resize(input_features.NumRows(),
input_features.NumCols()
*(delta_opts.order + 1));
DeltaFeatures delta(delta_opts);
for (int32 r = 0; r < static_cast<int32>(input_features.NumRows()); r++) {
SubVector<BaseFloat> row(*output_features, r);
delta.Process(input_features, r, &row);
}
}
void ComputeShiftedDeltas(const ShiftedDeltaFeaturesOptions &delta_opts,
const MatrixBase<BaseFloat> &input_features,
Matrix<BaseFloat> *output_features) {
output_features->Resize(input_features.NumRows(),
input_features.NumCols()
* (delta_opts.num_blocks + 1));
ShiftedDeltaFeatures delta(delta_opts);
for (int32 r = 0; r < static_cast<int32>(input_features.NumRows()); r++) {
SubVector<BaseFloat> row(*output_features, r);
delta.Process(input_features, r, &row);
}
}
void InitIdftBases(int32 n_bases, int32 dimension, Matrix<BaseFloat> *mat_out) {
BaseFloat angle = M_PI / static_cast<BaseFloat>(dimension - 1);
BaseFloat scale = 1.0f / (2.0 * static_cast<BaseFloat>(dimension - 1));
mat_out->Resize(n_bases, dimension);
for (int32 i = 0; i < n_bases; i++) {
(*mat_out)(i, 0) = 1.0 * scale;
BaseFloat i_fl = static_cast<BaseFloat>(i);
for (int32 j = 1; j < dimension - 1; j++) {
BaseFloat j_fl = static_cast<BaseFloat>(j);
(*mat_out)(i, j) = 2.0 * scale * cos(angle * i_fl * j_fl);
}
(*mat_out)(i, dimension -1)
= scale * cos(angle * i_fl * static_cast<BaseFloat>(dimension-1));
}
}
void SpliceFrames(const MatrixBase<BaseFloat> &input_features,
int32 left_context,
int32 right_context,
Matrix<BaseFloat> *output_features) {
int32 T = input_features.NumRows(), D = input_features.NumCols();
if (T == 0 || D == 0)
KALDI_ERR << "SpliceFrames: empty input";
KALDI_ASSERT(left_context >= 0 && right_context >= 0);
int32 N = 1 + left_context + right_context;
output_features->Resize(T, D*N);
for (int32 t = 0; t < T; t++) {
SubVector<BaseFloat> dst_row(*output_features, t);
for (int32 j = 0; j < N; j++) {
int32 t2 = t + j - left_context;
if (t2 < 0) t2 = 0;
if (t2 >= T) t2 = T-1;
SubVector<BaseFloat> dst(dst_row, j*D, D),
src(input_features, t2);
dst.CopyFromVec(src);
}
}
}
void ReverseFrames(const MatrixBase<BaseFloat> &input_features,
Matrix<BaseFloat> *output_features) {
int32 T = input_features.NumRows(), D = input_features.NumCols();
if (T == 0 || D == 0)
KALDI_ERR << "ReverseFrames: empty input";
output_features->Resize(T, D);
for (int32 t = 0; t < T; t++) {
SubVector<BaseFloat> dst_row(*output_features, t);
SubVector<BaseFloat> src_row(input_features, T-1-t);
dst_row.CopyFromVec(src_row);
}
}
void SlidingWindowCmnOptions::Check() const {
KALDI_ASSERT(cmn_window > 0);
if (center)
KALDI_ASSERT(min_window > 0 && min_window <= cmn_window);
// else ignored so value doesn't matter.
}
// Internal version of SlidingWindowCmn with double-precision arguments.
void SlidingWindowCmnInternal(const SlidingWindowCmnOptions &opts,
const MatrixBase<double> &input,
MatrixBase<double> *output) {
opts.Check();
int32 num_frames = input.NumRows(), dim = input.NumCols(),
last_window_start = -1, last_window_end = -1,
warning_count = 0;
Vector<double> cur_sum(dim), cur_sumsq(dim);
for (int32 t = 0; t < num_frames; t++) {
int32 window_start, window_end; // note: window_end will be one
// past the end of the window we use for normalization.
if (opts.center) {
window_start = t - (opts.cmn_window / 2);
window_end = window_start + opts.cmn_window;
} else {
window_start = t - opts.cmn_window;
window_end = t + 1;
}
if (window_start < 0) { // shift window right if starts <0.
window_end -= window_start;
window_start = 0; // or: window_start -= window_start
}
if (!opts.center) {
if (window_end > t)
window_end = std::max(t + 1, opts.min_window);
}
if (window_end > num_frames) {
window_start -= (window_end - num_frames);
window_end = num_frames;
if (window_start < 0) window_start = 0;
}
if (last_window_start == -1) {
SubMatrix<double> input_part(input,
window_start, window_end - window_start,
0, dim);
cur_sum.AddRowSumMat(1.0, input_part , 0.0);
if (opts.normalize_variance)
cur_sumsq.AddDiagMat2(1.0, input_part, kTrans, 0.0);
} else {
if (window_start > last_window_start) {
KALDI_ASSERT(window_start == last_window_start + 1);
SubVector<double> frame_to_remove(input, last_window_start);
cur_sum.AddVec(-1.0, frame_to_remove);
if (opts.normalize_variance)
cur_sumsq.AddVec2(-1.0, frame_to_remove);
}
if (window_end > last_window_end) {
KALDI_ASSERT(window_end == last_window_end + 1);
SubVector<double> frame_to_add(input, last_window_end);
cur_sum.AddVec(1.0, frame_to_add);
if (opts.normalize_variance)
cur_sumsq.AddVec2(1.0, frame_to_add);
}
}
int32 window_frames = window_end - window_start;
last_window_start = window_start;
last_window_end = window_end;
KALDI_ASSERT(window_frames > 0);
SubVector<double> input_frame(input, t),
output_frame(*output, t);
output_frame.CopyFromVec(input_frame);
output_frame.AddVec(-1.0 / window_frames, cur_sum);
if (opts.normalize_variance) {
if (window_frames == 1) {
output_frame.Set(0.0);
} else {
Vector<double> variance(cur_sumsq);
variance.Scale(1.0 / window_frames);
variance.AddVec2(-1.0 / (window_frames * window_frames), cur_sum);
// now "variance" is the variance of the features in the window,
// around their own mean.
int32 num_floored;
variance.ApplyFloor(1.0e-10, &num_floored);
if (num_floored > 0 && num_frames > 1) {
if (opts.max_warnings == warning_count) {
KALDI_WARN << "Suppressing the remaining variance flooring "
<< "warnings. Run program with --max-warnings=-1 to "
<< "see all warnings.";
}
// If opts.max_warnings is a negative number, we won't restrict the
// number of times that the warning is printed out.
else if (opts.max_warnings < 0
|| opts.max_warnings > warning_count) {
KALDI_WARN << "Flooring when normalizing variance, floored "
<< num_floored << " elements; num-frames was "
<< window_frames;
}
warning_count++;
}
variance.ApplyPow(-0.5); // get inverse standard deviation.
output_frame.MulElements(variance);
}
}
}
}
void SlidingWindowCmn(const SlidingWindowCmnOptions &opts,
const MatrixBase<BaseFloat> &input,
MatrixBase<BaseFloat> *output) {
KALDI_ASSERT(SameDim(input, *output) && input.NumRows() > 0);
Matrix<double> input_dbl(input), output_dbl(input.NumRows(), input.NumCols());
// call double-precision version
SlidingWindowCmnInternal(opts, input_dbl, &output_dbl);
output->CopyFromMat(output_dbl);
}
} // namespace kaldi
// feat/feature-functions.h
// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Microsoft Corporation
// 2014 IMSL, PKU-HKUST (author: Wei Shi)
// 2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_FEATURE_FUNCTIONS_H_
#define KALDI_FEAT_FEATURE_FUNCTIONS_H_
#include <string>
#include <vector>
#include "matrix/matrix-lib.h"
#include "util/common-utils.h"
#include "base/kaldi-error.h"
namespace kaldi {
/// @addtogroup feat FeatureExtraction
/// @{
// ComputePowerSpectrum converts a complex FFT (as produced by the FFT
// functions in matrix/matrix-functions.h), and converts it into
// a power spectrum. If the complex FFT is a vector of size n (representing
// half the complex FFT of a real signal of size n, as described there),
// this function computes in the first (n/2) + 1 elements of it, the
// energies of the fft bins from zero to the Nyquist frequency. Contents of the
// remaining (n/2) - 1 elements are undefined at output.
void ComputePowerSpectrum(VectorBase<BaseFloat> *complex_fft);
struct DeltaFeaturesOptions {
int32 order;
int32 window; // e.g. 2; controls window size (window size is 2*window + 1)
// the behavior at the edges is to replicate the first or last frame.
// this is not configurable.
DeltaFeaturesOptions(int32 order = 2, int32 window = 2):
order(order), window(window) { }
void Register(OptionsItf *opts) {
opts->Register("delta-order", &order, "Order of delta computation");
opts->Register("delta-window", &window,
"Parameter controlling window for delta computation (actual window"
" size for each delta order is 1 + 2*delta-window-size)");
}
};
class DeltaFeatures {
public:
// This class provides a low-level function to compute delta features.
// The function takes as input a matrix of features and a frame index
// that it should compute the deltas on. It puts its output in an object
// of type VectorBase, of size (original-feature-dimension) * (opts.order+1).
// This is not the most efficient way to do the computation, but it's
// state-free and thus easier to understand
explicit DeltaFeatures(const DeltaFeaturesOptions &opts);
void Process(const MatrixBase<BaseFloat> &input_feats,
int32 frame,
VectorBase<BaseFloat> *output_frame) const;
private:
DeltaFeaturesOptions opts_;
std::vector<Vector<BaseFloat> > scales_; // a scaling window for each
// of the orders, including zero: multiply the features for each
// dimension by this window.
};
struct ShiftedDeltaFeaturesOptions {
int32 window, // The time delay and advance
num_blocks,
block_shift; // Distance between consecutive blocks
ShiftedDeltaFeaturesOptions():
window(1), num_blocks(7), block_shift(3) { }
void Register(OptionsItf *opts) {
opts->Register("delta-window", &window, "Size of delta advance and delay.");
opts->Register("num-blocks", &num_blocks, "Number of delta blocks in advance"
" of each frame to be concatenated");
opts->Register("block-shift", &block_shift, "Distance between each block");
}
};
class ShiftedDeltaFeatures {
public:
// This class provides a low-level function to compute shifted
// delta cesptra (SDC).
// The function takes as input a matrix of features and a frame index
// that it should compute the deltas on. It puts its output in an object
// of type VectorBase, of size original-feature-dimension + (1 * num_blocks).
explicit ShiftedDeltaFeatures(const ShiftedDeltaFeaturesOptions &opts);
void Process(const MatrixBase<BaseFloat> &input_feats,
int32 frame,
SubVector<BaseFloat> *output_frame) const;
private:
ShiftedDeltaFeaturesOptions opts_;
Vector<BaseFloat> scales_; // a scaling window for each
};
// ComputeDeltas is a convenience function that computes deltas on a feature
// file. If you want to deal with features coming in bit by bit you would have
// to use the DeltaFeatures class directly, and do the computation frame by
// frame. Later we will have to come up with a nice mechanism to do this for
// features coming in.
void ComputeDeltas(const DeltaFeaturesOptions &delta_opts,
const MatrixBase<BaseFloat> &input_features,
Matrix<BaseFloat> *output_features);
// ComputeShiftedDeltas computes deltas from a feature file by applying
// ShiftedDeltaFeatures over the frames. This function is provided for
// convenience, however, ShiftedDeltaFeatures can be used directly.
void ComputeShiftedDeltas(const ShiftedDeltaFeaturesOptions &delta_opts,
const MatrixBase<BaseFloat> &input_features,
Matrix<BaseFloat> *output_features);
// SpliceFrames will normally be used together with LDA.
// It splices frames together to make a window. At the
// start and end of an utterance, it duplicates the first
// and last frames.
// Will throw if input features are empty.
// left_context and right_context must be nonnegative.
// these both represent a number of frames (e.g. 4, 4 is
// a good choice).
void SpliceFrames(const MatrixBase<BaseFloat> &input_features,
int32 left_context,
int32 right_context,
Matrix<BaseFloat> *output_features);
// ReverseFrames reverses the frames in time (used for backwards decoding)
void ReverseFrames(const MatrixBase<BaseFloat> &input_features,
Matrix<BaseFloat> *output_features);
void InitIdftBases(int32 n_bases, int32 dimension, Matrix<BaseFloat> *mat_out);
// This is used for speaker-id. Also see OnlineCmnOptions in ../online2/, which
// is online CMN with no latency, for online speech recognition.
struct SlidingWindowCmnOptions {
int32 cmn_window;
int32 min_window;
int32 max_warnings;
bool normalize_variance;
bool center;
SlidingWindowCmnOptions():
cmn_window(600),
min_window(100),
max_warnings(5),
normalize_variance(false),
center(false) { }
void Register(OptionsItf *opts) {
opts->Register("cmn-window", &cmn_window, "Window in frames for running "
"average CMN computation");
opts->Register("min-cmn-window", &min_window, "Minimum CMN window "
"used at start of decoding (adds latency only at start). "
"Only applicable if center == false, ignored if center==true");
opts->Register("max-warnings", &max_warnings, "Maximum warnings to report "
"per utterance. 0 to disable, -1 to show all.");
opts->Register("norm-vars", &normalize_variance, "If true, normalize "
"variance to one."); // naming this as in apply-cmvn.cc
opts->Register("center", &center, "If true, use a window centered on the "
"current frame (to the extent possible, modulo end effects). "
"If false, window is to the left.");
}
void Check() const;
};
/// Applies sliding-window cepstral mean and/or variance normalization. See the
/// strings registering the options in the options class for information on how
/// this works and what the options are. input and output must have the same
/// dimension.
void SlidingWindowCmn(const SlidingWindowCmnOptions &opts,
const MatrixBase<BaseFloat> &input,
MatrixBase<BaseFloat> *output);
/// @} End of "addtogroup feat"
} // namespace kaldi
#endif // KALDI_FEAT_FEATURE_FUNCTIONS_H_
// feat/feature-mfcc.cc
// Copyright 2009-2011 Karel Vesely; Petr Motlicek
// 2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "feat/feature-mfcc.h"
namespace kaldi {
void MfccComputer::Compute(BaseFloat signal_raw_log_energy,
BaseFloat vtln_warp,
VectorBase<BaseFloat> *signal_frame,
VectorBase<BaseFloat> *feature) {
KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
feature->Dim() == this->Dim());
const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
if (opts_.use_energy && !opts_.raw_energy)
signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
std::numeric_limits<float>::epsilon()));
if (srfft_ != NULL) // Compute FFT using the split-radix algorithm.
srfft_->Compute(signal_frame->Data(), true);
else // An alternative algorithm that works for non-powers-of-two.
RealFft(signal_frame, true);
// Convert the FFT into a power spectrum.
ComputePowerSpectrum(signal_frame);
SubVector<BaseFloat> power_spectrum(*signal_frame, 0,
signal_frame->Dim() / 2 + 1);
mel_banks.Compute(power_spectrum, &mel_energies_);
// avoid log of zero (which should be prevented anyway by dithering).
mel_energies_.ApplyFloor(std::numeric_limits<float>::epsilon());
mel_energies_.ApplyLog(); // take the log.
feature->SetZero(); // in case there were NaNs.
// feature = dct_matrix_ * mel_energies [which now have log]
feature->AddMatVec(1.0, dct_matrix_, kNoTrans, mel_energies_, 0.0);
if (opts_.cepstral_lifter != 0.0)
feature->MulElements(lifter_coeffs_);
if (opts_.use_energy) {
if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
signal_raw_log_energy = log_energy_floor_;
(*feature)(0) = signal_raw_log_energy;
}
if (opts_.htk_compat) {
BaseFloat energy = (*feature)(0);
for (int32 i = 0; i < opts_.num_ceps - 1; i++)
(*feature)(i) = (*feature)(i+1);
if (!opts_.use_energy)
energy *= M_SQRT2; // scale on C0 (actually removing a scale
// we previously added that's part of one common definition of
// the cosine transform.)
(*feature)(opts_.num_ceps - 1) = energy;
}
}
MfccComputer::MfccComputer(const MfccOptions &opts):
opts_(opts), srfft_(NULL),
mel_energies_(opts.mel_opts.num_bins) {
int32 num_bins = opts.mel_opts.num_bins;
if (opts.num_ceps > num_bins)
KALDI_ERR << "num-ceps cannot be larger than num-mel-bins."
<< " It should be smaller or equal. You provided num-ceps: "
<< opts.num_ceps << " and num-mel-bins: "
<< num_bins;
Matrix<BaseFloat> dct_matrix(num_bins, num_bins);
ComputeDctMatrix(&dct_matrix);
// Note that we include zeroth dct in either case. If using the
// energy we replace this with the energy. This means a different
// ordering of features than HTK.
SubMatrix<BaseFloat> dct_rows(dct_matrix, 0, opts.num_ceps, 0, num_bins);
dct_matrix_.Resize(opts.num_ceps, num_bins);
dct_matrix_.CopyFromMat(dct_rows); // subset of rows.
if (opts.cepstral_lifter != 0.0) {
lifter_coeffs_.Resize(opts.num_ceps);
ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_);
}
if (opts.energy_floor > 0.0)
log_energy_floor_ = Log(opts.energy_floor);
int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two...
srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
// We'll definitely need the filterbanks info for VTLN warping factor 1.0.
// [note: this call caches it.]
GetMelBanks(1.0);
}
MfccComputer::MfccComputer(const MfccComputer &other):
opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_),
dct_matrix_(other.dct_matrix_),
log_energy_floor_(other.log_energy_floor_),
mel_banks_(other.mel_banks_),
srfft_(NULL),
mel_energies_(other.mel_energies_.Dim(), kUndefined) {
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
iter != mel_banks_.end(); ++iter)
iter->second = new MelBanks(*(iter->second));
if (other.srfft_ != NULL)
srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
}
MfccComputer::~MfccComputer() {
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
iter != mel_banks_.end();
++iter)
delete iter->second;
delete srfft_;
}
const MelBanks *MfccComputer::GetMelBanks(BaseFloat vtln_warp) {
MelBanks *this_mel_banks = NULL;
std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
if (iter == mel_banks_.end()) {
this_mel_banks = new MelBanks(opts_.mel_opts,
opts_.frame_opts,
vtln_warp);
mel_banks_[vtln_warp] = this_mel_banks;
} else {
this_mel_banks = iter->second;
}
return this_mel_banks;
}
} // namespace kaldi
// feat/feature-mfcc.h
// Copyright 2009-2011 Karel Vesely; Petr Motlicek; Saarland University
// 2014-2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_FEAT_FEATURE_MFCC_H_
#define KALDI_FEAT_FEATURE_MFCC_H_
#include <map>
#include <string>
#include "feat/feature-common.h"
#include "feat/feature-functions.h"
#include "feat/feature-window.h"
#include "feat/mel-computations.h"
namespace kaldi {
/// @addtogroup feat FeatureExtraction
/// @{
/// MfccOptions contains basic options for computing MFCC features.
struct MfccOptions {
FrameExtractionOptions frame_opts;
MelBanksOptions mel_opts;
int32 num_ceps; // e.g. 13: num cepstral coeffs, counting zero.
bool use_energy; // use energy; else C0
BaseFloat energy_floor; // 0 by default; set to a value like 1.0 or 0.1 if
// you disable dithering.
bool raw_energy; // If true, compute energy before preemphasis and windowing
BaseFloat cepstral_lifter; // Scaling factor on cepstra for HTK compatibility.
// if 0.0, no liftering is done.
bool htk_compat; // if true, put energy/C0 last and introduce a factor of
// sqrt(2) on C0 to be the same as HTK.
MfccOptions() : mel_opts(23),
// defaults the #mel-banks to 23 for the MFCC computations.
// this seems to be common for 16khz-sampled data,
// but for 8khz-sampled data, 15 may be better.
num_ceps(13),
use_energy(true),
energy_floor(0.0),
raw_energy(true),
cepstral_lifter(22.0),
htk_compat(false) {}
void Register(OptionsItf *opts) {
frame_opts.Register(opts);
mel_opts.Register(opts);
opts->Register("num-ceps", &num_ceps,
"Number of cepstra in MFCC computation (including C0)");
opts->Register("use-energy", &use_energy,
"Use energy (not C0) in MFCC computation");
opts->Register("energy-floor", &energy_floor,
"Floor on energy (absolute, not relative) in MFCC computation. "
"Only makes a difference if --use-energy=true; only necessary if "
"--dither=0.0. Suggested values: 0.1 or 1.0");
opts->Register("raw-energy", &raw_energy,
"If true, compute energy before preemphasis and windowing");
opts->Register("cepstral-lifter", &cepstral_lifter,
"Constant that controls scaling of MFCCs");
opts->Register("htk-compat", &htk_compat,
"If true, put energy or C0 last and use a factor of sqrt(2) on "
"C0. Warning: not sufficient to get HTK compatible features "
"(need to change other parameters).");
}
};
// This is the new-style interface to the MFCC computation.
class MfccComputer {
public:
typedef MfccOptions Options;
explicit MfccComputer(const MfccOptions &opts);
MfccComputer(const MfccComputer &other);
const FrameExtractionOptions &GetFrameOptions() const {
return opts_.frame_opts;
}
int32 Dim() const { return opts_.num_ceps; }
bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
/**
Function that computes one frame of features from
one frame of signal.
@param [in] signal_raw_log_energy The log-energy of the frame of the signal
prior to windowing and pre-emphasis, or
log(numeric_limits<float>::min()), whichever is greater. Must be
ignored by this function if this class returns false from
this->NeedsRawLogEnergy().
@param [in] vtln_warp The VTLN warping factor that the user wants
to be applied when computing features for this utterance. Will
normally be 1.0, meaning no warping is to be done. The value will
be ignored for feature types that don't support VLTN, such as
spectrogram features.
@param [in] signal_frame One frame of the signal,
as extracted using the function ExtractWindow() using the options
returned by this->GetFrameOptions(). The function will use the
vector as a workspace, which is why it's a non-const pointer.
@param [out] feature Pointer to a vector of size this->Dim(), to which
the computed feature will be written.
*/
void Compute(BaseFloat signal_raw_log_energy,
BaseFloat vtln_warp,
VectorBase<BaseFloat> *signal_frame,
VectorBase<BaseFloat> *feature);
~MfccComputer();
private:
// disallow assignment.
MfccComputer &operator = (const MfccComputer &in);
protected:
const MelBanks *GetMelBanks(BaseFloat vtln_warp);
MfccOptions opts_;
Vector<BaseFloat> lifter_coeffs_;
Matrix<BaseFloat> dct_matrix_; // matrix we left-multiply by to perform DCT.
BaseFloat log_energy_floor_;
std::map<BaseFloat, MelBanks*> mel_banks_; // BaseFloat is VTLN coefficient.
SplitRadixRealFft<BaseFloat> *srfft_;
// note: mel_energies_ is specific to the frame we're processing, it's
// just a temporary workspace.
Vector<BaseFloat> mel_energies_;
};
typedef OfflineFeatureTpl<MfccComputer> Mfcc;
/// @} End of "addtogroup feat"
} // namespace kaldi
#endif // KALDI_FEAT_FEATURE_MFCC_H_
// feat/feature-plp.cc
// Copyright 2009-2011 Petr Motlicek; Karel Vesely
// 2016 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#include "feat/feature-plp.h"
namespace kaldi {
PlpComputer::PlpComputer(const PlpOptions &opts):
opts_(opts), srfft_(NULL),
mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined),
autocorr_coeffs_(opts_.lpc_order + 1, kUndefined),
lpc_coeffs_(opts_.lpc_order, kUndefined),
raw_cepstrum_(opts_.lpc_order, kUndefined) {
if (opts.cepstral_lifter != 0.0) {
lifter_coeffs_.Resize(opts.num_ceps);
ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_);
}
InitIdftBases(opts_.lpc_order + 1, opts_.mel_opts.num_bins + 2,
&idft_bases_);
if (opts.energy_floor > 0.0)
log_energy_floor_ = Log(opts.energy_floor);
int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
if ((padded_window_size & (padded_window_size-1)) == 0) // Is a power of two...
srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
// We'll definitely need the filterbanks info for VTLN warping factor 1.0.
// [note: this call caches it.]
GetMelBanks(1.0);
}
PlpComputer::PlpComputer(const PlpComputer &other):
opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_),
idft_bases_(other.idft_bases_), log_energy_floor_(other.log_energy_floor_),
mel_banks_(other.mel_banks_), equal_loudness_(other.equal_loudness_),
srfft_(NULL),
mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined),
autocorr_coeffs_(opts_.lpc_order + 1, kUndefined),
lpc_coeffs_(opts_.lpc_order, kUndefined),
raw_cepstrum_(opts_.lpc_order, kUndefined) {
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
iter != mel_banks_.end(); ++iter)
iter->second = new MelBanks(*(iter->second));
for (std::map<BaseFloat, Vector<BaseFloat>*>::iterator
iter = equal_loudness_.begin();
iter != equal_loudness_.end(); ++iter)
iter->second = new Vector<BaseFloat>(*(iter->second));
if (other.srfft_ != NULL)
srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
}
PlpComputer::~PlpComputer() {
for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
iter != mel_banks_.end(); ++iter)
delete iter->second;
for (std::map<BaseFloat, Vector<BaseFloat>* >::iterator
iter = equal_loudness_.begin();
iter != equal_loudness_.end(); ++iter)
delete iter->second;
delete srfft_;
}
const MelBanks *PlpComputer::GetMelBanks(BaseFloat vtln_warp) {
MelBanks *this_mel_banks = NULL;
std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
if (iter == mel_banks_.end()) {
this_mel_banks = new MelBanks(opts_.mel_opts,
opts_.frame_opts,
vtln_warp);
mel_banks_[vtln_warp] = this_mel_banks;
} else {
this_mel_banks = iter->second;
}
return this_mel_banks;
}
const Vector<BaseFloat> *PlpComputer::GetEqualLoudness(BaseFloat vtln_warp) {
const MelBanks *this_mel_banks = GetMelBanks(vtln_warp);
Vector<BaseFloat> *ans = NULL;
std::map<BaseFloat, Vector<BaseFloat>*>::iterator iter
= equal_loudness_.find(vtln_warp);
if (iter == equal_loudness_.end()) {
ans = new Vector<BaseFloat>;
GetEqualLoudnessVector(*this_mel_banks, ans);
equal_loudness_[vtln_warp] = ans;
} else {
ans = iter->second;
}
return ans;
}
void PlpComputer::Compute(BaseFloat signal_raw_log_energy,
BaseFloat vtln_warp,
VectorBase<BaseFloat> *signal_frame,
VectorBase<BaseFloat> *feature) {
KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
feature->Dim() == this->Dim());
const MelBanks &mel_banks = *GetMelBanks(vtln_warp);
const Vector<BaseFloat> &equal_loudness = *GetEqualLoudness(vtln_warp);
KALDI_ASSERT(opts_.num_ceps <= opts_.lpc_order+1); // our num-ceps includes C0.
if (opts_.use_energy && !opts_.raw_energy)
signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
std::numeric_limits<float>::min()));
if (srfft_ != NULL) // Compute FFT using split-radix algorithm.
srfft_->Compute(signal_frame->Data(), true);
else // An alternative algorithm that works for non-powers-of-two.
RealFft(signal_frame, true);
// Convert the FFT into a power spectrum.
ComputePowerSpectrum(signal_frame); // elements 0 ... signal_frame->Dim()/2
SubVector<BaseFloat> power_spectrum(*signal_frame,
0, signal_frame->Dim() / 2 + 1);
int32 num_mel_bins = opts_.mel_opts.num_bins;
SubVector<BaseFloat> mel_energies(mel_energies_duplicated_, 1, num_mel_bins);
mel_banks.Compute(power_spectrum, &mel_energies);
mel_energies.MulElements(equal_loudness);
mel_energies.ApplyPow(opts_.compress_factor);
// duplicate first and last elements
mel_energies_duplicated_(0) = mel_energies_duplicated_(1);
mel_energies_duplicated_(num_mel_bins + 1) =
mel_energies_duplicated_(num_mel_bins);
autocorr_coeffs_.SetZero(); // In case of NaNs or infs
autocorr_coeffs_.AddMatVec(1.0, idft_bases_, kNoTrans,
mel_energies_duplicated_, 0.0);
BaseFloat residual_log_energy = ComputeLpc(autocorr_coeffs_, &lpc_coeffs_);
residual_log_energy = std::max<BaseFloat>(residual_log_energy,
std::numeric_limits<float>::min());
Lpc2Cepstrum(opts_.lpc_order, lpc_coeffs_.Data(), raw_cepstrum_.Data());
feature->Range(1, opts_.num_ceps - 1).CopyFromVec(
raw_cepstrum_.Range(0, opts_.num_ceps - 1));
(*feature)(0) = residual_log_energy;
if (opts_.cepstral_lifter != 0.0)
feature->MulElements(lifter_coeffs_);
if (opts_.cepstral_scale != 1.0)
feature->Scale(opts_.cepstral_scale);
if (opts_.use_energy) {
if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
signal_raw_log_energy = log_energy_floor_;
(*feature)(0) = signal_raw_log_energy;
}
if (opts_.htk_compat) { // reorder the features.
BaseFloat log_energy = (*feature)(0);
for (int32 i = 0; i < opts_.num_ceps-1; i++)
(*feature)(i) = (*feature)(i+1);
(*feature)(opts_.num_ceps-1) = log_energy;
}
}
} // namespace kaldi
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
add_library(kaldi-matrix
compressed-matrix.cc
kaldi-matrix.cc
kaldi-vector.cc
matrix-functions.cc
optimization.cc
packed-matrix.cc
qr.cc
sparse-matrix.cc
sp-matrix.cc
srfft.cc
tp-matrix.cc
)
target_link_libraries(kaldi-matrix gfortran kaldi-base libopenblas.a)
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册