[speechx] rm openblas && refactor kaldi-matrix, kaldi-vector (#2824)

* rm openblas && refactor kaldi-matrix kaldi-vector

[speechx] rm openblas && refactor kaldi-matrix, kaldi-vector (#2824)
* rm openblas && refactor kaldi-matrix kaldi-vector
ee7c266f · YangZhou · GitHub · c1b1ae05 · ee7c266f · ee7c266f
119 changed file
--- a/speechx/CMakeLists.txt
+++ b/speechx/CMakeLists.txt
@@ -53,9 +53,6 @@ include(gflags)
 include(glog)
-#openblas
-include(openblas)
 # openfst
 include(openfst)
 add_dependencies(openfst gflags glog)

--- a/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder_main.cc
+++ b/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder_main.cc
@@ -14,7 +14,7 @@
 #include "decoder/ctc_prefix_beam_search_decoder.h"
 #include "base/common.h"
-#include "frontend/audio/data_cache.h"
+#include "frontend/data_cache.h"
 #include "fst/symbol-table.h"
 #include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"

--- a/speechx/speechx/asr/nnet/decodable.h
+++ b/speechx/speechx/asr/nnet/decodable.h
@@ -14,7 +14,7 @@
 #include "base/common.h"
 #include "kaldi/decoder/decodable-itf.h"
-#include "kaldi/matrix/kaldi-matrix.h"
+#include "matrix/kaldi-matrix.h"
 #include "nnet/nnet_itf.h"
 #include "nnet/nnet_producer.h"

--- a/speechx/speechx/asr/nnet/nnet_itf.h
+++ b/speechx/speechx/asr/nnet/nnet_itf.h
@@ -15,7 +15,6 @@
 #include "base/basic_types.h"
 #include "kaldi/base/kaldi-types.h"
-#include "kaldi/matrix/kaldi-matrix.h"
 #include "kaldi/util/options-itf.h"
 DECLARE_int32(subsampling_rate);

--- a/speechx/speechx/asr/nnet/nnet_producer.cc
+++ b/speechx/speechx/asr/nnet/nnet_producer.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 #include "nnet/nnet_producer.h"
+#include "matrix/kaldi-matrix.h"
 namespace ppspeech {
-using kaldi::Vector;
 using std::vector;
 using kaldi::BaseFloat;

--- a/speechx/speechx/asr/nnet/nnet_producer.h
+++ b/speechx/speechx/asr/nnet/nnet_producer.h
@@ -16,7 +16,7 @@
 #include "base/common.h"
 #include "base/safe_queue.h"
-#include "frontend/audio/frontend_itf.h"
+#include "frontend/frontend_itf.h"
 #include "nnet/nnet_itf.h"
 namespace ppspeech {

--- a/speechx/speechx/asr/nnet/u2_nnet.h
+++ b/speechx/speechx/asr/nnet/u2_nnet.h
@@ -18,7 +18,7 @@
 #pragma once
 #include "base/common.h"
-#include "kaldi/matrix/kaldi-matrix.h"
+#include "matrix/kaldi-matrix.h"
 #include "nnet/nnet_itf.h"
 #include "paddle/extension.h"
 #include "paddle/jit/all.h"

--- a/speechx/speechx/asr/nnet/u2_nnet_main.cc
+++ b/speechx/speechx/asr/nnet/u2_nnet_main.cc
@@ -15,8 +15,8 @@
 #include "base/common.h"
 #include "decoder/param.h"
-#include "frontend/audio/assembler.h"
+#include "frontend/assembler.h"
-#include "frontend/audio/data_cache.h"
+#include "frontend/data_cache.h"
 #include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"
 #include "nnet/u2_nnet.h"

--- a/speechx/speechx/asr/recognizer/CMakeLists.txt
+++ b/speechx/speechx/asr/recognizer/CMakeLists.txt
@@ -15,7 +15,7 @@ set(TEST_BINS
 foreach(bin_name IN LISTS TEST_BINS)
  add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
  target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-  target_link_libraries(${bin_name} recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-feat-common)
+  target_link_libraries(${bin_name} recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util)
  target_compile_options(${bin_name}  PRIVATE ${PADDLE_COMPILE_FLAGS})
  target_include_directories(${bin_name}  PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
  target_link_libraries(${bin_name}  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})

--- a/speechx/speechx/asr/recognizer/u2_recognizer.h
+++ b/speechx/speechx/asr/recognizer/u2_recognizer.h
@@ -18,7 +18,7 @@
 #include "decoder/ctc_beam_search_opt.h"
 #include "decoder/ctc_prefix_beam_search_decoder.h"
 #include "decoder/decoder_itf.h"
-#include "frontend/audio/feature_pipeline.h"
+#include "frontend/feature_pipeline.h"
 #include "fst/fstlib.h"
 #include "fst/symbol-table.h"
 #include "nnet/decodable.h"

--- a/speechx/speechx/asr/recognizer/u2_recognizer_main.cc
+++ b/speechx/speechx/asr/recognizer/u2_recognizer_main.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 #include "decoder/param.h"
-#include "kaldi/feat/wave-reader.h"
+#include "frontend/wave-reader.h"
 #include "kaldi/util/table-types.h"
 #include "recognizer/u2_recognizer.h"

--- a/speechx/speechx/asr/recognizer/u2_recognizer_thread_main.cc
+++ b/speechx/speechx/asr/recognizer/u2_recognizer_thread_main.cc
@@ -14,7 +14,7 @@
 #include "recognizer/u2_recognizer.h"
 #include "decoder/param.h"
-#include "kaldi/feat/wave-reader.h"
+#include "frontend/wave-reader.h"
 #include "kaldi/util/table-types.h"
 DEFINE_string(wav_rspecifier, "", "test feature rspecifier");

--- a/speechx/speechx/common/CMakeLists.txt
+++ b/speechx/speechx/common/CMakeLists.txt
@@ -4,6 +4,8 @@ ${CMAKE_CURRENT_SOURCE_DIR}/../
 )
 add_subdirectory(utils)
+add_subdirectory(matrix)
 include_directories(
 ${CMAKE_CURRENT_SOURCE_DIR}/frontend
 )

--- a/speechx/speechx/common/frontend/CMakeLists.txt
+++ b/speechx/speechx/common/frontend/CMakeLists.txt
+add_library(kaldi-native-fbank-core 
+  feature-fbank.cc
+  feature-functions.cc
+  feature-window.cc
+  fftsg.c
+  mel-computations.cc
+  rfft.cc
+)
-add_subdirectory(audio)
+add_library(frontend STATIC
\ No newline at end of file
+  cmvn.cc
+  audio_cache.cc
+  feature_cache.cc
+  feature_pipeline.cc
+  assembler.cc
+  wave-reader.cc
+)
+target_link_libraries(frontend PUBLIC kaldi-native-fbank-core utils)
+set(BINS 
+  compute_fbank_main
+)
+foreach(bin_name IN LISTS BINS)
+  add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
+  target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+  target_link_libraries(${bin_name} PUBLIC frontend utils kaldi-util gflags glog)
+endforeach()
--- a/speechx/speechx/common/frontend/audio/assembler.cc
+++ b/speechx/speechx/common/frontend/audio/assembler.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "frontend/audio/assembler.h"
+#include "frontend/assembler.h"
 namespace ppspeech {

--- a/speechx/speechx/common/frontend/audio/assembler.h
+++ b/speechx/speechx/common/frontend/audio/assembler.h
@@ -15,7 +15,7 @@
 #pragma once
 #include "base/common.h"
-#include "frontend/audio/frontend_itf.h"
+#include "frontend/frontend_itf.h"
 namespace ppspeech {

--- a/speechx/speechx/common/frontend/audio/CMakeLists.txt
+++ b/speechx/speechx/common/frontend/audio/CMakeLists.txt
-add_library(kaldi-native-fbank-core 
-  feature-fbank.cc
-  feature-functions.cc
-  feature-window.cc
-  fftsg.c
-  mel-computations.cc
-  rfft.cc
-)
-add_library(frontend STATIC
-  cmvn.cc
-  audio_cache.cc
-  feature_cache.cc
-  feature_pipeline.cc
-  assembler.cc
-)
-target_link_libraries(frontend PUBLIC kaldi-native-fbank-core utils)
-set(BINS 
-  compute_fbank_main
-)
-foreach(bin_name IN LISTS BINS)
-  add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc)
-  target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
-  target_link_libraries(${bin_name} PUBLIC frontend utils kaldi-util gflags glog kaldi-feat-common)
-endforeach()
--- a/speechx/speechx/common/frontend/audio/audio_cache.cc
+++ b/speechx/speechx/common/frontend/audio/audio_cache.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "frontend/audio/audio_cache.h"
+#include "frontend/audio_cache.h"
 #include "kaldi/base/timer.h"

--- a/speechx/speechx/common/frontend/audio/audio_cache.h
+++ b/speechx/speechx/common/frontend/audio/audio_cache.h
@@ -16,7 +16,7 @@
 #pragma once
 #include "base/common.h"
-#include "frontend/audio/frontend_itf.h"
+#include "frontend/frontend_itf.h"
 namespace ppspeech {

--- a/speechx/speechx/common/frontend/audio/cmvn.cc
+++ b/speechx/speechx/common/frontend/audio/cmvn.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
-#include "frontend/audio/cmvn.h"
+#include "frontend/cmvn.h"
 #include "utils/file_utils.h"
 #include "utils/picojson.h"

--- a/speechx/speechx/common/frontend/audio/cmvn.h
+++ b/speechx/speechx/common/frontend/audio/cmvn.h
@@ -15,8 +15,7 @@
 #pragma once
 #include "base/common.h"
-#include "frontend/audio/frontend_itf.h"
+#include "frontend/frontend_itf.h"
-#include "kaldi/matrix/kaldi-matrix.h"
 #include "kaldi/util/options-itf.h"
 namespace ppspeech {

--- a/speechx/speechx/common/frontend/audio/compute_fbank_main.cc
+++ b/speechx/speechx/common/frontend/audio/compute_fbank_main.cc
@@ -16,13 +16,13 @@
 #include "base/flags.h"
 #include "base/log.h"
-#include "frontend/audio/audio_cache.h"
+#include "frontend/audio_cache.h"
-#include "frontend/audio/data_cache.h"
+#include "frontend/data_cache.h"
-#include "frontend/audio/fbank.h"
+#include "frontend/fbank.h"
-#include "frontend/audio/feature_cache.h"
+#include "frontend/feature_cache.h"
-#include "frontend/audio/frontend_itf.h"
+#include "frontend/frontend_itf.h"
-#include "frontend/audio/normalizer.h"
+#include "frontend/normalizer.h"
-#include "kaldi/feat/wave-reader.h"
+#include "frontend/wave-reader.h"
 #include "kaldi/util/kaldi-io.h"
 #include "kaldi/util/table-types.h"

--- a/speechx/speechx/common/frontend/audio/compute_linear_spectrogram_main.cc
+++ b/speechx/speechx/common/frontend/audio/compute_linear_spectrogram_main.cc
--- a/speechx/speechx/common/frontend/audio/data_cache.h
+++ b/speechx/speechx/common/frontend/audio/data_cache.h
@@ -16,7 +16,7 @@
 #pragma once
 #include "base/common.h"
-#include "frontend/audio/frontend_itf.h"
+#include "frontend/frontend_itf.h"
 using std::vector;

--- a/speechx/speechx/common/frontend/audio/db_norm.cc
+++ b/speechx/speechx/common/frontend/audio/db_norm.cc
--- a/speechx/speechx/common/frontend/audio/db_norm.h
+++ b/speechx/speechx/common/frontend/audio/db_norm.h
--- a/speechx/speechx/common/frontend/audio/fbank.cc
+++ b/speechx/speechx/common/frontend/audio/fbank.cc
--- a/speechx/speechx/common/frontend/audio/fbank.h
+++ b/speechx/speechx/common/frontend/audio/fbank.h
@@ -15,8 +15,8 @@
 #pragma once
 #include "base/common.h"
-#include "frontend/audio/feature_common.h"
+#include "frontend/feature_common.h"
-#include "frontend/audio/feature-fbank.h"
+#include "frontend/feature-fbank.h"
 namespace ppspeech {

--- a/speechx/speechx/common/frontend/audio/feature-fbank.cc
+++ b/speechx/speechx/common/frontend/audio/feature-fbank.cc
@@ -18,11 +18,11 @@
 // This file is copied/modified from kaldi/src/feat/feature-fbank.cc
 //
-#include "frontend/audio/feature-fbank.h"
+#include "frontend/feature-fbank.h"
 #include <cmath>
-#include "frontend/audio/feature-functions.h"
+#include "frontend/feature-functions.h"
 namespace knf {

--- a/speechx/speechx/common/frontend/audio/feature-fbank.h
+++ b/speechx/speechx/common/frontend/audio/feature-fbank.h
@@ -23,9 +23,9 @@
 #include <map>
-#include "frontend/audio/feature-window.h"
+#include "frontend/feature-window.h"
-#include "frontend/audio/mel-computations.h"
+#include "frontend/mel-computations.h"
-#include "frontend/audio/rfft.h"
+#include "frontend/rfft.h"
 namespace knf {

--- a/speechx/speechx/common/frontend/audio/feature-functions.cc
+++ b/speechx/speechx/common/frontend/audio/feature-functions.cc
@@ -18,7 +18,7 @@
 // This file is copied/modified from kaldi/src/feat/feature-functions.cc
-#include "frontend/audio/feature-functions.h"
+#include "frontend/feature-functions.h"
 #include <cstdint>
 #include <vector>

--- a/speechx/speechx/common/frontend/audio/feature-functions.h
+++ b/speechx/speechx/common/frontend/audio/feature-functions.h
--- a/speechx/speechx/common/frontend/audio/feature-window.cc
+++ b/speechx/speechx/common/frontend/audio/feature-window.cc
@@ -4,7 +4,7 @@
 // This file is copied/modified from kaldi/src/feat/feature-window.cc
-#include "frontend/audio/feature-window.h"
+#include "frontend/feature-window.h"
 #include <cmath>
 #include <vector>

--- a/speechx/speechx/common/frontend/audio/feature-window.h
+++ b/speechx/speechx/common/frontend/audio/feature-window.h
--- a/speechx/speechx/common/frontend/audio/feature_cache.cc
+++ b/speechx/speechx/common/frontend/audio/feature_cache.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "frontend/audio/feature_cache.h"
+#include "frontend/feature_cache.h"
 namespace ppspeech {

--- a/speechx/speechx/common/frontend/audio/feature_cache.h
+++ b/speechx/speechx/common/frontend/audio/feature_cache.h
@@ -15,7 +15,7 @@
 #pragma once
 #include "base/common.h"
-#include "frontend/audio/frontend_itf.h"
+#include "frontend/frontend_itf.h"
 namespace ppspeech {

--- a/speechx/speechx/common/frontend/audio/feature_common.h
+++ b/speechx/speechx/common/frontend/audio/feature_common.h
@@ -15,7 +15,7 @@
 #pragma once
 #include "frontend_itf.h"
-#include "frontend/audio/feature-window.h"
+#include "frontend/feature-window.h"
 namespace ppspeech {
@@ -52,4 +52,4 @@ class StreamingFeatureTpl : public FrontendInterface {
 }  // namespace ppspeech
-#include "frontend/audio/feature_common_inl.h"
+#include "frontend/feature_common_inl.h"
--- a/speechx/speechx/common/frontend/audio/feature_common_inl.h
+++ b/speechx/speechx/common/frontend/audio/feature_common_inl.h
--- a/speechx/speechx/common/frontend/audio/feature_pipeline.cc
+++ b/speechx/speechx/common/frontend/audio/feature_pipeline.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "frontend/audio/feature_pipeline.h"
+#include "frontend/feature_pipeline.h"
 namespace ppspeech {

--- a/speechx/speechx/common/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/common/frontend/audio/feature_pipeline.h
@@ -16,13 +16,13 @@
 #pragma once
-#include "frontend/audio/assembler.h"
+#include "frontend/assembler.h"
-#include "frontend/audio/audio_cache.h"
+#include "frontend/audio_cache.h"
-#include "frontend/audio/data_cache.h"
+#include "frontend/data_cache.h"
-#include "frontend/audio/fbank.h"
+#include "frontend/fbank.h"
-#include "frontend/audio/feature_cache.h"
+#include "frontend/feature_cache.h"
-#include "frontend/audio/frontend_itf.h"
+#include "frontend/frontend_itf.h"
-#include "frontend/audio/normalizer.h"
+#include "frontend/cmvn.h"
 // feature
 DECLARE_bool(fill_zero);

--- a/speechx/speechx/common/frontend/audio/fftsg.c
+++ b/speechx/speechx/common/frontend/audio/fftsg.c
--- a/speechx/speechx/common/frontend/audio/frontend_itf.h
+++ b/speechx/speechx/common/frontend/audio/frontend_itf.h
@@ -15,7 +15,7 @@
 #pragma once
 #include "base/basic_types.h"
-#include "kaldi/matrix/kaldi-vector.h"
+#include "matrix/kaldi-vector.h"
 namespace ppspeech {

--- a/speechx/speechx/common/frontend/audio/linear_spectrogram.cc
+++ b/speechx/speechx/common/frontend/audio/linear_spectrogram.cc
--- a/speechx/speechx/common/frontend/audio/linear_spectrogram.h
+++ b/speechx/speechx/common/frontend/audio/linear_spectrogram.h
--- a/speechx/speechx/common/frontend/audio/mel-computations.cc
+++ b/speechx/speechx/common/frontend/audio/mel-computations.cc
@@ -18,12 +18,12 @@
 // This file is copied/modified from kaldi/src/feat/mel-computations.cc
-#include "frontend/audio/mel-computations.h"
+#include "frontend/mel-computations.h"
 #include <algorithm>
 #include <sstream>
-#include "frontend/audio/feature-window.h"
+#include "frontend/feature-window.h"
 namespace knf {

--- a/speechx/speechx/common/frontend/audio/mel-computations.h
+++ b/speechx/speechx/common/frontend/audio/mel-computations.h
@@ -22,7 +22,7 @@
 #include <cmath>
 #include <string>
-#include "frontend/audio/feature-window.h"
+#include "frontend/feature-window.h"
 namespace knf {

--- a/speechx/speechx/common/frontend/audio/mfcc.cc
+++ b/speechx/speechx/common/frontend/audio/mfcc.cc
--- a/speechx/speechx/common/frontend/audio/mfcc.h
+++ b/speechx/speechx/common/frontend/audio/mfcc.h
--- a/speechx/speechx/common/frontend/audio/normalizer.h
+++ b/speechx/speechx/common/frontend/audio/normalizer.h
@@ -14,5 +14,4 @@
 #pragma once
-#include "frontend/audio/cmvn.h"
+#include "frontend/cmvn.h"
-#include "frontend/audio/db_norm.h"
\ No newline at end of file
\ No newline at end of file
--- a/speechx/speechx/common/frontend/audio/rfft.cc
+++ b/speechx/speechx/common/frontend/audio/rfft.cc
@@ -16,7 +16,7 @@
 * limitations under the License.
 */
-#include "frontend/audio/rfft.h"
+#include "frontend/rfft.h"
 #include <cmath>
 #include <vector>

--- a/speechx/speechx/common/frontend/audio/rfft.h
+++ b/speechx/speechx/common/frontend/audio/rfft.h
--- a/speechx/speechx/kaldi/feat/wave-reader.cc
+++ b/speechx/speechx/kaldi/feat/wave-reader.cc
@@ -25,7 +25,7 @@
 #include <sstream>
 #include <vector>
-#include "feat/wave-reader.h"
+#include "frontend/wave-reader.h"
 #include "base/kaldi-error.h"
 #include "base/kaldi-utils.h"

--- a/speechx/speechx/kaldi/feat/wave-reader.h
+++ b/speechx/speechx/kaldi/feat/wave-reader.h
--- a/speechx/speechx/common/matrix/CMakeLists.txt
+++ b/speechx/speechx/common/matrix/CMakeLists.txt
+add_library(kaldi-matrix
+kaldi-matrix.cc
+kaldi-vector.cc
+)
+target_link_libraries(kaldi-matrix kaldi-base)
--- a/speechx/speechx/kaldi/matrix/kaldi-matrix-inl.h
+++ b/speechx/speechx/kaldi/matrix/kaldi-matrix-inl.h
@@ -28,7 +28,7 @@ namespace kaldi {
 template<typename Real>
 Matrix<Real>::Matrix(): MatrixBase<Real>(NULL, 0, 0, 0) { }
+/*
 template<>
 template<>
 void MatrixBase<float>::AddVecVec(const float alpha, const VectorBase<float> &ra, const VectorBase<float> &rb);
@@ -36,6 +36,7 @@ void MatrixBase<float>::AddVecVec(const float alpha, const VectorBase<float> &ra
 template<>
 template<>
 void MatrixBase<double>::AddVecVec(const double alpha, const VectorBase<double> &ra, const VectorBase<double> &rb);
+*/
 template<typename Real>
 inline std::ostream & operator << (std::ostream & os, const MatrixBase<Real> & M) {

--- a/speechx/speechx/kaldi/matrix/kaldi-matrix.cc
+++ b/speechx/speechx/kaldi/matrix/kaldi-matrix.cc
@@ -23,17 +23,9 @@
 // limitations under the License.
 #include "matrix/kaldi-matrix.h"
-#include "matrix/sp-matrix.h"
-#include "matrix/jama-svd.h"
-#include "matrix/jama-eig.h"
-#include "matrix/compressed-matrix.h"
-#include "matrix/sparse-matrix.h"
-static_assert(int(kaldi::kNoTrans) == int(CblasNoTrans) && int(kaldi::kTrans) == int(CblasTrans), 
-    "kaldi::kNoTrans and kaldi::kTrans must be equal to the appropriate CBLAS library constants!");
 namespace kaldi {
+/*
 template<typename Real>
 void MatrixBase<Real>::Invert(Real *log_det, Real *det_sign,
                              bool inverse_needed) {
@@ -206,29 +198,30 @@ void MatrixBase<Real>::SetMatMatDivMat(const MatrixBase<Real>& A,
    }
  }
 }
+*/
+//template<typename Real>
+//void MatrixBase<Real>::CopyLowerToUpper() {
+  //KALDI_ASSERT(num_rows_ == num_cols_);
+  //Real *data = data_;
+  //MatrixIndexT num_rows = num_rows_, stride = stride_;
+  //for (int32 i = 0; i < num_rows; i++)
+    //for (int32 j = 0; j < i; j++)
+      //data[j * stride + i ] = data[i * stride + j];
+//}
-template<typename Real>
-void MatrixBase<Real>::CopyLowerToUpper() {
-  KALDI_ASSERT(num_rows_ == num_cols_);
-  Real *data = data_;
-  MatrixIndexT num_rows = num_rows_, stride = stride_;
-  for (int32 i = 0; i < num_rows; i++)
-    for (int32 j = 0; j < i; j++)
-      data[j * stride + i ] = data[i * stride + j];
-}
+//template<typename Real>
+//void MatrixBase<Real>::CopyUpperToLower() {
+  //KALDI_ASSERT(num_rows_ == num_cols_);
+  //Real *data = data_;
+  //MatrixIndexT num_rows = num_rows_, stride = stride_;
+  //for (int32 i = 0; i < num_rows; i++)
+    //for (int32 j = 0; j < i; j++)
+      //data[i * stride + j] = data[j * stride + i];
+//}
-template<typename Real>
+/*
-void MatrixBase<Real>::CopyUpperToLower() {
-  KALDI_ASSERT(num_rows_ == num_cols_);
-  Real *data = data_;
-  MatrixIndexT num_rows = num_rows_, stride = stride_;
-  for (int32 i = 0; i < num_rows; i++)
-    for (int32 j = 0; j < i; j++)
-      data[i * stride + j] = data[j * stride + i];
-}
 template<typename Real>
 void MatrixBase<Real>::SymAddMat2(const Real alpha,
                                  const MatrixBase<Real> &A,
@@ -734,7 +727,7 @@ void MatrixBase<Real>::LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U_in,
 }
 #endif
+*/
 // Copy constructor.  Copies data to newly allocated memory.
 template<typename Real>
 Matrix<Real>::Matrix (const MatrixBase<Real> & M,
@@ -898,6 +891,7 @@ template
 void MatrixBase<double>::CopyFromMat(const MatrixBase<double> & M,
                                     MatrixTransposeType Trans);
+/*
 // Specialize the template for CopyFromSp for float, float.
 template<>
 template<>
@@ -992,7 +986,7 @@ template
 void MatrixBase<double>::CopyFromTp(const TpMatrix<double> & M,
                                    MatrixTransposeType trans);
+*/
 template<typename Real>
 void MatrixBase<Real>::CopyRowsFromVec(const VectorBase<Real> &rv) {
  if (rv.Dim() == num_rows_*num_cols_) {
@@ -1076,7 +1070,6 @@ void MatrixBase<Real>::CopyColsFromVec(const VectorBase<Real> &rv) {
  }
 }
 template<typename Real>
 void MatrixBase<Real>::CopyRowFromVec(const VectorBase<Real> &rv, const MatrixIndexT row) {
  KALDI_ASSERT(rv.Dim() == num_cols_ &&
@@ -1088,7 +1081,7 @@ void MatrixBase<Real>::CopyRowFromVec(const VectorBase<Real> &rv, const MatrixIn
  std::memcpy(row_data, rv_data, num_cols_ * sizeof(Real));
 }
+/*
 template<typename Real>
 void MatrixBase<Real>::CopyDiagFromVec(const VectorBase<Real> &rv) {
  KALDI_ASSERT(rv.Dim() == std::min(num_cols_, num_rows_));
@@ -1096,7 +1089,7 @@ void MatrixBase<Real>::CopyDiagFromVec(const VectorBase<Real> &rv) {
  Real *my_data = this->Data();
  for (; rv_data != rv_end; rv_data++, my_data += (this->stride_+1))
    *my_data = *rv_data;
-}
+}*/
 template<typename Real>
 void MatrixBase<Real>::CopyColFromVec(const VectorBase<Real> &rv,
@@ -1135,7 +1128,7 @@ void Matrix<Real>::Destroy() {
 }
+/*
 template<typename Real>
 void MatrixBase<Real>::MulElements(const MatrixBase<Real> &a) {
  KALDI_ASSERT(a.NumRows() == num_rows_ && a.NumCols() == num_cols_);
@@ -1325,6 +1318,7 @@ void MatrixBase<Real>::MulColsVec(const VectorBase<Real> &scale) {
    }
  }
 }
+*/
 template<typename Real>
 void MatrixBase<Real>::SetZero() {
@@ -1344,6 +1338,7 @@ void MatrixBase<Real>::Set(Real value) {
  }
 }
+/*
 template<typename Real>
 void MatrixBase<Real>::SetUnit() {
  SetZero();
@@ -1374,6 +1369,7 @@ void MatrixBase<Real>::SetRandUniform() {
    }
  }
 }
+*/
 template<typename Real>
 void MatrixBase<Real>::Write(std::ostream &os, bool binary) const {
@@ -1420,23 +1416,11 @@ void MatrixBase<Real>::Write(std::ostream &os, bool binary) const {
 template<typename Real>
-void MatrixBase<Real>::Read(std::istream & is, bool binary, bool add) {
+void MatrixBase<Real>::Read(std::istream & is, bool binary) {
-  if (add) {
-    Matrix<Real> tmp(num_rows_, num_cols_);
-    tmp.Read(is, binary, false);  // read without adding.
-    if (tmp.num_rows_ != this->num_rows_ || tmp.num_cols_ != this->num_cols_)
-      KALDI_ERR << "MatrixBase::Read, size mismatch "
-                << this->num_rows_ << ", " << this->num_cols_
-                << " vs. " << tmp.num_rows_ << ", " << tmp.num_cols_;
-    this->AddMat(1.0, tmp);
-    return;
-  }
-  // now assume add == false.
  //  In order to avoid rewriting this, we just declare a Matrix and
  // use it to read the data, then copy.
  Matrix<Real> tmp;
-  tmp.Read(is, binary, false);
+  tmp.Read(is, binary);
  if (tmp.NumRows() != NumRows() || tmp.NumCols() != NumCols()) {
    KALDI_ERR << "MatrixBase<Real>::Read, size mismatch "
              << NumRows() << " x " << NumCols() << " versus "
@@ -1447,23 +1431,7 @@ void MatrixBase<Real>::Read(std::istream & is, bool binary, bool add) {
 template<typename Real>
-void Matrix<Real>::Read(std::istream & is, bool binary, bool add) {
+void Matrix<Real>::Read(std::istream & is, bool binary) {
-  if (add) {
-    Matrix<Real> tmp;
-    tmp.Read(is, binary, false);  // read without adding.
-    if (this->num_rows_ == 0) this->Resize(tmp.num_rows_, tmp.num_cols_);
-    else {
-      if (this->num_rows_ != tmp.num_rows_ || this->num_cols_ != tmp.num_cols_) {
-        if (tmp.num_rows_ == 0) return;  // do nothing in this case.
-        else KALDI_ERR << "Matrix::Read, size mismatch "
-                       << this->num_rows_ <<  ", " << this->num_cols_
-                       << " vs. " << tmp.num_rows_ << ", " << tmp.num_cols_;
-      }
-    }
-    this->AddMat(1.0, tmp);
-    return;
-  }
  // now assume add == false.
  MatrixIndexT pos_at_start = is.tellg();
  std::ostringstream specific_error;
@@ -1472,10 +1440,10 @@ void Matrix<Real>::Read(std::istream & is, bool binary, bool add) {
    int peekval = Peek(is, binary);
    if (peekval == 'C') {
      // This code enables us to read CompressedMatrix as a regular matrix.
-      CompressedMatrix compressed_mat;
+      //CompressedMatrix compressed_mat;
-      compressed_mat.Read(is, binary); // at this point, add == false.
+      //compressed_mat.Read(is, binary); // at this point, add == false.
-      this->Resize(compressed_mat.NumRows(), compressed_mat.NumCols());
+      //this->Resize(compressed_mat.NumRows(), compressed_mat.NumCols());
-      compressed_mat.CopyToMat(this);
+      //compressed_mat.CopyToMat(this);
      return;
    }
    const char *my_token =  (sizeof(Real) == 4 ? "FM" : "DM");
@@ -1483,7 +1451,7 @@ void Matrix<Real>::Read(std::istream & is, bool binary, bool add) {
    if (peekval == other_token_start) {  // need to instantiate the other type to read it.
      typedef typename OtherReal<Real>::Real OtherType;  // if Real == float, OtherType == double, and vice versa.
      Matrix<OtherType> other(this->num_rows_, this->num_cols_);
-      other.Read(is, binary, false);  // add is false at this point anyway.
+      other.Read(is, binary);  // add is false at this point anyway.
      this->Resize(other.NumRows(), other.NumCols());
      this->CopyFromMat(other);
      return;
@@ -1672,7 +1640,7 @@ SubMatrix<Real>::SubMatrix(Real *data,
  }
 }
+/*
 template<typename Real>
 void MatrixBase<Real>::Add(const Real alpha) {
  Real *data = data_;
@@ -1812,15 +1780,15 @@ void MatrixBase<Real>::DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
      for(int32 i = 0; i < NumRows(); i++)
      (*this)(i, i)  *= 1.00001;
      }*/
-  bool ans = JamaSvd(s, U, Vt);
+//  bool ans = JamaSvd(s, U, Vt);
-  if (Vt != NULL) Vt->Transpose();  // possibly to do: change this and also the transpose inside the JamaSvd routine.  note, Vt is square.
+  //if (Vt != NULL) Vt->Transpose();  // possibly to do: change this and also the transpose inside the JamaSvd routine.  note, Vt is square.
-  if (!ans) {
+  //if (!ans) {
-    KALDI_ERR << "Error doing Svd";  // This one will be caught.
+    //KALDI_ERR << "Error doing Svd";  // This one will be caught.
-  }
+  //}
-#endif
+//#endif
-  if (prescale != 1.0) s->Scale(1.0/prescale);
+  //if (prescale != 1.0) s->Scale(1.0/prescale);
-}
+//}
+/*
 template<typename Real>
 void MatrixBase<Real>::Svd(VectorBase<Real> *s, MatrixBase<Real> *U, MatrixBase<Real> *Vt) const {
  try {
@@ -2052,17 +2020,18 @@ void MatrixBase<Real>::InvertDouble(Real *log_det, Real *det_sign,
  if (log_det) *log_det = log_det_tmp;
  if (det_sign) *det_sign = det_sign_tmp;
 }
+*/
-template<class Real>
+//template<class Real>
-void MatrixBase<Real>::CopyFromMat(const CompressedMatrix &mat) {
+//void MatrixBase<Real>::CopyFromMat(const CompressedMatrix &mat) {
-  mat.CopyToMat(this);
+  //mat.CopyToMat(this);
-}
+//}
-template<class Real>
+//template<class Real>
-Matrix<Real>::Matrix(const CompressedMatrix &M): MatrixBase<Real>() {
+//Matrix<Real>::Matrix(const CompressedMatrix &M): MatrixBase<Real>() {
-  Resize(M.NumRows(), M.NumCols(), kUndefined);
+  //Resize(M.NumRows(), M.NumCols(), kUndefined);
-  M.CopyToMat(this);
+  //M.CopyToMat(this);
-}
+//}
@@ -2074,7 +2043,7 @@ void MatrixBase<Real>::InvertElements() {
    }
  }
 }
+/*
 template<typename Real>
 void MatrixBase<Real>::Transpose() {
  KALDI_ASSERT(num_rows_ == num_cols_);
@@ -2250,7 +2219,7 @@ bool MatrixBase<Real>::Power(Real power) {
  (*this).AddMatMat(1.0, tmp, kNoTrans, P, kNoTrans, 0.0);
  return true;
 }
+*/
 template<typename Real>
 void Matrix<Real>::Swap(Matrix<Real> *other) {
  std::swap(this->data_, other->data_);
@@ -2258,7 +2227,7 @@ void Matrix<Real>::Swap(Matrix<Real> *other) {
  std::swap(this->num_rows_, other->num_rows_);
  std::swap(this->stride_, other->stride_);
 }
+/*
 // Repeating this comment that appeared in the header:
 // Eigenvalue Decomposition of a square NxN matrix into the form (*this) = P D
 // P^{-1}.  Be careful: the relationship of D to the eigenvalues we output is
@@ -2298,7 +2267,7 @@ void MatrixBase<Real>::Eig(MatrixBase<Real> *P,
 // INT_32 mVersion;
 // INT_32 mSampSize;
 // };
+/*
 template<typename Real>
 bool ReadHtk(std::istream &is, Matrix<Real> *M_ptr, HtkHeader *header_ptr)
 {
@@ -2821,7 +2790,7 @@ void MatrixBase<Real>::GroupMax(const MatrixBase<Real> &src) {
    }
  }
 }
+*/
 template<typename Real>
 void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
                                const MatrixIndexT *indices) {
@@ -2847,7 +2816,7 @@ void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
  }
 }
+/*
 template<typename Real>
 void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
                               const MatrixIndexT *indices) {
@@ -2871,8 +2840,9 @@ void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
        this_data[c] += src_data[*index_ptr];
    }
  }
-}
+}*/
+/*
 template<typename Real>
 void MatrixBase<Real>::CopyRows(const MatrixBase<Real> &src,
                                const MatrixIndexT *indices) {
@@ -3022,9 +2992,9 @@ void MatrixBase<Real>::DiffTanh(const MatrixBase<Real> &value,
    value_data += value_stride;
    diff_data += diff_stride;
  }
-}
+}*/
+/*
 template<typename Real>
 template<typename OtherReal>
 void MatrixBase<Real>::AddVecToRows(const Real alpha, const VectorBase<OtherReal> &v) {
@@ -3087,7 +3057,7 @@ template void MatrixBase<double>::AddVecToCols(const double alpha,
                                               const VectorBase<float> &v);
 template void MatrixBase<double>::AddVecToCols(const double alpha,
                                               const VectorBase<double> &v);
+*/
 //Explicit instantiation of the classes
 //Apparently, it seems to be necessary that the instantiation
 //happens at the end of the file. Otherwise, not all the member

--- a/speechx/speechx/kaldi/matrix/kaldi-matrix.h
+++ b/speechx/speechx/kaldi/matrix/kaldi-matrix.h
@@ -32,13 +32,6 @@ namespace kaldi {
 /// @{ \addtogroup matrix_funcs_scalar
-/// We need to declare this here as it will be a friend function.
-/// tr(A B), or tr(A B^T).
-template<typename Real>
-Real TraceMatMat(const MatrixBase<Real> &A, const MatrixBase<Real> &B,
-                 MatrixTransposeType trans = kNoTrans);
-/// @}
 /// \addtogroup matrix_group
 /// @{
@@ -50,15 +43,8 @@ class MatrixBase {
 public:
  // so this child can access protected members of other instances.
  friend class Matrix<Real>;
+  friend class SubMatrix<Real>;
  // friend declarations for CUDA matrices (see ../cudamatrix/)
-  friend class CuMatrixBase<Real>;
-  friend class CuMatrix<Real>;
-  friend class CuSubMatrix<Real>;
-  friend class CuPackedMatrix<Real>;
-  friend class PackedMatrix<Real>;
-  friend class SparseMatrix<Real>;
-  friend class SparseMatrix<float>;
-  friend class SparseMatrix<double>;
  /// Returns number of rows (or zero for empty matrix).
  inline MatrixIndexT  NumRows() const { return num_rows_; }
@@ -127,14 +113,6 @@ class MatrixBase {
  /// Sets all elements to a specific value.
  void Set(Real);
  /// Sets to zero, except ones along diagonal [for non-square matrices too]
-  void SetUnit();
-  /// Sets to random values of a normal distribution
-  void SetRandn();
-  /// Sets to numbers uniformly distributed on (0, 1)
-  void SetRandUniform();
-  /*  Copying functions.  These do not resize the matrix! */
  /// Copy given matrix. (no resize is done).
  template<typename OtherReal>
@@ -142,21 +120,17 @@ class MatrixBase {
                   MatrixTransposeType trans = kNoTrans);
  /// Copy from compressed matrix.
-  void CopyFromMat(const CompressedMatrix &M);
+  //void CopyFromMat(const CompressedMatrix &M);
-  /// Copy given spmatrix. (no resize is done).
-  template<typename OtherReal>
-  void CopyFromSp(const SpMatrix<OtherReal> &M);
  /// Copy given tpmatrix. (no resize is done).
-  template<typename OtherReal>
+  //template<typename OtherReal>
-  void CopyFromTp(const TpMatrix<OtherReal> &M,
+  //void CopyFromTp(const TpMatrix<OtherReal> &M,
-                  MatrixTransposeType trans = kNoTrans);
+                  //MatrixTransposeType trans = kNoTrans);
  /// Copy from CUDA matrix.  Implemented in ../cudamatrix/cu-matrix.h
-  template<typename OtherReal>
+  //template<typename OtherReal>
-  void CopyFromMat(const CuMatrixBase<OtherReal> &M,
+  //void CopyFromMat(const CuMatrixBase<OtherReal> &M,
-                   MatrixTransposeType trans = kNoTrans);
+                   //MatrixTransposeType trans = kNoTrans);
  /// This function has two modes of operation.  If v.Dim() == NumRows() *
  /// NumCols(), then treats the vector as a row-by-row concatenation of a
@@ -165,7 +139,7 @@ class MatrixBase {
  void CopyRowsFromVec(const VectorBase<Real> &v);
  /// This version of CopyRowsFromVec is implemented in ../cudamatrix/cu-vector.cc
-  void CopyRowsFromVec(const CuVectorBase<Real> &v);
+  //void CopyRowsFromVec(const CuVectorBase<Real> &v);
  template<typename OtherReal>
  void CopyRowsFromVec(const VectorBase<OtherReal> &v);
@@ -215,7 +189,7 @@ class MatrixBase {
    return SubMatrix<Real>(*this, 0, num_rows_, col_offset, num_cols);
  }
-  /* Various special functions. */
+/*
  /// Returns sum of all elements in matrix.
  Real Sum() const;
  /// Returns trace of matrix.
@@ -268,15 +242,16 @@ class MatrixBase {
  /// Does inversion in double precision even if matrix was not double.
  void InvertDouble(Real *LogDet = NULL, Real *det_sign = NULL,
                      bool inverse_needed = true);
+*/
  /// Inverts all the elements of the matrix
  void InvertElements();
+/*
  /// Transpose the matrix.  This one is only
  /// applicable to square matrices (the one in the
  /// Matrix child class works also for non-square.
  void Transpose();
+*/
  /// Copies column r from column indices[r] of src.
  /// As a special case, if indexes[i] == -1, sets column i to zero.
  /// all elements of "indices" must be in [-1, src.NumCols()-1],
@@ -296,8 +271,8 @@ class MatrixBase {
  /// indices.size() must equal this->NumCols(),
  /// all elements of "reorder" must be in [-1, src.NumCols()-1],
  /// and src.NumRows() must equal this.NumRows()
-  void AddCols(const MatrixBase<Real> &src,
+  //void AddCols(const MatrixBase<Real> &src,
-               const MatrixIndexT *indices);
+   //            const MatrixIndexT *indices);
  /// Copies row r of this matrix from an array of floats at the location given
  /// by src[r]. If any src[r] is NULL then this.Row(r) will be set to zero.
@@ -314,30 +289,30 @@ class MatrixBase {
  /// Does for each row r, this.Row(r) += alpha * src.row(indexes[r]).
  /// If indexes[r] < 0, does not add anything. all elements of "indexes" must
  /// be in [-1, src.NumRows()-1], and src.NumCols() must equal this.NumCols().
-  void AddRows(Real alpha,
+ // void AddRows(Real alpha,
-               const MatrixBase<Real> &src,
+  //             const MatrixBase<Real> &src,
-               const MatrixIndexT *indexes);
+   //            const MatrixIndexT *indexes);
  /// Does for each row r, this.Row(r) += alpha * src[r], treating src[r] as the
  /// beginning of a region of memory representing a vector of floats, of the
  /// same length as this.NumCols(). If src[r] is NULL, does not add anything.
-  void AddRows(Real alpha, const Real *const *src);
+  //void AddRows(Real alpha, const Real *const *src);
  /// For each row r of this matrix, adds it (times alpha) to the array of
  /// floats at the location given by dst[r]. If dst[r] is NULL, does not do
  /// anything for that row. Requires that none of the memory regions pointed
  /// to by the pointers in "dst" overlap (e.g. none of the pointers should be
  /// the same).
-  void AddToRows(Real alpha, Real *const *dst) const;
+  //void AddToRows(Real alpha, Real *const *dst) const;
  /// For each row i of *this, adds this->Row(i) to
  /// dst->Row(indexes(i)) if indexes(i) >= 0, else do nothing.
  /// Requires that all the indexes[i] that are >= 0
  /// be distinct, otherwise the behavior is undefined.
-  void AddToRows(Real alpha,
+  //void AddToRows(Real alpha,
-                 const MatrixIndexT *indexes,
+   //              const MatrixIndexT *indexes,
-                 MatrixBase<Real> *dst) const;
+    //             MatrixBase<Real> *dst) const;
+/*
  inline void ApplyPow(Real power) {
    this -> Pow(*this, power);
  }
@@ -374,7 +349,7 @@ class MatrixBase {
  inline void ApplyLog() {
    this -> Log(*this);
  }
+*/
  /// Eigenvalue Decomposition of a square NxN matrix into the form (*this) = P D
  /// P^{-1}.  Be careful: the relationship of D to the eigenvalues we output is
  /// slightly complicated, due to the need for P to be real.  In the symmetric
@@ -389,9 +364,9 @@ class MatrixBase {
  /// instead (*this) P = P D.
  ///
  /// The non-member function CreateEigenvalueMatrix creates D from eigs_real and eigs_imag.
-  void Eig(MatrixBase<Real> *P,
+  //void Eig(MatrixBase<Real> *P,
-           VectorBase<Real> *eigs_real,
+   //        VectorBase<Real> *eigs_real,
-           VectorBase<Real> *eigs_imag) const;
+    //       VectorBase<Real> *eigs_imag) const;
  /// The Power method attempts to take the matrix to a power using a method that
  /// works in general for fractional and negative powers.  The input matrix must
@@ -400,7 +375,7 @@ class MatrixBase {
  /// return false and leave the matrix unchanged, if at entry the matrix had
  /// real negative eigenvalues (or if it had zero eigenvalues and the power was
  /// negative).
-  bool Power(Real pow);
+//  bool Power(Real pow);
  /** Singular value decomposition
     Major limitations:
@@ -413,31 +388,32 @@ class MatrixBase {
     expect that S.Dim() == m, U is either NULL or m by n,
     and v is either NULL or n by n.
     The singular values are not sorted (use SortSvd for that).  */
-  void DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
+  //void DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
-                      MatrixBase<Real> *Vt);  // Destroys calling matrix.
+   //                   MatrixBase<Real> *Vt);  // Destroys calling matrix.
  /// Compute SVD (*this) = U diag(s) Vt.   Note that the V in the call is already
  /// transposed; the normal formulation is U diag(s) V^T.
  /// Null pointers for U or V mean we don't want that output (this saves
  /// compute).  The singular values are not sorted (use SortSvd for that).
-  void Svd(VectorBase<Real> *s, MatrixBase<Real> *U,
+  //void Svd(VectorBase<Real> *s, MatrixBase<Real> *U,
-           MatrixBase<Real> *Vt) const;
+   //        MatrixBase<Real> *Vt) const;
  /// Compute SVD but only retain the singular values.
-  void Svd(VectorBase<Real> *s) const { Svd(s, NULL, NULL); }
+  //void Svd(VectorBase<Real> *s) const { Svd(s, NULL, NULL); }
  /// Returns smallest singular value.
-  Real MinSingularValue() const {
+  //Real MinSingularValue() const {
-    Vector<Real> tmp(std::min(NumRows(), NumCols()));
+   // Vector<Real> tmp(std::min(NumRows(), NumCols()));
-    Svd(&tmp);
+    //Svd(&tmp);
-    return tmp.Min();
+    //return tmp.Min();
-  }
+  //}
-  void TestUninitialized() const; // This function is designed so that if any element
+  //void TestUninitialized() const; // This function is designed so that if any element
  // if the matrix is uninitialized memory, valgrind will complain.
  /// Returns condition number by computing Svd.  Works even if cols > rows.
  /// Returns infinity if all singular values are zero.
+  /*
  Real Cond() const;
  /// Returns true if matrix is Symmetric.
@@ -559,7 +535,7 @@ class MatrixBase {
  // element-by-element, set *this = diff * (1.0 - value^2).
  void DiffTanh(const MatrixBase<Real> &value,
                const MatrixBase<Real> &diff);
+*/
  /** Uses Svd to compute the eigenvalue decomposition of a symmetric positive
   * semi-definite matrix: (*this) = rP * diag(rS) * rP^T, with rP an
   * orthogonal matrix so rP^{-1} = rP^T.   Throws exception if input was not
@@ -571,208 +547,15 @@ class MatrixBase {
   * SpMatrix and use Eig() function there, which uses eigenvalue decomposition
   * directly rather than SVD.
  */
-  void SymPosSemiDefEig(VectorBase<Real> *s, MatrixBase<Real> *P,
-                        Real check_thresh = 0.001);
-  friend Real kaldi::TraceMatMat<Real>(const MatrixBase<Real> &A,
-      const MatrixBase<Real> &B, MatrixTransposeType trans);  // tr (A B)
-  // so it can get around const restrictions on the pointer to data_.
-  friend class SubMatrix<Real>;
-  /// Add a scalar to each element
-  void Add(const Real alpha);
-  /// Add a scalar to each diagonal element.
-  void AddToDiag(const Real alpha);
-  /// *this += alpha * a * b^T
-  template<typename OtherReal>
-  void AddVecVec(const Real alpha, const VectorBase<OtherReal> &a,
-                 const VectorBase<OtherReal> &b);
-  /// [each row of *this] += alpha * v
-  template<typename OtherReal>
-  void AddVecToRows(const Real alpha, const VectorBase<OtherReal> &v);
-  /// [each col of *this] += alpha * v
-  template<typename OtherReal>
-  void AddVecToCols(const Real alpha, const VectorBase<OtherReal> &v);
-  /// *this += alpha * M [or M^T]
-  void AddMat(const Real alpha, const MatrixBase<Real> &M,
-              MatrixTransposeType transA = kNoTrans);
-  /// *this += alpha * A [or A^T].
-  void AddSmat(Real alpha, const SparseMatrix<Real> &A,
-               MatrixTransposeType trans = kNoTrans);
-  /// (*this) = alpha * op(A) * B + beta * (*this), where A is sparse.
-  /// Multiplication of sparse with dense matrix.  See also AddMatSmat.
-  void AddSmatMat(Real alpha, const SparseMatrix<Real> &A,
-                  MatrixTransposeType transA, const MatrixBase<Real> &B,
-                  Real beta);
-  /// (*this) = alpha * A * op(B) + beta * (*this), where B is sparse
-  /// and op(B) is either B or trans(B) depending on the 'transB' argument.
-  /// This is multiplication of a dense by a sparse matrix.  See also
-  /// AddSmatMat.
-  void AddMatSmat(Real alpha, const MatrixBase<Real> &A,
-                  const SparseMatrix<Real> &B, MatrixTransposeType transB,
-                  Real beta);
-  /// *this = beta * *this + alpha * M M^T, for symmetric matrices.  It only
-  /// updates the lower triangle of *this.  It will leave the matrix asymmetric;
-  /// if you need it symmetric as a regular matrix, do CopyLowerToUpper().
-  void SymAddMat2(const Real alpha, const MatrixBase<Real> &M,
-                  MatrixTransposeType transA, Real beta);
-  /// *this = beta * *this + alpha * diag(v) * M [or M^T].
-  /// The same as adding M but scaling each row M_i by v(i).
-  void AddDiagVecMat(const Real alpha, const VectorBase<Real> &v,
-                     const MatrixBase<Real> &M, MatrixTransposeType transM,
-                     Real beta = 1.0);
-  /// *this = beta * *this + alpha * M [or M^T] * diag(v)
-  /// The same as adding M but scaling each column M_j by v(j).
-  void AddMatDiagVec(const Real alpha,
-                     const MatrixBase<Real> &M, MatrixTransposeType transM,
-                     VectorBase<Real> &v,
-                     Real beta = 1.0);
-  /// *this = beta * *this + alpha * A .* B (.* element by element multiplication)
-  void AddMatMatElements(const Real alpha,
-                         const MatrixBase<Real>& A,
-                         const MatrixBase<Real>& B,
-                         const Real beta);
-  /// *this += alpha * S
-  template<typename OtherReal>
-  void AddSp(const Real alpha, const SpMatrix<OtherReal> &S);
-  void AddMatMat(const Real alpha,
-                 const MatrixBase<Real>& A, MatrixTransposeType transA,
-                 const MatrixBase<Real>& B, MatrixTransposeType transB,
-                 const Real beta);
-  /// *this = a * b / c (by element; when c = 0, *this = a)
-  void SetMatMatDivMat(const MatrixBase<Real>& A,
-                       const MatrixBase<Real>& B,
-                       const MatrixBase<Real>& C);
-  /// A version of AddMatMat specialized for when the second argument
-  /// contains a lot of zeroes.
-  void AddMatSmat(const Real alpha,
-                  const MatrixBase<Real>& A, MatrixTransposeType transA,
-                  const MatrixBase<Real>& B, MatrixTransposeType transB,
-                  const Real beta);
-  /// A version of AddMatMat specialized for when the first argument
-  /// contains a lot of zeroes.
-  void AddSmatMat(const Real alpha,
-                  const MatrixBase<Real>& A, MatrixTransposeType transA,
-                  const MatrixBase<Real>& B, MatrixTransposeType transB,
-                  const Real beta);
-  /// this <-- beta*this + alpha*A*B*C.
-  void AddMatMatMat(const Real alpha,
-                    const MatrixBase<Real>& A, MatrixTransposeType transA,
-                    const MatrixBase<Real>& B, MatrixTransposeType transB,
-                    const MatrixBase<Real>& C, MatrixTransposeType transC,
-                    const Real beta);
-  /// this <-- beta*this + alpha*SpA*B.
-  // This and the routines below are really
-  // stubs that need to be made more efficient.
-  void AddSpMat(const Real alpha,
-                const SpMatrix<Real>& A,
-                const MatrixBase<Real>& B, MatrixTransposeType transB,
-                const Real beta) {
-    Matrix<Real> M(A);
-    return AddMatMat(alpha, M, kNoTrans, B, transB, beta);
-  }
-  /// this <-- beta*this + alpha*A*B.
-  void AddTpMat(const Real alpha,
-                const TpMatrix<Real>& A, MatrixTransposeType transA,
-                const MatrixBase<Real>& B, MatrixTransposeType transB,
-                const Real beta) {
-    Matrix<Real> M(A);
-    return AddMatMat(alpha, M, transA, B, transB, beta);
-  }
-  /// this <-- beta*this + alpha*A*B.
-  void AddMatSp(const Real alpha,
-                const MatrixBase<Real>& A, MatrixTransposeType transA,
-                const SpMatrix<Real>& B,
-                const Real beta) {
-    Matrix<Real> M(B);
-    return AddMatMat(alpha, A, transA, M, kNoTrans, beta);
-  }
-  /// this <-- beta*this + alpha*A*B*C.
-  void AddSpMatSp(const Real alpha,
-                  const SpMatrix<Real> &A,
-                  const MatrixBase<Real>& B, MatrixTransposeType transB,
-                  const SpMatrix<Real>& C,
-                const Real beta) {
-    Matrix<Real> M(A), N(C);
-    return AddMatMatMat(alpha, M, kNoTrans, B, transB, N, kNoTrans, beta);
-  }
-  /// this <-- beta*this + alpha*A*B.
-  void AddMatTp(const Real alpha,
-                const MatrixBase<Real>& A, MatrixTransposeType transA,
-                const TpMatrix<Real>& B, MatrixTransposeType transB,
-                const Real beta) {
-    Matrix<Real> M(B);
-    return AddMatMat(alpha, A, transA, M, transB, beta);
-  }
-  /// this <-- beta*this + alpha*A*B.
-  void AddTpTp(const Real alpha,
-               const TpMatrix<Real>& A, MatrixTransposeType transA,
-               const TpMatrix<Real>& B, MatrixTransposeType transB,
-               const Real beta) {
-    Matrix<Real> M(A), N(B);
-    return AddMatMat(alpha, M, transA, N, transB, beta);
-  }
-  /// this <-- beta*this + alpha*A*B.
-  // This one is more efficient, not like the others above.
-  void AddSpSp(const Real alpha,
-               const SpMatrix<Real>& A, const SpMatrix<Real>& B,
-               const Real beta);
-  /// Copy lower triangle to upper triangle (symmetrize)
-  void CopyLowerToUpper();
-  /// Copy upper triangle to lower triangle (symmetrize)
-  void CopyUpperToLower();
-  /// This function orthogonalizes the rows of a matrix using the Gram-Schmidt
-  /// process.  It is only applicable if NumRows() <= NumCols().  It will use
-  /// random number generation to fill in rows with something nonzero, in cases
-  /// where the original matrix was of deficient row rank.
-  void OrthogonalizeRows();
  /// stream read.
  /// Use instead of stream<<*this, if you want to add to existing contents.
  // Will throw exception on failure.
-  void Read(std::istream & in, bool binary, bool add = false);
+  void Read(std::istream & in, bool binary);
  /// write to stream.
  void Write(std::ostream & out, bool binary) const;
  // Below is internal methods for Svd, user does not have to know about this.
-#if !defined(HAVE_ATLAS) && !defined(USE_KALDI_SVD)
-  // protected:
-  // Should be protected but used directly in testing routine.
-  // destroys *this!
-  void LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U,
-                     MatrixBase<Real> *Vt);
-#else
- protected:
-  // destroys *this!
-  bool JamaSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
-               MatrixBase<Real> *V);
-#endif
 protected:
  ///  Initializer, callable only from child.
@@ -827,19 +610,9 @@ class Matrix : public MatrixBase<Real> {
         MatrixStrideType stride_type = kDefaultStride):
      MatrixBase<Real>() { Resize(r, c, resize_type, stride_type); }
-  /// Copy constructor from CUDA matrix
-  /// This is defined in ../cudamatrix/cu-matrix.h
-  template<typename OtherReal>
-  explicit Matrix(const CuMatrixBase<OtherReal> &cu,
-                  MatrixTransposeType trans = kNoTrans);
  /// Swaps the contents of *this and *other.  Shallow swap.
  void Swap(Matrix<Real> *other);
-  /// Defined in ../cudamatrix/cu-matrix.cc
-  void Swap(CuMatrix<Real> *mat);
  /// Constructor from any MatrixBase. Can also copy with transpose.
  /// Allocates new memory.
  explicit Matrix(const MatrixBase<Real> & M,
@@ -853,40 +626,29 @@ class Matrix : public MatrixBase<Real> {
  explicit Matrix(const MatrixBase<OtherReal> & M,
                    MatrixTransposeType trans = kNoTrans);
-  /// Copy constructor taking SpMatrix...
-  /// It is symmetric, so no option for transpose, and NumRows == Cols
-  template<typename OtherReal>
-  explicit Matrix(const SpMatrix<OtherReal> & M) : MatrixBase<Real>() {
-    Resize(M.NumRows(), M.NumRows(), kUndefined);
-    this->CopyFromSp(M);
-  }
-  /// Constructor from CompressedMatrix
-  explicit Matrix(const CompressedMatrix &C);
  /// Copy constructor taking TpMatrix...
-  template <typename OtherReal>
+  //template <typename OtherReal>
-  explicit Matrix(const TpMatrix<OtherReal> & M,
+  //explicit Matrix(const TpMatrix<OtherReal> & M,
-                  MatrixTransposeType trans = kNoTrans) : MatrixBase<Real>() {
+                  //MatrixTransposeType trans = kNoTrans) : MatrixBase<Real>() {
-    if (trans == kNoTrans) {
+    //if (trans == kNoTrans) {
-      Resize(M.NumRows(), M.NumCols(), kUndefined);
+      //Resize(M.NumRows(), M.NumCols(), kUndefined);
-      this->CopyFromTp(M);
+      //this->CopyFromTp(M);
-    } else {
+    //} else {
-      Resize(M.NumCols(), M.NumRows(), kUndefined);
+      //Resize(M.NumCols(), M.NumRows(), kUndefined);
-      this->CopyFromTp(M, kTrans);
+      //this->CopyFromTp(M, kTrans);
-    }
+    //}
-  }
+  //}
  /// read from stream.
  // Unlike one in base, allows resizing.
-  void Read(std::istream & in, bool binary, bool add = false);
+  void Read(std::istream & in, bool binary);
  /// Remove a specified row.
  void RemoveRow(MatrixIndexT i);
  /// Transpose the matrix.  Works for non-square
  /// matrices as well as square ones.
-  void Transpose();
+  //void Transpose();
  /// Distructor to free matrices.
  ~Matrix() { Destroy(); }
@@ -947,37 +709,6 @@ class Matrix : public MatrixBase<Real> {
 /// A structure containing the HTK header.
 /// [TODO: change the style of the variables to Kaldi-compliant]
-struct HtkHeader {
-  /// Number of samples.
-  int32    mNSamples;
-  /// Sample period.
-  int32    mSamplePeriod;
-  /// Sample size
-  int16    mSampleSize;
-  /// Sample kind.
-  uint16   mSampleKind;
-};
-// Read HTK formatted features from file into matrix.
-template<typename Real>
-bool ReadHtk(std::istream &is, Matrix<Real> *M, HtkHeader *header_ptr);
-// Write (HTK format) features to file from matrix.
-template<typename Real>
-bool WriteHtk(std::ostream &os, const MatrixBase<Real> &M, HtkHeader htk_hdr);
-// Write (CMUSphinx format) features to file from matrix.
-template<typename Real>
-bool WriteSphinx(std::ostream &os, const MatrixBase<Real> &M);
-/// @} end of "addtogroup matrix_funcs_io"
-/**
-  Sub-matrix representation.
-  Can work with sub-parts of a matrix using this class.
-  Note that SubMatrix is not very const-correct-- it allows you to
-  change the contents of a const Matrix.  Be careful!
-*/
 template<typename Real>
 class SubMatrix : public MatrixBase<Real> {
@@ -1012,6 +743,7 @@ class SubMatrix : public MatrixBase<Real> {
  /// Disallow assignment.
  SubMatrix<Real> &operator = (const SubMatrix<Real> &other);
 };
 /// @} End of "addtogroup matrix_funcs_io".
 /// \addtogroup matrix_funcs_scalar
@@ -1019,7 +751,7 @@ class SubMatrix : public MatrixBase<Real> {
 // Some declarations.  These are traces of products.
+/************************
 template<typename Real>
 bool ApproxEqual(const MatrixBase<Real> &A,
                 const MatrixBase<Real> &B, Real tol = 0.01) {
@@ -1085,7 +817,7 @@ void CreateEigenvalueMatrix(const VectorBase<Real> &real, const VectorBase<Real>
 template<typename Real>
 bool AttemptComplexPower(Real *x_re, Real *x_im, Real power);
+**********/
 /// @} end of addtogroup matrix_funcs_misc
@@ -1101,7 +833,6 @@ std::istream & operator >> (std::istream & In, MatrixBase<Real> & M);
 template<typename Real>
 std::istream & operator >> (std::istream & In, Matrix<Real> & M);
 template<typename Real>
 bool SameDim(const MatrixBase<Real> &M, const MatrixBase<Real> &N) {
  return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols());

--- a/speechx/speechx/kaldi/matrix/kaldi-vector-inl.h
+++ b/speechx/speechx/kaldi/matrix/kaldi-vector-inl.h
@@ -44,14 +44,14 @@ std::istream &operator >> (std::istream &is, Vector<Real> &rv) {
  return is;
 }
-template<>
+//template<>
-template<>
+//template<>
-void VectorBase<float>::AddVec(const float alpha, const VectorBase<float> &rv);
+//void VectorBase<float>::AddVec(const float alpha, const VectorBase<float> &rv);
-template<>
+//template<>
-template<>
+//template<>
-void VectorBase<double>::AddVec<double>(const double alpha,
+//void VectorBase<double>::AddVec<double>(const double alpha,
-                                        const VectorBase<double> &rv);
+                                        //const VectorBase<double> &rv);
 }  // namespace kaldi

--- a/speechx/speechx/kaldi/matrix/kaldi-vector.cc
+++ b/speechx/speechx/kaldi/matrix/kaldi-vector.cc
@@ -25,144 +25,11 @@
 #include <algorithm>
 #include <string>
-#include "matrix/cblas-wrappers.h"
 #include "matrix/kaldi-vector.h"
 #include "matrix/kaldi-matrix.h"
-#include "matrix/sp-matrix.h"
-#include "matrix/sparse-matrix.h"
 namespace kaldi {
-template<typename Real>
-Real VecVec(const VectorBase<Real> &a,
-            const VectorBase<Real> &b) {
-  MatrixIndexT adim = a.Dim();
-  KALDI_ASSERT(adim == b.Dim());
-  return cblas_Xdot(adim, a.Data(), 1, b.Data(), 1);
-}
-template
-float VecVec<>(const VectorBase<float> &a,
-               const VectorBase<float> &b);
-template
-double VecVec<>(const VectorBase<double> &a,
-                const VectorBase<double> &b);
-template<typename Real, typename OtherReal>
-Real VecVec(const VectorBase<Real> &ra,
-            const VectorBase<OtherReal> &rb) {
-  MatrixIndexT adim = ra.Dim();
-  KALDI_ASSERT(adim == rb.Dim());
-  const Real *a_data = ra.Data();
-  const OtherReal *b_data = rb.Data();
-  Real sum = 0.0;
-  for (MatrixIndexT i = 0; i < adim; i++)
-    sum += a_data[i]*b_data[i];
-  return sum;
-}
-// instantiate the template above.
-template
-float VecVec<>(const VectorBase<float> &ra,
-               const VectorBase<double> &rb);
-template
-double VecVec<>(const VectorBase<double> &ra,
-                const VectorBase<float> &rb);
-template<>
-template<>
-void VectorBase<float>::AddVec(const float alpha,
-                               const VectorBase<float> &v) {
-  KALDI_ASSERT(dim_ == v.dim_);
-  KALDI_ASSERT(&v != this);
-  cblas_Xaxpy(dim_, alpha, v.Data(), 1, data_, 1);
-}
-template<>
-template<>
-void VectorBase<double>::AddVec(const double alpha,
-                                const VectorBase<double> &v) {
-  KALDI_ASSERT(dim_ == v.dim_);
-  KALDI_ASSERT(&v != this);
-  cblas_Xaxpy(dim_, alpha, v.Data(), 1, data_, 1);
-}
-template<typename Real>
-void VectorBase<Real>::AddMatVec(const Real alpha,
-                                  const MatrixBase<Real> &M,
-                                  MatrixTransposeType trans,
-                                  const VectorBase<Real> &v,
-                                  const Real beta) {
-  KALDI_ASSERT((trans == kNoTrans && M.NumCols() == v.dim_ && M.NumRows() == dim_)
-               || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_));
-  KALDI_ASSERT(&v != this);
-  cblas_Xgemv(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
-              v.Data(), 1, beta, data_, 1);
-}
-template<typename Real>
-void VectorBase<Real>::AddMatSvec(const Real alpha,
-                                  const MatrixBase<Real> &M,
-                                  MatrixTransposeType trans,
-                                  const VectorBase<Real> &v,
-                                  const Real beta) {
-  KALDI_ASSERT((trans == kNoTrans && M.NumCols() == v.dim_ && M.NumRows() == dim_)
-               || (trans == kTrans && M.NumRows() == v.dim_ && M.NumCols() == dim_));
-  KALDI_ASSERT(&v != this);
-  Xgemv_sparsevec(trans, M.NumRows(), M.NumCols(), alpha, M.Data(), M.Stride(),
-                  v.Data(), 1, beta, data_, 1);
-  return;
-  /*
-  MatrixIndexT this_dim = this->dim_, v_dim = v.dim_,
-      M_stride = M.Stride();
-  Real *this_data = this->data_;
-  const Real *M_data = M.Data(), *v_data = v.data_;
-  if (beta != 1.0) this->Scale(beta);
-  if (trans == kNoTrans) {
-    for (MatrixIndexT i = 0; i < v_dim; i++) {
-      Real v_i = v_data[i];
-      if (v_i == 0.0) continue;
-      // Add to *this, the i'th column of the Matrix, times v_i.
-      cblas_Xaxpy(this_dim, v_i * alpha, M_data + i, M_stride, this_data, 1);
-    }
-  } else { // The transposed case is slightly more efficient, I guess.
-    for (MatrixIndexT i = 0; i < v_dim; i++) {
-      Real v_i = v.data_[i];
-      if (v_i == 0.0) continue;
-      // Add to *this, the i'th row of the Matrix, times v_i.
-      cblas_Xaxpy(this_dim, v_i * alpha,
-                  M_data + (i * M_stride), 1, this_data, 1);
-    }
-    }*/
-}
-template<typename Real>
-void VectorBase<Real>::AddSpVec(const Real alpha,
-                                 const SpMatrix<Real> &M,
-                                 const VectorBase<Real> &v,
-                                 const Real beta) {
-  KALDI_ASSERT(M.NumRows() == v.dim_ && dim_ == v.dim_);
-  KALDI_ASSERT(&v != this);
-  cblas_Xspmv(alpha, M.NumRows(), M.Data(), v.Data(), 1, beta, data_, 1);
-}
-template<typename Real>
-void VectorBase<Real>::MulTp(const TpMatrix<Real> &M,
-                              const MatrixTransposeType trans) {
-  KALDI_ASSERT(M.NumRows() == dim_);
-  cblas_Xtpmv(trans,M.Data(),M.NumRows(),data_,1);
-}
-template<typename Real>
-void VectorBase<Real>::Solve(const TpMatrix<Real> &M,
-                        const MatrixTransposeType trans) {
-  KALDI_ASSERT(M.NumRows() == dim_);
-  cblas_Xtpsv(trans, M.Data(), M.NumRows(), data_, 1);
-}
 template<typename Real>
 inline void Vector<Real>::Init(const MatrixIndexT dim) {
  KALDI_ASSERT(dim >= 0);
@@ -232,6 +99,7 @@ void VectorBase<Real>::CopyFromVec(const VectorBase<Real> &v) {
  }
 }
+/*
 template<typename Real>
 template<typename OtherReal>
 void VectorBase<Real>::CopyFromPacked(const PackedMatrix<OtherReal>& M) {
@@ -249,7 +117,7 @@ template<typename Real>
 void VectorBase<Real>::CopyFromPtr(const Real *data, MatrixIndexT sz) {
  KALDI_ASSERT(dim_ == sz);
  std::memcpy(this->data_, data, Dim() * sizeof(Real));
-}
+}*/
 template<typename Real>
 template<typename OtherReal>
@@ -297,6 +165,7 @@ bool VectorBase<Real>::IsZero(Real cutoff) const {
  return (abs_max <= cutoff);
 }
+/*
 template<typename Real>
 void VectorBase<Real>::SetRandn() {
  kaldi::RandomState rstate;
@@ -330,7 +199,7 @@ MatrixIndexT VectorBase<Real>::RandCategorical() const {
  }
  return dim_ - 1; // Should only happen if RandUniform()
                   // returns exactly 1, or due to roundoff.
-}
+}*/
 template<typename Real>
 void VectorBase<Real>::Set(Real f) {
@@ -426,6 +295,7 @@ void VectorBase<float>::CopyRowFromMat(const MatrixBase<double> &mat, MatrixInde
 template
 void VectorBase<double>::CopyRowFromMat(const MatrixBase<float> &mat, MatrixIndexT row);
+/*
 template<typename Real>
 template<typename OtherReal>
 void VectorBase<Real>::CopyRowFromSp(const SpMatrix<OtherReal> &sp, MatrixIndexT row) {
@@ -451,28 +321,6 @@ void VectorBase<float>::CopyRowFromSp(const SpMatrix<float> &mat, MatrixIndexT r
 template
 void VectorBase<double>::CopyRowFromSp(const SpMatrix<double> &mat, MatrixIndexT row);
-#ifdef HAVE_MKL
-template<>
-void VectorBase<float>::Pow(const VectorBase<float> &v, float power) {
-  vsPowx(dim_, data_, power, v.data_);
-}
-template<>
-void VectorBase<double>::Pow(const VectorBase<double> &v, double power) {
-  vdPowx(dim_, data_, power, v.data_);
-}
-#else
-// takes elements to a power.  Does not check output.
-template<typename Real>
-void VectorBase<Real>::Pow(const VectorBase<Real> &v, Real power) {
-  KALDI_ASSERT(dim_ == v.dim_);
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] = pow(v.data_[i], power);
-  }
-}
-#endif
 // takes absolute value of the elements to a power.
 // Throws exception if could not (but only for power != 1 and power != 2).
 template<typename Real>
@@ -648,7 +496,7 @@ Real VectorBase<Real>::Min(MatrixIndexT *index_out) const {
    if (data[i] < ans) { ans = data[i]; index = i; }
  *index_out = index;
  return ans;
-}
+}*/
 template<typename Real>
@@ -670,434 +518,424 @@ void VectorBase<double>::CopyColFromMat(const MatrixBase<float> &mat, MatrixInde
 template
 void VectorBase<double>::CopyColFromMat(const MatrixBase<double> &mat, MatrixIndexT col);
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::CopyDiagFromMat(const MatrixBase<Real> &M) {
+//void VectorBase<Real>::CopyDiagFromMat(const MatrixBase<Real> &M) {
-  KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols()));
+  //KALDI_ASSERT(dim_ == std::min(M.NumRows(), M.NumCols()));
-  cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, 1);
+  //cblas_Xcopy(dim_, M.Data(), M.Stride() + 1, data_, 1);
-}
+//}
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::CopyDiagFromPacked(const PackedMatrix<Real> &M) {
+//void VectorBase<Real>::CopyDiagFromPacked(const PackedMatrix<Real> &M) {
-  KALDI_ASSERT(dim_ == M.NumCols());
+  //KALDI_ASSERT(dim_ == M.NumCols());
-  for (MatrixIndexT i = 0; i < dim_; i++)
+  //for (MatrixIndexT i = 0; i < dim_; i++)
-    data_[i] = M(i, i);
+    //data_[i] = M(i, i);
-  // could make this more efficient.
+  //// could make this more efficient.
-}
+//}
-template<typename Real>
+//template<typename Real>
-Real VectorBase<Real>::Sum() const {
+//Real VectorBase<Real>::Sum() const {
-  // Do a dot-product with a size-1 array with a stride of 0 to
+  //// Do a dot-product with a size-1 array with a stride of 0 to
-  // implement sum. This allows us to access SIMD operations in a
+  //// implement sum. This allows us to access SIMD operations in a
-  // cross-platform way via your BLAS library.
+  //// cross-platform way via your BLAS library.
-  Real one(1);
+  //Real one(1);
-  return cblas_Xdot(dim_, data_, 1, &one, 0);
+  //return cblas_Xdot(dim_, data_, 1, &one, 0);
-}
+//}
-template<typename Real>
+//template<typename Real>
-Real VectorBase<Real>::SumLog() const {
+//Real VectorBase<Real>::SumLog() const {
-  double sum_log = 0.0;
+  //double sum_log = 0.0;
-  double prod = 1.0;
+  //double prod = 1.0;
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    prod *= data_[i];
+    //prod *= data_[i];
-    // Possible future work (arnab): change these magic values to pre-defined
+    //// Possible future work (arnab): change these magic values to pre-defined
-    // constants
+    //// constants
-    if (prod < 1.0e-10 || prod > 1.0e+10) {
+    //if (prod < 1.0e-10 || prod > 1.0e+10) {
-      sum_log += Log(prod);
+      //sum_log += Log(prod);
-      prod = 1.0;
+      //prod = 1.0;
-    }
+    //}
-  }
+  //}
-  if (prod != 1.0) sum_log += Log(prod);
+  //if (prod != 1.0) sum_log += Log(prod);
-  return sum_log;
+  //return sum_log;
-}
+//}
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
+//void VectorBase<Real>::AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
-  KALDI_ASSERT(dim_ == M.NumCols());
+  //KALDI_ASSERT(dim_ == M.NumCols());
-  MatrixIndexT num_rows = M.NumRows(), stride = M.Stride(), dim = dim_;
+  //MatrixIndexT num_rows = M.NumRows(), stride = M.Stride(), dim = dim_;
-  Real *data = data_;
+  //Real *data = data_;
-  // implement the function according to a dimension cutoff for computation efficiency
+  //// implement the function according to a dimension cutoff for computation efficiency
-  if (num_rows <= 64) {
+  //if (num_rows <= 64) {
-    cblas_Xscal(dim, beta, data, 1);
+    //cblas_Xscal(dim, beta, data, 1);
-    const Real *m_data = M.Data();
+    //const Real *m_data = M.Data();
-    for (MatrixIndexT i = 0; i < num_rows; i++, m_data += stride)
+    //for (MatrixIndexT i = 0; i < num_rows; i++, m_data += stride)
-      cblas_Xaxpy(dim, alpha, m_data, 1, data, 1);
+      //cblas_Xaxpy(dim, alpha, m_data, 1, data, 1);
-  } else {
+  //} else {
-    Vector<Real> ones(M.NumRows());
+    //Vector<Real> ones(M.NumRows());
-    ones.Set(1.0);
+    //ones.Set(1.0);
-    this->AddMatVec(alpha, M, kTrans, ones, beta);
+    //this->AddMatVec(alpha, M, kTrans, ones, beta);
-  }
+  //}
-}
+//}
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::AddColSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
+//void VectorBase<Real>::AddColSumMat(Real alpha, const MatrixBase<Real> &M, Real beta) {
-  KALDI_ASSERT(dim_ == M.NumRows());
+  //KALDI_ASSERT(dim_ == M.NumRows());
-  MatrixIndexT num_cols = M.NumCols();
+  //MatrixIndexT num_cols = M.NumCols();
-  // implement the function according to a dimension cutoff for computation efficiency
+  //// implement the function according to a dimension cutoff for computation efficiency
-  if (num_cols <= 64) {
+  //if (num_cols <= 64) {
-    for (MatrixIndexT i = 0; i < dim_; i++) {
+    //for (MatrixIndexT i = 0; i < dim_; i++) {
-      double sum = 0.0;
+      //double sum = 0.0;
-      const Real *src = M.RowData(i);
+      //const Real *src = M.RowData(i);
-      for (MatrixIndexT j = 0; j < num_cols; j++)
+      //for (MatrixIndexT j = 0; j < num_cols; j++)
-        sum += src[j];
+        //sum += src[j];
-      data_[i] = alpha * sum + beta * data_[i];
+      //data_[i] = alpha * sum + beta * data_[i];
-    }
+    //}
-  } else {
+  //} else {
-    Vector<Real> ones(M.NumCols());
+    //Vector<Real> ones(M.NumCols());
-    ones.Set(1.0);
+    //ones.Set(1.0);
-    this->AddMatVec(alpha, M, kNoTrans, ones, beta);
+    //this->AddMatVec(alpha, M, kNoTrans, ones, beta);
-  }
+  //}
-}
+//}
-template<typename Real>
+//template<typename Real>
-Real VectorBase<Real>::LogSumExp(Real prune) const {
+//Real VectorBase<Real>::LogSumExp(Real prune) const {
-  Real sum;
+  //Real sum;
-  if (sizeof(sum) == 8) sum = kLogZeroDouble;
+  //if (sizeof(sum) == 8) sum = kLogZeroDouble;
-  else sum = kLogZeroFloat;
+  //else sum = kLogZeroFloat;
-  Real max_elem = Max(), cutoff;
+  //Real max_elem = Max(), cutoff;
-  if (sizeof(Real) == 4) cutoff = max_elem + kMinLogDiffFloat;
+  //if (sizeof(Real) == 4) cutoff = max_elem + kMinLogDiffFloat;
-  else cutoff = max_elem + kMinLogDiffDouble;
+  //else cutoff = max_elem + kMinLogDiffDouble;
-  if (prune > 0.0 && max_elem - prune > cutoff) // explicit pruning...
+  //if (prune > 0.0 && max_elem - prune > cutoff) // explicit pruning...
-    cutoff = max_elem - prune;
+    //cutoff = max_elem - prune;
-  double sum_relto_max_elem = 0.0;
+  //double sum_relto_max_elem = 0.0;
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    BaseFloat f = data_[i];
+    //BaseFloat f = data_[i];
-    if (f >= cutoff)
+    //if (f >= cutoff)
-      sum_relto_max_elem += Exp(f - max_elem);
+      //sum_relto_max_elem += Exp(f - max_elem);
-  }
+  //}
-  return max_elem + Log(sum_relto_max_elem);
+  //return max_elem + Log(sum_relto_max_elem);
-}
+//}
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::InvertElements() {
+//void VectorBase<Real>::InvertElements() {
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] = static_cast<Real>(1 / data_[i]);
+    //data_[i] = static_cast<Real>(1 / data_[i]);
-  }
+  //}
-}
+//}
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::ApplyLog() {
+//void VectorBase<Real>::ApplyLog() {
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    if (data_[i] < 0.0)
+    //if (data_[i] < 0.0)
-      KALDI_ERR << "Trying to take log of a negative number.";
+      //KALDI_ERR << "Trying to take log of a negative number.";
-    data_[i] = Log(data_[i]);
+    //data_[i] = Log(data_[i]);
-  }
+  //}
-}
+//}
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::ApplyLogAndCopy(const VectorBase<Real> &v) {
+//void VectorBase<Real>::ApplyLogAndCopy(const VectorBase<Real> &v) {
-  KALDI_ASSERT(dim_ == v.Dim());
+  //KALDI_ASSERT(dim_ == v.Dim());
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] = Log(v(i));
+    //data_[i] = Log(v(i));
-  }
+  //}
-}
+//}
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::ApplyExp() {
+//void VectorBase<Real>::ApplyExp() {
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] = Exp(data_[i]);
+    //data_[i] = Exp(data_[i]);
-  }
+  //}
-}
+//}
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::ApplyAbs() {
+//void VectorBase<Real>::ApplyAbs() {
-  for (MatrixIndexT i = 0; i < dim_; i++) { data_[i] = std::abs(data_[i]); }
+  //for (MatrixIndexT i = 0; i < dim_; i++) { data_[i] = std::abs(data_[i]); }
-}
+//}
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::Floor(const VectorBase<Real> &v, Real floor_val, MatrixIndexT *floored_count) {
+//void VectorBase<Real>::Floor(const VectorBase<Real> &v, Real floor_val, MatrixIndexT *floored_count) {
-  KALDI_ASSERT(dim_ == v.dim_);
+  //KALDI_ASSERT(dim_ == v.dim_);
-  if (floored_count == nullptr) {
+  //if (floored_count == nullptr) {
-    for (MatrixIndexT i = 0; i < dim_; i++) {
+    //for (MatrixIndexT i = 0; i < dim_; i++) {
-      data_[i] = std::max(v.data_[i], floor_val);
+      //data_[i] = std::max(v.data_[i], floor_val);
-    }
+    //}
-  } else {
+  //} else {
-    MatrixIndexT num_floored = 0;
+    //MatrixIndexT num_floored = 0;
-    for (MatrixIndexT i = 0; i < dim_; i++) {
+    //for (MatrixIndexT i = 0; i < dim_; i++) {
-      if (v.data_[i] < floor_val) {
+      //if (v.data_[i] < floor_val) {
-        data_[i] = floor_val;
+        //data_[i] = floor_val;
-        num_floored++;
+        //num_floored++;
-      } else {
+      //} else {
-        data_[i] = v.data_[i];
+        //data_[i] = v.data_[i];
-      }
+      //}
-    }
+    //}
-    *floored_count = num_floored;
+    //*floored_count = num_floored;
-  }
+  //}
-}
+//}
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::Ceiling(const VectorBase<Real> &v, Real ceil_val, MatrixIndexT *ceiled_count) {
+//void VectorBase<Real>::Ceiling(const VectorBase<Real> &v, Real ceil_val, MatrixIndexT *ceiled_count) {
-  KALDI_ASSERT(dim_ == v.dim_);
+  //KALDI_ASSERT(dim_ == v.dim_);
-  if (ceiled_count == nullptr) {
+  //if (ceiled_count == nullptr) {
-    for (MatrixIndexT i = 0; i < dim_; i++) {
+    //for (MatrixIndexT i = 0; i < dim_; i++) {
-      data_[i] = std::min(v.data_[i], ceil_val);
+      //data_[i] = std::min(v.data_[i], ceil_val);
-    }
+    //}
-  } else {
+  //} else {
-    MatrixIndexT num_changed = 0;
+    //MatrixIndexT num_changed = 0;
-    for (MatrixIndexT i = 0; i < dim_; i++) {
+    //for (MatrixIndexT i = 0; i < dim_; i++) {
-      if (v.data_[i] > ceil_val) {
+      //if (v.data_[i] > ceil_val) {
-        data_[i] = ceil_val;
+        //data_[i] = ceil_val;
-        num_changed++;
+        //num_changed++;
-      } else {
+      //} else {
-        data_[i] = v.data_[i];
+        //data_[i] = v.data_[i];
-      }
+      //}
-    }
+    //}
-    *ceiled_count = num_changed;
+    //*ceiled_count = num_changed;
-  }
+  //}
-}
+//}
-template<typename Real>
+//template<typename Real>
-MatrixIndexT VectorBase<Real>::ApplyFloor(const VectorBase<Real> &floor_vec) {
+//MatrixIndexT VectorBase<Real>::ApplyFloor(const VectorBase<Real> &floor_vec) {
-  KALDI_ASSERT(floor_vec.Dim() == dim_);
+  //KALDI_ASSERT(floor_vec.Dim() == dim_);
-  MatrixIndexT num_floored = 0;
+  //MatrixIndexT num_floored = 0;
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    if (data_[i] < floor_vec(i)) {
+    //if (data_[i] < floor_vec(i)) {
-      data_[i] = floor_vec(i);
+      //data_[i] = floor_vec(i);
-      num_floored++;
+      //num_floored++;
-    }
+    //}
-  }
+  //}
-  return num_floored;
+  //return num_floored;
-}
+//}
-template<typename Real>
+//template<typename Real>
-Real VectorBase<Real>::ApplySoftMax() {
+//Real VectorBase<Real>::ApplySoftMax() {
-  Real max = this->Max(), sum = 0.0;
+  //Real max = this->Max(), sum = 0.0;
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    sum += (data_[i] = Exp(data_[i] - max));
+    //sum += (data_[i] = Exp(data_[i] - max));
-  }
+  //}
-  this->Scale(1.0 / sum);
+  //this->Scale(1.0 / sum);
-  return max + Log(sum);
+  //return max + Log(sum);
-}
+//}
-template<typename Real>
+//template<typename Real>
-Real VectorBase<Real>::ApplyLogSoftMax() {
+//Real VectorBase<Real>::ApplyLogSoftMax() {
-  Real max = this->Max(), sum = 0.0;
+  //Real max = this->Max(), sum = 0.0;
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    sum += Exp((data_[i] -= max));
+    //sum += Exp((data_[i] -= max));
-  }
+  //}
-  sum = Log(sum);
+  //sum = Log(sum);
-  this->Add(-1.0 * sum);
+  //this->Add(-1.0 * sum);
-  return max + sum;
+  //return max + sum;
-}
+//}
-#ifdef HAVE_MKL
+//#ifdef HAVE_MKL
-template<>
+//template<>
-void VectorBase<float>::Tanh(const VectorBase<float> &src) {
+//void VectorBase<float>::Tanh(const VectorBase<float> &src) {
-  KALDI_ASSERT(dim_ == src.dim_);
+  //KALDI_ASSERT(dim_ == src.dim_);
-  vsTanh(dim_, src.data_, data_);
+  //vsTanh(dim_, src.data_, data_);
-}
+//}
-template<>
+//template<>
-void VectorBase<double>::Tanh(const VectorBase<double> &src) {
+//void VectorBase<double>::Tanh(const VectorBase<double> &src) {
-  KALDI_ASSERT(dim_ == src.dim_);
+  //KALDI_ASSERT(dim_ == src.dim_);
-  vdTanh(dim_, src.data_, data_);
+  //vdTanh(dim_, src.data_, data_);
-}
+//}
-#else
+//#else
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::Tanh(const VectorBase<Real> &src) {
+//void VectorBase<Real>::Tanh(const VectorBase<Real> &src) {
-  KALDI_ASSERT(dim_ == src.dim_);
+  //KALDI_ASSERT(dim_ == src.dim_);
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    Real x = src.data_[i];
+    //Real x = src.data_[i];
-    if (x > 0.0) {
+    //if (x > 0.0) {
-      Real inv_expx = Exp(-x);
+      //Real inv_expx = Exp(-x);
-      x = -1.0 + 2.0 / (1.0 + inv_expx * inv_expx);
+      //x = -1.0 + 2.0 / (1.0 + inv_expx * inv_expx);
-    } else {
+    //} else {
-      Real expx = Exp(x);
+      //Real expx = Exp(x);
-      x = 1.0 - 2.0 / (1.0 + expx * expx);
+      //x = 1.0 - 2.0 / (1.0 + expx * expx);
-    }
+    //}
-    data_[i] = x;
+    //data_[i] = x;
-  }
+  //}
-}
+//}
-#endif
+//#endif
-#ifdef HAVE_MKL
+//#ifdef HAVE_MKL
-// Implementing sigmoid based on tanh.
+//// Implementing sigmoid based on tanh.
-template<>
+//template<>
-void VectorBase<float>::Sigmoid(const VectorBase<float> &src) {
+//void VectorBase<float>::Sigmoid(const VectorBase<float> &src) {
-  KALDI_ASSERT(dim_ == src.dim_);
+  //KALDI_ASSERT(dim_ == src.dim_);
-  this->CopyFromVec(src);
+  //this->CopyFromVec(src);
-  this->Scale(0.5);
+  //this->Scale(0.5);
-  vsTanh(dim_, data_, data_);
+  //vsTanh(dim_, data_, data_);
-  this->Add(1.0);
+  //this->Add(1.0);
-  this->Scale(0.5);
+  //this->Scale(0.5);
-}
+//}
-template<>
+//template<>
-void VectorBase<double>::Sigmoid(const VectorBase<double> &src) {
+//void VectorBase<double>::Sigmoid(const VectorBase<double> &src) {
-  KALDI_ASSERT(dim_ == src.dim_);
+  //KALDI_ASSERT(dim_ == src.dim_);
-  this->CopyFromVec(src);
+  //this->CopyFromVec(src);
-  this->Scale(0.5);
+  //this->Scale(0.5);
-  vdTanh(dim_, data_, data_);
+  //vdTanh(dim_, data_, data_);
-  this->Add(1.0);
+  //this->Add(1.0);
-  this->Scale(0.5);
+  //this->Scale(0.5);
-}
+//}
-#else
+//#else
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::Sigmoid(const VectorBase<Real> &src) {
+//void VectorBase<Real>::Sigmoid(const VectorBase<Real> &src) {
-  KALDI_ASSERT(dim_ == src.dim_);
+  //KALDI_ASSERT(dim_ == src.dim_);
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    Real x = src.data_[i];
+    //Real x = src.data_[i];
-    // We aim to avoid floating-point overflow here.
+    //// We aim to avoid floating-point overflow here.
-    if (x > 0.0) {
+    //if (x > 0.0) {
-      x = 1.0 / (1.0 + Exp(-x));
+      //x = 1.0 / (1.0 + Exp(-x));
-    } else {
+    //} else {
-      Real ex = Exp(x);
+      //Real ex = Exp(x);
-      x = ex / (ex + 1.0);
+      //x = ex / (ex + 1.0);
-    }
+    //}
-    data_[i] = x;
+    //data_[i] = x;
-  }
+  //}
-}
+//}
-#endif
+//#endif
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::Add(Real c) {
+//void VectorBase<Real>::Add(Real c) {
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] += c;
+    //data_[i] += c;
-  }
+  //}
-}
+//}
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::Scale(Real alpha) {
+//void VectorBase<Real>::Scale(Real alpha) {
-  cblas_Xscal(dim_, alpha, data_, 1);
+  //cblas_Xscal(dim_, alpha, data_, 1);
-}
+//}
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::MulElements(const VectorBase<Real> &v) {
+//void VectorBase<Real>::MulElements(const VectorBase<Real> &v) {
-  KALDI_ASSERT(dim_ == v.dim_);
+  //KALDI_ASSERT(dim_ == v.dim_);
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] *= v.data_[i];
+    //data_[i] *= v.data_[i];
-  }
+  //}
-}
+//}
-template<typename Real>  // Set each element to y = (x == orig ? changed : x).
+//template<typename Real>  // Set each element to y = (x == orig ? changed : x).
-void VectorBase<Real>::ReplaceValue(Real orig, Real changed) {
+//void VectorBase<Real>::ReplaceValue(Real orig, Real changed) {
-  Real *data = data_;
+  //Real *data = data_;
-  for (MatrixIndexT i = 0; i < dim_; i++)
+  //for (MatrixIndexT i = 0; i < dim_; i++)
-    if (data[i] == orig) data[i] = changed;
+    //if (data[i] == orig) data[i] = changed;
-}
+//}
-template<typename Real>
+//template<typename Real>
-template<typename OtherReal>
+//template<typename OtherReal>
-void VectorBase<Real>::MulElements(const VectorBase<OtherReal> &v) {
+//void VectorBase<Real>::MulElements(const VectorBase<OtherReal> &v) {
-  KALDI_ASSERT(dim_ == v.Dim());
+  //KALDI_ASSERT(dim_ == v.Dim());
-  const OtherReal *other_ptr = v.Data();
+  //const OtherReal *other_ptr = v.Data();
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] *= other_ptr[i];
+    //data_[i] *= other_ptr[i];
-  }
+  //}
-}
+//}
-// instantiate template.
+//// instantiate template.
-template
+//template
-void VectorBase<float>::MulElements(const VectorBase<double> &v);
+//void VectorBase<float>::MulElements(const VectorBase<double> &v);
-template
+//template
-void VectorBase<double>::MulElements(const VectorBase<float> &v);
+//void VectorBase<double>::MulElements(const VectorBase<float> &v);
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::AddVecVec(Real alpha, const VectorBase<Real> &v,
+//void VectorBase<Real>::AddVecVec(Real alpha, const VectorBase<Real> &v,
-                                 const VectorBase<Real> &r, Real beta) {
+                                 //const VectorBase<Real> &r, Real beta) {
-  KALDI_ASSERT(v.data_ != this->data_ && r.data_ != this->data_);
+  //KALDI_ASSERT(v.data_ != this->data_ && r.data_ != this->data_);
-  // We pretend that v is a band-diagonal matrix.
+  //// We pretend that v is a band-diagonal matrix.
-  KALDI_ASSERT(dim_ == v.dim_ && dim_ == r.dim_);
+  //KALDI_ASSERT(dim_ == v.dim_ && dim_ == r.dim_);
-  cblas_Xgbmv(kNoTrans, dim_, dim_, 0, 0, alpha, v.data_, 1,
+  //cblas_Xgbmv(kNoTrans, dim_, dim_, 0, 0, alpha, v.data_, 1,
-              r.data_, 1, beta, this->data_, 1);
+              //r.data_, 1, beta, this->data_, 1);
-}
+//}
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::DivElements(const VectorBase<Real> &v) {
+//void VectorBase<Real>::DivElements(const VectorBase<Real> &v) {
-  KALDI_ASSERT(dim_ == v.dim_);
+  //KALDI_ASSERT(dim_ == v.dim_);
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] /= v.data_[i];
+    //data_[i] /= v.data_[i];
-  }
+  //}
-}
+//}
-template<typename Real>
+//template<typename Real>
-template<typename OtherReal>
+//template<typename OtherReal>
-void VectorBase<Real>::DivElements(const VectorBase<OtherReal> &v) {
+//void VectorBase<Real>::DivElements(const VectorBase<OtherReal> &v) {
-  KALDI_ASSERT(dim_ == v.Dim());
+  //KALDI_ASSERT(dim_ == v.Dim());
-  const OtherReal *other_ptr = v.Data();
+  //const OtherReal *other_ptr = v.Data();
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] /= other_ptr[i];
+    //data_[i] /= other_ptr[i];
-  }
+  //}
-}
+//}
-// instantiate template.
+//// instantiate template.
-template
+//template
-void VectorBase<float>::DivElements(const VectorBase<double> &v);
+//void VectorBase<float>::DivElements(const VectorBase<double> &v);
-template
+//template
-void VectorBase<double>::DivElements(const VectorBase<float> &v);
+//void VectorBase<double>::DivElements(const VectorBase<float> &v);
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::AddVecDivVec(Real alpha, const VectorBase<Real> &v,
+//void VectorBase<Real>::AddVecDivVec(Real alpha, const VectorBase<Real> &v,
-                                    const VectorBase<Real> &rr, Real beta) {
+                                    //const VectorBase<Real> &rr, Real beta) {
-  KALDI_ASSERT((dim_ == v.dim_ && dim_ == rr.dim_));
+  //KALDI_ASSERT((dim_ == v.dim_ && dim_ == rr.dim_));
-  for (MatrixIndexT i = 0; i < dim_; i++) {
+  //for (MatrixIndexT i = 0; i < dim_; i++) {
-    data_[i] = alpha * v.data_[i]/rr.data_[i] + beta * data_[i] ;
+    //data_[i] = alpha * v.data_[i]/rr.data_[i] + beta * data_[i] ;
-  }
+  //}
-}
+//}
-template<typename Real>
+//template<typename Real>
-template<typename OtherReal>
+//template<typename OtherReal>
-void VectorBase<Real>::AddVec(const Real alpha, const VectorBase<OtherReal> &v) {
+//void VectorBase<Real>::AddVec(const Real alpha, const VectorBase<OtherReal> &v) {
-  KALDI_ASSERT(dim_ == v.dim_);
+  //KALDI_ASSERT(dim_ == v.dim_);
-  // remove __restrict__ if it causes compilation problems.
+  //// remove __restrict__ if it causes compilation problems.
-  Real *__restrict__ data = data_;
+  //Real *__restrict__ data = data_;
-  OtherReal *__restrict__ other_data = v.data_;
+  //OtherReal *__restrict__ other_data = v.data_;
-  MatrixIndexT dim = dim_;
+  //MatrixIndexT dim = dim_;
-  if (alpha != 1.0)
+  //if (alpha != 1.0)
-    for (MatrixIndexT i = 0; i < dim; i++)
+    //for (MatrixIndexT i = 0; i < dim; i++)
-      data[i] += alpha * other_data[i];
+      //data[i] += alpha * other_data[i];
-  else
+  //else
-    for (MatrixIndexT i = 0; i < dim; i++)
+    //for (MatrixIndexT i = 0; i < dim; i++)
-      data[i] += other_data[i];
+      //data[i] += other_data[i];
-}
+//}
-template
+//template
-void VectorBase<float>::AddVec(const float alpha, const VectorBase<double> &v);
+//void VectorBase<float>::AddVec(const float alpha, const VectorBase<double> &v);
-template
+//template
-void VectorBase<double>::AddVec(const double alpha, const VectorBase<float> &v);
+//void VectorBase<double>::AddVec(const double alpha, const VectorBase<float> &v);
-template<typename Real>
+//template<typename Real>
-template<typename OtherReal>
+//template<typename OtherReal>
-void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<OtherReal> &v) {
+//void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<OtherReal> &v) {
-  KALDI_ASSERT(dim_ == v.dim_);
+  //KALDI_ASSERT(dim_ == v.dim_);
-  // remove __restrict__ if it causes compilation problems.
+  //// remove __restrict__ if it causes compilation problems.
-  Real *__restrict__ data = data_;
+  //Real *__restrict__ data = data_;
-  OtherReal *__restrict__ other_data = v.data_;
+  //OtherReal *__restrict__ other_data = v.data_;
-  MatrixIndexT dim = dim_;
+  //MatrixIndexT dim = dim_;
-  if (alpha != 1.0)
+  //if (alpha != 1.0)
-    for (MatrixIndexT i = 0; i < dim; i++)
+    //for (MatrixIndexT i = 0; i < dim; i++)
-      data[i] += alpha * other_data[i] * other_data[i];
+      //data[i] += alpha * other_data[i] * other_data[i];
-  else
+  //else
-    for (MatrixIndexT i = 0; i < dim; i++)
+    //for (MatrixIndexT i = 0; i < dim; i++)
-      data[i] += other_data[i] * other_data[i];
+      //data[i] += other_data[i] * other_data[i];
-}
+//}
-template
+//template
-void VectorBase<float>::AddVec2(const float alpha, const VectorBase<double> &v);
+//void VectorBase<float>::AddVec2(const float alpha, const VectorBase<double> &v);
-template
+//template
-void VectorBase<double>::AddVec2(const double alpha, const VectorBase<float> &v);
+//void VectorBase<double>::AddVec2(const double alpha, const VectorBase<float> &v);
 template<typename Real>
-void VectorBase<Real>::Read(std::istream &is,  bool binary, bool add) {
+void VectorBase<Real>::Read(std::istream &is,  bool binary) {
-  if (add) {
-    Vector<Real> tmp(Dim());
-    tmp.Read(is, binary, false);  // read without adding.
-    if (this->Dim() != tmp.Dim()) {
-      KALDI_ERR << "VectorBase::Read, size mismatch " << this->Dim()<<" vs. "<<tmp.Dim();
-    }
-    this->AddVec(1.0, tmp);
-    return;
-  } // now assume add == false.
  //  In order to avoid rewriting this, we just declare a Vector and
  // use it to read the data, then copy.
  Vector<Real> tmp;
-  tmp.Read(is, binary, false);
+  tmp.Read(is, binary);
  if (tmp.Dim() != Dim())
    KALDI_ERR << "VectorBase<Real>::Read, size mismatch "
              << Dim() << " vs. " << tmp.Dim();
@@ -1106,19 +944,7 @@ void VectorBase<Real>::Read(std::istream &is,  bool binary, bool add) {
 template<typename Real>
-void Vector<Real>::Read(std::istream &is,  bool binary, bool add) {
+void Vector<Real>::Read(std::istream &is,  bool binary) {
-  if (add) {
-    Vector<Real> tmp(this->Dim());
-    tmp.Read(is, binary, false);  // read without adding.
-    if (this->Dim() == 0) this->Resize(tmp.Dim());
-    if (this->Dim() != tmp.Dim()) {
-      KALDI_ERR << "Vector<Real>::Read, adding but dimensions mismatch "
-                << this->Dim() << " vs. " << tmp.Dim();
-    }
-    this->AddVec(1.0, tmp);
-    return;
-  } // now assume add == false.
  std::ostringstream specific_error;
  MatrixIndexT pos_at_start = is.tellg();
@@ -1129,7 +955,7 @@ void Vector<Real>::Read(std::istream &is,  bool binary, bool add) {
    if (peekval == other_token_start) {  // need to instantiate the other type to read it.
      typedef typename OtherReal<Real>::Real OtherType;  // if Real == float, OtherType == double, and vice versa.
      Vector<OtherType> other(this->Dim());
-      other.Read(is, binary, false);  // add is false at this point.
+      other.Read(is, binary);  // add is false at this point.
      if (this->Dim() != other.Dim()) this->Resize(other.Dim());
      this->CopyFromVec(other);
      return;
@@ -1251,47 +1077,47 @@ void VectorBase<Real>::Write(std::ostream & os, bool binary) const {
 }
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<Real> &v) {
+//void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<Real> &v) {
-  KALDI_ASSERT(dim_ == v.dim_);
+  //KALDI_ASSERT(dim_ == v.dim_);
-  for (MatrixIndexT i = 0; i < dim_; i++)
+  //for (MatrixIndexT i = 0; i < dim_; i++)
-    data_[i] += alpha * v.data_[i] * v.data_[i];
+    //data_[i] += alpha * v.data_[i] * v.data_[i];
-}
+//}
-// this <-- beta*this + alpha*M*v.
+//// this <-- beta*this + alpha*M*v.
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::AddTpVec(const Real alpha, const TpMatrix<Real> &M,
+//void VectorBase<Real>::AddTpVec(const Real alpha, const TpMatrix<Real> &M,
-                                const MatrixTransposeType trans,
+                                //const MatrixTransposeType trans,
-                                const VectorBase<Real> &v,
+                                //const VectorBase<Real> &v,
-                                const Real beta) {
+                                //const Real beta) {
-  KALDI_ASSERT(dim_ == v.dim_ && dim_ == M.NumRows());
+  //KALDI_ASSERT(dim_ == v.dim_ && dim_ == M.NumRows());
-  if (beta == 0.0) {
+  //if (beta == 0.0) {
-    if (&v != this) CopyFromVec(v);
+    //if (&v != this) CopyFromVec(v);
-    MulTp(M, trans);
+    //MulTp(M, trans);
-    if (alpha != 1.0) Scale(alpha);
+    //if (alpha != 1.0) Scale(alpha);
-  } else {
+  //} else {
-    Vector<Real> tmp(v);
+    //Vector<Real> tmp(v);
-    tmp.MulTp(M, trans);
+    //tmp.MulTp(M, trans);
-    if (beta != 1.0) Scale(beta);  // *this <-- beta * *this
+    //if (beta != 1.0) Scale(beta);  // *this <-- beta * *this
-    AddVec(alpha, tmp);          // *this += alpha * M * v
+    //AddVec(alpha, tmp);          // *this += alpha * M * v
-  }
+  //}
-}
+//}
-template<typename Real>
+//template<typename Real>
-Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
+//Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
-               const VectorBase<Real> &v2) {
+               //const VectorBase<Real> &v2) {
-  KALDI_ASSERT(v1.Dim() == M.NumRows() && v2.Dim() == M.NumCols());
+  //KALDI_ASSERT(v1.Dim() == M.NumRows() && v2.Dim() == M.NumCols());
-  Vector<Real> vtmp(M.NumRows());
+  //Vector<Real> vtmp(M.NumRows());
-  vtmp.AddMatVec(1.0, M, kNoTrans, v2, 0.0);
+  //vtmp.AddMatVec(1.0, M, kNoTrans, v2, 0.0);
-  return VecVec(v1, vtmp);
+  //return VecVec(v1, vtmp);
-}
+//}
-template
+//template
-float VecMatVec(const VectorBase<float> &v1, const MatrixBase<float> &M,
+//float VecMatVec(const VectorBase<float> &v1, const MatrixBase<float> &M,
-                const VectorBase<float> &v2);
+                //const VectorBase<float> &v2);
-template
+//template
-double VecMatVec(const VectorBase<double> &v1, const MatrixBase<double> &M,
+//double VecMatVec(const VectorBase<double> &v1, const MatrixBase<double> &M,
-                 const VectorBase<double> &v2);
+                 //const VectorBase<double> &v2);
 template<typename Real>
 void Vector<Real>::Swap(Vector<Real> *other) {
@@ -1300,51 +1126,51 @@ void Vector<Real>::Swap(Vector<Real> *other) {
 }
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::AddDiagMat2(
+//void VectorBase<Real>::AddDiagMat2(
-    Real alpha, const MatrixBase<Real> &M,
+    //Real alpha, const MatrixBase<Real> &M,
-    MatrixTransposeType trans, Real beta) {
+    //MatrixTransposeType trans, Real beta) {
-  if (trans == kNoTrans) {
+  //if (trans == kNoTrans) {
-    KALDI_ASSERT(this->dim_ == M.NumRows());
+    //KALDI_ASSERT(this->dim_ == M.NumRows());
-    MatrixIndexT rows = this->dim_, cols = M.NumCols(),
+    //MatrixIndexT rows = this->dim_, cols = M.NumCols(),
-           mat_stride = M.Stride();
+           //mat_stride = M.Stride();
-    Real *data = this->data_;
+    //Real *data = this->data_;
-    const Real *mat_data = M.Data();
+    //const Real *mat_data = M.Data();
-    for (MatrixIndexT i = 0; i < rows; i++, mat_data += mat_stride, data++)
+    //for (MatrixIndexT i = 0; i < rows; i++, mat_data += mat_stride, data++)
-      *data = beta * *data + alpha * cblas_Xdot(cols,mat_data,1,mat_data,1);
+      //*data = beta * *data + alpha * cblas_Xdot(cols,mat_data,1,mat_data,1);
-  } else {
+  //} else {
-    KALDI_ASSERT(this->dim_ == M.NumCols());
+    //KALDI_ASSERT(this->dim_ == M.NumCols());
-    MatrixIndexT rows = M.NumRows(), cols = this->dim_,
+    //MatrixIndexT rows = M.NumRows(), cols = this->dim_,
-           mat_stride = M.Stride();
+           //mat_stride = M.Stride();
-    Real *data = this->data_;
+    //Real *data = this->data_;
-    const Real *mat_data = M.Data();
+    //const Real *mat_data = M.Data();
-    for (MatrixIndexT i = 0; i < cols; i++, mat_data++, data++)
+    //for (MatrixIndexT i = 0; i < cols; i++, mat_data++, data++)
-      *data = beta * *data + alpha * cblas_Xdot(rows, mat_data, mat_stride,
+      //*data = beta * *data + alpha * cblas_Xdot(rows, mat_data, mat_stride,
-                                                 mat_data, mat_stride);
+                                                 //mat_data, mat_stride);
-  }
+  //}
-}
+//}
-template<typename Real>
+//template<typename Real>
-void VectorBase<Real>::AddDiagMatMat(
+//void VectorBase<Real>::AddDiagMatMat(
-    Real alpha,
+    //Real alpha,
-    const MatrixBase<Real> &M, MatrixTransposeType transM,
+    //const MatrixBase<Real> &M, MatrixTransposeType transM,
-    const MatrixBase<Real> &N, MatrixTransposeType transN,
+    //const MatrixBase<Real> &N, MatrixTransposeType transN,
-    Real beta) {
+    //Real beta) {
-  MatrixIndexT dim = this->dim_,
+  //MatrixIndexT dim = this->dim_,
-      M_col_dim = (transM == kTrans ? M.NumRows() : M.NumCols()),
+      //M_col_dim = (transM == kTrans ? M.NumRows() : M.NumCols()),
-      N_row_dim = (transN == kTrans ? N.NumCols() : N.NumRows());
+      //N_row_dim = (transN == kTrans ? N.NumCols() : N.NumRows());
-  KALDI_ASSERT(M_col_dim == N_row_dim); // this is the dimension we sum over
+  //KALDI_ASSERT(M_col_dim == N_row_dim); // this is the dimension we sum over
-  MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
+  //MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
-  if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
+  //if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
-  MatrixIndexT N_row_stride = N.Stride(), N_col_stride = 1;
+  //MatrixIndexT N_row_stride = N.Stride(), N_col_stride = 1;
-  if (transN == kTrans) std::swap(N_row_stride, N_col_stride);
+  //if (transN == kTrans) std::swap(N_row_stride, N_col_stride);
-  Real *data = this->data_;
+  //Real *data = this->data_;
-  const Real *Mdata = M.Data(), *Ndata = N.Data();
+  //const Real *Mdata = M.Data(), *Ndata = N.Data();
-  for (MatrixIndexT i = 0; i < dim; i++, Mdata += M_row_stride, Ndata += N_col_stride, data++) {
+  //for (MatrixIndexT i = 0; i < dim; i++, Mdata += M_row_stride, Ndata += N_col_stride, data++) {
-    *data = beta * *data + alpha * cblas_Xdot(M_col_dim, Mdata, M_col_stride, Ndata, N_row_stride);
+    //*data = beta * *data + alpha * cblas_Xdot(M_col_dim, Mdata, M_col_stride, Ndata, N_row_stride);
-  }
+  //}
-}
+//}
 template class Vector<float>;

--- a/speechx/speechx/kaldi/matrix/kaldi-vector.h
+++ b/speechx/speechx/kaldi/matrix/kaldi-vector.h
@@ -49,17 +49,6 @@ class VectorBase {
  /// Set all members of a vector to a specified value.
  void Set(Real f);
-  /// Set vector to random normally-distributed noise.
-  void SetRandn();
-  /// Sets to numbers uniformly distributed on (0,1)
-  void SetRandUniform();
-  /// This function returns a random index into this vector,
-  /// chosen with probability proportional to the corresponding
-  /// element.  Requires that this->Min() >= 0 and this->Sum() > 0.
-  MatrixIndexT RandCategorical() const;
  /// Returns the  dimension of the vector.
  inline MatrixIndexT Dim() const { return dim_; }
@@ -108,178 +97,15 @@ class VectorBase {
  /// Copy data from another vector (must match own size).
  void CopyFromVec(const VectorBase<Real> &v);
-  /// Copy data from a SpMatrix or TpMatrix (must match own size).
-  template<typename OtherReal>
-  void CopyFromPacked(const PackedMatrix<OtherReal> &M);
  /// Copy data from another vector of different type (double vs. float)
  template<typename OtherReal>
  void CopyFromVec(const VectorBase<OtherReal> &v);
-  /// Copy from CuVector.  This is defined in ../cudamatrix/cu-vector.h
-  template<typename OtherReal>
-  void CopyFromVec(const CuVectorBase<OtherReal> &v);
-  /// Applies floor to all elements. Returns number of elements
-  /// floored in floored_count if it is non-null.
-  void Floor(const VectorBase<Real> &v, Real floor_val, MatrixIndexT *floored_count = nullptr);
-  /// Applies ceiling to all elements. Returns number of elements
-  /// changed in ceiled_count if it is non-null.
-  void Ceiling(const VectorBase<Real> &v, Real ceil_val, MatrixIndexT *ceiled_count = nullptr);
-  void Pow(const VectorBase<Real> &v, Real power);
-  /// Apply natural log to all elements.  Throw if any element of
-  /// the vector is negative (but doesn't complain about zero; the
-  /// log will be -infinity
-  void ApplyLog();
-  /// Apply natural log to another vector and put result in *this.
-  void ApplyLogAndCopy(const VectorBase<Real> &v);
-  /// Apply exponential to each value in vector.
-  void ApplyExp();
-  /// Take absolute value of each of the elements
-  void ApplyAbs();
-  /// Applies floor to all elements. Returns number of elements
-  /// floored in floored_count if it is non-null.
-  inline void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = nullptr) {
-    this->Floor(*this, floor_val, floored_count);
-  };
-  /// Applies ceiling to all elements. Returns number of elements
-  /// changed in ceiled_count if it is non-null.
-  inline void ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count = nullptr) {
-    this->Ceiling(*this, ceil_val, ceiled_count);
-  };
-  /// Applies floor to all elements. Returns number of elements floored.
-  MatrixIndexT ApplyFloor(const VectorBase<Real> &floor_vec);
-  /// Apply soft-max to vector and return normalizer (log sum of exponentials).
-  /// This is the same as: \f$ x(i) = exp(x(i)) / \sum_i exp(x(i)) \f$
-  Real ApplySoftMax();
-  /// Applies log soft-max to vector and returns normalizer (log sum of
-  /// exponentials).
-  /// This is the same as: \f$ x(i) = x(i) - log(\sum_i exp(x(i))) \f$
-  Real ApplyLogSoftMax();
-  /// Sets each element of *this to the tanh of the corresponding element of "src".
-  void Tanh(const VectorBase<Real> &src);
-  /// Sets each element of *this to the sigmoid function of the corresponding
-  /// element of "src".
-  void Sigmoid(const VectorBase<Real> &src);
-  /// Take all  elements of vector to a power.
-  inline void ApplyPow(Real power) {
-    this->Pow(*this, power);
-  };
-  /// Take the absolute value of all elements of a vector to a power.
-  /// Include the sign of the input element if include_sign == true.
-  /// If power is negative and the input value is zero, the output is set zero.
-  void ApplyPowAbs(Real power, bool include_sign=false);
-  /// Compute the p-th norm of the vector.
-  Real Norm(Real p) const;
-  /// Returns true if ((*this)-other).Norm(2.0) <= tol * (*this).Norm(2.0).
-  bool ApproxEqual(const VectorBase<Real> &other, float tol = 0.01) const;
-  /// Invert all elements.
-  void InvertElements();
-  /// Add vector : *this = *this + alpha * rv (with casting between floats and
-  /// doubles)
-  template<typename OtherReal>
-  void AddVec(const Real alpha, const VectorBase<OtherReal> &v);
-  /// Add vector : *this = *this + alpha * rv^2  [element-wise squaring].
-  void AddVec2(const Real alpha, const VectorBase<Real> &v);
-  /// Add vector : *this = *this + alpha * rv^2  [element-wise squaring],
-  /// with casting between floats and doubles.
-  template<typename OtherReal>
-  void AddVec2(const Real alpha, const VectorBase<OtherReal> &v);
-  /// Add matrix times vector : this <-- beta*this + alpha*M*v.
-  /// Calls BLAS GEMV.
-  void AddMatVec(const Real alpha, const MatrixBase<Real> &M,
-                 const MatrixTransposeType trans,  const VectorBase<Real> &v,
-                 const Real beta); // **beta previously defaulted to 0.0**
-  /// This is as AddMatVec, except optimized for where v contains a lot
-  /// of zeros.
-  void AddMatSvec(const Real alpha, const MatrixBase<Real> &M,
-                  const MatrixTransposeType trans,  const VectorBase<Real> &v,
-                  const Real beta); // **beta previously defaulted to 0.0**
-  /// Add symmetric positive definite matrix times vector:
-  ///  this <-- beta*this + alpha*M*v.   Calls BLAS SPMV.
-  void AddSpVec(const Real alpha, const SpMatrix<Real> &M,
-                const VectorBase<Real> &v, const Real beta);  // **beta previously defaulted to 0.0**
-  /// Add triangular matrix times vector: this <-- beta*this + alpha*M*v.
-  /// Works even if rv == *this.
-  void AddTpVec(const Real alpha, const TpMatrix<Real> &M,
-                const MatrixTransposeType trans, const VectorBase<Real> &v,
-                const Real beta);  // **beta previously defaulted to 0.0**
-  /// Set each element to y = (x == orig ? changed : x).
-  void ReplaceValue(Real orig, Real changed);
-  /// Multiply element-by-element by another vector.
-  void MulElements(const VectorBase<Real> &v);
-  /// Multiply element-by-element by another vector of different type.
-  template<typename OtherReal>
-  void MulElements(const VectorBase<OtherReal> &v);
-  /// Divide element-by-element by a vector.
-  void DivElements(const VectorBase<Real> &v);
-  /// Divide element-by-element by a vector of different type.
-  template<typename OtherReal>
-  void DivElements(const VectorBase<OtherReal> &v);
-  /// Add a constant to each element of a vector.
-  void Add(Real c);
-  /// Add element-by-element product of vectors:
-  //  this <-- alpha * v .* r + beta*this .
-  void AddVecVec(Real alpha, const VectorBase<Real> &v,
-                 const VectorBase<Real> &r, Real beta);
-  /// Add element-by-element quotient of two vectors.
-  ///  this <---- alpha*v/r + beta*this
-  void AddVecDivVec(Real alpha, const VectorBase<Real> &v,
-                    const VectorBase<Real> &r, Real beta);
-  /// Multiplies all elements by this constant.
-  void Scale(Real alpha);
-  /// Multiplies this vector by lower-triangular matrix:  *this <-- *this *M
-  void MulTp(const TpMatrix<Real> &M, const MatrixTransposeType trans);
-  /// If trans == kNoTrans, solves M x = b, where b is the value of *this at input
-  /// and x is the value of *this at output.
-  /// If trans == kTrans, solves M' x = b.
-  /// Does not test for M being singular or near-singular, so test it before
-  /// calling this routine.
-  void Solve(const TpMatrix<Real> &M, const MatrixTransposeType trans);
  /// Performs a row stack of the matrix M
  void CopyRowsFromMat(const MatrixBase<Real> &M);
  template<typename OtherReal>
  void CopyRowsFromMat(const MatrixBase<OtherReal> &M);
-  /// The following is implemented in ../cudamatrix/cu-matrix.cc
-  void CopyRowsFromMat(const CuMatrixBase<Real> &M);
  /// Performs a column stack of the matrix M
  void CopyColsFromMat(const MatrixBase<Real> &M);
@@ -290,85 +116,19 @@ class VectorBase {
  template<typename OtherReal>
  void CopyRowFromMat(const MatrixBase<OtherReal> &M, MatrixIndexT row);
-  /// Extracts a row of the symmetric matrix S.
-  template<typename OtherReal>
-  void CopyRowFromSp(const SpMatrix<OtherReal> &S, MatrixIndexT row);
  /// Extracts a column of the matrix M.
  template<typename OtherReal>
  void CopyColFromMat(const MatrixBase<OtherReal> &M , MatrixIndexT col);
-  /// Extracts the diagonal of the matrix M.
-  void CopyDiagFromMat(const MatrixBase<Real> &M);
-  /// Extracts the diagonal of a packed matrix M; works for Sp or Tp.
-  void CopyDiagFromPacked(const PackedMatrix<Real> &M);
-  /// Extracts the diagonal of a symmetric matrix.
-  inline void CopyDiagFromSp(const SpMatrix<Real> &M) { CopyDiagFromPacked(M); }
-  /// Extracts the diagonal of a triangular matrix.
-  inline void CopyDiagFromTp(const TpMatrix<Real> &M) { CopyDiagFromPacked(M); }
-  /// Returns the maximum value of any element, or -infinity for the empty vector.
-  Real Max() const;
-  /// Returns the maximum value of any element, and the associated index.
-  /// Error if vector is empty.
-  Real Max(MatrixIndexT *index) const;
-  /// Returns the minimum value of any element, or +infinity for the empty vector.
-  Real Min() const;
-  /// Returns the minimum value of any element, and the associated index.
-  /// Error if vector is empty.
-  Real Min(MatrixIndexT *index) const;
-  /// Returns sum of the elements
-  Real Sum() const;
-  /// Returns sum of the logs of the elements.  More efficient than
-  /// just taking log of each.  Will return NaN if any elements are
-  /// negative.
-  Real SumLog() const;
-  /// Does *this = alpha * (sum of rows of M) + beta * *this.
-  void AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta = 1.0);
-  /// Does *this = alpha * (sum of columns of M) + beta * *this.
-  void AddColSumMat(Real alpha, const MatrixBase<Real> &M, Real beta = 1.0);
-  /// Add the diagonal of a matrix times itself:
-  /// *this = diag(M M^T) +  beta * *this (if trans == kNoTrans), or
-  /// *this = diag(M^T M) +  beta * *this (if trans == kTrans).
-  void AddDiagMat2(Real alpha, const MatrixBase<Real> &M,
-                   MatrixTransposeType trans = kNoTrans, Real beta = 1.0);
-  /// Add the diagonal of a matrix product: *this = diag(M N), assuming the
-  /// "trans" arguments are both kNoTrans; for transpose arguments, it behaves
-  /// as you would expect.
-  void AddDiagMatMat(Real alpha, const MatrixBase<Real> &M, MatrixTransposeType transM,
-                     const MatrixBase<Real> &N, MatrixTransposeType transN,
-                     Real beta = 1.0);
-  /// Returns log(sum(exp())) without exp overflow
-  /// If prune > 0.0, ignores terms less than the max - prune.
-  /// [Note: in future, if prune = 0.0, it will take the max.
-  /// For now, use -1 if you don't want it to prune.]
-  Real LogSumExp(Real prune = -1.0) const;
  /// Reads from C++ stream (option to add to existing contents).
  /// Throws exception on failure
-  void Read(std::istream &in, bool binary, bool add = false);
+  void Read(std::istream &in, bool binary);
  /// Writes to C++ stream (option to write in binary).
  void Write(std::ostream &Out, bool binary) const;
  friend class VectorBase<double>;
  friend class VectorBase<float>;
-  friend class CuVectorBase<Real>;
-  friend class CuVector<Real>;
 protected:
  /// Destructor;  does not deallocate memory, this is handled by child classes.
  /// This destructor is protected so this object can only be
@@ -380,17 +140,6 @@ class VectorBase {
    KALDI_ASSERT_IS_FLOATING_TYPE(Real);
  }
-// Took this out since it is not currently used, and it is possible to create
-// objects where the allocated memory is not the same size as dim_ : Arnab
-//  /// Initializer from a pointer and a size; keeps the pointer internally
-//  /// (ownership or non-ownership depends on the child class).
-//  explicit VectorBase(Real* data, MatrixIndexT dim)
-//      : data_(data), dim_(dim) {}
-  // Arnab : made this protected since it is unsafe too.
-  /// Load data into the vector: sz must match own size.
-  void CopyFromPtr(const Real* Data, MatrixIndexT sz);
  /// data memory area
  Real* data_;
  /// dimension of vector
@@ -416,8 +165,8 @@ class Vector: public VectorBase<Real> {
  /// Copy constructor from CUDA vector
  /// This is defined in ../cudamatrix/cu-vector.h
-  template<typename OtherReal>
+  //template<typename OtherReal>
-  explicit Vector(const CuVectorBase<OtherReal> &cu);
+  //explicit Vector(const CuVectorBase<OtherReal> &cu);
  /// Copy constructor.  The need for this is controversial.
  Vector(const Vector<Real> &v) : VectorBase<Real>()  { //  (cannot be explicit)
@@ -455,7 +204,7 @@ class Vector: public VectorBase<Real> {
  /// Read function using C++ streams.  Can also add to existing contents
  /// of matrix.
-  void Read(std::istream &in, bool binary, bool add = false);
+  void Read(std::istream &in, bool binary);
  /// Set vector to a specified size (can be zero).
  /// The value of the new data depends on resize_type:
@@ -516,10 +265,10 @@ class SubVector : public VectorBase<Real> {
  /// This constructor initializes the vector to point at the contents
  /// of this packed matrix (SpMatrix or TpMatrix).
-  SubVector(const PackedMatrix<Real> &M) {
+ // SubVector(const PackedMatrix<Real> &M) {
-    VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
+    //VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
-    VectorBase<Real>::dim_   = (M.NumRows()*(M.NumRows()+1))/2;
+    //VectorBase<Real>::dim_   = (M.NumRows()*(M.NumRows()+1))/2;
-  }
+  //}
  /// Copy constructor
  SubVector(const SubVector &other) : VectorBase<Real> () {
@@ -572,34 +321,18 @@ std::istream & operator >> (std::istream & in, Vector<Real> & v);
 /// @{
-template<typename Real>
+//template<typename Real>
-bool ApproxEqual(const VectorBase<Real> &a,
+//bool ApproxEqual(const VectorBase<Real> &a,
-                 const VectorBase<Real> &b, Real tol = 0.01) {
+                 //const VectorBase<Real> &b, Real tol = 0.01) {
-  return a.ApproxEqual(b, tol);
+  //return a.ApproxEqual(b, tol);
-}
+//}
-template<typename Real>
-inline void AssertEqual(VectorBase<Real> &a, VectorBase<Real> &b,
-                        float tol = 0.01) {
-  KALDI_ASSERT(a.ApproxEqual(b, tol));
-}
+//template<typename Real>
+//inline void AssertEqual(VectorBase<Real> &a, VectorBase<Real> &b,
+                        //float tol = 0.01) {
+  //KALDI_ASSERT(a.ApproxEqual(b, tol));
+//}
-/// Returns dot product between v1 and v2.
-template<typename Real>
-Real VecVec(const VectorBase<Real> &v1, const VectorBase<Real> &v2);
-template<typename Real, typename OtherReal>
-Real VecVec(const VectorBase<Real> &v1, const VectorBase<OtherReal> &v2);
-/// Returns \f$ v_1^T M v_2  \f$ .
-/// Not as efficient as it could be where v1 == v2.
-template<typename Real>
-Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
-               const VectorBase<Real> &v2);
-/// @} End of "addtogroup matrix_funcs_scalar"
 }  // namespace kaldi

--- a/speechx/speechx/kaldi/matrix/matrix-common.h
+++ b/speechx/speechx/kaldi/matrix/matrix-common.h
@@ -59,26 +59,7 @@ template<typename Real> class SubVector;
 template<typename Real> class MatrixBase;
 template<typename Real> class SubMatrix;
 template<typename Real> class Matrix;
-template<typename Real> class SpMatrix;
-template<typename Real> class TpMatrix;
-template<typename Real> class PackedMatrix;
-template<typename Real> class SparseMatrix;
-// these are classes that won't be defined in this
-// directory; they're mostly needed for friend declarations.
-template<typename Real> class CuMatrixBase;
-template<typename Real> class CuSubMatrix;
-template<typename Real> class CuMatrix;
-template<typename Real> class CuVectorBase;
-template<typename Real> class CuSubVector;
-template<typename Real> class CuVector;
-template<typename Real> class CuPackedMatrix;
-template<typename Real> class CuSpMatrix;
-template<typename Real> class CuTpMatrix;
-template<typename Real> class CuSparseMatrix;
-class CompressedMatrix;
-class GeneralMatrix;
 /// This class provides a way for switching between double and float types.
 template<typename T> class OtherReal { };  // useful in reading+writing routines

--- a/speechx/speechx/kaldi/CMakeLists.txt
+++ b/speechx/speechx/kaldi/CMakeLists.txt
@@ -5,8 +5,6 @@ ${CMAKE_CURRENT_SOURCE_DIR}
 add_subdirectory(base)
 add_subdirectory(util)
-add_subdirectory(feat)
-add_subdirectory(matrix)
 add_subdirectory(lat)
 add_subdirectory(fstext)
 add_subdirectory(decoder)

--- a/speechx/speechx/kaldi/feat/CMakeLists.txt
+++ b/speechx/speechx/kaldi/feat/CMakeLists.txt
-add_library(kaldi-mfcc
-  feature-mfcc.cc
-)
-target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common)
-add_library(kaldi-fbank
-  feature-fbank.cc
-)
-target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common)
-add_library(kaldi-feat-common
-  wave-reader.cc
-  signal.cc
-  feature-functions.cc
-  feature-window.cc
-  resample.cc
-  mel-computations.cc
-  cmvn.cc
-)
-target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)
--- a/speechx/speechx/kaldi/feat/cmvn.cc
+++ b/speechx/speechx/kaldi/feat/cmvn.cc
-// transform/cmvn.cc
-// Copyright 2009-2013 Microsoft Corporation
-//                     Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include "feat/cmvn.h"
-namespace kaldi {
-void InitCmvnStats(int32 dim, Matrix<double> *stats) {
-  KALDI_ASSERT(dim > 0);
-  stats->Resize(2, dim+1);
-}
-void AccCmvnStats(const VectorBase<BaseFloat> &feats, BaseFloat weight, MatrixBase<double> *stats) {
-  int32 dim = feats.Dim();
-  KALDI_ASSERT(stats != NULL);
-  KALDI_ASSERT(stats->NumRows() == 2 && stats->NumCols() == dim + 1);
-  // Remove these __restrict__ modifiers if they cause compilation problems.
-  // It's just an optimization.
-   double *__restrict__ mean_ptr = stats->RowData(0),
-       *__restrict__ var_ptr = stats->RowData(1),
-       *__restrict__ count_ptr = mean_ptr + dim;
-   const BaseFloat * __restrict__ feats_ptr = feats.Data();
-  *count_ptr += weight;
-  // Careful-- if we change the format of the matrix, the "mean_ptr < count_ptr"
-  // statement below might become wrong.
-  for (; mean_ptr < count_ptr; mean_ptr++, var_ptr++, feats_ptr++) {
-    *mean_ptr += *feats_ptr * weight;
-    *var_ptr +=  *feats_ptr * *feats_ptr * weight;
-  }
-}
-void AccCmvnStats(const MatrixBase<BaseFloat> &feats,
-                  const VectorBase<BaseFloat> *weights,
-                  MatrixBase<double> *stats) {
-  int32 num_frames = feats.NumRows();
-  if (weights != NULL) {
-    KALDI_ASSERT(weights->Dim() == num_frames);
-  }
-  for (int32 i = 0; i < num_frames; i++) {
-    SubVector<BaseFloat> this_frame = feats.Row(i);
-    BaseFloat weight = (weights == NULL ? 1.0 : (*weights)(i));
-    if (weight != 0.0)
-      AccCmvnStats(this_frame, weight, stats);
-  }
-}
-void ApplyCmvn(const MatrixBase<double> &stats,
-               bool var_norm,
-               MatrixBase<BaseFloat> *feats) {
-  KALDI_ASSERT(feats != NULL);
-  int32 dim = stats.NumCols() - 1;
-  if (stats.NumRows() > 2 || stats.NumRows() < 1 || feats->NumCols() != dim) {
-    KALDI_ERR << "Dim mismatch: cmvn "
-              << stats.NumRows() << 'x' << stats.NumCols()
-              << ", feats " << feats->NumRows() << 'x' << feats->NumCols();
-  }
-  if (stats.NumRows() == 1 && var_norm)
-    KALDI_ERR << "You requested variance normalization but no variance stats "
-              << "are supplied.";
-  double count = stats(0, dim);
-  // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
-  // computing an offset and representing it as stats, we use a count of one.
-  if (count < 1.0)
-    KALDI_ERR << "Insufficient stats for cepstral mean and variance normalization: "
-              << "count = " << count;
-  if (!var_norm) {
-    Vector<BaseFloat> offset(dim);
-    SubVector<double> mean_stats(stats.RowData(0), dim);
-    offset.AddVec(-1.0 / count, mean_stats);
-    feats->AddVecToRows(1.0, offset);
-    return;
-  }
-  // norm(0, d) = mean offset;
-  // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
-  Matrix<BaseFloat> norm(2, dim);
-  for (int32 d = 0; d < dim; d++) {
-    double mean, offset, scale;
-    mean = stats(0, d)/count;
-    double var = (stats(1, d)/count) - mean*mean,
-        floor = 1.0e-20;
-    if (var < floor) {
-      KALDI_WARN << "Flooring cepstral variance from " << var << " to "
-                 << floor;
-      var = floor;
-    }
-    scale = 1.0 / sqrt(var);
-    if (scale != scale || 1/scale == 0.0)
-      KALDI_ERR << "NaN or infinity in cepstral mean/variance computation";
-    offset = -(mean*scale);
-    norm(0, d) = offset;
-    norm(1, d) = scale;
-  }
-  // Apply the normalization.
-  feats->MulColsVec(norm.Row(1));
-  feats->AddVecToRows(1.0, norm.Row(0));
-}
-void ApplyCmvnReverse(const MatrixBase<double> &stats,
-                      bool var_norm,
-                      MatrixBase<BaseFloat> *feats) {
-  KALDI_ASSERT(feats != NULL);
-  int32 dim = stats.NumCols() - 1;
-  if (stats.NumRows() > 2 || stats.NumRows() < 1 || feats->NumCols() != dim) {
-    KALDI_ERR << "Dim mismatch: cmvn "
-              << stats.NumRows() << 'x' << stats.NumCols()
-              << ", feats " << feats->NumRows() << 'x' << feats->NumCols();
-  }
-  if (stats.NumRows() == 1 && var_norm)
-    KALDI_ERR << "You requested variance normalization but no variance stats "
-              << "are supplied.";
-  double count = stats(0, dim);
-  // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
-  // computing an offset and representing it as stats, we use a count of one.
-  if (count < 1.0)
-    KALDI_ERR << "Insufficient stats for cepstral mean and variance normalization: "
-              << "count = " << count;
-  Matrix<BaseFloat> norm(2, dim);  // norm(0, d) = mean offset
-  // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
-  for (int32 d = 0; d < dim; d++) {
-    double mean, offset, scale;
-    mean = stats(0, d) / count;
-    if (!var_norm) {
-      scale = 1.0;
-      offset = mean;
-    } else {
-      double var = (stats(1, d)/count) - mean*mean,
-          floor = 1.0e-20;
-      if (var < floor) {
-        KALDI_WARN << "Flooring cepstral variance from " << var << " to "
-                   << floor;
-        var = floor;
-      }
-      // we aim to transform zero-mean, unit-variance input into data
-      // with the given mean and variance.
-      scale = sqrt(var);
-      offset = mean;
-    }
-    norm(0, d) = offset;
-    norm(1, d) = scale;
-  }
-  if (var_norm)
-    feats->MulColsVec(norm.Row(1));
-  feats->AddVecToRows(1.0, norm.Row(0));
-}
-void FakeStatsForSomeDims(const std::vector<int32> &dims,
-                          MatrixBase<double> *stats) {
-  KALDI_ASSERT(stats->NumRows() == 2 && stats->NumCols() > 1);
-  int32 dim = stats->NumCols() - 1;
-  double count = (*stats)(0, dim);
-  for (size_t i = 0; i < dims.size(); i++) {
-    int32 d = dims[i];
-    KALDI_ASSERT(d >= 0 && d < dim);
-    (*stats)(0, d) = 0.0;
-    (*stats)(1, d) = count;
-  }
-}
-}  // namespace kaldi
--- a/speechx/speechx/kaldi/feat/cmvn.h
+++ b/speechx/speechx/kaldi/feat/cmvn.h
-// transform/cmvn.h
-// Copyright 2009-2013 Microsoft Corporation
-//                     Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_TRANSFORM_CMVN_H_
-#define KALDI_TRANSFORM_CMVN_H_
-#include "base/kaldi-common.h"
-#include "matrix/matrix-lib.h"
-namespace kaldi {
-/// This function initializes the matrix to dimension 2 by (dim+1);
-/// 1st "dim" elements of 1st row are mean stats, 1st "dim" elements
-/// of 2nd row are var stats, last element of 1st row is count,
-/// last element of 2nd row is zero.
-void InitCmvnStats(int32 dim, Matrix<double> *stats);
-/// Accumulation from a single frame (weighted).
-void AccCmvnStats(const VectorBase<BaseFloat> &feat,
-                  BaseFloat weight,
-                  MatrixBase<double> *stats);
-/// Accumulation from a feature file (possibly weighted-- useful in excluding silence).
-void AccCmvnStats(const MatrixBase<BaseFloat> &feats,
-                  const VectorBase<BaseFloat> *weights,  // or NULL
-                  MatrixBase<double> *stats);
-/// Apply cepstral mean and variance normalization to a matrix of features.
-/// If norm_vars == true, expects stats to be of dimension 2 by (dim+1), but
-/// if norm_vars == false, will accept stats of dimension 1 by (dim+1); these
-/// are produced by the balanced-cmvn code when it computes an offset and
-/// represents it as "fake stats".
-void ApplyCmvn(const MatrixBase<double> &stats,
-               bool norm_vars,
-               MatrixBase<BaseFloat> *feats);
-/// This is as ApplyCmvn, but does so in the reverse sense, i.e. applies a transform
-/// that would take zero-mean, unit-variance input and turn it into output with the
-/// stats of "stats".  This can be useful if you trained without CMVN but later want
-/// to correct a mismatch, so you would first apply CMVN and then do the "reverse"
-/// CMVN with the summed stats of your training data.
-void ApplyCmvnReverse(const MatrixBase<double> &stats,
-                      bool norm_vars,
-                      MatrixBase<BaseFloat> *feats);
-/// Modify the stats so that for some dimensions (specified in "dims"), we
-/// replace them with "fake" stats that have zero mean and unit variance; this
-/// is done to disable CMVN for those dimensions.
-void FakeStatsForSomeDims(const std::vector<int32> &dims,
-                          MatrixBase<double> *stats);
-}  // namespace kaldi
-#endif  // KALDI_TRANSFORM_CMVN_H_
--- a/speechx/speechx/kaldi/feat/feature-common-inl.h
+++ b/speechx/speechx/kaldi/feat/feature-common-inl.h
-// feat/feature-common-inl.h
-// Copyright       2016  Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_FEAT_FEATURE_COMMON_INL_H_
-#define KALDI_FEAT_FEATURE_COMMON_INL_H_
-#include "feat/resample.h"
-// Do not include this file directly.  It is included by feat/feature-common.h
-namespace kaldi {
-template <class F>
-void OfflineFeatureTpl<F>::ComputeFeatures(
-    const VectorBase<BaseFloat> &wave,
-    BaseFloat sample_freq,
-    BaseFloat vtln_warp,
-    Matrix<BaseFloat> *output) {
-  KALDI_ASSERT(output != NULL);
-  BaseFloat new_sample_freq = computer_.GetFrameOptions().samp_freq;
-  if (sample_freq == new_sample_freq) {
-    Compute(wave, vtln_warp, output);
-  } else {
-    if (new_sample_freq < sample_freq &&
-        ! computer_.GetFrameOptions().allow_downsample)
-        KALDI_ERR << "Waveform and config sample Frequency mismatch: "
-                  << sample_freq << " .vs " << new_sample_freq
-                  << " (use --allow-downsample=true to allow "
-                  << " downsampling the waveform).";
-    else if (new_sample_freq > sample_freq &&
-             ! computer_.GetFrameOptions().allow_upsample)
-      KALDI_ERR << "Waveform and config sample Frequency mismatch: "
-                  << sample_freq << " .vs " << new_sample_freq
-                << " (use --allow-upsample=true option to allow "
-                << " upsampling the waveform).";
-    // Resample the waveform.
-    Vector<BaseFloat> resampled_wave(wave);
-    ResampleWaveform(sample_freq, wave,
-                     new_sample_freq, &resampled_wave);
-    Compute(resampled_wave, vtln_warp, output);
-  }
-}
-template <class F>
-void OfflineFeatureTpl<F>::Compute(
-    const VectorBase<BaseFloat> &wave,
-    BaseFloat vtln_warp,
-    Matrix<BaseFloat> *output) {
-  KALDI_ASSERT(output != NULL);
-  int32 rows_out = NumFrames(wave.Dim(), computer_.GetFrameOptions()),
-      cols_out = computer_.Dim();
-  if (rows_out == 0) {
-    output->Resize(0, 0);
-    return;
-  }
-  output->Resize(rows_out, cols_out);
-  Vector<BaseFloat> window;  // windowed waveform.
-  bool use_raw_log_energy = computer_.NeedRawLogEnergy();
-  for (int32 r = 0; r < rows_out; r++) {  // r is frame index.
-    BaseFloat raw_log_energy = 0.0;
-    ExtractWindow(0, wave, r, computer_.GetFrameOptions(),
-                  feature_window_function_, &window,
-                  (use_raw_log_energy ? &raw_log_energy : NULL));
-    SubVector<BaseFloat> output_row(*output, r);
-    computer_.Compute(raw_log_energy, vtln_warp, &window, &output_row);
-  }
-}
-template <class F>
-void OfflineFeatureTpl<F>::Compute(
-    const VectorBase<BaseFloat> &wave,
-    BaseFloat vtln_warp,
-    Matrix<BaseFloat> *output) const {
-  OfflineFeatureTpl<F> temp(*this);
-  // call the non-const version of Compute() on a temporary copy of this object.
-  // This is a workaround for const-ness that may sometimes be useful in
-  // multi-threaded code, although it's not optimally efficient.
-  temp.Compute(wave, vtln_warp, output);
-}
-} // end namespace kaldi
-#endif
--- a/speechx/speechx/kaldi/feat/feature-common.h
+++ b/speechx/speechx/kaldi/feat/feature-common.h
-// feat/feature-common.h
-// Copyright      2016   Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABILITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_FEAT_FEATURE_COMMON_H_
-#define KALDI_FEAT_FEATURE_COMMON_H_
-#include <map>
-#include <string>
-#include "feat/feature-window.h"
-namespace kaldi {
-/// @addtogroup  feat FeatureExtraction
-/// @{
-/// This class is only added for documentation, it is not intended to ever be
-/// used.
-struct ExampleFeatureComputerOptions {
-  FrameExtractionOptions frame_opts;
-  // .. more would go here.
-};
-/// This class is only added for documentation, it is not intended to ever be
-/// used.  It documents the interface of the *Computer classes which wrap the
-/// low-level feature extraction.  The template argument F of OfflineFeatureTpl must
-/// follow this interface.  This interface is intended for features such as
-/// MFCCs and PLPs which can be computed frame by frame.
-class ExampleFeatureComputer {
- public:
-  typedef ExampleFeatureComputerOptions Options;
-  /// Returns a reference to the frame-extraction options class, which
-  /// will be part of our own options class.
-  const FrameExtractionOptions &GetFrameOptions() const {
-    return opts_.frame_opts;
-  }
-  /// Returns the feature dimension
-  int32 Dim() const;
-  /// Returns true if this function may inspect the raw log-energy of the signal
-  /// (before windowing and pre-emphasis); it's safe to always return true, but
-  /// setting it to false enables an optimization.
-  bool NeedRawLogEnergy() const { return true; }
-  /// constructor from options class; it should not store a reference or pointer
-  /// to the options class but should copy it.
-  explicit ExampleFeatureComputer(const ExampleFeatureComputerOptions &opts):
-      opts_(opts) { }
-  /// Copy constructor; all of these classes must have one.
-  ExampleFeatureComputer(const ExampleFeatureComputer &other);
-  /**
-     Function that computes one frame of features from
-     one frame of signal.
-     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
-         prior to windowing and pre-emphasis, or
-         log(numeric_limits<float>::min()), whichever is greater.  Must be
-         ignored by this function if this class returns false from
-         this->NeedRawLogEnergy().
-     @param [in] vtln_warp  The VTLN warping factor that the user wants
-         to be applied when computing features for this utterance.  Will
-         normally be 1.0, meaning no warping is to be done.  The value will
-         be ignored for feature types that don't support VLTN, such as
-         spectrogram features.
-     @param [in] signal_frame  One frame of the signal,
-       as extracted using the function ExtractWindow() using the options
-       returned by this->GetFrameOptions().  The function will use the
-       vector as a workspace, which is why it's a non-const pointer.
-     @param [out] feature  Pointer to a vector of size this->Dim(), to which
-         the computed feature will be written.
-  */
-  void Compute(BaseFloat signal_raw_log_energy,
-               BaseFloat vtln_warp,
-               VectorBase<BaseFloat> *signal_frame,
-               VectorBase<BaseFloat> *feature);
- private:
-  // disallow assignment.
-  ExampleFeatureComputer &operator = (const ExampleFeatureComputer &in);
-  Options opts_;
-};
-/// This templated class is intended for offline feature extraction, i.e. where
-/// you have access to the entire signal at the start.  It exists mainly to be
-/// drop-in replacement for the old (pre-2016) classes Mfcc, Plp and so on, for
-/// use in the offline case.  In April 2016 we reorganized the online
-/// feature-computation code for greater modularity and to have correct support
-/// for the snip-edges=false option.
-template <class F>
-class OfflineFeatureTpl {
- public:
-  typedef typename F::Options Options;
-  // Note: feature_window_function_ is the windowing function, which initialized
-  // using the options class, that we cache at this level.
-  OfflineFeatureTpl(const Options &opts):
-      computer_(opts),
-      feature_window_function_(computer_.GetFrameOptions()) { }
-  // Internal (and back-compatibility) interface for computing features, which
-  // requires that the user has already checked that the sampling frequency
-  // of the waveform is equal to the sampling frequency specified in
-  // the frame-extraction options.
-  void Compute(const VectorBase<BaseFloat> &wave,
-               BaseFloat vtln_warp,
-               Matrix<BaseFloat> *output);
-  // This const version of Compute() is a wrapper that
-  // calls the non-const version on a temporary object.
-  // It's less efficient than the non-const version.
-  void Compute(const VectorBase<BaseFloat> &wave,
-               BaseFloat vtln_warp,
-               Matrix<BaseFloat> *output) const;
-  /**
-     Computes the features for one file (one sequence of features).
-     This is the newer interface where you specify the sample frequency
-     of the input waveform.
-       @param [in] wave   The input waveform
-       @param [in] sample_freq  The sampling frequency with which
-                                'wave' was sampled.
-                                if sample_freq is higher than the frequency
-                                specified in the config, we will downsample
-                                the waveform, but if lower, it's an error.
-     @param [in] vtln_warp  The VTLN warping factor (will normally
-                            be 1.0)
-     @param [out]  output  The matrix of features, where the row-index
-                           is the frame index.
-  */
-  void ComputeFeatures(const VectorBase<BaseFloat> &wave,
-                       BaseFloat sample_freq,
-                       BaseFloat vtln_warp,
-                       Matrix<BaseFloat> *output);
-  int32 Dim() const { return computer_.Dim(); }
-  // Copy constructor.
-  OfflineFeatureTpl(const OfflineFeatureTpl<F> &other):
-      computer_(other.computer_),
-      feature_window_function_(other.feature_window_function_) { }
-  private:
-  // Disallow assignment.
-  OfflineFeatureTpl<F> &operator =(const OfflineFeatureTpl<F> &other);
-  F computer_;
-  FeatureWindowFunction feature_window_function_;
-};
-/// @} End of "addtogroup feat"
-}  // namespace kaldi
-#include "feat/feature-common-inl.h"
-#endif  // KALDI_FEAT_FEATURE_COMMON_H_
--- a/speechx/speechx/kaldi/feat/feature-fbank.cc
+++ b/speechx/speechx/kaldi/feat/feature-fbank.cc
-// feat/feature-fbank.cc
-// Copyright 2009-2012  Karel Vesely
-//                2016  Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include "feat/feature-fbank.h"
-namespace kaldi {
-FbankComputer::FbankComputer(const FbankOptions &opts):
-    opts_(opts), srfft_(NULL) {
-  if (opts.energy_floor > 0.0)
-    log_energy_floor_ = Log(opts.energy_floor);
-  int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
-  if ((padded_window_size & (padded_window_size-1)) == 0)  // Is a power of two...
-    srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
-  // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
-  // [note: this call caches it.]
-  GetMelBanks(1.0);
-}
-FbankComputer::FbankComputer(const FbankComputer &other):
-    opts_(other.opts_), log_energy_floor_(other.log_energy_floor_),
-    mel_banks_(other.mel_banks_), srfft_(NULL) {
-  for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
-      iter != mel_banks_.end();
-      ++iter)
-    iter->second = new MelBanks(*(iter->second));
-  if (other.srfft_)
-    srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
-}
-FbankComputer::~FbankComputer() {
-  for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
-      iter != mel_banks_.end(); ++iter)
-    delete iter->second;
-  delete srfft_;
-}
-const MelBanks* FbankComputer::GetMelBanks(BaseFloat vtln_warp) {
-  MelBanks *this_mel_banks = NULL;
-  std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
-  if (iter == mel_banks_.end()) {
-    this_mel_banks = new MelBanks(opts_.mel_opts,
-                                  opts_.frame_opts,
-                                  vtln_warp);
-    mel_banks_[vtln_warp] = this_mel_banks;
-  } else {
-    this_mel_banks = iter->second;
-  }
-  return this_mel_banks;
-}
-void FbankComputer::Compute(BaseFloat signal_raw_log_energy,
-                            BaseFloat vtln_warp,
-                            VectorBase<BaseFloat> *signal_frame,
-                            VectorBase<BaseFloat> *feature) {
-  const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
-  KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
-               feature->Dim() == this->Dim());
-  // Compute energy after window function (not the raw one).
-  if (opts_.use_energy && !opts_.raw_energy)
-    signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<float>::epsilon()));
-  if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
-    srfft_->Compute(signal_frame->Data(), true);
-  else  // An alternative algorithm that works for non-powers-of-two.
-    RealFft(signal_frame, true);
-  // Convert the FFT into a power spectrum.
-  ComputePowerSpectrum(signal_frame);
-  SubVector<BaseFloat> power_spectrum(*signal_frame, 0,
-                                      signal_frame->Dim() / 2 + 1);
-  // Use magnitude instead of power if requested.
-  if (!opts_.use_power)
-    power_spectrum.ApplyPow(0.5);
-  int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
-  SubVector<BaseFloat> mel_energies(*feature,
-                                    mel_offset,
-                                    opts_.mel_opts.num_bins);
-  // Sum with mel fiterbanks over the power spectrum
-  mel_banks.Compute(power_spectrum, &mel_energies);
-  if (opts_.use_log_fbank) {
-    // Avoid log of zero (which should be prevented anyway by dithering).
-    mel_energies.ApplyFloor(std::numeric_limits<float>::epsilon());
-    mel_energies.ApplyLog();  // take the log.
-  }
-  // Copy energy as first value (or the last, if htk_compat == true).
-  if (opts_.use_energy) {
-    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) {
-      signal_raw_log_energy = log_energy_floor_;
-    }
-    int32 energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
-    (*feature)(energy_index) = signal_raw_log_energy;
-  }
-}
-}  // namespace kaldi
--- a/speechx/speechx/kaldi/feat/feature-fbank.h
+++ b/speechx/speechx/kaldi/feat/feature-fbank.h
-// feat/feature-fbank.h
-// Copyright 2009-2012  Karel Vesely
-//                2016  Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_FEAT_FEATURE_FBANK_H_
-#define KALDI_FEAT_FEATURE_FBANK_H_
-#include <map>
-#include <string>
-#include "feat/feature-common.h"
-#include "feat/feature-functions.h"
-#include "feat/feature-window.h"
-#include "feat/mel-computations.h"
-namespace kaldi {
-/// @addtogroup  feat FeatureExtraction
-/// @{
-/// FbankOptions contains basic options for computing filterbank features.
-/// It only includes things that can be done in a "stateless" way, i.e.
-/// it does not include energy max-normalization.
-/// It does not include delta computation.
-struct FbankOptions {
-  FrameExtractionOptions frame_opts;
-  MelBanksOptions mel_opts;
-  bool use_energy;  // append an extra dimension with energy to the filter banks
-  BaseFloat energy_floor;
-  bool raw_energy;  // If true, compute energy before preemphasis and windowing
-  bool htk_compat;  // If true, put energy last (if using energy)
-  bool use_log_fbank;  // if true (default), produce log-filterbank, else linear
-  bool use_power;  // if true (default), use power in filterbank analysis, else magnitude.
-  FbankOptions(): mel_opts(23),
-                 // defaults the #mel-banks to 23 for the FBANK computations.
-                 // this seems to be common for 16khz-sampled data,
-                 // but for 8khz-sampled data, 15 may be better.
-                 use_energy(false),
-                 energy_floor(0.0),
-                 raw_energy(true),
-                 htk_compat(false),
-                 use_log_fbank(true),
-                 use_power(true) {}
-  void Register(OptionsItf *opts) {
-    frame_opts.Register(opts);
-    mel_opts.Register(opts);
-    opts->Register("use-energy", &use_energy,
-                   "Add an extra dimension with energy to the FBANK output.");
-    opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in FBANK computation. "
-                   "Only makes a difference if --use-energy=true; only necessary if "
-                   "--dither=0.0.  Suggested values: 0.1 or 1.0");
-    opts->Register("raw-energy", &raw_energy,
-                   "If true, compute energy before preemphasis and windowing");
-    opts->Register("htk-compat", &htk_compat, "If true, put energy last.  "
-                   "Warning: not sufficient to get HTK compatible features (need "
-                   "to change other parameters).");
-    opts->Register("use-log-fbank", &use_log_fbank,
-                   "If true, produce log-filterbank, else produce linear.");
-    opts->Register("use-power", &use_power,
-                   "If true, use power, else use magnitude.");
-  }
-};
-/// Class for computing mel-filterbank features; see \ref feat_mfcc for more
-/// information.
-class FbankComputer {
- public:
-  typedef FbankOptions Options;
-  explicit FbankComputer(const FbankOptions &opts);
-  FbankComputer(const FbankComputer &other);
-  int32 Dim() const {
-    return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
-  }
-  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
-  const FrameExtractionOptions &GetFrameOptions() const {
-    return opts_.frame_opts;
-  }
-  /**
-     Function that computes one frame of features from
-     one frame of signal.
-     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
-         prior to windowing and pre-emphasis, or
-         log(numeric_limits<float>::min()), whichever is greater.  Must be
-         ignored by this function if this class returns false from
-         this->NeedsRawLogEnergy().
-     @param [in] vtln_warp  The VTLN warping factor that the user wants
-         to be applied when computing features for this utterance.  Will
-         normally be 1.0, meaning no warping is to be done.  The value will
-         be ignored for feature types that don't support VLTN, such as
-         spectrogram features.
-     @param [in] signal_frame  One frame of the signal,
-       as extracted using the function ExtractWindow() using the options
-       returned by this->GetFrameOptions().  The function will use the
-       vector as a workspace, which is why it's a non-const pointer.
-     @param [out] feature  Pointer to a vector of size this->Dim(), to which
-         the computed feature will be written.
-  */
-  void Compute(BaseFloat signal_raw_log_energy,
-               BaseFloat vtln_warp,
-               VectorBase<BaseFloat> *signal_frame,
-               VectorBase<BaseFloat> *feature);
-  ~FbankComputer();
-  const MelBanks *GetMelBanks(BaseFloat vtln_warp);
- private:
-  FbankOptions opts_;
-  BaseFloat log_energy_floor_;
-  std::map<BaseFloat, MelBanks*> mel_banks_;  // BaseFloat is VTLN coefficient.
-  SplitRadixRealFft<BaseFloat> *srfft_;
-  // Disallow assignment.
-  FbankComputer &operator =(const FbankComputer &other);
-};
-typedef OfflineFeatureTpl<FbankComputer> Fbank;
-/// @} End of "addtogroup feat"
-}  // namespace kaldi
-#endif  // KALDI_FEAT_FEATURE_FBANK_H_
--- a/speechx/speechx/kaldi/feat/feature-functions.cc
+++ b/speechx/speechx/kaldi/feat/feature-functions.cc
-// feat/feature-functions.cc
-// Copyright 2009-2011  Karel Vesely;  Petr Motlicek;  Microsoft Corporation
-//                2013  Johns Hopkins University (author: Daniel Povey)
-//                2014  IMSL, PKU-HKUST (author: Wei Shi)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include "feat/feature-functions.h"
-#include "matrix/matrix-functions.h"
-namespace kaldi {
-void ComputePowerSpectrum(VectorBase<BaseFloat> *waveform) {
-  int32 dim = waveform->Dim();
-  // no, letting it be non-power-of-two for now.
-  // KALDI_ASSERT(dim > 0 && (dim & (dim-1) == 0));  // make sure a power of two.. actually my FFT code
-  // does not require this (dan) but this is better in case we use different code [dan].
-  // RealFft(waveform, true);  // true == forward (not inverse) FFT; makes no difference here,
-  // as we just want power spectrum.
-  // now we have in waveform, first half of complex spectrum
-  // it's stored as [real0, realN/2, real1, im1, real2, im2, ...]
-  int32 half_dim = dim/2;
-  BaseFloat first_energy = (*waveform)(0) * (*waveform)(0),
-      last_energy = (*waveform)(1) * (*waveform)(1);  // handle this special case
-  for (int32 i = 1; i < half_dim; i++) {
-    BaseFloat real = (*waveform)(i*2), im = (*waveform)(i*2 + 1);
-    (*waveform)(i) = real*real + im*im;
-  }
-  (*waveform)(0) = first_energy;
-  (*waveform)(half_dim) = last_energy;  // Will actually never be used, and anyway
-  // if the signal has been bandlimited sensibly this should be zero.
-}
-DeltaFeatures::DeltaFeatures(const DeltaFeaturesOptions &opts): opts_(opts) {
-  KALDI_ASSERT(opts.order >= 0 && opts.order < 1000);  // just make sure we don't get binary junk.
-  // opts will normally be 2 or 3.
-  KALDI_ASSERT(opts.window > 0 && opts.window < 1000);  // again, basic sanity check.
-  // normally the window size will be two.
-  scales_.resize(opts.order+1);
-  scales_[0].Resize(1);
-  scales_[0](0) = 1.0;  // trivial window for 0th order delta [i.e. baseline feats]
-  for (int32 i = 1; i <= opts.order; i++) {
-    Vector<BaseFloat> &prev_scales = scales_[i-1],
-        &cur_scales = scales_[i];
-    int32 window = opts.window;  // this code is designed to still
-    // work if instead we later make it an array and do opts.window[i-1],
-    // or something like that. "window" is a parameter specifying delta-window
-    // width which is actually 2*window + 1.
-    KALDI_ASSERT(window != 0);
-    int32 prev_offset = (static_cast<int32>(prev_scales.Dim()-1))/2,
-        cur_offset = prev_offset + window;
-    cur_scales.Resize(prev_scales.Dim() + 2*window);  // also zeros it.
-    BaseFloat normalizer = 0.0;
-    for (int32 j = -window; j <= window; j++) {
-      normalizer += j*j;
-      for (int32 k = -prev_offset; k <= prev_offset; k++) {
-        cur_scales(j+k+cur_offset) +=
-            static_cast<BaseFloat>(j) * prev_scales(k+prev_offset);
-      }
-    }
-    cur_scales.Scale(1.0 / normalizer);
-  }
-}
-void DeltaFeatures::Process(const MatrixBase<BaseFloat> &input_feats,
-                            int32 frame,
-                            VectorBase<BaseFloat> *output_frame) const {
-  KALDI_ASSERT(frame < input_feats.NumRows());
-  int32 num_frames = input_feats.NumRows(),
-      feat_dim = input_feats.NumCols();
-  KALDI_ASSERT(static_cast<int32>(output_frame->Dim()) == feat_dim * (opts_.order+1));
-  output_frame->SetZero();
-  for (int32 i = 0; i <= opts_.order; i++) {
-    const Vector<BaseFloat> &scales = scales_[i];
-    int32 max_offset = (scales.Dim() - 1) / 2;
-    SubVector<BaseFloat> output(*output_frame, i*feat_dim, feat_dim);
-    for (int32 j = -max_offset; j <= max_offset; j++) {
-      // if asked to read
-      int32 offset_frame = frame + j;
-      if (offset_frame < 0) offset_frame = 0;
-      else if (offset_frame >= num_frames)
-        offset_frame = num_frames - 1;
-      BaseFloat scale = scales(j + max_offset);
-      if (scale != 0.0)
-        output.AddVec(scale, input_feats.Row(offset_frame));
-    }
-  }
-}
-ShiftedDeltaFeatures::ShiftedDeltaFeatures(
-  const ShiftedDeltaFeaturesOptions &opts): opts_(opts) {
-  KALDI_ASSERT(opts.window > 0 && opts.window < 1000);
-  // Default window is 1.
-  int32 window = opts.window;
-  KALDI_ASSERT(window != 0);
-  scales_.Resize(1 + 2*window);  // also zeros it.
-  BaseFloat normalizer = 0.0;
-  for (int32 j = -window; j <= window; j++) {
-    normalizer += j*j;
-    scales_(j + window) += static_cast<BaseFloat>(j);
-  }
-  scales_.Scale(1.0 / normalizer);
-}
-void ShiftedDeltaFeatures::Process(const MatrixBase<BaseFloat> &input_feats,
-                            int32 frame,
-                            SubVector<BaseFloat> *output_frame) const {
-  KALDI_ASSERT(frame < input_feats.NumRows());
-  int32 num_frames = input_feats.NumRows(),
-      feat_dim = input_feats.NumCols();
-  KALDI_ASSERT(static_cast<int32>(output_frame->Dim())
-               == feat_dim * (opts_.num_blocks + 1));
-  output_frame->SetZero();
-  // The original features
-  SubVector<BaseFloat> output(*output_frame, 0, feat_dim);
-  output.AddVec(1.0, input_feats.Row(frame));
-  // Concatenate the delta-blocks. Each block is block_shift
-  // (usually 3) frames apart.
-  for (int32 i = 0; i < opts_.num_blocks; i++) {
-    int32 max_offset = (scales_.Dim() - 1) / 2;
-    SubVector<BaseFloat> output(*output_frame, (i + 1) * feat_dim, feat_dim);
-    for (int32 j = -max_offset; j <= max_offset; j++) {
-      int32 offset_frame = frame + j + i * opts_.block_shift;
-      if (offset_frame < 0) offset_frame = 0;
-      else if (offset_frame >= num_frames)
-        offset_frame = num_frames - 1;
-      BaseFloat scale = scales_(j + max_offset);
-      if (scale != 0.0)
-        output.AddVec(scale, input_feats.Row(offset_frame));
-    }
-  }
-}
-void ComputeDeltas(const DeltaFeaturesOptions &delta_opts,
-                   const MatrixBase<BaseFloat> &input_features,
-                   Matrix<BaseFloat> *output_features) {
-  output_features->Resize(input_features.NumRows(),
-                          input_features.NumCols()
-                          *(delta_opts.order + 1));
-  DeltaFeatures delta(delta_opts);
-  for (int32 r = 0; r < static_cast<int32>(input_features.NumRows()); r++) {
-    SubVector<BaseFloat> row(*output_features, r);
-    delta.Process(input_features, r, &row);
-  }
-}
-void ComputeShiftedDeltas(const ShiftedDeltaFeaturesOptions &delta_opts,
-                   const MatrixBase<BaseFloat> &input_features,
-                   Matrix<BaseFloat> *output_features) {
-  output_features->Resize(input_features.NumRows(),
-                          input_features.NumCols()
-                          * (delta_opts.num_blocks + 1));
-  ShiftedDeltaFeatures delta(delta_opts);
-  for (int32 r = 0; r < static_cast<int32>(input_features.NumRows()); r++) {
-    SubVector<BaseFloat> row(*output_features, r);
-    delta.Process(input_features, r, &row);
-  }
-}
-void InitIdftBases(int32 n_bases, int32 dimension, Matrix<BaseFloat> *mat_out) {
-  BaseFloat angle = M_PI / static_cast<BaseFloat>(dimension - 1);
-  BaseFloat scale = 1.0f / (2.0 * static_cast<BaseFloat>(dimension - 1));
-  mat_out->Resize(n_bases, dimension);
-  for (int32 i = 0; i < n_bases; i++) {
-    (*mat_out)(i, 0) = 1.0 * scale;
-    BaseFloat i_fl = static_cast<BaseFloat>(i);
-    for (int32 j = 1; j < dimension - 1; j++) {
-      BaseFloat j_fl = static_cast<BaseFloat>(j);
-      (*mat_out)(i, j) = 2.0 * scale * cos(angle * i_fl * j_fl);
-    }
-    (*mat_out)(i, dimension -1)
-        = scale * cos(angle * i_fl * static_cast<BaseFloat>(dimension-1));
-  }
-}
-void SpliceFrames(const MatrixBase<BaseFloat> &input_features,
-                  int32 left_context,
-                  int32 right_context,
-                  Matrix<BaseFloat> *output_features) {
-  int32 T = input_features.NumRows(), D = input_features.NumCols();
-  if (T == 0 || D == 0)
-    KALDI_ERR << "SpliceFrames: empty input";
-  KALDI_ASSERT(left_context >= 0 && right_context >= 0);
-  int32 N = 1 + left_context + right_context;
-  output_features->Resize(T, D*N);
-  for (int32 t = 0; t < T; t++) {
-    SubVector<BaseFloat> dst_row(*output_features, t);
-    for (int32 j = 0; j < N; j++) {
-      int32 t2 = t + j - left_context;
-      if (t2 < 0) t2 = 0;
-      if (t2 >= T) t2 = T-1;
-      SubVector<BaseFloat> dst(dst_row, j*D, D),
-          src(input_features, t2);
-      dst.CopyFromVec(src);
-    }
-  }
-}
-void ReverseFrames(const MatrixBase<BaseFloat> &input_features,
-                   Matrix<BaseFloat> *output_features) {
-  int32 T = input_features.NumRows(), D = input_features.NumCols();
-  if (T == 0 || D == 0)
-    KALDI_ERR << "ReverseFrames: empty input";
-  output_features->Resize(T, D);
-  for (int32 t = 0; t < T; t++) {
-    SubVector<BaseFloat> dst_row(*output_features, t);
-    SubVector<BaseFloat> src_row(input_features, T-1-t);
-    dst_row.CopyFromVec(src_row);
-  }
-}
-void SlidingWindowCmnOptions::Check() const {
-  KALDI_ASSERT(cmn_window > 0);
-  if (center)
-    KALDI_ASSERT(min_window > 0 && min_window <= cmn_window);
-  // else ignored so value doesn't matter.
-}
-// Internal version of SlidingWindowCmn with double-precision arguments.
-void SlidingWindowCmnInternal(const SlidingWindowCmnOptions &opts,
-                              const MatrixBase<double> &input,
-                              MatrixBase<double> *output) {
-  opts.Check();
-  int32 num_frames = input.NumRows(), dim = input.NumCols(),
-        last_window_start = -1, last_window_end = -1,
-        warning_count = 0;
-  Vector<double> cur_sum(dim), cur_sumsq(dim);
-  for (int32 t = 0; t < num_frames; t++) {
-    int32 window_start, window_end; // note: window_end will be one
-    // past the end of the window we use for normalization.
-    if (opts.center) {
-      window_start = t - (opts.cmn_window / 2);
-      window_end = window_start + opts.cmn_window;
-    } else {
-      window_start = t - opts.cmn_window;
-      window_end = t + 1;
-    }
-    if (window_start < 0) { // shift window right if starts <0.
-      window_end -= window_start;
-      window_start = 0; // or: window_start -= window_start
-    }
-    if (!opts.center) {
-      if (window_end > t)
-        window_end = std::max(t + 1, opts.min_window);
-    }
-    if (window_end > num_frames) {
-      window_start -= (window_end - num_frames);
-      window_end = num_frames;
-      if (window_start < 0) window_start = 0;
-    }
-    if (last_window_start == -1) {
-      SubMatrix<double> input_part(input,
-                                      window_start, window_end - window_start,
-                                      0, dim);
-      cur_sum.AddRowSumMat(1.0, input_part , 0.0);
-      if (opts.normalize_variance)
-        cur_sumsq.AddDiagMat2(1.0, input_part, kTrans, 0.0);
-    } else {
-      if (window_start > last_window_start) {
-        KALDI_ASSERT(window_start == last_window_start + 1);
-        SubVector<double> frame_to_remove(input, last_window_start);
-        cur_sum.AddVec(-1.0, frame_to_remove);
-        if (opts.normalize_variance)
-          cur_sumsq.AddVec2(-1.0, frame_to_remove);
-      }
-      if (window_end > last_window_end) {
-        KALDI_ASSERT(window_end == last_window_end + 1);
-        SubVector<double> frame_to_add(input, last_window_end);
-        cur_sum.AddVec(1.0, frame_to_add);
-        if (opts.normalize_variance)
-          cur_sumsq.AddVec2(1.0, frame_to_add);
-      }
-    }
-    int32 window_frames = window_end - window_start;
-    last_window_start = window_start;
-    last_window_end = window_end;
-    KALDI_ASSERT(window_frames > 0);
-    SubVector<double> input_frame(input, t),
-        output_frame(*output, t);
-    output_frame.CopyFromVec(input_frame);
-    output_frame.AddVec(-1.0 / window_frames, cur_sum);
-    if (opts.normalize_variance) {
-      if (window_frames == 1) {
-        output_frame.Set(0.0);
-      } else {
-        Vector<double> variance(cur_sumsq);
-        variance.Scale(1.0 / window_frames);
-        variance.AddVec2(-1.0 / (window_frames * window_frames), cur_sum);
-        // now "variance" is the variance of the features in the window,
-        // around their own mean.
-        int32 num_floored;
-	variance.ApplyFloor(1.0e-10, &num_floored);
-        if (num_floored > 0 && num_frames > 1) {
-          if (opts.max_warnings == warning_count) {
-            KALDI_WARN << "Suppressing the remaining variance flooring "
-                       << "warnings. Run program with --max-warnings=-1 to "
-                       << "see all warnings.";
-          }
-          // If opts.max_warnings is a negative number, we won't restrict the
-          // number of times that the warning is printed out.
-          else if (opts.max_warnings < 0
-                   || opts.max_warnings > warning_count) {
-            KALDI_WARN << "Flooring when normalizing variance, floored "
-                       << num_floored << " elements; num-frames was "
-                       << window_frames;
-          }
-          warning_count++;
-        }
-        variance.ApplyPow(-0.5); // get inverse standard deviation.
-        output_frame.MulElements(variance);
-      }
-    }
-  }
-}
-void SlidingWindowCmn(const SlidingWindowCmnOptions &opts,
-                      const MatrixBase<BaseFloat> &input,
-                      MatrixBase<BaseFloat> *output) {
-  KALDI_ASSERT(SameDim(input, *output) && input.NumRows() > 0);
-  Matrix<double> input_dbl(input), output_dbl(input.NumRows(), input.NumCols());
-  // call double-precision version
-  SlidingWindowCmnInternal(opts, input_dbl, &output_dbl);
-  output->CopyFromMat(output_dbl);
-}
-}  // namespace kaldi
--- a/speechx/speechx/kaldi/feat/feature-functions.h
+++ b/speechx/speechx/kaldi/feat/feature-functions.h
-// feat/feature-functions.h
-// Copyright 2009-2011  Karel Vesely;  Petr Motlicek;  Microsoft Corporation
-//                2014  IMSL, PKU-HKUST (author: Wei Shi)
-//                2016  Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_FEAT_FEATURE_FUNCTIONS_H_
-#define KALDI_FEAT_FEATURE_FUNCTIONS_H_
-#include <string>
-#include <vector>
-#include "matrix/matrix-lib.h"
-#include "util/common-utils.h"
-#include "base/kaldi-error.h"
-namespace kaldi {
-/// @addtogroup  feat FeatureExtraction
-/// @{
-// ComputePowerSpectrum converts a complex FFT (as produced by the FFT
-// functions in matrix/matrix-functions.h), and converts it into
-// a power spectrum.  If the complex FFT is a vector of size n (representing
-// half the complex FFT of a real signal of size n, as described there),
-// this function computes in the first (n/2) + 1 elements of it, the
-// energies of the fft bins from zero to the Nyquist frequency.  Contents of the
-// remaining (n/2) - 1 elements are undefined at output.
-void ComputePowerSpectrum(VectorBase<BaseFloat> *complex_fft);
-struct DeltaFeaturesOptions {
-  int32 order;
-  int32 window;  // e.g. 2; controls window size (window size is 2*window + 1)
-  // the behavior at the edges is to replicate the first or last frame.
-  // this is not configurable.
-  DeltaFeaturesOptions(int32 order = 2, int32 window = 2):
-      order(order), window(window) { }
-  void Register(OptionsItf *opts) {
-    opts->Register("delta-order", &order, "Order of delta computation");
-    opts->Register("delta-window", &window,
-                   "Parameter controlling window for delta computation (actual window"
-                   " size for each delta order is 1 + 2*delta-window-size)");
-  }
-};
-class DeltaFeatures {
- public:
-  // This class provides a low-level function to compute delta features.
-  // The function takes as input a matrix of features and a frame index
-  // that it should compute the deltas on.  It puts its output in an object
-  // of type VectorBase, of size (original-feature-dimension) * (opts.order+1).
-  // This is not the most efficient way to do the computation, but it's
-  // state-free and thus easier to understand
-  explicit DeltaFeatures(const DeltaFeaturesOptions &opts);
-  void Process(const MatrixBase<BaseFloat> &input_feats,
-               int32 frame,
-               VectorBase<BaseFloat> *output_frame) const;
- private:
-  DeltaFeaturesOptions opts_;
-  std::vector<Vector<BaseFloat> > scales_;  // a scaling window for each
-  // of the orders, including zero: multiply the features for each
-  // dimension by this window.
-};
-struct ShiftedDeltaFeaturesOptions {
-  int32 window,           // The time delay and advance
-        num_blocks,
-        block_shift;      // Distance between consecutive blocks
-  ShiftedDeltaFeaturesOptions():
-      window(1), num_blocks(7), block_shift(3) { }
-  void Register(OptionsItf *opts) {
-    opts->Register("delta-window", &window, "Size of delta advance and delay.");
-    opts->Register("num-blocks", &num_blocks, "Number of delta blocks in advance"
-                   " of each frame to be concatenated");
-    opts->Register("block-shift", &block_shift, "Distance between each block");
-  }
-};
-class ShiftedDeltaFeatures {
- public:
-  // This class provides a low-level function to compute shifted
-  // delta cesptra (SDC).
-  // The function takes as input a matrix of features and a frame index
-  // that it should compute the deltas on.  It puts its output in an object
-  // of type VectorBase, of size original-feature-dimension + (1  * num_blocks).
-  explicit ShiftedDeltaFeatures(const ShiftedDeltaFeaturesOptions &opts);
-  void Process(const MatrixBase<BaseFloat> &input_feats,
-               int32 frame,
-               SubVector<BaseFloat> *output_frame) const;
- private:
-  ShiftedDeltaFeaturesOptions opts_;
-  Vector<BaseFloat> scales_;  // a scaling window for each
-};
-// ComputeDeltas is a convenience function that computes deltas on a feature
-// file.  If you want to deal with features coming in bit by bit you would have
-// to use the DeltaFeatures class directly, and do the computation frame by
-// frame.  Later we will have to come up with a nice mechanism to do this for
-// features coming in.
-void ComputeDeltas(const DeltaFeaturesOptions &delta_opts,
-                   const MatrixBase<BaseFloat> &input_features,
-                   Matrix<BaseFloat> *output_features);
-// ComputeShiftedDeltas computes deltas from a feature file by applying
-// ShiftedDeltaFeatures over the frames. This function is provided for
-// convenience, however, ShiftedDeltaFeatures can be used directly.
-void ComputeShiftedDeltas(const ShiftedDeltaFeaturesOptions &delta_opts,
-                   const MatrixBase<BaseFloat> &input_features,
-                   Matrix<BaseFloat> *output_features);
-// SpliceFrames will normally be used together with LDA.
-// It splices frames together to make a window.  At the
-// start and end of an utterance, it duplicates the first
-// and last frames.
-// Will throw if input features are empty.
-// left_context and right_context must be nonnegative.
-// these both represent a number of frames (e.g. 4, 4 is
-// a good choice).
-void SpliceFrames(const MatrixBase<BaseFloat> &input_features,
-                  int32 left_context,
-                  int32 right_context,
-                  Matrix<BaseFloat> *output_features);
-// ReverseFrames reverses the frames in time (used for backwards decoding)
-void ReverseFrames(const MatrixBase<BaseFloat> &input_features,
-                  Matrix<BaseFloat> *output_features);
-void InitIdftBases(int32 n_bases, int32 dimension, Matrix<BaseFloat> *mat_out);
-// This is used for speaker-id.  Also see OnlineCmnOptions in ../online2/, which
-// is online CMN with no latency, for online speech recognition.
-struct SlidingWindowCmnOptions {
-  int32 cmn_window;
-  int32 min_window;
-  int32 max_warnings;
-  bool normalize_variance;
-  bool center;
-  SlidingWindowCmnOptions():
-      cmn_window(600),
-      min_window(100),
-      max_warnings(5),
-      normalize_variance(false),
-      center(false) { }
-  void Register(OptionsItf *opts) {
-    opts->Register("cmn-window", &cmn_window, "Window in frames for running "
-                   "average CMN computation");
-    opts->Register("min-cmn-window", &min_window, "Minimum CMN window "
-                   "used at start of decoding (adds latency only at start). "
-                   "Only applicable if center == false, ignored if center==true");
-    opts->Register("max-warnings", &max_warnings, "Maximum warnings to report "
-                   "per utterance. 0 to disable, -1 to show all.");
-    opts->Register("norm-vars", &normalize_variance, "If true, normalize "
-                   "variance to one."); // naming this as in apply-cmvn.cc
-    opts->Register("center", &center, "If true, use a window centered on the "
-                   "current frame (to the extent possible, modulo end effects). "
-                   "If false, window is to the left.");
-  }
-  void Check() const;
-};
-/// Applies sliding-window cepstral mean and/or variance normalization.  See the
-/// strings registering the options in the options class for information on how
-/// this works and what the options are.  input and output must have the same
-/// dimension.
-void SlidingWindowCmn(const SlidingWindowCmnOptions &opts,
-                      const MatrixBase<BaseFloat> &input,
-                      MatrixBase<BaseFloat> *output);
-/// @} End of "addtogroup feat"
-}  // namespace kaldi
-#endif  // KALDI_FEAT_FEATURE_FUNCTIONS_H_
--- a/speechx/speechx/kaldi/feat/feature-mfcc.cc
+++ b/speechx/speechx/kaldi/feat/feature-mfcc.cc
-// feat/feature-mfcc.cc
-// Copyright 2009-2011  Karel Vesely;  Petr Motlicek
-//                2016  Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include "feat/feature-mfcc.h"
-namespace kaldi {
-void MfccComputer::Compute(BaseFloat signal_raw_log_energy,
-                           BaseFloat vtln_warp,
-                           VectorBase<BaseFloat> *signal_frame,
-                           VectorBase<BaseFloat> *feature) {
-  KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
-               feature->Dim() == this->Dim());
-  const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
-  if (opts_.use_energy && !opts_.raw_energy)
-    signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<float>::epsilon()));
-  if (srfft_ != NULL)  // Compute FFT using the split-radix algorithm.
-    srfft_->Compute(signal_frame->Data(), true);
-  else  // An alternative algorithm that works for non-powers-of-two.
-    RealFft(signal_frame, true);
-  // Convert the FFT into a power spectrum.
-  ComputePowerSpectrum(signal_frame);
-  SubVector<BaseFloat> power_spectrum(*signal_frame, 0,
-                                      signal_frame->Dim() / 2 + 1);
-  mel_banks.Compute(power_spectrum, &mel_energies_);
-  // avoid log of zero (which should be prevented anyway by dithering).
-  mel_energies_.ApplyFloor(std::numeric_limits<float>::epsilon());
-  mel_energies_.ApplyLog();  // take the log.
-  feature->SetZero();  // in case there were NaNs.
-  // feature = dct_matrix_ * mel_energies [which now have log]
-  feature->AddMatVec(1.0, dct_matrix_, kNoTrans, mel_energies_, 0.0);
-  if (opts_.cepstral_lifter != 0.0)
-    feature->MulElements(lifter_coeffs_);
-  if (opts_.use_energy) {
-    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
-      signal_raw_log_energy = log_energy_floor_;
-    (*feature)(0) = signal_raw_log_energy;
-  }
-  if (opts_.htk_compat) {
-    BaseFloat energy = (*feature)(0);
-    for (int32 i = 0; i < opts_.num_ceps - 1; i++)
-      (*feature)(i) = (*feature)(i+1);
-    if (!opts_.use_energy)
-      energy *= M_SQRT2;  // scale on C0 (actually removing a scale
-    // we previously added that's part of one common definition of
-    // the cosine transform.)
-    (*feature)(opts_.num_ceps - 1)  = energy;
-  }
-}
-MfccComputer::MfccComputer(const MfccOptions &opts):
-    opts_(opts), srfft_(NULL),
-    mel_energies_(opts.mel_opts.num_bins) {
-  int32 num_bins = opts.mel_opts.num_bins;
-  if (opts.num_ceps > num_bins)
-    KALDI_ERR << "num-ceps cannot be larger than num-mel-bins."
-              << " It should be smaller or equal. You provided num-ceps: "
-              << opts.num_ceps << "  and num-mel-bins: "
-              << num_bins;
-  Matrix<BaseFloat> dct_matrix(num_bins, num_bins);
-  ComputeDctMatrix(&dct_matrix);
-  // Note that we include zeroth dct in either case.  If using the
-  // energy we replace this with the energy.  This means a different
-  // ordering of features than HTK.
-  SubMatrix<BaseFloat> dct_rows(dct_matrix, 0, opts.num_ceps, 0, num_bins);
-  dct_matrix_.Resize(opts.num_ceps, num_bins);
-  dct_matrix_.CopyFromMat(dct_rows);  // subset of rows.
-  if (opts.cepstral_lifter != 0.0) {
-    lifter_coeffs_.Resize(opts.num_ceps);
-    ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_);
-  }
-  if (opts.energy_floor > 0.0)
-    log_energy_floor_ = Log(opts.energy_floor);
-  int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
-  if ((padded_window_size & (padded_window_size-1)) == 0)  // Is a power of two...
-    srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
-  // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
-  // [note: this call caches it.]
-  GetMelBanks(1.0);
-}
-MfccComputer::MfccComputer(const MfccComputer &other):
-    opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_),
-    dct_matrix_(other.dct_matrix_),
-    log_energy_floor_(other.log_energy_floor_),
-    mel_banks_(other.mel_banks_),
-    srfft_(NULL),
-    mel_energies_(other.mel_energies_.Dim(), kUndefined) {
-  for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
-       iter != mel_banks_.end(); ++iter)
-    iter->second = new MelBanks(*(iter->second));
-  if (other.srfft_ != NULL)
-    srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
-}
-MfccComputer::~MfccComputer() {
-  for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
-      iter != mel_banks_.end();
-      ++iter)
-    delete iter->second;
-  delete srfft_;
-}
-const MelBanks *MfccComputer::GetMelBanks(BaseFloat vtln_warp) {
-  MelBanks *this_mel_banks = NULL;
-  std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
-  if (iter == mel_banks_.end()) {
-    this_mel_banks = new MelBanks(opts_.mel_opts,
-                                  opts_.frame_opts,
-                                  vtln_warp);
-    mel_banks_[vtln_warp] = this_mel_banks;
-  } else {
-    this_mel_banks = iter->second;
-  }
-  return this_mel_banks;
-}
-}  // namespace kaldi
--- a/speechx/speechx/kaldi/feat/feature-mfcc.h
+++ b/speechx/speechx/kaldi/feat/feature-mfcc.h
-// feat/feature-mfcc.h
-// Copyright 2009-2011  Karel Vesely;  Petr Motlicek;  Saarland University
-//           2014-2016  Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_FEAT_FEATURE_MFCC_H_
-#define KALDI_FEAT_FEATURE_MFCC_H_
-#include <map>
-#include <string>
-#include "feat/feature-common.h"
-#include "feat/feature-functions.h"
-#include "feat/feature-window.h"
-#include "feat/mel-computations.h"
-namespace kaldi {
-/// @addtogroup  feat FeatureExtraction
-/// @{
-/// MfccOptions contains basic options for computing MFCC features.
-struct MfccOptions {
-  FrameExtractionOptions frame_opts;
-  MelBanksOptions mel_opts;
-  int32 num_ceps;  // e.g. 13: num cepstral coeffs, counting zero.
-  bool use_energy;  // use energy; else C0
-  BaseFloat energy_floor;  // 0 by default; set to a value like 1.0 or 0.1 if
-                           // you disable dithering.
-  bool raw_energy;  // If true, compute energy before preemphasis and windowing
-  BaseFloat cepstral_lifter;  // Scaling factor on cepstra for HTK compatibility.
-                              // if 0.0, no liftering is done.
-  bool htk_compat;  // if true, put energy/C0 last and introduce a factor of
-                    // sqrt(2) on C0 to be the same as HTK.
-  MfccOptions() : mel_opts(23),
-                  // defaults the #mel-banks to 23 for the MFCC computations.
-                  // this seems to be common for 16khz-sampled data,
-                  // but for 8khz-sampled data, 15 may be better.
-                  num_ceps(13),
-                  use_energy(true),
-                  energy_floor(0.0),
-                  raw_energy(true),
-                  cepstral_lifter(22.0),
-                  htk_compat(false) {}
-  void Register(OptionsItf *opts) {
-    frame_opts.Register(opts);
-    mel_opts.Register(opts);
-    opts->Register("num-ceps", &num_ceps,
-                   "Number of cepstra in MFCC computation (including C0)");
-    opts->Register("use-energy", &use_energy,
-                   "Use energy (not C0) in MFCC computation");
-    opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in MFCC computation. "
-                   "Only makes a difference if --use-energy=true; only necessary if "
-                   "--dither=0.0.  Suggested values: 0.1 or 1.0");
-    opts->Register("raw-energy", &raw_energy,
-                   "If true, compute energy before preemphasis and windowing");
-    opts->Register("cepstral-lifter", &cepstral_lifter,
-                   "Constant that controls scaling of MFCCs");
-    opts->Register("htk-compat", &htk_compat,
-                   "If true, put energy or C0 last and use a factor of sqrt(2) on "
-                   "C0.  Warning: not sufficient to get HTK compatible features "
-                   "(need to change other parameters).");
-  }
-};
-// This is the new-style interface to the MFCC computation.
-class MfccComputer {
- public:
-  typedef MfccOptions Options;
-  explicit MfccComputer(const MfccOptions &opts);
-  MfccComputer(const MfccComputer &other);
-  const FrameExtractionOptions &GetFrameOptions() const {
-    return opts_.frame_opts;
-  }
-  int32 Dim() const { return opts_.num_ceps; }
-  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
-  /**
-     Function that computes one frame of features from
-     one frame of signal.
-     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
-         prior to windowing and pre-emphasis, or
-         log(numeric_limits<float>::min()), whichever is greater.  Must be
-         ignored by this function if this class returns false from
-         this->NeedsRawLogEnergy().
-     @param [in] vtln_warp  The VTLN warping factor that the user wants
-         to be applied when computing features for this utterance.  Will
-         normally be 1.0, meaning no warping is to be done.  The value will
-         be ignored for feature types that don't support VLTN, such as
-         spectrogram features.
-     @param [in] signal_frame  One frame of the signal,
-       as extracted using the function ExtractWindow() using the options
-       returned by this->GetFrameOptions().  The function will use the
-       vector as a workspace, which is why it's a non-const pointer.
-     @param [out] feature  Pointer to a vector of size this->Dim(), to which
-         the computed feature will be written.
-  */
-  void Compute(BaseFloat signal_raw_log_energy,
-               BaseFloat vtln_warp,
-               VectorBase<BaseFloat> *signal_frame,
-               VectorBase<BaseFloat> *feature);
-  ~MfccComputer();
- private:
-  // disallow assignment.
-  MfccComputer &operator = (const MfccComputer &in);
- protected:
-  const MelBanks *GetMelBanks(BaseFloat vtln_warp);
-  MfccOptions opts_;
-  Vector<BaseFloat> lifter_coeffs_;
-  Matrix<BaseFloat> dct_matrix_;  // matrix we left-multiply by to perform DCT.
-  BaseFloat log_energy_floor_;
-  std::map<BaseFloat, MelBanks*> mel_banks_;  // BaseFloat is VTLN coefficient.
-  SplitRadixRealFft<BaseFloat> *srfft_;
-  // note: mel_energies_ is specific to the frame we're processing, it's
-  // just a temporary workspace.
-  Vector<BaseFloat> mel_energies_;
-};
-typedef OfflineFeatureTpl<MfccComputer> Mfcc;
-/// @} End of "addtogroup feat"
-}  // namespace kaldi
-#endif  // KALDI_FEAT_FEATURE_MFCC_H_
--- a/speechx/speechx/kaldi/feat/feature-plp.cc
+++ b/speechx/speechx/kaldi/feat/feature-plp.cc
-// feat/feature-plp.cc
-// Copyright 2009-2011  Petr Motlicek;  Karel Vesely
-//                2016  Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include "feat/feature-plp.h"
-namespace kaldi {
-PlpComputer::PlpComputer(const PlpOptions &opts):
-    opts_(opts), srfft_(NULL),
-    mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined),
-    autocorr_coeffs_(opts_.lpc_order + 1, kUndefined),
-    lpc_coeffs_(opts_.lpc_order, kUndefined),
-    raw_cepstrum_(opts_.lpc_order, kUndefined) {
-  if (opts.cepstral_lifter != 0.0) {
-    lifter_coeffs_.Resize(opts.num_ceps);
-    ComputeLifterCoeffs(opts.cepstral_lifter, &lifter_coeffs_);
-  }
-  InitIdftBases(opts_.lpc_order + 1, opts_.mel_opts.num_bins + 2,
-                &idft_bases_);
-  if (opts.energy_floor > 0.0)
-    log_energy_floor_ = Log(opts.energy_floor);
-  int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
-  if ((padded_window_size & (padded_window_size-1)) == 0)  // Is a power of two...
-    srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
-  // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
-  // [note: this call caches it.]
-  GetMelBanks(1.0);
-}
-PlpComputer::PlpComputer(const PlpComputer &other):
-    opts_(other.opts_), lifter_coeffs_(other.lifter_coeffs_),
-    idft_bases_(other.idft_bases_), log_energy_floor_(other.log_energy_floor_),
-    mel_banks_(other.mel_banks_), equal_loudness_(other.equal_loudness_),
-    srfft_(NULL),
-    mel_energies_duplicated_(opts_.mel_opts.num_bins + 2, kUndefined),
-    autocorr_coeffs_(opts_.lpc_order + 1, kUndefined),
-    lpc_coeffs_(opts_.lpc_order, kUndefined),
-    raw_cepstrum_(opts_.lpc_order, kUndefined) {
-  for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
-       iter != mel_banks_.end(); ++iter)
-    iter->second = new MelBanks(*(iter->second));
-  for (std::map<BaseFloat, Vector<BaseFloat>*>::iterator
-           iter = equal_loudness_.begin();
-       iter != equal_loudness_.end(); ++iter)
-    iter->second = new Vector<BaseFloat>(*(iter->second));
-  if (other.srfft_ != NULL)
-    srfft_ = new SplitRadixRealFft<BaseFloat>(*(other.srfft_));
-}
-PlpComputer::~PlpComputer() {
-  for (std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.begin();
-      iter != mel_banks_.end(); ++iter)
-    delete iter->second;
-  for (std::map<BaseFloat, Vector<BaseFloat>* >::iterator
-           iter = equal_loudness_.begin();
-       iter != equal_loudness_.end(); ++iter)
-    delete iter->second;
-  delete srfft_;
-}
-const MelBanks *PlpComputer::GetMelBanks(BaseFloat vtln_warp) {
-  MelBanks *this_mel_banks = NULL;
-  std::map<BaseFloat, MelBanks*>::iterator iter = mel_banks_.find(vtln_warp);
-  if (iter == mel_banks_.end()) {
-    this_mel_banks = new MelBanks(opts_.mel_opts,
-                                  opts_.frame_opts,
-                                  vtln_warp);
-    mel_banks_[vtln_warp] = this_mel_banks;
-  } else {
-    this_mel_banks = iter->second;
-  }
-  return this_mel_banks;
-}
-const Vector<BaseFloat> *PlpComputer::GetEqualLoudness(BaseFloat vtln_warp) {
-  const MelBanks *this_mel_banks = GetMelBanks(vtln_warp);
-  Vector<BaseFloat> *ans = NULL;
-  std::map<BaseFloat, Vector<BaseFloat>*>::iterator iter
-      = equal_loudness_.find(vtln_warp);
-  if (iter == equal_loudness_.end()) {
-    ans = new Vector<BaseFloat>;
-    GetEqualLoudnessVector(*this_mel_banks, ans);
-    equal_loudness_[vtln_warp] = ans;
-  } else {
-    ans = iter->second;
-  }
-  return ans;
-}
-void PlpComputer::Compute(BaseFloat signal_raw_log_energy,
-                          BaseFloat vtln_warp,
-                          VectorBase<BaseFloat> *signal_frame,
-                          VectorBase<BaseFloat> *feature) {
-  KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
-               feature->Dim() == this->Dim());
-  const MelBanks &mel_banks = *GetMelBanks(vtln_warp);
-  const Vector<BaseFloat> &equal_loudness = *GetEqualLoudness(vtln_warp);
-  KALDI_ASSERT(opts_.num_ceps <= opts_.lpc_order+1);  // our num-ceps includes C0.
-  if (opts_.use_energy && !opts_.raw_energy)
-    signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<float>::min()));
-  if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
-    srfft_->Compute(signal_frame->Data(), true);
-  else  // An alternative algorithm that works for non-powers-of-two.
-    RealFft(signal_frame, true);
-  // Convert the FFT into a power spectrum.
-  ComputePowerSpectrum(signal_frame);  // elements 0 ... signal_frame->Dim()/2
-  SubVector<BaseFloat> power_spectrum(*signal_frame,
-                                      0, signal_frame->Dim() / 2 + 1);
-  int32 num_mel_bins = opts_.mel_opts.num_bins;
-  SubVector<BaseFloat> mel_energies(mel_energies_duplicated_, 1, num_mel_bins);
-  mel_banks.Compute(power_spectrum, &mel_energies);
-  mel_energies.MulElements(equal_loudness);
-  mel_energies.ApplyPow(opts_.compress_factor);
-  // duplicate first and last elements
-  mel_energies_duplicated_(0) = mel_energies_duplicated_(1);
-  mel_energies_duplicated_(num_mel_bins + 1) =
-      mel_energies_duplicated_(num_mel_bins);
-  autocorr_coeffs_.SetZero();  // In case of NaNs or infs
-  autocorr_coeffs_.AddMatVec(1.0, idft_bases_, kNoTrans,
-                             mel_energies_duplicated_,  0.0);
-  BaseFloat residual_log_energy = ComputeLpc(autocorr_coeffs_, &lpc_coeffs_);
-  residual_log_energy = std::max<BaseFloat>(residual_log_energy,
-                                 std::numeric_limits<float>::min());
-  Lpc2Cepstrum(opts_.lpc_order, lpc_coeffs_.Data(), raw_cepstrum_.Data());
-  feature->Range(1, opts_.num_ceps - 1).CopyFromVec(
-      raw_cepstrum_.Range(0, opts_.num_ceps - 1));
-  (*feature)(0) = residual_log_energy;
-  if (opts_.cepstral_lifter != 0.0)
-    feature->MulElements(lifter_coeffs_);
-  if (opts_.cepstral_scale != 1.0)
-    feature->Scale(opts_.cepstral_scale);
-  if (opts_.use_energy) {
-    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
-      signal_raw_log_energy = log_energy_floor_;
-    (*feature)(0) = signal_raw_log_energy;
-  }
-  if (opts_.htk_compat) {  // reorder the features.
-    BaseFloat log_energy = (*feature)(0);
-    for (int32 i = 0; i < opts_.num_ceps-1; i++)
-      (*feature)(i) = (*feature)(i+1);
-    (*feature)(opts_.num_ceps-1)  = log_energy;
-  }
-}
-}  // namespace kaldi
--- a/speechx/speechx/kaldi/feat/feature-plp.h
+++ b/speechx/speechx/kaldi/feat/feature-plp.h
-// feat/feature-plp.h
-// Copyright 2009-2011  Petr Motlicek;  Karel Vesely
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_FEAT_FEATURE_PLP_H_
-#define KALDI_FEAT_FEATURE_PLP_H_
-#include <map>
-#include <string>
-#include "feat/feature-common.h"
-#include "feat/feature-functions.h"
-#include "feat/feature-window.h"
-#include "feat/mel-computations.h"
-#include "util/options-itf.h"
-namespace kaldi {
-/// @addtogroup  feat FeatureExtraction
-/// @{
-/// PlpOptions contains basic options for computing PLP features.
-/// It only includes things that can be done in a "stateless" way, i.e.
-/// it does not include energy max-normalization.
-/// It does not include delta computation.
-struct PlpOptions {
-  FrameExtractionOptions frame_opts;
-  MelBanksOptions mel_opts;
-  int32 lpc_order;
-  int32 num_ceps;  // num cepstra including zero
-  bool use_energy;  // use energy; else C0
-  BaseFloat energy_floor;
-  bool raw_energy;  // If true, compute energy before preemphasis and windowing
-  BaseFloat compress_factor;
-  int32 cepstral_lifter;
-  BaseFloat cepstral_scale;
-  bool htk_compat;  // if true, put energy/C0 last and introduce a factor of
-                    // sqrt(2) on C0 to be the same as HTK.
-  PlpOptions() : mel_opts(23),
-                 // default number of mel-banks for the PLP computation; this
-                 // seems to be common for 16kHz-sampled data. For 8kHz-sampled
-                 // data, 15 may be better.
-                 lpc_order(12),
-                 num_ceps(13),
-                 use_energy(true),
-                 energy_floor(0.0),
-                 raw_energy(true),
-                 compress_factor(0.33333),
-                 cepstral_lifter(22),
-                 cepstral_scale(1.0),
-                 htk_compat(false) {}
-  void Register(OptionsItf *opts) {
-    frame_opts.Register(opts);
-    mel_opts.Register(opts);
-    opts->Register("lpc-order", &lpc_order,
-                   "Order of LPC analysis in PLP computation");
-    opts->Register("num-ceps", &num_ceps,
-                   "Number of cepstra in PLP computation (including C0)");
-    opts->Register("use-energy", &use_energy,
-                   "Use energy (not C0) for zeroth PLP feature");
-    opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in PLP computation. "
-                   "Only makes a difference if --use-energy=true; only necessary if "
-                   "--dither=0.0.  Suggested values: 0.1 or 1.0");
-    opts->Register("raw-energy", &raw_energy,
-                   "If true, compute energy before preemphasis and windowing");
-    opts->Register("compress-factor", &compress_factor,
-                   "Compression factor in PLP computation");
-    opts->Register("cepstral-lifter", &cepstral_lifter,
-                   "Constant that controls scaling of PLPs");
-    opts->Register("cepstral-scale", &cepstral_scale,
-                   "Scaling constant in PLP computation");
-    opts->Register("htk-compat", &htk_compat,
-                   "If true, put energy or C0 last.  Warning: not sufficient "
-                   "to get HTK compatible features (need to change other "
-                   "parameters).");
-  }
-};
-/// This is the new-style interface to the PLP computation.
-class PlpComputer {
- public:
-  typedef PlpOptions Options;
-  explicit PlpComputer(const PlpOptions &opts);
-  PlpComputer(const PlpComputer &other);
-  const FrameExtractionOptions &GetFrameOptions() const {
-    return opts_.frame_opts;
-  }
-  int32 Dim() const { return opts_.num_ceps; }
-  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
-  /**
-     Function that computes one frame of features from
-     one frame of signal.
-     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
-         prior to windowing and pre-emphasis, or
-         log(numeric_limits<float>::min()), whichever is greater.  Must be
-         ignored by this function if this class returns false from
-         this->NeedsRawLogEnergy().
-     @param [in] vtln_warp  The VTLN warping factor that the user wants
-         to be applied when computing features for this utterance.  Will
-         normally be 1.0, meaning no warping is to be done.  The value will
-         be ignored for feature types that don't support VLTN, such as
-         spectrogram features.
-     @param [in] signal_frame  One frame of the signal,
-       as extracted using the function ExtractWindow() using the options
-       returned by this->GetFrameOptions().  The function will use the
-       vector as a workspace, which is why it's a non-const pointer.
-     @param [out] feature  Pointer to a vector of size this->Dim(), to which
-         the computed feature will be written.
-  */
-  void Compute(BaseFloat signal_raw_log_energy,
-               BaseFloat vtln_warp,
-               VectorBase<BaseFloat> *signal_frame,
-               VectorBase<BaseFloat> *feature);
-  ~PlpComputer();
- private:
-  const MelBanks *GetMelBanks(BaseFloat vtln_warp);
-  const Vector<BaseFloat> *GetEqualLoudness(BaseFloat vtln_warp);
-  PlpOptions opts_;
-  Vector<BaseFloat> lifter_coeffs_;
-  Matrix<BaseFloat> idft_bases_;
-  BaseFloat log_energy_floor_;
-  std::map<BaseFloat, MelBanks*> mel_banks_;  // BaseFloat is VTLN coefficient.
-  std::map<BaseFloat, Vector<BaseFloat>* > equal_loudness_;
-  SplitRadixRealFft<BaseFloat> *srfft_;
-  // temporary vector used inside Compute; size is opts_.mel_opts.num_bins + 2
-  Vector<BaseFloat> mel_energies_duplicated_;
-  // temporary vector used inside Compute; size is opts_.lpc_order + 1
-  Vector<BaseFloat> autocorr_coeffs_;
-  // temporary vector used inside Compute; size is opts_.lpc_order
-  Vector<BaseFloat> lpc_coeffs_;
-  // temporary vector used inside Compute; size is opts_.lpc_order
-  Vector<BaseFloat> raw_cepstrum_;
-  // Disallow assignment.
-  PlpComputer &operator =(const PlpComputer &other);
-};
-typedef OfflineFeatureTpl<PlpComputer> Plp;
-/// @} End of "addtogroup feat"
-}  // namespace kaldi
-#endif  // KALDI_FEAT_FEATURE_PLP_H_
--- a/speechx/speechx/kaldi/feat/feature-spectrogram.cc
+++ b/speechx/speechx/kaldi/feat/feature-spectrogram.cc
-// feat/feature-spectrogram.cc
-// Copyright 2009-2012  Karel Vesely
-// Copyright 2012  Navdeep Jaitly
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include "feat/feature-spectrogram.h"
-namespace kaldi {
-SpectrogramComputer::SpectrogramComputer(const SpectrogramOptions &opts)
-    : opts_(opts), srfft_(NULL) {
-  if (opts.energy_floor > 0.0)
-    log_energy_floor_ = Log(opts.energy_floor);
-  int32 padded_window_size = opts.frame_opts.PaddedWindowSize();
-  if ((padded_window_size & (padded_window_size-1)) == 0)  // Is a power of two
-    srfft_ = new SplitRadixRealFft<BaseFloat>(padded_window_size);
-}
-SpectrogramComputer::SpectrogramComputer(const SpectrogramComputer &other):
-    opts_(other.opts_), log_energy_floor_(other.log_energy_floor_), srfft_(NULL) {
-  if (other.srfft_ != NULL)
-    srfft_ = new SplitRadixRealFft<BaseFloat>(*other.srfft_);
-}
-SpectrogramComputer::~SpectrogramComputer() {
-  delete srfft_;
-}
-void SpectrogramComputer::Compute(BaseFloat signal_raw_log_energy,
-                                  BaseFloat vtln_warp,
-                                  VectorBase<BaseFloat> *signal_frame,
-                                  VectorBase<BaseFloat> *feature) {
-  KALDI_ASSERT(signal_frame->Dim() == opts_.frame_opts.PaddedWindowSize() &&
-               feature->Dim() == this->Dim());
-  // Compute energy after window function (not the raw one)
-  if (!opts_.raw_energy)
-    signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<float>::epsilon()));
-  if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
-    srfft_->Compute(signal_frame->Data(), true);
-  else  // An alternative algorithm that works for non-powers-of-two
-    RealFft(signal_frame, true);
-  // Convert the FFT into a power spectrum.
-  ComputePowerSpectrum(signal_frame);
-  SubVector<BaseFloat> power_spectrum(*signal_frame,
-                                      0, signal_frame->Dim() / 2 + 1);
-  power_spectrum.ApplyFloor(std::numeric_limits<float>::epsilon());
-  power_spectrum.ApplyLog();
-  feature->CopyFromVec(power_spectrum);
-  if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
-    signal_raw_log_energy = log_energy_floor_;
-  // The zeroth spectrogram component is always set to the signal energy,
-  // instead of the square of the constant component of the signal.
-  (*feature)(0) = signal_raw_log_energy;
-}
-}  // namespace kaldi
--- a/speechx/speechx/kaldi/feat/feature-spectrogram.h
+++ b/speechx/speechx/kaldi/feat/feature-spectrogram.h
-// feat/feature-spectrogram.h
-// Copyright 2009-2012  Karel Vesely
-// Copyright 2012  Navdeep Jaitly
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_FEAT_FEATURE_SPECTROGRAM_H_
-#define KALDI_FEAT_FEATURE_SPECTROGRAM_H_
-#include <string>
-#include "feat/feature-common.h"
-#include "feat/feature-functions.h"
-#include "feat/feature-window.h"
-namespace kaldi {
-/// @addtogroup  feat FeatureExtraction
-/// @{
-/// SpectrogramOptions contains basic options for computing spectrogram
-/// features.
-struct SpectrogramOptions {
-  FrameExtractionOptions frame_opts;
-  BaseFloat energy_floor;
-  bool raw_energy;  // If true, compute energy before preemphasis and windowing
-  SpectrogramOptions() :
-    energy_floor(0.0),
-    raw_energy(true) {}
-  void Register(OptionsItf *opts) {
-    frame_opts.Register(opts);
-    opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in Spectrogram "
-                   "computation.  Caution: this floor is applied to the zeroth "
-                   "component, representing the total signal energy.  The "
-                   "floor on the individual spectrogram elements is fixed at "
-                   "std::numeric_limits<float>::epsilon().");
-    opts->Register("raw-energy", &raw_energy,
-                   "If true, compute energy before preemphasis and windowing");
-  }
-};
-/// Class for computing spectrogram features.
-class SpectrogramComputer {
- public:
-  typedef SpectrogramOptions Options;
-  explicit SpectrogramComputer(const SpectrogramOptions &opts);
-  SpectrogramComputer(const SpectrogramComputer &other);
-  const FrameExtractionOptions& GetFrameOptions() const {
-    return opts_.frame_opts;
-  }
-  int32 Dim() const { return opts_.frame_opts.PaddedWindowSize() / 2 + 1; }
-  bool NeedRawLogEnergy() const { return opts_.raw_energy; }
-  /**
-     Function that computes one frame of spectrogram features from
-     one frame of signal.
-     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
-         prior to windowing and pre-emphasis, or
-         log(numeric_limits<float>::min()), whichever is greater.  Must be
-         ignored by this function if this class returns false from
-         this->NeedsRawLogEnergy().
-     @param [in] vtln_warp  This is ignored by this function, it's only
-         needed for interface compatibility.
-     @param [in] signal_frame  One frame of the signal,
-       as extracted using the function ExtractWindow() using the options
-       returned by this->GetFrameOptions().  The function will use the
-       vector as a workspace, which is why it's a non-const pointer.
-     @param [out] feature  Pointer to a vector of size this->Dim(), to which
-         the computed feature will be written.
-  */
-  void Compute(BaseFloat signal_raw_log_energy,
-               BaseFloat vtln_warp,
-               VectorBase<BaseFloat> *signal_frame,
-               VectorBase<BaseFloat> *feature);
-  ~SpectrogramComputer();
- private:
-  SpectrogramOptions opts_;
-  BaseFloat log_energy_floor_;
-  SplitRadixRealFft<BaseFloat> *srfft_;
-  // Disallow assignment.
-  SpectrogramComputer &operator=(const SpectrogramComputer &other);
-};
-typedef OfflineFeatureTpl<SpectrogramComputer> Spectrogram;
-/// @} End of "addtogroup feat"
-}  // namespace kaldi
-#endif  // KALDI_FEAT_FEATURE_SPECTROGRAM_H_
--- a/speechx/speechx/kaldi/feat/feature-window.cc
+++ b/speechx/speechx/kaldi/feat/feature-window.cc
-// feat/feature-window.cc
-// Copyright 2009-2011  Karel Vesely;  Petr Motlicek;  Microsoft Corporation
-//           2013-2016  Johns Hopkins University (author: Daniel Povey)
-//                2014  IMSL, PKU-HKUST (author: Wei Shi)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include "feat/feature-window.h"
-#include "matrix/matrix-functions.h"
-namespace kaldi {
-int64 FirstSampleOfFrame(int32 frame,
-                         const FrameExtractionOptions &opts) {
-  int64 frame_shift = opts.WindowShift();
-  if (opts.snip_edges) {
-    return frame * frame_shift;
-  } else {
-    int64 midpoint_of_frame = frame_shift * frame  +  frame_shift / 2,
-        beginning_of_frame = midpoint_of_frame  -  opts.WindowSize() / 2;
-    return beginning_of_frame;
-  }
-}
-int32 NumFrames(int64 num_samples,
-                const FrameExtractionOptions &opts,
-                bool flush) {
-  int64 frame_shift = opts.WindowShift();
-  int64 frame_length = opts.WindowSize();
-  if (opts.snip_edges) {
-    // with --snip-edges=true (the default), we use a HTK-like approach to
-    // determining the number of frames-- all frames have to fit completely into
-    // the waveform, and the first frame begins at sample zero.
-    if (num_samples < frame_length)
-      return 0;
-    else
-      return (1 + ((num_samples - frame_length) / frame_shift));
-    // You can understand the expression above as follows: 'num_samples -
-    // frame_length' is how much room we have to shift the frame within the
-    // waveform; 'frame_shift' is how much we shift it each time; and the ratio
-    // is how many times we can shift it (integer arithmetic rounds down).
-  } else {
-    // if --snip-edges=false, the number of frames is determined by rounding the
-    // (file-length / frame-shift) to the nearest integer.  The point of this
-    // formula is to make the number of frames an obvious and predictable
-    // function of the frame shift and signal length, which makes many
-    // segmentation-related questions simpler.
-    //
-    // Because integer division in C++ rounds toward zero, we add (half the
-    // frame-shift minus epsilon) before dividing, to have the effect of
-    // rounding towards the closest integer.
-    int32 num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
-    if (flush)
-      return num_frames;
-    // note: 'end' always means the last plus one, i.e. one past the last.
-    int64 end_sample_of_last_frame = FirstSampleOfFrame(num_frames - 1, opts)
-        + frame_length;
-    // the following code is optimized more for clarity than efficiency.
-    // If flush == false, we can't output frames that extend past the end
-    // of the signal.
-    while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
-      num_frames--;
-      end_sample_of_last_frame -= frame_shift;
-    }
-    return num_frames;
-  }
-}
-void Dither(VectorBase<BaseFloat> *waveform, BaseFloat dither_value) {
-  if (dither_value == 0.0)
-    return;
-  int32 dim = waveform->Dim();
-  BaseFloat *data = waveform->Data();
-  RandomState rstate;
-  for (int32 i = 0; i < dim; i++)
-    data[i] += RandGauss(&rstate) * dither_value;
-}
-void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff) {
-  if (preemph_coeff == 0.0) return;
-  KALDI_ASSERT(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
-  for (int32 i = waveform->Dim()-1; i > 0; i--)
-    (*waveform)(i) -= preemph_coeff * (*waveform)(i-1);
-  (*waveform)(0) -= preemph_coeff * (*waveform)(0);
-}
-FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts) {
-  int32 frame_length = opts.WindowSize();
-  KALDI_ASSERT(frame_length > 0);
-  window.Resize(frame_length);
-  double a = M_2PI / (frame_length-1);
-  for (int32 i = 0; i < frame_length; i++) {
-    double i_fl = static_cast<double>(i);
-    if (opts.window_type == "hanning") {
-      window(i) = 0.5  - 0.5*cos(a * i_fl);
-    } else if (opts.window_type == "hamming") {
-      window(i) = 0.54 - 0.46*cos(a * i_fl);
-    } else if (opts.window_type == "povey") {  // like hamming but goes to zero at edges.
-      window(i) = pow(0.5 - 0.5*cos(a * i_fl), 0.85);
-    } else if (opts.window_type == "rectangular") {
-      window(i) = 1.0;
-    } else if (opts.window_type == "blackman") {
-      window(i) = opts.blackman_coeff - 0.5*cos(a * i_fl) +
-        (0.5 - opts.blackman_coeff) * cos(2 * a * i_fl);
-    } else {
-      KALDI_ERR << "Invalid window type " << opts.window_type;
-    }
-  }
-}
-void ProcessWindow(const FrameExtractionOptions &opts,
-                   const FeatureWindowFunction &window_function,
-                   VectorBase<BaseFloat> *window,
-                   BaseFloat *log_energy_pre_window) {
-  int32 frame_length = opts.WindowSize();
-  KALDI_ASSERT(window->Dim() == frame_length);
-  if (opts.dither != 0.0)
-    Dither(window, opts.dither);
-  if (opts.remove_dc_offset)
-    window->Add(-window->Sum() / frame_length);
-  if (log_energy_pre_window != NULL) {
-    BaseFloat energy = std::max<BaseFloat>(VecVec(*window, *window),
-                                std::numeric_limits<float>::epsilon());
-    *log_energy_pre_window = Log(energy);
-  }
-  if (opts.preemph_coeff != 0.0)
-    Preemphasize(window, opts.preemph_coeff);
-  window->MulElements(window_function.window);
-}
-// ExtractWindow extracts a windowed frame of waveform with a power-of-two,
-// padded size.  It does mean subtraction, pre-emphasis and dithering as
-// requested.
-void ExtractWindow(int64 sample_offset,
-                   const VectorBase<BaseFloat> &wave,
-                   int32 f,  // with 0 <= f < NumFrames(feats, opts)
-                   const FrameExtractionOptions &opts,
-                   const FeatureWindowFunction &window_function,
-                   Vector<BaseFloat> *window,
-                   BaseFloat *log_energy_pre_window) {
-  KALDI_ASSERT(sample_offset >= 0 && wave.Dim() != 0);
-  int32 frame_length = opts.WindowSize(),
-      frame_length_padded = opts.PaddedWindowSize();
-  int64 num_samples = sample_offset + wave.Dim(),
-      start_sample = FirstSampleOfFrame(f, opts),
-      end_sample = start_sample + frame_length;
-  if (opts.snip_edges) {
-    KALDI_ASSERT(start_sample >= sample_offset &&
-                 end_sample <= num_samples);
-  } else {
-    KALDI_ASSERT(sample_offset == 0 || start_sample >= sample_offset);
-  }
-  if (window->Dim() != frame_length_padded)
-    window->Resize(frame_length_padded, kUndefined);
-  // wave_start and wave_end are start and end indexes into 'wave', for the
-  // piece of wave that we're trying to extract.
-  int32 wave_start = int32(start_sample - sample_offset),
-      wave_end = wave_start + frame_length;
-  if (wave_start >= 0 && wave_end <= wave.Dim()) {
-    // the normal case-- no edge effects to consider.
-    window->Range(0, frame_length).CopyFromVec(
-        wave.Range(wave_start, frame_length));
-  } else {
-    // Deal with any end effects by reflection, if needed.  This code will only
-    // be reached for about two frames per utterance, so we don't concern
-    // ourselves excessively with efficiency.
-    int32 wave_dim = wave.Dim();
-    for (int32 s = 0; s < frame_length; s++) {
-      int32 s_in_wave = s + wave_start;
-      while (s_in_wave < 0 || s_in_wave >= wave_dim) {
-        // reflect around the beginning or end of the wave.
-        // e.g. -1 -> 0, -2 -> 1.
-        // dim -> dim - 1, dim + 1 -> dim - 2.
-        // the code supports repeated reflections, although this
-        // would only be needed in pathological cases.
-        if (s_in_wave < 0) s_in_wave = - s_in_wave - 1;
-        else s_in_wave = 2 * wave_dim - 1 - s_in_wave;
-      }
-      (*window)(s) = wave(s_in_wave);
-    }
-  }
-  if (frame_length_padded > frame_length)
-    window->Range(frame_length, frame_length_padded - frame_length).SetZero();
-  SubVector<BaseFloat> frame(*window, 0, frame_length);
-  ProcessWindow(opts, window_function, &frame, log_energy_pre_window);
-}
-}  // namespace kaldi
--- a/speechx/speechx/kaldi/feat/feature-window.h
+++ b/speechx/speechx/kaldi/feat/feature-window.h
-// feat/feature-window.h
-// Copyright 2009-2011  Karel Vesely;  Petr Motlicek;  Saarland University
-//           2014-2016  Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_FEAT_FEATURE_WINDOW_H_
-#define KALDI_FEAT_FEATURE_WINDOW_H_
-#include <map>
-#include <string>
-#include "matrix/matrix-lib.h"
-#include "util/common-utils.h"
-#include "base/kaldi-error.h"
-namespace kaldi {
-/// @addtogroup  feat FeatureExtraction
-/// @{
-struct FrameExtractionOptions {
-  BaseFloat samp_freq;
-  BaseFloat frame_shift_ms;  // in milliseconds.
-  BaseFloat frame_length_ms;  // in milliseconds.
-  BaseFloat dither;  // Amount of dithering, 0.0 means no dither.
-  BaseFloat preemph_coeff;  // Preemphasis coefficient.
-  bool remove_dc_offset;  // Subtract mean of wave before FFT.
-  std::string window_type;  // e.g. Hamming window
-  // May be "hamming", "rectangular", "povey", "hanning", "blackman"
-  // "povey" is a window I made to be similar to Hamming but to go to zero at the
-  // edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85)
-  // I just don't think the Hamming window makes sense as a windowing function.
-  bool round_to_power_of_two;
-  BaseFloat blackman_coeff;
-  bool snip_edges;
-  bool allow_downsample;
-  bool allow_upsample;
-  int max_feature_vectors;
-  FrameExtractionOptions():
-      samp_freq(16000),
-      frame_shift_ms(10.0),
-      frame_length_ms(25.0),
-      dither(1.0),
-      preemph_coeff(0.97),
-      remove_dc_offset(true),
-      window_type("povey"),
-      round_to_power_of_two(true),
-      blackman_coeff(0.42),
-      snip_edges(true),
-      allow_downsample(false),
-      allow_upsample(false),
-      max_feature_vectors(-1)
-      { }
-  void Register(OptionsItf *opts) {
-    opts->Register("sample-frequency", &samp_freq,
-                   "Waveform data sample frequency (must match the waveform file, "
-                   "if specified there)");
-    opts->Register("frame-length", &frame_length_ms, "Frame length in milliseconds");
-    opts->Register("frame-shift", &frame_shift_ms, "Frame shift in milliseconds");
-    opts->Register("preemphasis-coefficient", &preemph_coeff,
-                   "Coefficient for use in signal preemphasis");
-    opts->Register("remove-dc-offset", &remove_dc_offset,
-                   "Subtract mean from waveform on each frame");
-    opts->Register("dither", &dither, "Dithering constant (0.0 means no dither). "
-                   "If you turn this off, you should set the --energy-floor "
-                   "option, e.g. to 1.0 or 0.1");
-    opts->Register("window-type", &window_type, "Type of window "
-                   "(\"hamming\"|\"hanning\"|\"povey\"|\"rectangular\""
-                   "|\"blackmann\")");
-    opts->Register("blackman-coeff", &blackman_coeff,
-                   "Constant coefficient for generalized Blackman window.");
-    opts->Register("round-to-power-of-two", &round_to_power_of_two,
-                   "If true, round window size to power of two by zero-padding "
-                   "input to FFT.");
-    opts->Register("snip-edges", &snip_edges,
-                   "If true, end effects will be handled by outputting only frames that "
-                   "completely fit in the file, and the number of frames depends on the "
-                   "frame-length.  If false, the number of frames depends only on the "
-                   "frame-shift, and we reflect the data at the ends.");
-    opts->Register("allow-downsample", &allow_downsample,
-                   "If true, allow the input waveform to have a higher frequency than "
-                   "the specified --sample-frequency (and we'll downsample).");
-    opts->Register("max-feature-vectors", &max_feature_vectors,
-                   "Memory optimization. If larger than 0, periodically remove feature "
-                   "vectors so that only this number of the latest feature vectors is "
-                   "retained.");
-    opts->Register("allow-upsample", &allow_upsample,
-                   "If true, allow the input waveform to have a lower frequency than "
-                   "the specified --sample-frequency (and we'll upsample).");
-  }
-  int32 WindowShift() const {
-    return static_cast<int32>(samp_freq * 0.001 * frame_shift_ms);
-  }
-  int32 WindowSize() const {
-    return static_cast<int32>(samp_freq * 0.001 * frame_length_ms);
-  }
-  int32 PaddedWindowSize() const {
-    return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize()) :
-                                    WindowSize());
-  }
-};
-struct FeatureWindowFunction {
-  FeatureWindowFunction() {}
-  explicit FeatureWindowFunction(const FrameExtractionOptions &opts);
-  FeatureWindowFunction(const FeatureWindowFunction &other):
-      window(other.window) { }
-  Vector<BaseFloat> window;
-};
-/**
-   This function returns the number of frames that we can extract from a wave
-   file with the given number of samples in it (assumed to have the same
-   sampling rate as specified in 'opts').
-      @param [in] num_samples  The number of samples in the wave file.
-      @param [in] opts     The frame-extraction options class
-      @param [in] flush   True if we are asserting that this number of samples is
-             'all there is', false if we expecting more data to possibly come
-             in.  This only makes a difference to the answer if opts.snips_edges
-             == false.  For offline feature extraction you always want flush ==
-             true.  In an online-decoding context, once you know (or decide) that
-             no more data is coming in, you'd call it with flush == true at the
-             end to flush out any remaining data.
-*/
-int32 NumFrames(int64 num_samples,
-                const FrameExtractionOptions &opts,
-                bool flush = true);
-/*
-   This function returns the index of the first sample of the frame indexed
-   'frame'.  If snip-edges=true, it just returns frame * opts.WindowShift(); if
-   snip-edges=false, the formula is a little more complicated and the result may
-   be negative.
-*/
-int64 FirstSampleOfFrame(int32 frame,
-                         const FrameExtractionOptions &opts);
-void Dither(VectorBase<BaseFloat> *waveform, BaseFloat dither_value);
-void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff);
-/**
-  This function does all the windowing steps after actually
-  extracting the windowed signal: depending on the
-  configuration, it does dithering, dc offset removal,
-  preemphasis, and multiplication by the windowing function.
-   @param [in] opts  The options class to be used
-   @param [in] window_function  The windowing function-- should have
-                    been initialized using 'opts'.
-   @param [in,out] window  A vector of size opts.WindowSize().  Note:
-      it will typically be a sub-vector of a larger vector of size
-      opts.PaddedWindowSize(), with the remaining samples zero,
-      as the FFT code is more efficient if it operates on data with
-      power-of-two size.
-   @param [out]   log_energy_pre_window If non-NULL, then after dithering and
-      DC offset removal, this function will write to this pointer the log of
-      the total energy (i.e. sum-squared) of the frame.
- */
-void ProcessWindow(const FrameExtractionOptions &opts,
-                   const FeatureWindowFunction &window_function,
-                   VectorBase<BaseFloat> *window,
-                   BaseFloat *log_energy_pre_window = NULL);
-/*
-  ExtractWindow() extracts a windowed frame of waveform (possibly with a
-  power-of-two, padded size, depending on the config), including all the
-  proessing done by ProcessWindow().
-  @param [in] sample_offset  If 'wave' is not the entire waveform, but
-                   part of it to the left has been discarded, then the
-                   number of samples prior to 'wave' that we have
-                   already discarded.  Set this to zero if you are
-                   processing the entire waveform in one piece, or
-                   if you get 'no matching function' compilation
-                   errors when updating the code.
-  @param [in] wave  The waveform
-  @param [in] f     The frame index to be extracted, with
-                    0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true)
-  @param [in] opts  The options class to be used
-  @param [in] window_function  The windowing function, as derived from the
-                    options class.
-  @param [out] window  The windowed, possibly-padded waveform to be
-                     extracted.  Will be resized as needed.
-  @param [out] log_energy_pre_window  If non-NULL, the log-energy of
-                   the signal prior to pre-emphasis and multiplying by
-                   the windowing function will be written to here.
-*/
-void ExtractWindow(int64 sample_offset,
-                   const VectorBase<BaseFloat> &wave,
-                   int32 f,
-                   const FrameExtractionOptions &opts,
-                   const FeatureWindowFunction &window_function,
-                   Vector<BaseFloat> *window,
-                   BaseFloat *log_energy_pre_window = NULL);
-/// @} End of "addtogroup feat"
-}  // namespace kaldi
-#endif  // KALDI_FEAT_FEATURE_WINDOW_H_
--- a/speechx/speechx/kaldi/feat/mel-computations.cc
+++ b/speechx/speechx/kaldi/feat/mel-computations.cc
-// feat/mel-computations.cc
-// Copyright 2009-2011  Phonexia s.r.o.;  Karel Vesely;  Microsoft Corporation
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include <stdio.h>
-#include <stdlib.h>
-#include <float.h>
-#include <algorithm>
-#include <iostream>
-#include "feat/feature-functions.h"
-#include "feat/feature-window.h"
-#include "feat/mel-computations.h"
-namespace kaldi {
-MelBanks::MelBanks(const MelBanksOptions &opts,
-                   const FrameExtractionOptions &frame_opts,
-                   BaseFloat vtln_warp_factor):
-    htk_mode_(opts.htk_mode) {
-  int32 num_bins = opts.num_bins;
-  if (num_bins < 3) KALDI_ERR << "Must have at least 3 mel bins";
-  BaseFloat sample_freq = frame_opts.samp_freq;
-  int32 window_length_padded = frame_opts.PaddedWindowSize();
-  KALDI_ASSERT(window_length_padded % 2 == 0);
-  int32 num_fft_bins = window_length_padded / 2;
-  BaseFloat nyquist = 0.5 * sample_freq;
-  BaseFloat low_freq = opts.low_freq, high_freq;
-  if (opts.high_freq > 0.0)
-    high_freq = opts.high_freq;
-  else
-    high_freq = nyquist + opts.high_freq;
-  if (low_freq < 0.0 || low_freq >= nyquist
-      || high_freq <= 0.0 || high_freq > nyquist
-      || high_freq <= low_freq)
-    KALDI_ERR << "Bad values in options: low-freq " << low_freq
-              << " and high-freq " << high_freq << " vs. nyquist "
-              << nyquist;
-  BaseFloat fft_bin_width = sample_freq / window_length_padded;
-  // fft-bin width [think of it as Nyquist-freq / half-window-length]
-  BaseFloat mel_low_freq = MelScale(low_freq);
-  BaseFloat mel_high_freq = MelScale(high_freq);
-  debug_ = opts.debug_mel;
-  // divide by num_bins+1 in next line because of end-effects where the bins
-  // spread out to the sides.
-  BaseFloat mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins+1);
-  BaseFloat vtln_low = opts.vtln_low,
-      vtln_high = opts.vtln_high;
-  if (vtln_high < 0.0) {
-    vtln_high += nyquist;
-  }
-  if (vtln_warp_factor != 1.0 &&
-      (vtln_low < 0.0 || vtln_low <= low_freq
-       || vtln_low >= high_freq
-       || vtln_high <= 0.0 || vtln_high >= high_freq
-       || vtln_high <= vtln_low))
-    KALDI_ERR << "Bad values in options: vtln-low " << vtln_low
-              << " and vtln-high " << vtln_high << ", versus "
-              << "low-freq " << low_freq << " and high-freq "
-              << high_freq;
-  bins_.resize(num_bins);
-  center_freqs_.Resize(num_bins);
-  for (int32 bin = 0; bin < num_bins; bin++) {
-    BaseFloat left_mel = mel_low_freq + bin * mel_freq_delta,
-        center_mel = mel_low_freq + (bin + 1) * mel_freq_delta,
-        right_mel = mel_low_freq + (bin + 2) * mel_freq_delta;
-    if (vtln_warp_factor != 1.0) {
-      left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
-                                 vtln_warp_factor, left_mel);
-      center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
-                                 vtln_warp_factor, center_mel);
-      right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
-                                  vtln_warp_factor, right_mel);
-    }
-    center_freqs_(bin) = InverseMelScale(center_mel);
-    // this_bin will be a vector of coefficients that is only
-    // nonzero where this mel bin is active.
-    Vector<BaseFloat> this_bin(num_fft_bins);
-    int32 first_index = -1, last_index = -1;
-    for (int32 i = 0; i < num_fft_bins; i++) {
-      BaseFloat freq = (fft_bin_width * i);  // Center frequency of this fft
-                                             // bin.
-      BaseFloat mel = MelScale(freq);
-      if (mel > left_mel && mel < right_mel) {
-        BaseFloat weight;
-        if (mel <= center_mel)
-          weight = (mel - left_mel) / (center_mel - left_mel);
-        else
-         weight = (right_mel-mel) / (right_mel-center_mel);
-        this_bin(i) = weight;
-        if (first_index == -1)
-          first_index = i;
-        last_index = i;
-      }
-    }
-    //KALDI_ASSERT(first_index != -1 && last_index >= first_index
-    //             && "You may have set --num-mel-bins too large.");
-    bins_[bin].first = first_index;
-    int32 size = last_index + 1 - first_index;
-    bins_[bin].second.Resize(size);
-    bins_[bin].second.CopyFromVec(this_bin.Range(first_index, size));
-    // Replicate a bug in HTK, for testing purposes.
-    if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0)
-      bins_[bin].second(0) = 0.0;
-  }
-  if (debug_) {
-    for (size_t i = 0; i < bins_.size(); i++) {
-      KALDI_LOG << "bin " << i << ", offset = " << bins_[i].first
-                << ", vec = " << bins_[i].second;
-    }
-  }
-}
-MelBanks::MelBanks(const MelBanks &other):
-    center_freqs_(other.center_freqs_),
-    bins_(other.bins_),
-    debug_(other.debug_),
-    htk_mode_(other.htk_mode_) { }
-BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
-                                 BaseFloat vtln_high_cutoff,
-                                 BaseFloat low_freq,  // upper+lower frequency cutoffs in mel computation
-                                 BaseFloat high_freq,
-                                 BaseFloat vtln_warp_factor,
-                                 BaseFloat freq) {
-  /// This computes a VTLN warping function that is not the same as HTK's one,
-  /// but has similar inputs (this function has the advantage of never producing
-  /// empty bins).
-  /// This function computes a warp function F(freq), defined between low_freq and
-  /// high_freq inclusive, with the following properties:
-  ///  F(low_freq) == low_freq
-  ///  F(high_freq) == high_freq
-  /// The function is continuous and piecewise linear with two inflection
-  ///   points.
-  /// The lower inflection point (measured in terms of the unwarped
-  ///  frequency) is at frequency l, determined as described below.
-  /// The higher inflection point is at a frequency h, determined as
-  ///   described below.
-  /// If l <= f <= h, then F(f) = f/vtln_warp_factor.
-  /// If the higher inflection point (measured in terms of the unwarped
-  ///   frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
-  ///   Since (by the last point) F(h) == h/vtln_warp_factor, then
-  ///   max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
-  ///   h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
-  ///     = vtln_high_cutoff * min(1, vtln_warp_factor).
-  /// If the lower inflection point (measured in terms of the unwarped
-  ///   frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
-  ///   This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
-  ///                       = vtln_low_cutoff * max(1, vtln_warp_factor)
-  if (freq < low_freq || freq > high_freq) return freq;  // in case this gets called
-  // for out-of-range frequencies, just return the freq.
-  KALDI_ASSERT(vtln_low_cutoff > low_freq &&
-               "be sure to set the --vtln-low option higher than --low-freq");
-  KALDI_ASSERT(vtln_high_cutoff < high_freq &&
-               "be sure to set the --vtln-high option lower than --high-freq [or negative]");
-  BaseFloat one = 1.0;
-  BaseFloat l = vtln_low_cutoff * std::max(one, vtln_warp_factor);
-  BaseFloat h = vtln_high_cutoff * std::min(one, vtln_warp_factor);
-  BaseFloat scale = 1.0 / vtln_warp_factor;
-  BaseFloat Fl = scale * l;  // F(l);
-  BaseFloat Fh = scale * h;  // F(h);
-  KALDI_ASSERT(l > low_freq && h < high_freq);
-  // slope of left part of the 3-piece linear function
-  BaseFloat scale_left = (Fl - low_freq) / (l - low_freq);
-  // [slope of center part is just "scale"]
-  // slope of right part of the 3-piece linear function
-  BaseFloat scale_right = (high_freq - Fh) / (high_freq - h);
-  if (freq < l) {
-    return low_freq + scale_left * (freq - low_freq);
-  } else if (freq < h) {
-    return scale * freq;
-  } else {  // freq >= h
-    return high_freq + scale_right * (freq - high_freq);
-  }
-}
-BaseFloat MelBanks::VtlnWarpMelFreq(BaseFloat vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
-                                    BaseFloat vtln_high_cutoff,
-                                    BaseFloat low_freq,  // upper+lower frequency cutoffs in mel computation
-                                    BaseFloat high_freq,
-                                    BaseFloat vtln_warp_factor,
-                                    BaseFloat mel_freq) {
-  return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
-                               low_freq, high_freq,
-                               vtln_warp_factor, InverseMelScale(mel_freq)));
-}
-// "power_spectrum" contains fft energies.
-void MelBanks::Compute(const VectorBase<BaseFloat> &power_spectrum,
-                       VectorBase<BaseFloat> *mel_energies_out) const {
-  int32 num_bins = bins_.size();
-  KALDI_ASSERT(mel_energies_out->Dim() == num_bins);
-  for (int32 i = 0; i < num_bins; i++) {
-    int32 offset = bins_[i].first;
-    const Vector<BaseFloat> &v(bins_[i].second);
-    BaseFloat energy = VecVec(v, power_spectrum.Range(offset, v.Dim()));
-    // HTK-like flooring- for testing purposes (we prefer dither)
-    if (htk_mode_ && energy < 1.0) energy = 1.0;
-    (*mel_energies_out)(i) = energy;
-    // The following assert was added due to a problem with OpenBlas that
-    // we had at one point (it was a bug in that library).  Just to detect
-    // it early.
-    KALDI_ASSERT(!KALDI_ISNAN((*mel_energies_out)(i)));
-  }
-  if (debug_) {
-    fprintf(stderr, "MEL BANKS:\n");
-    for (int32 i = 0; i < num_bins; i++)
-      fprintf(stderr, " %f", (*mel_energies_out)(i));
-    fprintf(stderr, "\n");
-  }
-}
-void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs) {
-  // Compute liftering coefficients (scaling on cepstral coeffs)
-  // coeffs are numbered slightly differently from HTK: the zeroth
-  // index is C0, which is not affected.
-  for (int32 i = 0; i < coeffs->Dim(); i++)
-    (*coeffs)(i) = 1.0 + 0.5 * Q * sin (M_PI * i / Q);
-}
-// Durbin's recursion - converts autocorrelation coefficients to the LPC
-// pTmp - temporal place [n]
-// pAC - autocorrelation coefficients [n + 1]
-// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i-1] * s[n-i]}})
-//       F(z) = 1 / (1 - A(z)), 1 is not stored in the demoninator
-BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp) {
-  BaseFloat ki;                // reflection coefficient
-  int i;
-  int j;
-  BaseFloat E = pAC[0];
-  for (i = 0; i < n; i++) {
-    // next reflection coefficient
-    ki = pAC[i + 1];
-    for (j = 0; j < i; j++)
-      ki += pLP[j] * pAC[i - j];
-    ki = ki / E;
-    // new error
-    BaseFloat c = 1 - ki * ki;
-    if (c < 1.0e-5) // remove NaNs for constan signal
-      c = 1.0e-5;
-    E *= c;
-    // new LP coefficients
-    pTmp[i] = -ki;
-    for (j = 0; j < i; j++)
-      pTmp[j] = pLP[j] - ki * pLP[i - j - 1];
-    for (j = 0; j <= i; j++)
-      pLP[j] = pTmp[j];
-  }
-  return E;
-}
-void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst) {
-  for (int32 i = 0; i < n; i++) {
-    double sum = 0.0;
-    int j;
-    for (j = 0; j < i; j++) {
-      sum += static_cast<BaseFloat>(i - j) * pLPC[j] * pCepst[i - j - 1];
-    }
-    pCepst[i] = -pLPC[i] - sum / static_cast<BaseFloat>(i + 1);
-  }
-}
-void GetEqualLoudnessVector(const MelBanks &mel_banks,
-                            Vector<BaseFloat> *ans) {
-  int32 n = mel_banks.NumBins();
-  // Central frequency of each mel bin.
-  const Vector<BaseFloat> &f0 = mel_banks.GetCenterFreqs();
-  ans->Resize(n);
-  for (int32 i = 0; i < n; i++) {
-    BaseFloat fsq = f0(i) * f0(i);
-    BaseFloat fsub = fsq / (fsq + 1.6e5);
-    (*ans)(i) = fsub * fsub * ((fsq + 1.44e6) / (fsq + 9.61e6));
-  }
-}
-// Compute LP coefficients from autocorrelation coefficients.
-BaseFloat ComputeLpc(const VectorBase<BaseFloat> &autocorr_in,
-                     Vector<BaseFloat> *lpc_out) {
-  int32 n = autocorr_in.Dim() - 1;
-  KALDI_ASSERT(lpc_out->Dim() == n);
-  Vector<BaseFloat> tmp(n);
-  BaseFloat ans = Durbin(n, autocorr_in.Data(),
-                         lpc_out->Data(),
-                         tmp.Data());
-  if (ans <= 0.0)
-    KALDI_WARN << "Zero energy in LPC computation";
-  return -Log(1.0 / ans);  // forms the C0 value
-}
-}  // namespace kaldi
--- a/speechx/speechx/kaldi/feat/mel-computations.h
+++ b/speechx/speechx/kaldi/feat/mel-computations.h
-// feat/mel-computations.h
-// Copyright 2009-2011  Phonexia s.r.o.;  Microsoft Corporation
-//                2016  Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_FEAT_MEL_COMPUTATIONS_H_
-#define KALDI_FEAT_MEL_COMPUTATIONS_H_
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <complex>
-#include <utility>
-#include <vector>
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "matrix/matrix-lib.h"
-namespace kaldi {
-/// @addtogroup  feat FeatureExtraction
-/// @{
-struct FrameExtractionOptions;  // defined in feature-window.h
-struct MelBanksOptions {
-  int32 num_bins;  // e.g. 25; number of triangular bins
-  BaseFloat low_freq;  // e.g. 20; lower frequency cutoff
-  BaseFloat high_freq;  // an upper frequency cutoff; 0 -> no cutoff, negative
-  // ->added to the Nyquist frequency to get the cutoff.
-  BaseFloat vtln_low;  // vtln lower cutoff of warping function.
-  BaseFloat vtln_high;  // vtln upper cutoff of warping function: if negative, added
-                        // to the Nyquist frequency to get the cutoff.
-  bool debug_mel;
-  // htk_mode is a "hidden" config, it does not show up on command line.
-  // Enables more exact compatibility with HTK, for testing purposes.  Affects
-  // mel-energy flooring and reproduces a bug in HTK.
-  bool htk_mode;
-  explicit MelBanksOptions(int num_bins = 25)
-      : num_bins(num_bins), low_freq(20), high_freq(0), vtln_low(100),
-        vtln_high(-500), debug_mel(false), htk_mode(false) {}
-  void Register(OptionsItf *opts) {
-    opts->Register("num-mel-bins", &num_bins,
-                   "Number of triangular mel-frequency bins");
-    opts->Register("low-freq", &low_freq,
-                   "Low cutoff frequency for mel bins");
-    opts->Register("high-freq", &high_freq,
-                   "High cutoff frequency for mel bins (if <= 0, offset from Nyquist)");
-    opts->Register("vtln-low", &vtln_low,
-                   "Low inflection point in piecewise linear VTLN warping function");
-    opts->Register("vtln-high", &vtln_high,
-                   "High inflection point in piecewise linear VTLN warping function"
-                   " (if negative, offset from high-mel-freq");
-    opts->Register("debug-mel", &debug_mel,
-                   "Print out debugging information for mel bin computation");
-  }
-};
-class MelBanks {
- public:
-  static inline BaseFloat InverseMelScale(BaseFloat mel_freq) {
-    return 700.0f * (expf (mel_freq / 1127.0f) - 1.0f);
-  }
-  static inline BaseFloat MelScale(BaseFloat freq) {
-    return 1127.0f * logf (1.0f + freq / 700.0f);
-  }
-  static BaseFloat VtlnWarpFreq(BaseFloat vtln_low_cutoff,
-                                BaseFloat vtln_high_cutoff,  // discontinuities in warp func
-                                BaseFloat low_freq,
-                                BaseFloat high_freq,  // upper+lower frequency cutoffs in
-                                // the mel computation
-                                BaseFloat vtln_warp_factor,
-                                BaseFloat freq);
-  static BaseFloat VtlnWarpMelFreq(BaseFloat vtln_low_cutoff,
-                                   BaseFloat vtln_high_cutoff,
-                                   BaseFloat low_freq,
-                                   BaseFloat high_freq,
-                                   BaseFloat vtln_warp_factor,
-                                   BaseFloat mel_freq);
-  MelBanks(const MelBanksOptions &opts,
-           const FrameExtractionOptions &frame_opts,
-           BaseFloat vtln_warp_factor);
-  /// Compute Mel energies (note: not log enerties).
-  /// At input, "fft_energies" contains the FFT energies (not log).
-  void Compute(const VectorBase<BaseFloat> &fft_energies,
-               VectorBase<BaseFloat> *mel_energies_out) const;
-  int32 NumBins() const { return bins_.size(); }
-  // returns vector of central freq of each bin; needed by plp code.
-  const Vector<BaseFloat> &GetCenterFreqs() const { return center_freqs_; }
-  const std::vector<std::pair<int32, Vector<BaseFloat> > >& GetBins() const {
-    return bins_;
-  }
-  // Copy constructor
-  MelBanks(const MelBanks &other);
- private:
-  // Disallow assignment
-  MelBanks &operator = (const MelBanks &other);
-  // center frequencies of bins, numbered from 0 ... num_bins-1.
-  // Needed by GetCenterFreqs().
-  Vector<BaseFloat> center_freqs_;
-  // the "bins_" vector is a vector, one for each bin, of a pair:
-  // (the first nonzero fft-bin), (the vector of weights).
-  std::vector<std::pair<int32, Vector<BaseFloat> > > bins_;
-  bool debug_;
-  bool htk_mode_;
-};
-// Compute liftering coefficients (scaling on cepstral coeffs)
-// coeffs are numbered slightly differently from HTK: the zeroth
-// index is C0, which is not affected.
-void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs);
-// Durbin's recursion - converts autocorrelation coefficients to the LPC
-// pTmp - temporal place [n]
-// pAC - autocorrelation coefficients [n + 1]
-// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i-1] * s[n-i]}})
-//       F(z) = 1 / (1 - A(z)), 1 is not stored in the denominator
-// Returns log energy of residual (I think)
-BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp);
-// Compute LP coefficients from autocorrelation coefficients.
-// Returns log energy of residual (I think)
-BaseFloat ComputeLpc(const VectorBase<BaseFloat> &autocorr_in,
-                     Vector<BaseFloat> *lpc_out);
-void Lpc2Cepstrum(int n, const BaseFloat *pLPC, BaseFloat *pCepst);
-void GetEqualLoudnessVector(const MelBanks &mel_banks,
-                            Vector<BaseFloat> *ans);
-/// @} End of "addtogroup feat"
-}  // namespace kaldi
-#endif  // KALDI_FEAT_MEL_COMPUTATIONS_H_
--- a/speechx/speechx/kaldi/feat/online-feature-itf.h
+++ b/speechx/speechx/kaldi/feat/online-feature-itf.h
-// feat/online-feature-itf.h
-// Copyright    2013  Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_FEAT_ONLINE_FEATURE_ITF_H_
-#define KALDI_FEAT_ONLINE_FEATURE_ITF_H_ 1
-#include "base/kaldi-common.h"
-#include "matrix/matrix-lib.h"
-namespace kaldi {
-/// @ingroup Interfaces
-/// @{
-/**
-   OnlineFeatureInterface is an interface for online feature processing (it is
-   also usable in the offline setting, but currently we're not using it for
-   that).  This is for use in the online2/ directory, and it supersedes the
-   interface in ../online/online-feat-input.h.  We have a slightly different
-   model that puts more control in the hands of the calling thread, and won't
-   involve waiting on semaphores in the decoding thread.
-   This interface only specifies how the object *outputs* the features.
-   How it obtains the features, e.g. from a previous object or objects of type
-   OnlineFeatureInterface, is not specified in the interface and you will
-   likely define new constructors or methods in the derived type to do that.
-   You should appreciate that this interface is designed to allow random
-   access to features, as long as they are ready.  That is, the user
-   can call GetFrame for any frame less than NumFramesReady(), and when
-   implementing a child class you must not make assumptions about the
-   order in which the user makes these calls.
-*/
-class OnlineFeatureInterface {
- public:
-  virtual int32 Dim() const = 0; /// returns the feature dimension.
-  /// Returns the total number of frames, since the start of the utterance, that
-  /// are now available.  In an online-decoding context, this will likely
-  /// increase with time as more data becomes available.
-  virtual int32 NumFramesReady() const = 0;
-  /// Returns true if this is the last frame.  Frame indices are zero-based, so the
-  /// first frame is zero.  IsLastFrame(-1) will return false, unless the file
-  /// is empty (which is a case that I'm not sure all the code will handle, so
-  /// be careful).  This function may return false for some frame if
-  /// we haven't yet decided to terminate decoding, but later true if we decide
-  /// to terminate decoding.  This function exists mainly to correctly handle
-  /// end effects in feature extraction, and is not a mechanism to determine how
-  /// many frames are in the decodable object (as it used to be, and for backward
-  /// compatibility, still is, in the Decodable interface).
-  virtual bool IsLastFrame(int32 frame) const = 0;
-  /// Gets the feature vector for this frame.  Before calling this for a given
-  /// frame, it is assumed that you called NumFramesReady() and it returned a
-  /// number greater than "frame".  Otherwise this call will likely crash with
-  /// an assert failure.  This function is not declared const, in case there is
-  /// some kind of caching going on, but most of the time it shouldn't modify
-  /// the class.
-  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat) = 0;
-  /// This is like GetFrame() but for a collection of frames.  There is a
-  /// default implementation that just gets the frames one by one, but it
-  /// may be overridden for efficiency by child classes (since sometimes
-  /// it's more efficient to do things in a batch).
-  virtual void GetFrames(const std::vector<int32> &frames,
-                         MatrixBase<BaseFloat> *feats) {
-    KALDI_ASSERT(static_cast<int32>(frames.size()) == feats->NumRows());
-    for (size_t i = 0; i < frames.size(); i++) {
-      SubVector<BaseFloat> feat(*feats, i);
-      GetFrame(frames[i], &feat);
-    }
-  }
-  // Returns frame shift in seconds.  Helps to estimate duration from frame
-  // counts.
-  virtual BaseFloat FrameShiftInSeconds() const = 0;
-  /// Virtual destructor.  Note: constructors that take another member of
-  /// type OnlineFeatureInterface are not expected to take ownership of
-  /// that pointer; the caller needs to keep track of that manually.
-  virtual ~OnlineFeatureInterface() { }
-};
-/// Add a virtual class for "source" features such as MFCC or PLP or pitch
-/// features.
-class OnlineBaseFeature: public OnlineFeatureInterface {
- public:
-  /// This would be called from the application, when you get more wave data.
-  /// Note: the sampling_rate is typically only provided so the code can assert
-  /// that it matches the sampling rate expected in the options.
-  virtual void AcceptWaveform(BaseFloat sampling_rate,
-                              const VectorBase<BaseFloat> &waveform) = 0;
-  /// InputFinished() tells the class you won't be providing any
-  /// more waveform.  This will help flush out the last few frames
-  /// of delta or LDA features (it will typically affect the return value
-  /// of IsLastFrame.
-  virtual void InputFinished() = 0;
-};
-/// @}
-}  // namespace Kaldi
-#endif  // KALDI_ITF_ONLINE_FEATURE_ITF_H_
--- a/speechx/speechx/kaldi/feat/online-feature.cc
+++ b/speechx/speechx/kaldi/feat/online-feature.cc
-// feat/online-feature.cc
-// Copyright    2013  Johns Hopkins University (author: Daniel Povey)
-//              2014  Yanqing Sun, Junjie Wang,
-//                    Daniel Povey, Korbinian Riedhammer
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include "feat/online-feature.h"
-#include "transform/cmvn.h"
-namespace kaldi {
-RecyclingVector::RecyclingVector(int items_to_hold):
-  items_to_hold_(items_to_hold == 0 ? -1 : items_to_hold),
-  first_available_index_(0) {
-}
-RecyclingVector::~RecyclingVector() {
-  for (auto *item : items_) {
-    delete item;
-  }
-}
-Vector<BaseFloat> *RecyclingVector::At(int index) const {
-  if (index < first_available_index_) {
-    KALDI_ERR << "Attempted to retrieve feature vector that was "
-                 "already removed by the RecyclingVector (index = "
-              << index << "; "
-              << "first_available_index = " << first_available_index_ << "; "
-              << "size = " << Size() << ")";
-  }
-  // 'at' does size checking.
-  return items_.at(index - first_available_index_);
-}
-void RecyclingVector::PushBack(Vector<BaseFloat> *item) {
-  if (items_.size() == items_to_hold_) {
-    delete items_.front();
-    items_.pop_front();
-    ++first_available_index_;
-  }
-  items_.push_back(item);
-}
-int RecyclingVector::Size() const {
-  return first_available_index_ + items_.size();
-}
-template <class C>
-void OnlineGenericBaseFeature<C>::GetFrame(int32 frame,
-                                           VectorBase<BaseFloat> *feat) {
-  feat->CopyFromVec(*(features_.At(frame)));
-};
-template <class C>
-OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature(
-    const typename C::Options &opts):
-    computer_(opts), window_function_(computer_.GetFrameOptions()),
-    features_(opts.frame_opts.max_feature_vectors),
-    input_finished_(false), waveform_offset_(0) {
-  // RE the following assert: search for ONLINE_IVECTOR_LIMIT in
-  // online-ivector-feature.cc.
-  // Casting to uint32, an unsigned type, means that -1 would be treated
-  // as `very large`.
-  KALDI_ASSERT(static_cast<uint32>(opts.frame_opts.max_feature_vectors) > 200);
-}
-template <class C>
-void OnlineGenericBaseFeature<C>::MaybeCreateResampler(
-    BaseFloat sampling_rate) {
-  BaseFloat expected_sampling_rate = computer_.GetFrameOptions().samp_freq;
-  if (resampler_ != nullptr) {
-    KALDI_ASSERT(resampler_->GetInputSamplingRate() == sampling_rate);
-    KALDI_ASSERT(resampler_->GetOutputSamplingRate() == expected_sampling_rate);
-  } else if (((sampling_rate < expected_sampling_rate) &&
-              computer_.GetFrameOptions().allow_downsample) ||
-             ((sampling_rate > expected_sampling_rate) &&
-              computer_.GetFrameOptions().allow_upsample)) {
-    resampler_.reset(new LinearResample(
-        sampling_rate, expected_sampling_rate,
-        std::min(sampling_rate / 2, expected_sampling_rate / 2), 6));
-  } else if (sampling_rate != expected_sampling_rate) {
-    KALDI_ERR << "Sampling frequency mismatch, expected "
-              << expected_sampling_rate << ", got " << sampling_rate
-              << "\nPerhaps you want to use the options "
-                 "--allow_{upsample,downsample}";
-  }
-}
-template <class C>
-void OnlineGenericBaseFeature<C>::InputFinished() {
-  if (resampler_ != nullptr) {
-    // There may be a few samples left once we flush the resampler_ object, telling it
-    // that the file has finished.  This should rarely make any difference.
-    Vector<BaseFloat> appended_wave;
-    Vector<BaseFloat> resampled_wave;
-    resampler_->Resample(appended_wave, true, &resampled_wave);
-    if (resampled_wave.Dim() != 0) {
-      appended_wave.Resize(waveform_remainder_.Dim() +
-                           resampled_wave.Dim());
-      if (waveform_remainder_.Dim() != 0)
-        appended_wave.Range(0, waveform_remainder_.Dim())
-            .CopyFromVec(waveform_remainder_);
-      appended_wave.Range(waveform_remainder_.Dim(), resampled_wave.Dim())
-          .CopyFromVec(resampled_wave);
-      waveform_remainder_.Swap(&appended_wave);
-    }
-  }
-  input_finished_ = true;
-  ComputeFeatures();
-}
-template <class C>
-void OnlineGenericBaseFeature<C>::AcceptWaveform(
-    BaseFloat sampling_rate, const VectorBase<BaseFloat> &original_waveform) {
-  if (original_waveform.Dim() == 0)
-    return;  // Nothing to do.
-  if (input_finished_)
-    KALDI_ERR << "AcceptWaveform called after InputFinished() was called.";
-  Vector<BaseFloat> appended_wave;
-  Vector<BaseFloat> resampled_wave;
-  const VectorBase<BaseFloat> *waveform;
-  MaybeCreateResampler(sampling_rate);
-  if (resampler_ == nullptr) {
-    waveform = &original_waveform;
-  } else {
-    resampler_->Resample(original_waveform, false, &resampled_wave);
-    waveform = &resampled_wave;
-  }
-  appended_wave.Resize(waveform_remainder_.Dim() + waveform->Dim());
-  if (waveform_remainder_.Dim() != 0)
-    appended_wave.Range(0, waveform_remainder_.Dim())
-        .CopyFromVec(waveform_remainder_);
-  appended_wave.Range(waveform_remainder_.Dim(), waveform->Dim())
-      .CopyFromVec(*waveform);
-  waveform_remainder_.Swap(&appended_wave);
-  ComputeFeatures();
-}
-template <class C>
-void OnlineGenericBaseFeature<C>::ComputeFeatures() {
-  const FrameExtractionOptions &frame_opts = computer_.GetFrameOptions();
-  int64 num_samples_total = waveform_offset_ + waveform_remainder_.Dim();
-  int32 num_frames_old = features_.Size(),
-      num_frames_new = NumFrames(num_samples_total, frame_opts,
-                                 input_finished_);
-  KALDI_ASSERT(num_frames_new >= num_frames_old);
-  Vector<BaseFloat> window;
-  bool need_raw_log_energy = computer_.NeedRawLogEnergy();
-  for (int32 frame = num_frames_old; frame < num_frames_new; frame++) {
-    BaseFloat raw_log_energy = 0.0;
-    ExtractWindow(waveform_offset_, waveform_remainder_, frame,
-                  frame_opts, window_function_, &window,
-                  need_raw_log_energy ? &raw_log_energy : NULL);
-    Vector<BaseFloat> *this_feature = new Vector<BaseFloat>(computer_.Dim(),
-                                                            kUndefined);
-    // note: this online feature-extraction code does not support VTLN.
-    BaseFloat vtln_warp = 1.0;
-    computer_.Compute(raw_log_energy, vtln_warp, &window, this_feature);
-    features_.PushBack(this_feature);
-  }
-  // OK, we will now discard any portion of the signal that will not be
-  // necessary to compute frames in the future.
-  int64 first_sample_of_next_frame = FirstSampleOfFrame(num_frames_new,
-                                                        frame_opts);
-  int32 samples_to_discard = first_sample_of_next_frame - waveform_offset_;
-  if (samples_to_discard > 0) {
-    // discard the leftmost part of the waveform that we no longer need.
-    int32 new_num_samples = waveform_remainder_.Dim() - samples_to_discard;
-    if (new_num_samples <= 0) {
-      // odd, but we'll try to handle it.
-      waveform_offset_ += waveform_remainder_.Dim();
-      waveform_remainder_.Resize(0);
-    } else {
-      Vector<BaseFloat> new_remainder(new_num_samples);
-      new_remainder.CopyFromVec(waveform_remainder_.Range(samples_to_discard,
-                                                          new_num_samples));
-      waveform_offset_ += samples_to_discard;
-      waveform_remainder_.Swap(&new_remainder);
-    }
-  }
-}
-// instantiate the templates defined here for MFCC, PLP and filterbank classes.
-template class OnlineGenericBaseFeature<MfccComputer>;
-template class OnlineGenericBaseFeature<PlpComputer>;
-template class OnlineGenericBaseFeature<FbankComputer>;
-OnlineCmvnState::OnlineCmvnState(const OnlineCmvnState &other):
-    speaker_cmvn_stats(other.speaker_cmvn_stats),
-    global_cmvn_stats(other.global_cmvn_stats),
-    frozen_state(other.frozen_state) { }
-void OnlineCmvnState::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<OnlineCmvnState>");  // magic string.
-  WriteToken(os, binary, "<SpeakerCmvnStats>");
-  speaker_cmvn_stats.Write(os, binary);
-  WriteToken(os, binary, "<GlobalCmvnStats>");
-  global_cmvn_stats.Write(os, binary);
-  WriteToken(os, binary, "<FrozenState>");
-  frozen_state.Write(os, binary);
-  WriteToken(os, binary, "</OnlineCmvnState>");
-}
-void OnlineCmvnState::Read(std::istream &is, bool binary) {
-  ExpectToken(is, binary, "<OnlineCmvnState>");  // magic string.
-  ExpectToken(is, binary, "<SpeakerCmvnStats>");
-  speaker_cmvn_stats.Read(is, binary);
-  ExpectToken(is, binary, "<GlobalCmvnStats>");
-  global_cmvn_stats.Read(is, binary);
-  ExpectToken(is, binary, "<FrozenState>");
-  frozen_state.Read(is, binary);
-  ExpectToken(is, binary, "</OnlineCmvnState>");
-}
-OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
-                       const OnlineCmvnState &cmvn_state,
-                       OnlineFeatureInterface *src):
-    opts_(opts), temp_stats_(2, src->Dim() + 1),
-    temp_feats_(src->Dim()), temp_feats_dbl_(src->Dim()),
-    src_(src) {
-  SetState(cmvn_state);
-  if (!SplitStringToIntegers(opts.skip_dims, ":", false, &skip_dims_))
-    KALDI_ERR << "Bad --skip-dims option (should be colon-separated list of "
-              <<  "integers)";
-}
-OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
-                       OnlineFeatureInterface *src):
-    opts_(opts), temp_stats_(2, src->Dim() + 1),
-    temp_feats_(src->Dim()), temp_feats_dbl_(src->Dim()),
-    src_(src) {
-  if (!SplitStringToIntegers(opts.skip_dims, ":", false, &skip_dims_))
-    KALDI_ERR << "Bad --skip-dims option (should be colon-separated list of "
-              <<  "integers)";
-}
-void OnlineCmvn::GetMostRecentCachedFrame(int32 frame,
-                                          int32 *cached_frame,
-                                          MatrixBase<double> *stats) {
-  KALDI_ASSERT(frame >= 0);
-  InitRingBufferIfNeeded();
-  // look for a cached frame on a previous frame as close as possible in time
-  // to "frame".  Return if we get one.
-  for (int32 t = frame; t >= 0 && t >= frame - opts_.ring_buffer_size; t--) {
-    if (t % opts_.modulus == 0) {
-      // if this frame should be cached in cached_stats_modulo_, then
-      // we'll look there, and we won't go back any further in time.
-      break;
-    }
-    int32 index = t % opts_.ring_buffer_size;
-    if (cached_stats_ring_[index].first == t) {
-      *cached_frame = t;
-      stats->CopyFromMat(cached_stats_ring_[index].second);
-      return;
-    }
-  }
-  int32 n = frame / opts_.modulus;
-  if (n >= cached_stats_modulo_.size()) {
-    if (cached_stats_modulo_.size() == 0) {
-      *cached_frame = -1;
-      stats->SetZero();
-      return;
-    } else {
-      n = static_cast<int32>(cached_stats_modulo_.size() - 1);
-    }
-  }
-  *cached_frame = n * opts_.modulus;
-  KALDI_ASSERT(cached_stats_modulo_[n] != NULL);
-  stats->CopyFromMat(*(cached_stats_modulo_[n]));
-}
-// Initialize ring buffer for caching stats.
-void OnlineCmvn::InitRingBufferIfNeeded() {
-  if (cached_stats_ring_.empty() && opts_.ring_buffer_size > 0) {
-    Matrix<double> temp(2, this->Dim() + 1);
-    cached_stats_ring_.resize(opts_.ring_buffer_size,
-                              std::pair<int32, Matrix<double> >(-1, temp));
-  }
-}
-void OnlineCmvn::CacheFrame(int32 frame, const MatrixBase<double> &stats) {
-  KALDI_ASSERT(frame >= 0);
-  if (frame % opts_.modulus == 0) {  // store in cached_stats_modulo_.
-    int32 n = frame / opts_.modulus;
-    if (n >= cached_stats_modulo_.size()) {
-      // The following assert is a limitation on in what order you can call
-      // CacheFrame.  Fortunately the calling code always calls it in sequence,
-      // which it has to because you need a previous frame to compute the
-      // current one.
-      KALDI_ASSERT(n == cached_stats_modulo_.size());
-      cached_stats_modulo_.push_back(new Matrix<double>(stats));
-    } else {
-      KALDI_WARN << "Did not expect to reach this part of code.";
-      // do what seems right, but we shouldn't get here.
-      cached_stats_modulo_[n]->CopyFromMat(stats);
-    }
-  } else {  // store in the ring buffer.
-    InitRingBufferIfNeeded();
-    if (!cached_stats_ring_.empty()) {
-      int32 index = frame % cached_stats_ring_.size();
-      cached_stats_ring_[index].first = frame;
-      cached_stats_ring_[index].second.CopyFromMat(stats);
-    }
-  }
-}
-OnlineCmvn::~OnlineCmvn() {
-  for (size_t i = 0; i < cached_stats_modulo_.size(); i++)
-    delete cached_stats_modulo_[i];
-  cached_stats_modulo_.clear();
-}
-void OnlineCmvn::ComputeStatsForFrame(int32 frame,
-                                      MatrixBase<double> *stats_out) {
-  KALDI_ASSERT(frame >= 0 && frame < src_->NumFramesReady());
-  int32 dim = this->Dim(), cur_frame;
-  GetMostRecentCachedFrame(frame, &cur_frame, stats_out);
-  Vector<BaseFloat> &feats(temp_feats_);
-  Vector<double> &feats_dbl(temp_feats_dbl_);
-  while (cur_frame < frame) {
-    cur_frame++;
-    src_->GetFrame(cur_frame, &feats);
-    feats_dbl.CopyFromVec(feats);
-    stats_out->Row(0).Range(0, dim).AddVec(1.0, feats_dbl);
-    if (opts_.normalize_variance)
-      stats_out->Row(1).Range(0, dim).AddVec2(1.0, feats_dbl);
-    (*stats_out)(0, dim) += 1.0;
-    // it's a sliding buffer; a frame at the back may be
-    // leaving the buffer so we have to subtract that.
-    int32 prev_frame = cur_frame - opts_.cmn_window;
-    if (prev_frame >= 0) {
-      // we need to subtract frame prev_f from the stats.
-      src_->GetFrame(prev_frame, &feats);
-      feats_dbl.CopyFromVec(feats);
-      stats_out->Row(0).Range(0, dim).AddVec(-1.0, feats_dbl);
-      if (opts_.normalize_variance)
-        stats_out->Row(1).Range(0, dim).AddVec2(-1.0, feats_dbl);
-      (*stats_out)(0, dim) -= 1.0;
-    }
-    CacheFrame(cur_frame, (*stats_out));
-  }
-}
-// static
-void OnlineCmvn::SmoothOnlineCmvnStats(const MatrixBase<double> &speaker_stats,
-                                       const MatrixBase<double> &global_stats,
-                                       const OnlineCmvnOptions &opts,
-                                       MatrixBase<double> *stats) {
-  if (speaker_stats.NumRows() == 2 && !opts.normalize_variance) {
-    // this is just for efficiency: don't operate on the variance if it's not
-    // needed.
-    int32 cols = speaker_stats.NumCols();  // dim + 1
-    SubMatrix<double> stats_temp(*stats, 0, 1, 0, cols);
-    SmoothOnlineCmvnStats(speaker_stats.RowRange(0, 1),
-                          global_stats.RowRange(0, 1),
-                          opts, &stats_temp);
-    return;
-  }
-  int32 dim = stats->NumCols() - 1;
-  double cur_count = (*stats)(0, dim);
-  // If count exceeded cmn_window it would be an error in how "window_stats"
-  // was accumulated.
-  KALDI_ASSERT(cur_count <= 1.001 * opts.cmn_window);
-  if (cur_count >= opts.cmn_window)
-    return;
-  if (speaker_stats.NumRows() != 0) {  // if we have speaker stats..
-    double count_from_speaker = opts.cmn_window - cur_count,
-        speaker_count = speaker_stats(0, dim);
-    if (count_from_speaker > opts.speaker_frames)
-      count_from_speaker = opts.speaker_frames;
-    if (count_from_speaker > speaker_count)
-      count_from_speaker = speaker_count;
-    if (count_from_speaker > 0.0)
-      stats->AddMat(count_from_speaker / speaker_count,
-                             speaker_stats);
-    cur_count = (*stats)(0, dim);
-  }
-  if (cur_count >= opts.cmn_window)
-    return;
-  if (global_stats.NumRows() != 0) {
-    double count_from_global = opts.cmn_window - cur_count,
-        global_count = global_stats(0, dim);
-    KALDI_ASSERT(global_count > 0.0);
-    if (count_from_global > opts.global_frames)
-      count_from_global = opts.global_frames;
-    if (count_from_global > 0.0)
-      stats->AddMat(count_from_global / global_count,
-                    global_stats);
-  } else {
-    KALDI_ERR << "Global CMN stats are required";
-  }
-}
-void OnlineCmvn::GetFrame(int32 frame,
-                          VectorBase<BaseFloat> *feat) {
-  src_->GetFrame(frame, feat);
-  KALDI_ASSERT(feat->Dim() == this->Dim());
-  int32 dim = feat->Dim();
-  Matrix<double> &stats(temp_stats_);
-  stats.Resize(2, dim + 1, kUndefined);  // Will do nothing if size was correct.
-  if (frozen_state_.NumRows() != 0) {  // the CMVN state has been frozen.
-    stats.CopyFromMat(frozen_state_);
-  } else {
-    // first get the raw CMVN stats (this involves caching..)
-    this->ComputeStatsForFrame(frame, &stats);
-    // now smooth them.
-    SmoothOnlineCmvnStats(orig_state_.speaker_cmvn_stats,
-                          orig_state_.global_cmvn_stats,
-                          opts_,
-                          &stats);
-  }
-  if (!skip_dims_.empty())
-    FakeStatsForSomeDims(skip_dims_, &stats);
-  // call the function ApplyCmvn declared in ../transform/cmvn.h, which
-  // requires a matrix.
-  // 1 row; num-cols == dim; stride  == dim.
-  SubMatrix<BaseFloat> feat_mat(feat->Data(), 1, dim, dim);
-  // the function ApplyCmvn takes a matrix, so form a one-row matrix to give it.
-  if (opts_.normalize_mean)
-    ApplyCmvn(stats, opts_.normalize_variance, &feat_mat);
-  else
-    KALDI_ASSERT(!opts_.normalize_variance);
-}
-void OnlineCmvn::Freeze(int32 cur_frame) {
-  int32 dim = this->Dim();
-  Matrix<double> stats(2, dim + 1);
-  // get the raw CMVN stats
-  this->ComputeStatsForFrame(cur_frame, &stats);
-  // now smooth them.
-  SmoothOnlineCmvnStats(orig_state_.speaker_cmvn_stats,
-                        orig_state_.global_cmvn_stats,
-                        opts_,
-                        &stats);
-  this->frozen_state_ = stats;
-}
-void OnlineCmvn::GetState(int32 cur_frame,
-                          OnlineCmvnState *state_out) {
-  *state_out = this->orig_state_;
-  { // This block updates state_out->speaker_cmvn_stats
-    int32 dim = this->Dim();
-    if (state_out->speaker_cmvn_stats.NumRows() == 0)
-      state_out->speaker_cmvn_stats.Resize(2, dim + 1);
-    Vector<BaseFloat> feat(dim);
-    Vector<double> feat_dbl(dim);
-    for (int32 t = 0; t <= cur_frame; t++) {
-      src_->GetFrame(t, &feat);
-      feat_dbl.CopyFromVec(feat);
-      state_out->speaker_cmvn_stats(0, dim) += 1.0;
-      state_out->speaker_cmvn_stats.Row(0).Range(0, dim).AddVec(1.0, feat_dbl);
-      state_out->speaker_cmvn_stats.Row(1).Range(0, dim).AddVec2(1.0, feat_dbl);
-    }
-  }
-  // Store any frozen state (the effect of the user possibly
-  // having called Freeze().
-  state_out->frozen_state = frozen_state_;
-}
-void OnlineCmvn::SetState(const OnlineCmvnState &cmvn_state) {
-  KALDI_ASSERT(cached_stats_modulo_.empty() &&
-               "You cannot call SetState() after processing data.");
-  orig_state_ = cmvn_state;
-  frozen_state_ = cmvn_state.frozen_state;
-}
-int32 OnlineSpliceFrames::NumFramesReady() const {
-  int32 num_frames = src_->NumFramesReady();
-  if (num_frames > 0 && src_->IsLastFrame(num_frames - 1))
-    return num_frames;
-  else
-    return std::max<int32>(0, num_frames - right_context_);
-}
-void OnlineSpliceFrames::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
-  KALDI_ASSERT(left_context_ >= 0 && right_context_ >= 0);
-  KALDI_ASSERT(frame >= 0 && frame < NumFramesReady());
-  int32 dim_in = src_->Dim();
-  KALDI_ASSERT(feat->Dim() == dim_in * (1 + left_context_ + right_context_));
-  int32 T = src_->NumFramesReady();
-  for (int32 t2 = frame - left_context_; t2 <= frame + right_context_; t2++) {
-    int32 t2_limited = t2;
-    if (t2_limited < 0) t2_limited = 0;
-    if (t2_limited >= T) t2_limited = T - 1;
-    int32 n = t2 - (frame - left_context_);  // 0 for left-most frame,
-                                             // increases to the right.
-    SubVector<BaseFloat> part(*feat, n * dim_in, dim_in);
-    src_->GetFrame(t2_limited, &part);
-  }
-}
-OnlineTransform::OnlineTransform(const MatrixBase<BaseFloat> &transform,
-                                 OnlineFeatureInterface *src):
-    src_(src) {
-  int32 src_dim = src_->Dim();
-  if (transform.NumCols() == src_dim) {  // Linear transform
-    linear_term_ = transform;
-    offset_.Resize(transform.NumRows());  // Resize() will zero it.
-  } else if (transform.NumCols() == src_dim + 1) {  // Affine transform
-    linear_term_ = transform.Range(0, transform.NumRows(), 0, src_dim);
-    offset_.Resize(transform.NumRows());
-    offset_.CopyColFromMat(transform, src_dim);
-  } else {
-    KALDI_ERR << "Dimension mismatch: source features have dimension "
-              << src_dim << " and LDA #cols is " << transform.NumCols();
-  }
-}
-void OnlineTransform::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
-  Vector<BaseFloat> input_feat(linear_term_.NumCols());
-  src_->GetFrame(frame, &input_feat);
-  feat->CopyFromVec(offset_);
-  feat->AddMatVec(1.0, linear_term_, kNoTrans, input_feat, 1.0);
-}
-void OnlineTransform::GetFrames(
-    const std::vector<int32> &frames, MatrixBase<BaseFloat> *feats) {
-  KALDI_ASSERT(static_cast<int32>(frames.size()) == feats->NumRows());
-  int32 num_frames = feats->NumRows(),
-      input_dim = linear_term_.NumCols();
-  Matrix<BaseFloat> input_feats(num_frames, input_dim, kUndefined);
-  src_->GetFrames(frames, &input_feats);
-  feats->CopyRowsFromVec(offset_);
-  feats->AddMatMat(1.0, input_feats, kNoTrans, linear_term_, kTrans, 1.0);
-}
-int32 OnlineDeltaFeature::Dim() const {
-  int32 src_dim = src_->Dim();
-  return src_dim * (1 + opts_.order);
-}
-int32 OnlineDeltaFeature::NumFramesReady() const {
-  int32 num_frames = src_->NumFramesReady(),
-      context = opts_.order * opts_.window;
-  // "context" is the number of frames on the left or (more relevant
-  // here) right which we need in order to produce the output.
-  if (num_frames > 0 && src_->IsLastFrame(num_frames-1))
-    return num_frames;
-  else
-    return std::max<int32>(0, num_frames - context);
-}
-void OnlineDeltaFeature::GetFrame(int32 frame,
-                                      VectorBase<BaseFloat> *feat) {
-  KALDI_ASSERT(frame >= 0 && frame < NumFramesReady());
-  KALDI_ASSERT(feat->Dim() == Dim());
-  // We'll produce a temporary matrix containing the features we want to
-  // compute deltas on, but truncated to the necessary context.
-  int32 context = opts_.order * opts_.window;
-  int32 left_frame = frame - context,
-      right_frame = frame + context,
-      src_frames_ready = src_->NumFramesReady();
-  if (left_frame < 0) left_frame = 0;
-  if (right_frame >= src_frames_ready)
-    right_frame = src_frames_ready - 1;
-  KALDI_ASSERT(right_frame >= left_frame);
-  int32 temp_num_frames = right_frame + 1 - left_frame,
-      src_dim = src_->Dim();
-  Matrix<BaseFloat> temp_src(temp_num_frames, src_dim);
-  for (int32 t = left_frame; t <= right_frame; t++) {
-    SubVector<BaseFloat> temp_row(temp_src, t - left_frame);
-    src_->GetFrame(t, &temp_row);
-  }
-  int32 temp_t = frame - left_frame;  // temp_t is the offset of frame "frame"
-                                      // within temp_src
-  delta_features_.Process(temp_src, temp_t, feat);
-}
-OnlineDeltaFeature::OnlineDeltaFeature(const DeltaFeaturesOptions &opts,
-                                       OnlineFeatureInterface *src):
-    src_(src), opts_(opts), delta_features_(opts) { }
-void OnlineCacheFeature::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
-  KALDI_ASSERT(frame >= 0);
-  if (static_cast<size_t>(frame) < cache_.size() && cache_[frame] != NULL) {
-    feat->CopyFromVec(*(cache_[frame]));
-  } else {
-    if (static_cast<size_t>(frame) >= cache_.size())
-      cache_.resize(frame + 1, NULL);
-    int32 dim = this->Dim();
-    cache_[frame] = new Vector<BaseFloat>(dim);
-    // The following call will crash if frame "frame" is not ready.
-    src_->GetFrame(frame, cache_[frame]);
-    feat->CopyFromVec(*(cache_[frame]));
-  }
-}
-void OnlineCacheFeature::GetFrames(
-    const std::vector<int32> &frames, MatrixBase<BaseFloat> *feats) {
-  int32 num_frames = frames.size();
-  // non_cached_frames will be the subset of 't' values in 'frames' which were
-  // not previously cached, which we therefore need to get from src_.
-  std::vector<int32> non_cached_frames;
-  // 'non_cached_indexes' stores the indexes 'i' into 'frames' corresponding to
-  // the corresponding frames in 'non_cached_frames'.
-  std::vector<int32> non_cached_indexes;
-  non_cached_frames.reserve(frames.size());
-  non_cached_indexes.reserve(frames.size());
-  for (int32 i = 0; i < num_frames; i++) {
-    int32 t = frames[i];
-    if (static_cast<size_t>(t) < cache_.size() && cache_[t] != NULL) {
-      feats->Row(i).CopyFromVec(*(cache_[t]));
-    } else {
-      non_cached_frames.push_back(t);
-      non_cached_indexes.push_back(i);
-    }
-  }
-  if (non_cached_frames.empty())
-    return;
-  int32 num_non_cached_frames = non_cached_frames.size(),
-      dim = this->Dim();
-  Matrix<BaseFloat> non_cached_feats(num_non_cached_frames, dim,
-                                     kUndefined);
-  src_->GetFrames(non_cached_frames, &non_cached_feats);
-  for (int32 i = 0; i < num_non_cached_frames; i++) {
-    int32 t = non_cached_frames[i];
-    if (static_cast<size_t>(t) < cache_.size() && cache_[t] != NULL) {
-      // We can reach this point due to repeat indexes in 'non_cached_frames'.
-      feats->Row(non_cached_indexes[i]).CopyFromVec(*(cache_[t]));
-    } else {
-      SubVector<BaseFloat> this_feat(non_cached_feats, i);
-      feats->Row(non_cached_indexes[i]).CopyFromVec(this_feat);
-      if (static_cast<size_t>(t) >= cache_.size())
-        cache_.resize(t + 1, NULL);
-      cache_[t] = new Vector<BaseFloat>(this_feat);
-    }
-  }
-}
-void OnlineCacheFeature::ClearCache() {
-  for (size_t i = 0; i < cache_.size(); i++)
-    delete cache_[i];
-  cache_.resize(0);
-}
-void OnlineAppendFeature::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
-  KALDI_ASSERT(feat->Dim() == Dim());
-  SubVector<BaseFloat> feat1(*feat, 0, src1_->Dim());
-  SubVector<BaseFloat> feat2(*feat, src1_->Dim(), src2_->Dim());
-  src1_->GetFrame(frame, &feat1);
-  src2_->GetFrame(frame, &feat2);
-};
-}  // namespace kaldi
--- a/speechx/speechx/kaldi/feat/online-feature.h
+++ b/speechx/speechx/kaldi/feat/online-feature.h
-// feat/online-feature.h
-// Copyright 2013   Johns Hopkins University (author: Daniel Povey)
-//           2014   Yanqing Sun, Junjie Wang,
-//                  Daniel Povey, Korbinian Riedhammer
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_FEAT_ONLINE_FEATURE_H_
-#define KALDI_FEAT_ONLINE_FEATURE_H_
-#include <string>
-#include <vector>
-#include <deque>
-#include "matrix/matrix-lib.h"
-#include "util/common-utils.h"
-#include "base/kaldi-error.h"
-#include "feat/feature-functions.h"
-#include "feat/feature-mfcc.h"
-#include "feat/feature-plp.h"
-#include "feat/feature-fbank.h"
-#include "feat/online-feature-itf.h"
-namespace kaldi {
-/// @addtogroup  onlinefeat OnlineFeatureExtraction
-/// @{
-/// This class serves as a storage for feature vectors with an option to limit
-/// the memory usage by removing old elements. The deleted frames indices are
-/// "remembered" so that regardless of the MAX_ITEMS setting, the user always
-/// provides the indices as if no deletion was being performed.
-/// This is useful when processing very long recordings which would otherwise
-/// cause the memory to eventually blow up when the features are not being removed.
-class RecyclingVector {
-public:
-  /// By default it does not remove any elements.
-  RecyclingVector(int items_to_hold = -1);
-  /// The ownership is being retained by this collection - do not delete the item.
-  Vector<BaseFloat> *At(int index) const;
-  /// The ownership of the item is passed to this collection - do not delete the item.
-  void PushBack(Vector<BaseFloat> *item);
-  /// This method returns the size as if no "recycling" had happened,
-  /// i.e. equivalent to the number of times the PushBack method has been called.
-  int Size() const;
-  ~RecyclingVector();
-private:
-  std::deque<Vector<BaseFloat>*> items_;
-  int items_to_hold_;
-  int first_available_index_;
-};
-/// This is a templated class for online feature extraction;
-/// it's templated on a class like MfccComputer or PlpComputer
-/// that does the basic feature extraction.
-template<class C>
-class OnlineGenericBaseFeature: public OnlineBaseFeature {
- public:
-  //
-  // First, functions that are present in the interface:
-  //
-  virtual int32 Dim() const { return computer_.Dim(); }
-  // Note: IsLastFrame() will only ever return true if you have called
-  // InputFinished() (and this frame is the last frame).
-  virtual bool IsLastFrame(int32 frame) const {
-    return input_finished_ && frame == NumFramesReady() - 1;
-  }
-  virtual BaseFloat FrameShiftInSeconds() const {
-    return computer_.GetFrameOptions().frame_shift_ms / 1000.0f;
-  }
-  virtual int32 NumFramesReady() const { return features_.Size(); }
-  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
-  // Next, functions that are not in the interface.
-  // Constructor from options class
-  explicit OnlineGenericBaseFeature(const typename C::Options &opts);
-  // This would be called from the application, when you get
-  // more wave data.  Note: the sampling_rate is only provided so
-  // the code can assert that it matches the sampling rate
-  // expected in the options.
-  virtual void AcceptWaveform(BaseFloat sampling_rate,
-                              const VectorBase<BaseFloat> &waveform);
-  // InputFinished() tells the class you won't be providing any
-  // more waveform.  This will help flush out the last frame or two
-  // of features, in the case where snip-edges == false; it also
-  // affects the return value of IsLastFrame().
-  virtual void InputFinished();
- private:
-  // This function computes any additional feature frames that it is possible to
-  // compute from 'waveform_remainder_', which at this point may contain more
-  // than just a remainder-sized quantity (because AcceptWaveform() appends to
-  // waveform_remainder_ before calling this function).  It adds these feature
-  // frames to features_, and shifts off any now-unneeded samples of input from
-  // waveform_remainder_ while incrementing waveform_offset_ by the same amount.
-  void ComputeFeatures();
-  void MaybeCreateResampler(BaseFloat sampling_rate);
-  C computer_;  // class that does the MFCC or PLP or filterbank computation
-  // resampler in cases when the input sampling frequency is not equal to
-  // the expected sampling rate
-  std::unique_ptr<LinearResample> resampler_;
-  FeatureWindowFunction window_function_;
-  // features_ is the Mfcc or Plp or Fbank features that we have already computed.
-  RecyclingVector features_;
-  // True if the user has called "InputFinished()"
-  bool input_finished_;
-  // The sampling frequency, extracted from the config.  Should
-  // be identical to the waveform supplied.
-  BaseFloat sampling_frequency_;
-  // waveform_offset_ is the number of samples of waveform that we have
-  // already discarded, i.e. that were prior to 'waveform_remainder_'.
-  int64 waveform_offset_;
-  // waveform_remainder_ is a short piece of waveform that we may need to keep
-  // after extracting all the whole frames we can (whatever length of feature
-  // will be required for the next phase of computation).
-  Vector<BaseFloat> waveform_remainder_;
-};
-typedef OnlineGenericBaseFeature<MfccComputer> OnlineMfcc;
-typedef OnlineGenericBaseFeature<PlpComputer> OnlinePlp;
-typedef OnlineGenericBaseFeature<FbankComputer> OnlineFbank;
-/// This class takes a Matrix<BaseFloat> and wraps it as an
-/// OnlineFeatureInterface: this can be useful where some earlier stage of
-/// feature processing has been done offline but you want to use part of the
-/// online pipeline.
-class OnlineMatrixFeature: public OnlineFeatureInterface {
- public:
-  /// Caution: this class maintains the const reference from the constructor, so
-  /// don't let it go out of scope while this object exists.
-  explicit OnlineMatrixFeature(const MatrixBase<BaseFloat> &mat): mat_(mat) { }
-  virtual int32 Dim() const { return mat_.NumCols(); }
-  virtual BaseFloat FrameShiftInSeconds() const {
-    return 0.01f;
-  }
-  virtual int32 NumFramesReady() const { return mat_.NumRows(); }
-  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
-    feat->CopyFromVec(mat_.Row(frame));
-  }
-  virtual bool IsLastFrame(int32 frame) const {
-    return (frame + 1 == mat_.NumRows());
-  }
- private:
-  const MatrixBase<BaseFloat> &mat_;
-};
-// Note the similarity with SlidingWindowCmnOptions, but there
-// are also differences.  One which doesn't appear in the config
-// itself, because it's a difference between the setups, is that
-// in OnlineCmn, we carry over data from the previous utterance,
-// or, if no previous utterance is available, from global stats,
-// or, if previous utterances are available but the total amount
-// of data is less than prev_frames, we pad with up to "global_frames"
-// frames from the global stats.
-struct OnlineCmvnOptions {
-  int32 cmn_window;
-  int32 speaker_frames;  // must be <= cmn_window
-  int32 global_frames;  // must be <= speaker_frames.
-  bool normalize_mean;  // Must be true if normalize_variance==true.
-  bool normalize_variance;
-  int32 modulus;  // not configurable from command line, relates to how the
-                  // class computes the cmvn internally.  smaller->more
-                  // time-efficient but less memory-efficient.  Must be >= 1.
-  int32 ring_buffer_size;  // not configurable from command line; size of ring
-                           // buffer used for caching CMVN stats.  Must be >=
-                           // modulus.
-  std::string skip_dims; // Colon-separated list of dimensions to skip normalization
-                         // of, e.g. 13:14:15.
-  OnlineCmvnOptions():
-      cmn_window(600),
-      speaker_frames(600),
-      global_frames(200),
-      normalize_mean(true),
-      normalize_variance(false),
-      modulus(20),
-      ring_buffer_size(20),
-      skip_dims("") { }
-  void Check() const {
-    KALDI_ASSERT(speaker_frames <= cmn_window && global_frames <= speaker_frames
-                 && modulus > 0);
-  }
-  void Register(ParseOptions *po) {
-    po->Register("cmn-window", &cmn_window, "Number of frames of sliding "
-                 "context for cepstral mean normalization.");
-    po->Register("global-frames", &global_frames, "Number of frames of "
-                 "global-average cepstral mean normalization stats to use for "
-                 "first utterance of a speaker");
-    po->Register("speaker-frames", &speaker_frames, "Number of frames of "
-                 "previous utterance(s) from this speaker to use in cepstral "
-                 "mean normalization");
-    // we name the config string "norm-vars" for compatibility with
-    // ../featbin/apply-cmvn.cc
-    po->Register("norm-vars", &normalize_variance, "If true, do "
-                 "cepstral variance normalization in addition to cepstral mean "
-                 "normalization ");
-    po->Register("norm-means", &normalize_mean, "If true, do mean normalization "
-                 "(note: you cannot normalize the variance but not the mean)");
-    po->Register("skip-dims", &skip_dims, "Dimensions to skip normalization of "
-                 "(colon-separated list of integers)");}
-};
-/** Struct OnlineCmvnState stores the state of CMVN adaptation between
-    utterances (but not the state of the computation within an utterance).  It
-    stores the global CMVN stats and the stats of the current speaker (if we
-    have seen previous utterances for this speaker), and possibly will have a
-    member "frozen_state": if the user has called the function Freeze() of class
-    OnlineCmvn, to fix the CMVN so we can estimate fMLLR on top of the fixed
-    value of cmvn.  If nonempty, "frozen_state" will reflect how we were
-    normalizing the mean and (if applicable) variance at the time when that
-    function was called.
-*/
-struct OnlineCmvnState {
-  // The following is the total CMVN stats for this speaker (up till now), in
-  // the same format.
-  Matrix<double> speaker_cmvn_stats;
-  // The following is the global CMVN stats, in the usual
-  // format, of dimension 2 x (dim+1), as [  sum-stats          count
-  //                                       sum-squared-stats   0    ]
-  Matrix<double> global_cmvn_stats;
-  // If nonempty, contains CMVN stats representing the "frozen" state
-  // of CMVN that reflects how we were normalizing the data when the
-  // user called the Freeze() function in class OnlineCmvn.
-  Matrix<double> frozen_state;
-  OnlineCmvnState() { }
-  explicit OnlineCmvnState(const Matrix<double> &global_stats):
-      global_cmvn_stats(global_stats) { }
-  // Copy constructor
-  OnlineCmvnState(const OnlineCmvnState &other);
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &is, bool binary);
-  // Use the default assignment operator.
-};
-/**
-   This class does an online version of the cepstral mean and [optionally]
-   variance, but note that this is not equivalent to the offline version.  This
-   is necessarily so, as the offline computation involves looking into the
-   future.  If you plan to use features normalized with this type of CMVN then
-   you need to train in a `matched' way, i.e. with the same type of features.
-   We normally only do so in the "online" GMM-based decoding, e.g.  in
-   online2bin/online2-wav-gmm-latgen-faster.cc; see also the script
-   steps/online/prepare_online_decoding.sh and steps/online/decode.sh.
-   In the steady state (in the middle of a long utterance), this class
-   accumulates CMVN statistics from the previous "cmn_window" frames (default 600
-   frames, or 6 seconds), and uses these to normalize the mean and possibly
-   variance of the current frame.
-   The config variables "speaker_frames" and "global_frames" relate to what
-   happens at the beginning of the utterance when we have seen fewer than
-   "cmn_window" frames of context, and so might not have very good stats to
-   normalize with.  Basically, we first augment any existing stats with up
-   to "speaker_frames" frames of stats from previous utterances of the current
-   speaker, and if this doesn't take us up to the required "cmn_window" frame
-   count, we further augment with up to "global_frames" frames of global
-   stats.  The global stats are CMVN stats accumulated from training or testing
-   data, that give us a reasonable source of mean and variance for "typical"
-   data.
- */
-class OnlineCmvn: public OnlineFeatureInterface {
- public:
-  //
-  // First, functions that are present in the interface:
-  //
-  virtual int32 Dim() const { return src_->Dim(); }
-  virtual bool IsLastFrame(int32 frame) const {
-    return src_->IsLastFrame(frame);
-  }
-  virtual BaseFloat FrameShiftInSeconds() const {
-    return src_->FrameShiftInSeconds();
-  }
-  // The online cmvn does not introduce any additional latency.
-  virtual int32 NumFramesReady() const { return src_->NumFramesReady(); }
-  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
-  //
-  // Next, functions that are not in the interface.
-  //
-  /// Initializer that sets the cmvn state.  If you don't have previous
-  /// utterances from the same speaker you are supposed to initialize the CMVN
-  /// state from some global CMVN stats, which you can get from summing all cmvn
-  /// stats you have in your training data using "sum-matrix".  This just gives
-  /// it a reasonable starting point at the start of the file.
-  /// If you do have previous utterances from the same speaker or at least a
-  /// similar environment, you are supposed to initialize it by calling GetState
-  /// from the previous utterance
-  OnlineCmvn(const OnlineCmvnOptions &opts,
-             const OnlineCmvnState &cmvn_state,
-             OnlineFeatureInterface *src);
-  /// Initializer that does not set the cmvn state:
-  /// after calling this, you should call SetState().
-  OnlineCmvn(const OnlineCmvnOptions &opts,
-             OnlineFeatureInterface *src);
-  // Outputs any state information from this utterance to "cmvn_state".
-  // The value of "cmvn_state" before the call does not matter: the output
-  // depends on the value of OnlineCmvnState the class was initialized
-  // with, the input feature values up to cur_frame, and the effects
-  // of the user possibly having called Freeze().
-  // If cur_frame is -1, it will just output the unmodified original
-  // state that was supplied to this object.
-  void GetState(int32 cur_frame,
-                OnlineCmvnState *cmvn_state);
-  // This function can be used to modify the state of the CMVN computation
-  // from outside, but must only be called before you have processed any data
-  // (otherwise it will crash).  This "state" is really just the information
-  // that is propagated between utterances, not the state of the computation
-  // inside an utterance.
-  void SetState(const OnlineCmvnState &cmvn_state);
-  // From this point it will freeze the CMN to what it would have been if
-  // measured at frame "cur_frame", and it will stop it from changing
-  // further. This also applies retroactively for this utterance, so if you
-  // call GetFrame() on previous frames, it will use the CMVN stats
-  // from cur_frame; and it applies in the future too if you then
-  // call OutputState() and use this state to initialize the next
-  // utterance's CMVN object.
-  void Freeze(int32 cur_frame);
-  virtual ~OnlineCmvn();
- private:
-  /// Smooth the CMVN stats "stats" (which are stored in the normal format as a
-  /// 2 x (dim+1) matrix), by possibly adding some stats from "global_stats"
-  /// and/or "speaker_stats", controlled by the config.  The best way to
-  /// understand the smoothing rule we use is just to look at the code.
-  static void SmoothOnlineCmvnStats(const MatrixBase<double> &speaker_stats,
-                                    const MatrixBase<double> &global_stats,
-                                    const OnlineCmvnOptions &opts,
-                                    MatrixBase<double> *stats);
-  /// Get the most recent cached frame of CMVN stats.  [If no frames
-  /// were cached, sets up empty stats for frame zero and returns that].
-  void GetMostRecentCachedFrame(int32 frame,
-                                int32 *cached_frame,
-                                MatrixBase<double> *stats);
-  /// Cache this frame of stats.
-  void CacheFrame(int32 frame, const MatrixBase<double> &stats);
-  /// Initialize ring buffer for caching stats.
-  inline void InitRingBufferIfNeeded();
-  /// Computes the raw CMVN stats for this frame, making use of (and updating if
-  /// necessary) the cached statistics in raw_stats_.  This means the (x,
-  /// x^2, count) stats for the last up to opts_.cmn_window frames.
-  void ComputeStatsForFrame(int32 frame,
-                            MatrixBase<double> *stats);
-  OnlineCmvnOptions opts_;
-  std::vector<int32> skip_dims_; // Skip CMVN for these dimensions.  Derived from opts_.
-  OnlineCmvnState orig_state_;   // reflects the state before we saw this
-                                 // utterance.
-  Matrix<double> frozen_state_;  // If the user called Freeze(), this variable
-                                 // will reflect the CMVN state that we froze
-                                 // at.
-  // The variable below reflects the raw (count, x, x^2) statistics of the
-  // input, computed every opts_.modulus frames.  raw_stats_[n / opts_.modulus]
-  // contains the (count, x, x^2) statistics for the frames from
-  // std::max(0, n - opts_.cmn_window) through n.
-  std::vector<Matrix<double>*> cached_stats_modulo_;
-  // the variable below is a ring-buffer of cached stats.  the int32 is the
-  // frame index.
-  std::vector<std::pair<int32, Matrix<double> > > cached_stats_ring_;
-  // Some temporary variables used inside functions of this class, which
-  // put here to avoid reallocation.
-  Matrix<double> temp_stats_;
-  Vector<BaseFloat> temp_feats_;
-  Vector<double> temp_feats_dbl_;
-  OnlineFeatureInterface *src_;  // Not owned here
-};
-struct OnlineSpliceOptions {
-  int32 left_context;
-  int32 right_context;
-  OnlineSpliceOptions(): left_context(4), right_context(4) { }
-  void Register(ParseOptions *po) {
-    po->Register("left-context", &left_context, "Left-context for frame "
-                 "splicing prior to LDA");
-    po->Register("right-context", &right_context, "Right-context for frame "
-                 "splicing prior to LDA");
-  }
-};
-class OnlineSpliceFrames: public OnlineFeatureInterface {
- public:
-  //
-  // First, functions that are present in the interface:
-  //
-  virtual int32 Dim() const {
-    return src_->Dim() * (1 + left_context_ + right_context_);
-  }
-  virtual bool IsLastFrame(int32 frame) const {
-    return src_->IsLastFrame(frame);
-  }
-  virtual BaseFloat FrameShiftInSeconds() const {
-    return src_->FrameShiftInSeconds();
-  }
-  virtual int32 NumFramesReady() const;
-  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
-  //
-  // Next, functions that are not in the interface.
-  //
-  OnlineSpliceFrames(const OnlineSpliceOptions &opts,
-                     OnlineFeatureInterface *src):
-      left_context_(opts.left_context), right_context_(opts.right_context),
-      src_(src) { }
- private:
-  int32 left_context_;
-  int32 right_context_;
-  OnlineFeatureInterface *src_;  // Not owned here
-};
-/// This online-feature class implements any affine or linear transform.
-class OnlineTransform: public OnlineFeatureInterface {
- public:
-  //
-  // First, functions that are present in the interface:
-  //
-  virtual int32 Dim() const { return offset_.Dim(); }
-  virtual bool IsLastFrame(int32 frame) const {
-    return src_->IsLastFrame(frame);
-  }
-  virtual BaseFloat FrameShiftInSeconds() const {
-    return src_->FrameShiftInSeconds();
-  }
-  virtual int32 NumFramesReady() const { return src_->NumFramesReady(); }
-  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
-  virtual void GetFrames(const std::vector<int32> &frames,
-                         MatrixBase<BaseFloat> *feats);
-  //
-  // Next, functions that are not in the interface.
-  //
-  /// The transform can be a linear transform, or an affine transform
-  /// where the last column is the offset.
-  OnlineTransform(const MatrixBase<BaseFloat> &transform,
-                  OnlineFeatureInterface *src);
- private:
-  OnlineFeatureInterface *src_;  // Not owned here
-  Matrix<BaseFloat> linear_term_;
-  Vector<BaseFloat> offset_;
-};
-class OnlineDeltaFeature: public OnlineFeatureInterface {
- public:
-  //
-  // First, functions that are present in the interface:
-  //
-  virtual int32 Dim() const;
-  virtual bool IsLastFrame(int32 frame) const {
-    return src_->IsLastFrame(frame);
-  }
-  virtual BaseFloat FrameShiftInSeconds() const {
-    return src_->FrameShiftInSeconds();
-  }
-  virtual int32 NumFramesReady() const;
-  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
-  //
-  // Next, functions that are not in the interface.
-  //
-  OnlineDeltaFeature(const DeltaFeaturesOptions &opts,
-                     OnlineFeatureInterface *src);
- private:
-  OnlineFeatureInterface *src_;  // Not owned here
-  DeltaFeaturesOptions opts_;
-  DeltaFeatures delta_features_;  // This class contains just a few
-                                  // coefficients.
-};
-/// This feature type can be used to cache its input, to avoid
-/// repetition of computation in a multi-pass decoding context.
-class OnlineCacheFeature: public OnlineFeatureInterface {
- public:
-  virtual int32 Dim() const { return src_->Dim(); }
-  virtual bool IsLastFrame(int32 frame) const {
-    return src_->IsLastFrame(frame);
-  }
-  virtual BaseFloat FrameShiftInSeconds() const {
-    return src_->FrameShiftInSeconds();
-  }
-  virtual int32 NumFramesReady() const { return src_->NumFramesReady(); }
-  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
-  virtual void GetFrames(const std::vector<int32> &frames,
-                         MatrixBase<BaseFloat> *feats);
-  virtual ~OnlineCacheFeature() { ClearCache(); }
-  // Things that are not in the shared interface:
-  void ClearCache();  // this should be called if you change the underlying
-                      // features in some way.
-  explicit OnlineCacheFeature(OnlineFeatureInterface *src): src_(src) { }
- private:
-  OnlineFeatureInterface *src_;  // Not owned here
-  std::vector<Vector<BaseFloat>* > cache_;
-};
-/// This online-feature class implements combination of two feature
-/// streams (such as pitch, plp) into one stream.
-class OnlineAppendFeature: public OnlineFeatureInterface {
- public:
-  virtual int32 Dim() const { return src1_->Dim() + src2_->Dim(); }
-  virtual bool IsLastFrame(int32 frame) const {
-    return (src1_->IsLastFrame(frame) || src2_->IsLastFrame(frame));
-  }
-  // Hopefully sources have the same rate
-  virtual BaseFloat FrameShiftInSeconds() const {
-    return src1_->FrameShiftInSeconds();
-  }
-  virtual int32 NumFramesReady() const {
-    return std::min(src1_->NumFramesReady(), src2_->NumFramesReady());
-  }
-  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
-  virtual ~OnlineAppendFeature() {  }
-  OnlineAppendFeature(OnlineFeatureInterface *src1,
-      OnlineFeatureInterface *src2): src1_(src1), src2_(src2) { }
- private:
-  OnlineFeatureInterface *src1_;
-  OnlineFeatureInterface *src2_;
-};
-/// @} End of "addtogroup onlinefeat"
-}  // namespace kaldi
-#endif  // KALDI_FEAT_ONLINE_FEATURE_H_
--- a/speechx/speechx/kaldi/feat/pitch-functions.cc
+++ b/speechx/speechx/kaldi/feat/pitch-functions.cc
-// feat/pitch-functions.cc
-// Copyright    2013  Pegah Ghahremani
-//              2014  IMSL, PKU-HKUST (author: Wei Shi)
-//              2014  Yanqing Sun, Junjie Wang,
-//                    Daniel Povey, Korbinian Riedhammer
-//                    Xin Lei
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include <algorithm>
-#include <limits>
-#include "feat/feature-functions.h"
-#include "feat/mel-computations.h"
-#include "feat/online-feature.h"
-#include "feat/pitch-functions.h"
-#include "feat/resample.h"
-#include "matrix/matrix-functions.h"
-namespace kaldi {
-/**
-   This function processes the NCCF n to a POV feature f by applying the formula
-     f = (1.0001 - n)^0.15  - 1.0
-   This is a nonlinear function designed to make the output reasonably Gaussian
-   distributed.  Before doing this, the NCCF distribution is in the range [-1,
-   1] but has a strong peak just before 1.0, which this function smooths out.
-*/
-BaseFloat NccfToPovFeature(BaseFloat n) {
-  if (n > 1.0) {
-    n = 1.0;
-  } else if (n < -1.0) {
-    n = -1.0;
-  }
-  BaseFloat f = pow((1.0001 - n), 0.15) - 1.0;
-  KALDI_ASSERT(f - f == 0);  // check for NaN,inf.
-  return f;
-}
-/**
-   This function processes the NCCF n to a reasonably accurate probability
-   of voicing p by applying the formula:
-      n' = fabs(n)
-      r = -5.2 + 5.4 * exp(7.5 * (n' - 1.0)) +
-           4.8 * n' - 2.0 * exp(-10.0 * n') + 4.2 * exp(20.0 * (n' - 1.0));
-      p = 1.0 / (1 + exp(-1.0 * r));
-   How did we get this formula?  We plotted the empirical log-prob-ratio of voicing
-    r = log( p[voiced] / p[not-voiced] )
-   [on the Keele database where voicing is marked], as a function of the NCCF at
-   the delay picked by our algorithm.  This was done on intervals of the NCCF, so
-   we had enough statistics to get that ratio.  The NCCF covers [-1, 1]; almost
-   all of the probability mass is on [0, 1] but the empirical POV seems fairly
-   symmetric with a minimum near zero, so we chose to make it a function of n' = fabs(n).
-   Then we manually tuned a function (the one you see above) that approximated
-   the log-prob-ratio of voicing fairly well as a function of the absolute-value
-   NCCF n'; however, wasn't a very exact match since we were also trying to make
-   the transformed NCCF fairly Gaussian distributed, with a view to using it as
-   a feature-- an idea we later abandoned after a simpler formula worked better.
- */
-BaseFloat NccfToPov(BaseFloat n) {
-  BaseFloat ndash = fabs(n);
-  if (ndash > 1.0) ndash = 1.0;  // just in case it was slightly outside [-1, 1]
-  BaseFloat r = -5.2 + 5.4 * Exp(7.5 * (ndash - 1.0)) + 4.8 * ndash -
-                2.0 * Exp(-10.0 * ndash) + 4.2 * Exp(20.0 * (ndash - 1.0));
-  // r is the approximate log-prob-ratio of voicing, log(p/(1-p)).
-  BaseFloat p = 1.0 / (1 + Exp(-1.0 * r));
-  KALDI_ASSERT(p - p == 0);  // Check for NaN/inf
-  return p;
-}
-/**
-   This function computes some dot products that are required
-   while computing the NCCF.
-   For each integer lag from start to end-1, this function
-   outputs to (*inner_prod)(lag - start), the dot-product
-   of a window starting at 0 with a window starting at
-   lag.  All windows are of length nccf_window_size.  It
-   outputs to (*norm_prod)(lag - start), e1 * e2, where
-   e1 is the dot-product of the un-shifted window with itself,
-   and d2 is the dot-product of the window shifted by "lag"
-   with itself.
- */
-void ComputeCorrelation(const VectorBase<BaseFloat> &wave,
-                        int32 first_lag, int32 last_lag,
-                        int32 nccf_window_size,
-                        VectorBase<BaseFloat> *inner_prod,
-                        VectorBase<BaseFloat> *norm_prod) {
-  Vector<BaseFloat> zero_mean_wave(wave);
-  // TODO: possibly fix this, the mean normalization is done in a strange way.
-  SubVector<BaseFloat> wave_part(wave, 0, nccf_window_size);
-  // subtract mean-frame from wave
-  zero_mean_wave.Add(-wave_part.Sum() / nccf_window_size);
-  BaseFloat e1, e2, sum;
-  SubVector<BaseFloat> sub_vec1(zero_mean_wave, 0, nccf_window_size);
-  e1 = VecVec(sub_vec1, sub_vec1);
-  for (int32 lag = first_lag; lag <= last_lag; lag++) {
-    SubVector<BaseFloat> sub_vec2(zero_mean_wave, lag, nccf_window_size);
-    e2 = VecVec(sub_vec2, sub_vec2);
-    sum = VecVec(sub_vec1, sub_vec2);
-    (*inner_prod)(lag - first_lag) = sum;
-    (*norm_prod)(lag - first_lag) = e1 * e2;
-  }
-}
-/**
-   Computes the NCCF as a fraction of the numerator term (a dot product between
-   two vectors) and a denominator term which equals sqrt(e1*e2 + nccf_ballast)
-   where e1 and e2 are both dot-products of bits of the wave with themselves,
-   and e1*e2 is supplied as "norm_prod".  These quantities are computed by
-   "ComputeCorrelation".
-*/
-void ComputeNccf(const VectorBase<BaseFloat> &inner_prod,
-                 const VectorBase<BaseFloat> &norm_prod,
-                 BaseFloat nccf_ballast,
-                 VectorBase<BaseFloat> *nccf_vec) {
-  KALDI_ASSERT(inner_prod.Dim() == norm_prod.Dim() &&
-               inner_prod.Dim() == nccf_vec->Dim());
-  for (int32 lag = 0; lag < inner_prod.Dim(); lag++) {
-    BaseFloat numerator = inner_prod(lag),
-        denominator = pow(norm_prod(lag) + nccf_ballast, 0.5),
-        nccf;
-    if (denominator != 0.0) {
-      nccf = numerator / denominator;
-    } else {
-      KALDI_ASSERT(numerator == 0.0);
-      nccf = 0.0;
-    }
-    KALDI_ASSERT(nccf < 1.01 && nccf > -1.01);
-    (*nccf_vec)(lag) = nccf;
-  }
-}
-/**
-   This function selects the lags at which we measure the NCCF: we need
-   to select lags from 1/max_f0 to 1/min_f0, in a geometric progression
-   with ratio 1 + d.
- */
-void SelectLags(const PitchExtractionOptions &opts,
-                Vector<BaseFloat> *lags) {
-  // choose lags relative to acceptable pitch tolerance
-  BaseFloat min_lag = 1.0 / opts.max_f0, max_lag = 1.0 / opts.min_f0;
-  std::vector<BaseFloat> tmp_lags;
-  for (BaseFloat lag = min_lag; lag <= max_lag; lag *= 1.0 + opts.delta_pitch)
-    tmp_lags.push_back(lag);
-  lags->Resize(tmp_lags.size());
-  std::copy(tmp_lags.begin(), tmp_lags.end(), lags->Data());
-}
-/**
-   This function computes the local-cost for the Viterbi computation,
-   see eq. (5) in the paper.
-   @param  opts         The options as provided by the user
-   @param  nccf_pitch   The nccf as computed for the pitch computation (with ballast).
-   @param  lags         The log-spaced lags at which nccf_pitch is sampled.
-   @param  local_cost   We output the local-cost to here.
-*/
-void ComputeLocalCost(const VectorBase<BaseFloat> &nccf_pitch,
-                      const VectorBase<BaseFloat> &lags,
-                      const PitchExtractionOptions &opts,
-                      VectorBase<BaseFloat> *local_cost) {
-  // from the paper, eq. 5, local_cost = 1 - Phi(t,i)(1 - soft_min_f0 L_i)
-  // nccf is the nccf on this frame measured at the lags in "lags".
-  KALDI_ASSERT(nccf_pitch.Dim() == local_cost->Dim() &&
-               nccf_pitch.Dim() == lags.Dim());
-  local_cost->Set(1.0);
-  // add the term -Phi(t,i):
-  local_cost->AddVec(-1.0, nccf_pitch);
-  // add the term soft_min_f0 Phi(t,i) L_i
-  local_cost->AddVecVec(opts.soft_min_f0, lags, nccf_pitch, 1.0);
-}
-// class PitchFrameInfo is used inside class OnlinePitchFeatureImpl.
-// It stores the information we need to keep around for a single frame
-// of the pitch computation.
-class PitchFrameInfo {
- public:
-  /// This function resizes the arrays for this object and updates the reference
-  /// counts for the previous object (by decrementing those reference counts
-  /// when we destroy a StateInfo object).  A StateInfo object is considered to
-  /// be destroyed when we delete it, not when its reference counts goes to
-  /// zero.
-  void Cleanup(PitchFrameInfo *prev_frame);
-  /// This function may be called for the last (most recent) PitchFrameInfo
-  /// object with the best state (obtained from the externally held
-  /// forward-costs). It traces back as far as needed to set the
-  /// cur_best_state_, and as it's going it sets the lag-index and pov_nccf in
-  /// pitch_pov_iter, which when it's called is an iterator to where to put the
-  /// info for the final state; the iterator will be decremented inside this
-  /// function.
-  void SetBestState(int32 best_state,
-      std::vector<std::pair<int32, BaseFloat> > &lag_nccf);
-  /// This function may be called on the last (most recent) PitchFrameInfo
-  /// object; it computes how many frames of latency there is because the
-  /// traceback has not yet settled on a single value for frames in the past.
-  /// It actually returns the minimum of max_latency and the actual latency,
-  /// which is an optimization because we won't care about latency past
-  /// a user-specified maximum latency.
-  int32 ComputeLatency(int32 max_latency);
-  /// This function updates
-  bool UpdatePreviousBestState(PitchFrameInfo *prev_frame);
-  /// This constructor is used for frame -1; it sets the costs to be all zeros
-  /// the pov_nccf's to zero and the backpointers to -1.
-  explicit PitchFrameInfo(int32 num_states);
-  /// This constructor is used for subsequent frames (not -1).
-  PitchFrameInfo(PitchFrameInfo *prev);
-  /// Record the nccf_pov value.
-  ///  @param  nccf_pov     The nccf as computed for the POV computation (without ballast).
-  void SetNccfPov(const VectorBase<BaseFloat> &nccf_pov);
-  /// This constructor is used for frames apart from frame -1; the bulk of
-  /// the Viterbi computation takes place inside this constructor.
-  ///  @param  opts         The options as provided by the user
-  ///  @param  nccf_pitch   The nccf as computed for the pitch computation
-  ///                       (with ballast).
-  ///  @param  nccf_pov     The nccf as computed for the POV computation
-  ///                       (without ballast).
-  ///  @param  lags         The log-spaced lags at which nccf_pitch and
-  ///                       nccf_pov are sampled.
-  ///  @param  prev_frame_forward_cost   The forward-cost vector for the
-  ///                       previous frame.
-  ///  @param  index_info   A pointer to a temporary vector used by this function
-  ///  @param  this_forward_cost   The forward-cost vector for this frame
-  ///                       (to be computed).
-  void ComputeBacktraces(const PitchExtractionOptions &opts,
-                         const VectorBase<BaseFloat> &nccf_pitch,
-                         const VectorBase<BaseFloat> &lags,
-                         const VectorBase<BaseFloat> &prev_forward_cost,
-                         std::vector<std::pair<int32, int32> > *index_info,
-                         VectorBase<BaseFloat> *this_forward_cost);
- private:
-  // struct StateInfo is the information we keep for a single one of the
-  // log-spaced lags, for a single frame.  This is a state in the Viterbi
-  // computation.
-  struct StateInfo {
-    /// The state index on the previous frame that is the best preceding state
-    /// for this state.
-    int32 backpointer;
-    /// the version of the NCCF we keep for the POV computation (without the
-    /// ballast term).
-    BaseFloat pov_nccf;
-    StateInfo(): backpointer(0), pov_nccf(0.0) { }
-  };
-  std::vector<StateInfo> state_info_;
-  /// the state index of the first entry in "state_info"; this will initially be
-  /// zero, but after cleanup might be nonzero.
-  int32 state_offset_;
-  /// The current best state in the backtrace from the end.
-  int32 cur_best_state_;
-  /// The structure for the previous frame.
-  PitchFrameInfo *prev_info_;
-};
-// This constructor is used for frame -1; it sets the costs to be all zeros
-// the pov_nccf's to zero and the backpointers to -1.
-PitchFrameInfo::PitchFrameInfo(int32 num_states)
-    :state_info_(num_states), state_offset_(0),
-    cur_best_state_(-1), prev_info_(NULL) { }
-bool pitch_use_naive_search = false;  // This is used in unit-tests.
-PitchFrameInfo::PitchFrameInfo(PitchFrameInfo *prev_info):
-    state_info_(prev_info->state_info_.size()), state_offset_(0),
-    cur_best_state_(-1), prev_info_(prev_info) { }
-void PitchFrameInfo::SetNccfPov(const VectorBase<BaseFloat> &nccf_pov) {
-  int32 num_states = nccf_pov.Dim();
-  KALDI_ASSERT(num_states == state_info_.size());
-  for (int32 i = 0; i < num_states; i++)
-    state_info_[i].pov_nccf = nccf_pov(i);
-}
-void PitchFrameInfo::ComputeBacktraces(
-    const PitchExtractionOptions &opts,
-    const VectorBase<BaseFloat> &nccf_pitch,
-    const VectorBase<BaseFloat> &lags,
-    const VectorBase<BaseFloat> &prev_forward_cost_vec,
-    std::vector<std::pair<int32, int32> > *index_info,
-    VectorBase<BaseFloat> *this_forward_cost_vec) {
-  int32 num_states = nccf_pitch.Dim();
-  Vector<BaseFloat> local_cost(num_states, kUndefined);
-  ComputeLocalCost(nccf_pitch, lags, opts, &local_cost);
-  const BaseFloat delta_pitch_sq = pow(Log(1.0 + opts.delta_pitch), 2.0),
-      inter_frame_factor = delta_pitch_sq * opts.penalty_factor;
-  // index local_cost, prev_forward_cost and this_forward_cost using raw pointer
-  // indexing not operator (), since this is the very inner loop and a lot of
-  // time is taken here.
-  const BaseFloat *prev_forward_cost = prev_forward_cost_vec.Data();
-  BaseFloat *this_forward_cost = this_forward_cost_vec->Data();
-  if (index_info->empty())
-    index_info->resize(num_states);
-  // make it a reference for more concise indexing.
-  std::vector<std::pair<int32, int32> > &bounds = *index_info;
-  /* bounds[i].first will be a lower bound on the backpointer for state i,
-     bounds[i].second will be an upper bound on it.  We progressively tighten
-     these bounds till we know the backpointers exactly.
-  */
-  if (pitch_use_naive_search) {
-    // This branch is only taken in unit-testing code.
-    for (int32 i = 0; i < num_states; i++) {
-      BaseFloat best_cost = std::numeric_limits<BaseFloat>::infinity();
-      int32 best_j = -1;
-      for (int32 j = 0; j < num_states; j++) {
-        BaseFloat this_cost = (j - i) * (j - i) * inter_frame_factor
-            + prev_forward_cost[j];
-        if (this_cost < best_cost) {
-          best_cost = this_cost;
-          best_j = j;
-        }
-      }
-      this_forward_cost[i] = best_cost;
-      state_info_[i].backpointer = best_j;
-    }
-  } else {
-    int32 last_backpointer = 0;
-    for (int32 i = 0; i < num_states; i++) {
-      int32 start_j = last_backpointer;
-      BaseFloat best_cost = (start_j - i) * (start_j - i) * inter_frame_factor
-          + prev_forward_cost[start_j];
-      int32 best_j = start_j;
-      for (int32 j = start_j + 1; j < num_states; j++) {
-        BaseFloat this_cost = (j - i) * (j - i) * inter_frame_factor
-            + prev_forward_cost[j];
-        if (this_cost < best_cost) {
-          best_cost = this_cost;
-          best_j = j;
-        } else {  // as soon as the costs stop improving, we stop searching.
-          break;  // this is a loose lower bound we're getting.
-        }
-      }
-      state_info_[i].backpointer = best_j;
-      this_forward_cost[i] = best_cost;
-      bounds[i].first = best_j;  // this is now a lower bound on the
-                                 // backpointer.
-      bounds[i].second = num_states - 1;  // we have no meaningful upper bound
-                                          // yet.
-      last_backpointer = best_j;
-    }
-    // We iterate, progressively refining the upper and lower bounds until they
-    // meet and we know that the resulting backtraces are optimal.  Each
-    // iteration takes time linear in num_states.  We won't normally iterate as
-    // far as num_states; normally we only do two iterations; when printing out
-    // the number of iterations, it's rarely more than that (once I saw seven
-    // iterations).  Anyway, this part of the computation does not dominate.
-    for (int32 iter = 0; iter < num_states; iter++) {
-      bool changed = false;
-      if (iter % 2 == 0) {  // go backwards through the states
-        last_backpointer = num_states - 1;
-        for (int32 i = num_states - 1; i >= 0; i--) {
-          int32 lower_bound = bounds[i].first,
-              upper_bound = std::min(last_backpointer, bounds[i].second);
-          if (upper_bound == lower_bound) {
-            last_backpointer = lower_bound;
-            continue;
-          }
-          BaseFloat best_cost = this_forward_cost[i];
-          int32 best_j = state_info_[i].backpointer, initial_best_j = best_j;
-          if (best_j == upper_bound) {
-            // if best_j already equals upper bound, don't bother tightening the
-            // upper bound, we'll tighten the lower bound when the time comes.
-            last_backpointer = best_j;
-            continue;
-          }
-          // Below, we have j > lower_bound + 1 because we know we've already
-          // evaluated lower_bound and lower_bound + 1 [via knowledge of
-          // this algorithm.]
-          for (int32 j = upper_bound; j > lower_bound + 1; j--) {
-            BaseFloat this_cost = (j - i) * (j - i) * inter_frame_factor
-                + prev_forward_cost[j];
-            if (this_cost < best_cost) {
-              best_cost = this_cost;
-              best_j = j;
-            } else {  // as soon as the costs stop improving, we stop searching,
-              // unless the best j is still lower than j, in which case
-              // we obviously need to keep moving.
-              if (best_j > j)
-                break;  // this is a loose lower bound we're getting.
-            }
-          }
-          // our "best_j" is now an upper bound on the backpointer.
-          bounds[i].second = best_j;
-          if (best_j != initial_best_j) {
-            this_forward_cost[i] = best_cost;
-            state_info_[i].backpointer = best_j;
-            changed = true;
-          }
-          last_backpointer = best_j;
-        }
-      } else {  // go forwards through the states.
-        last_backpointer = 0;
-        for (int32 i = 0; i < num_states; i++) {
-          int32 lower_bound = std::max(last_backpointer, bounds[i].first),
-              upper_bound = bounds[i].second;
-          if (upper_bound == lower_bound) {
-            last_backpointer = lower_bound;
-            continue;
-          }
-          BaseFloat best_cost = this_forward_cost[i];
-          int32 best_j = state_info_[i].backpointer, initial_best_j = best_j;
-          if (best_j == lower_bound) {
-            // if best_j already equals lower bound, we don't bother tightening
-            // the lower bound, we'll tighten the upper bound when the time
-            // comes.
-            last_backpointer = best_j;
-            continue;
-          }
-          // Below, we have j < upper_bound because we know we've already
-          // evaluated that point.
-          for (int32 j = lower_bound; j < upper_bound - 1; j++) {
-            BaseFloat this_cost = (j - i) * (j - i) * inter_frame_factor
-                + prev_forward_cost[j];
-            if (this_cost < best_cost) {
-              best_cost = this_cost;
-              best_j = j;
-            } else {  // as soon as the costs stop improving, we stop searching,
-              // unless the best j is still higher than j, in which case
-              // we obviously need to keep moving.
-              if (best_j < j)
-                break;  // this is a loose lower bound we're getting.
-            }
-          }
-          // our "best_j" is now a lower bound on the backpointer.
-          bounds[i].first = best_j;
-          if (best_j != initial_best_j) {
-            this_forward_cost[i] = best_cost;
-            state_info_[i].backpointer = best_j;
-            changed = true;
-          }
-          last_backpointer = best_j;
-        }
-      }
-      if (!changed)
-        break;
-    }
-  }
-  // The next statement is needed due to RecomputeBacktraces: we have to
-  // invalidate the previously computed best-state info.
-  cur_best_state_ = -1;
-  this_forward_cost_vec->AddVec(1.0, local_cost);
-}
-void PitchFrameInfo::SetBestState(
-    int32 best_state,
-    std::vector<std::pair<int32, BaseFloat> > &lag_nccf) {
-  // This function would naturally be recursive, but we have coded this to avoid
-  // recursion, which would otherwise eat up the stack.  Think of it as a static
-  // member function, except we do use "this" right at the beginning.
-  std::vector<std::pair<int32, BaseFloat> >::reverse_iterator iter = lag_nccf.rbegin();
-  PitchFrameInfo *this_info = this;  // it will change in the loop.
-  while (this_info != NULL) {
-    PitchFrameInfo *prev_info = this_info->prev_info_;
-    if (best_state == this_info->cur_best_state_)
-      return;  // no change
-    if (prev_info != NULL)  // don't write anything for frame -1.
-      iter->first = best_state;
-    size_t state_info_index = best_state - this_info->state_offset_;
-    KALDI_ASSERT(state_info_index < this_info->state_info_.size());
-    this_info->cur_best_state_ = best_state;
-    best_state = this_info->state_info_[state_info_index].backpointer;
-    if (prev_info != NULL)  // don't write anything for frame -1.
-      iter->second = this_info->state_info_[state_info_index].pov_nccf;
-    this_info = prev_info;
-    if (this_info != NULL) ++iter;
-  }
-}
-int32 PitchFrameInfo::ComputeLatency(int32 max_latency) {
-  if (max_latency <= 0) return 0;
-  int32 latency = 0;
-  // This function would naturally be recursive, but we have coded this to avoid
-  // recursion, which would otherwise eat up the stack.  Think of it as a static
-  // member function, except we do use "this" right at the beginning.
-  // This function is called only on the most recent PitchFrameInfo object.
-  int32 num_states = state_info_.size();
-  int32 min_living_state = 0, max_living_state = num_states - 1;
-  PitchFrameInfo *this_info = this;  // it will change in the loop.
-  for (; this_info != NULL && latency < max_latency;) {
-    int32 offset = this_info->state_offset_;
-    KALDI_ASSERT(min_living_state >= offset &&
-                 max_living_state - offset < this_info->state_info_.size());
-    min_living_state =
-        this_info->state_info_[min_living_state - offset].backpointer;
-    max_living_state =
-        this_info->state_info_[max_living_state - offset].backpointer;
-    if (min_living_state == max_living_state) {
-      return latency;
-    }
-    this_info = this_info->prev_info_;
-    if (this_info != NULL)  // avoid incrementing latency for frame -1,
-      latency++;            // as it's not a real frame.
-  }
-  return latency;
-}
-void PitchFrameInfo::Cleanup(PitchFrameInfo *prev_frame) {
-  KALDI_ERR << "Cleanup not implemented.";
-}
-// struct NccfInfo is used to cache certain quantities that we need for online
-// operation, for the first "recompute_frame" frames of the file (e.g. 300);
-// after that many frames, or after the user calls InputFinished(), we redo the
-// initial backtraces, as we'll then have a better estimate of the average signal
-// energy.
-struct NccfInfo {
-  Vector<BaseFloat> nccf_pitch_resampled;  // resampled nccf_pitch
-  BaseFloat avg_norm_prod; // average value of e1 * e2.
-  BaseFloat mean_square_energy;  // mean_square energy we used when computing the
-                                 // original ballast term for
-                                 // "nccf_pitch_resampled".
-  NccfInfo(BaseFloat avg_norm_prod,
-           BaseFloat mean_square_energy):
-      avg_norm_prod(avg_norm_prod),
-      mean_square_energy(mean_square_energy) { }
-};
-// We could inherit from OnlineBaseFeature as we have the same interface,
-// but this will unnecessary force a lot of our functions to be virtual.
-class OnlinePitchFeatureImpl {
- public:
-  explicit OnlinePitchFeatureImpl(const PitchExtractionOptions &opts);
-  int32 Dim() const { return 2; }
-  BaseFloat FrameShiftInSeconds() const;
-  int32 NumFramesReady() const;
-  bool IsLastFrame(int32 frame) const;
-  void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
-  void AcceptWaveform(BaseFloat sampling_rate,
-                      const VectorBase<BaseFloat> &waveform);
-  void InputFinished();
-  ~OnlinePitchFeatureImpl();
-  // Copy-constructor, can be used to obtain a new copy of this object,
-  // any state from this utterance.
-  OnlinePitchFeatureImpl(const OnlinePitchFeatureImpl &other);
- private:
-  /// This function works out from the signal how many frames are currently
-  /// available to process (this is called from inside AcceptWaveform()).
-  /// Note: the number of frames differs slightly from the number the
-  /// old pitch code gave.
-  /// Note: the number this returns depends on whether input_finished_ == true;
-  /// if it is, it will "force out" a final frame or two.
-  int32 NumFramesAvailable(int64 num_downsampled_samples, bool snip_edges) const;
-  /// This function extracts from the signal the samples numbered from
-  /// "sample_index" (numbered in the full downsampled signal, not just this
-  /// part), and of length equal to window->Dim().  It uses the data members
-  /// downsampled_samples_discarded_ and downsampled_signal_remainder_, as well
-  /// as the more recent part of the downsampled wave "downsampled_wave_part"
-  /// which is provided.
-  ///
-  /// @param downsampled_wave_part  One chunk of the downsampled wave,
-  ///                      starting from sample-index downsampled_samples_discarded_.
-  /// @param sample_index  The desired starting sample index (measured from
-  ///                      the start of the whole signal, not just this part).
-  /// @param window  The part of the signal is output to here.
-  void ExtractFrame(const VectorBase<BaseFloat> &downsampled_wave_part,
-                    int64 frame_index,
-                    VectorBase<BaseFloat> *window);
-  /// This function is called after we reach frame "recompute_frame", or when
-  /// InputFinished() is called, whichever comes sooner.  It recomputes the
-  /// backtraces for frames zero through recompute_frame, if needed because the
-  /// average energy of the signal has changed, affecting the nccf ballast term.
-  /// It works out the average signal energy from
-  /// downsampled_samples_processed_, signal_sum_ and signal_sumsq_ (which, if
-  /// you see the calling code, might include more frames than just
-  /// "recompute_frame", it might include up to the end of the current chunk).
-  void RecomputeBacktraces();
-  /// This function updates downsampled_signal_remainder_,
-  /// downsampled_samples_processed_, signal_sum_ and signal_sumsq_; it's called
-  /// from AcceptWaveform().
-  void UpdateRemainder(const VectorBase<BaseFloat> &downsampled_wave_part);
-  // The following variables don't change throughout the lifetime
-  // of this object.
-  PitchExtractionOptions opts_;
-  // the first lag of the downsampled signal at which we measure NCCF
-  int32 nccf_first_lag_;
-  // the last lag of the downsampled signal at which we measure NCCF
-  int32 nccf_last_lag_;
-  // The log-spaced lags at which we will resample the NCCF
-  Vector<BaseFloat> lags_;
-  // This object is used to resample from evenly spaced to log-evenly-spaced
-  // nccf values.  It's a pointer for convenience of initialization, so we don't
-  // have to use the initializer from the constructor.
-  ArbitraryResample *nccf_resampler_;
-  // The following objects may change during the lifetime of this object.
-  // This object is used to resample the signal.
-  LinearResample *signal_resampler_;
-  // frame_info_ is indexed by [frame-index + 1].  frame_info_[0] is an object
-  // that corresponds to frame -1, which is not a real frame.
-  std::vector<PitchFrameInfo*> frame_info_;
-  // nccf_info_ is indexed by frame-index, from frame 0 to at most
-  // opts_.recompute_frame - 1.  It contains some information we'll
-  // need to recompute the tracebacks after getting a better estimate
-  // of the average energy of the signal.
-  std::vector<NccfInfo*> nccf_info_;
-  // Current number of frames which we can't output because Viterbi has not
-  // converged for them, or opts_.max_frames_latency if we have reached that
-  // limit.
-  int32 frames_latency_;
-  // The forward-cost at the current frame (the last frame in frame_info_);
-  // this has the same dimension as lags_.  We normalize each time so
-  // the lowest cost is zero, for numerical accuracy and so we can use float.
-  Vector<BaseFloat> forward_cost_;
-  // stores the constant part of forward_cost_.
-  double forward_cost_remainder_;
-  // The resampled-lag index and the NCCF (as computed for POV, without ballast
-  // term) for each frame, as determined by Viterbi traceback from the best
-  // final state.
-  std::vector<std::pair<int32, BaseFloat> > lag_nccf_;
-  bool input_finished_;
-  /// sum-squared of previously processed parts of signal; used to get NCCF
-  /// ballast term.  Denominator is downsampled_samples_processed_.
-  double signal_sumsq_;
-  /// sum of previously processed parts of signal; used to do mean-subtraction
-  /// when getting sum-squared, along with signal_sumsq_.
-  double signal_sum_;
-  /// downsampled_samples_processed is the number of samples (after
-  /// downsampling) that we got in previous calls to AcceptWaveform().
-  int64 downsampled_samples_processed_;
-  /// This is a small remainder of the previous downsampled signal;
-  /// it's used by ExtractFrame for frames near the boundary of two
-  /// waveforms supplied to AcceptWaveform().
-  Vector<BaseFloat> downsampled_signal_remainder_;
-};
-OnlinePitchFeatureImpl::OnlinePitchFeatureImpl(
-    const PitchExtractionOptions &opts):
-    opts_(opts), forward_cost_remainder_(0.0), input_finished_(false),
-    signal_sumsq_(0.0), signal_sum_(0.0), downsampled_samples_processed_(0) {
-  signal_resampler_ = new LinearResample(opts.samp_freq, opts.resample_freq,
-                                         opts.lowpass_cutoff,
-                                         opts.lowpass_filter_width);
-  double outer_min_lag = 1.0 / opts.max_f0 -
-      (opts.upsample_filter_width/(2.0 * opts.resample_freq));
-  double outer_max_lag = 1.0 / opts.min_f0 +
-      (opts.upsample_filter_width/(2.0 * opts.resample_freq));
-  nccf_first_lag_ = ceil(opts.resample_freq * outer_min_lag);
-  nccf_last_lag_ = floor(opts.resample_freq * outer_max_lag);
-  frames_latency_ = 0;  // will be set in AcceptWaveform()
-  // Choose the lags at which we resample the NCCF.
-  SelectLags(opts, &lags_);
-  // upsample_cutoff is the filter cutoff for upsampling the NCCF, which is the
-  // Nyquist of the resampling frequency.  The NCCF is (almost completely)
-  // bandlimited to around "lowpass_cutoff" (1000 by default), and when the
-  // spectrum of this bandlimited signal is convolved with the spectrum of an
-  // impulse train with frequency "resample_freq", which are separated by 4kHz,
-  // we get energy at -5000,-3000, -1000...1000, 3000..5000, etc.  Filtering at
-  // half the Nyquist (2000 by default) is sufficient to get only the first
-  // repetition.
-  BaseFloat upsample_cutoff = opts.resample_freq * 0.5;
-  Vector<BaseFloat> lags_offset(lags_);
-  // lags_offset equals lags_ (which are the log-spaced lag values we want to
-  // measure the NCCF at) with nccf_first_lag_ / opts.resample_freq subtracted
-  // from each element, so we can treat the measured NCCF values as as starting
-  // from sample zero in a signal that starts at the point start /
-  // opts.resample_freq.  This is necessary because the ArbitraryResample code
-  // assumes that the input signal starts from sample zero.
-  lags_offset.Add(-nccf_first_lag_ / opts.resample_freq);
-  int32 num_measured_lags = nccf_last_lag_ + 1 - nccf_first_lag_;
-  nccf_resampler_ = new ArbitraryResample(num_measured_lags, opts.resample_freq,
-                                          upsample_cutoff, lags_offset,
-                                          opts.upsample_filter_width);
-  // add a PitchInfo object for frame -1 (not a real frame).
-  frame_info_.push_back(new PitchFrameInfo(lags_.Dim()));
-  // zeroes forward_cost_; this is what we want for the fake frame -1.
-  forward_cost_.Resize(lags_.Dim());
-}
-int32 OnlinePitchFeatureImpl::NumFramesAvailable(
-    int64 num_downsampled_samples, bool snip_edges) const {
-  int32 frame_shift = opts_.NccfWindowShift(),
-      frame_length = opts_.NccfWindowSize();
-  // Use the "full frame length" to compute the number
-  // of frames only if the input is not finished.
-  if (!input_finished_)
-    frame_length += nccf_last_lag_;
-  if (num_downsampled_samples < frame_length) {
-    return 0;
-  } else {
-    if (!snip_edges) {
-      if (input_finished_) {
-        return static_cast<int32>(num_downsampled_samples * 1.0f /
-                                  frame_shift + 0.5f);
-      } else {
-        return static_cast<int32>((num_downsampled_samples - frame_length / 2) *
-                                   1.0f / frame_shift + 0.5f);
-      }
-    } else {
-      return static_cast<int32>((num_downsampled_samples - frame_length) /
-                                 frame_shift + 1);
-    }
-  }
-}
-void OnlinePitchFeatureImpl::UpdateRemainder(
-    const VectorBase<BaseFloat> &downsampled_wave_part) {
-  // frame_info_ has an extra element at frame-1, so subtract
-  // one from the length.
-  int64 num_frames = static_cast<int64>(frame_info_.size()) - 1,
-      next_frame = num_frames,
-      frame_shift = opts_.NccfWindowShift(),
-      next_frame_sample = frame_shift * next_frame;
-  signal_sumsq_ += VecVec(downsampled_wave_part, downsampled_wave_part);
-  signal_sum_ += downsampled_wave_part.Sum();
-  // next_frame_sample is the first sample index we'll need for the
-  // next frame.
-  int64 next_downsampled_samples_processed =
-      downsampled_samples_processed_ + downsampled_wave_part.Dim();
-  if (next_frame_sample > next_downsampled_samples_processed) {
-    // this could only happen in the weird situation that the full frame length
-    // is less than the frame shift.
-    int32 full_frame_length = opts_.NccfWindowSize() + nccf_last_lag_;
-    KALDI_ASSERT(full_frame_length < frame_shift && "Code error");
-    downsampled_signal_remainder_.Resize(0);
-  } else {
-    Vector<BaseFloat> new_remainder(next_downsampled_samples_processed -
-                                    next_frame_sample);
-    // note: next_frame_sample is the index into the entire signal, of
-    // new_remainder(0).
-    // i is the absolute index of the signal.
-    for (int64 i = next_frame_sample;
-         i < next_downsampled_samples_processed; i++) {
-      if (i >= downsampled_samples_processed_) {  // in current signal.
-        new_remainder(i - next_frame_sample) =
-            downsampled_wave_part(i - downsampled_samples_processed_);
-      } else {  // in old remainder; only reach here if waveform supplied is
-        new_remainder(i - next_frame_sample) =                      //  tiny.
-            downsampled_signal_remainder_(i - downsampled_samples_processed_ +
-                                          downsampled_signal_remainder_.Dim());
-      }
-    }
-    downsampled_signal_remainder_.Swap(&new_remainder);
-  }
-  downsampled_samples_processed_ = next_downsampled_samples_processed;
-}
-void OnlinePitchFeatureImpl::ExtractFrame(
-    const VectorBase<BaseFloat> &downsampled_wave_part,
-    int64 sample_index,
-    VectorBase<BaseFloat> *window) {
-  int32 full_frame_length = window->Dim();
-  int32 offset = static_cast<int32>(sample_index -
-                                    downsampled_samples_processed_);
-  // Treat edge cases first
-  if (sample_index < 0) {
-    // Part of the frame is before the beginning of the signal. This
-    // should only happen if opts_.snip_edges == false, when we are
-    // processing the first few frames of signal. In this case
-    // we pad with zeros.
-    KALDI_ASSERT(opts_.snip_edges == false);
-    int32 sub_frame_length = sample_index + full_frame_length;
-    int32 sub_frame_index = full_frame_length - sub_frame_length;
-    KALDI_ASSERT(sub_frame_length > 0 && sub_frame_index > 0);
-    window->SetZero();
-    SubVector<BaseFloat> sub_window(*window, sub_frame_index, sub_frame_length);
-    ExtractFrame(downsampled_wave_part, 0, &sub_window);
-    return;
-  }
-  if (offset + full_frame_length > downsampled_wave_part.Dim()) {
-    // Requested frame is past end of the signal.  This should only happen if
-    // input_finished_ == true, when we're flushing out the last couple of
-    // frames of signal.  In this case we pad with zeros.
-    KALDI_ASSERT(input_finished_);
-    int32 sub_frame_length = downsampled_wave_part.Dim() - offset;
-    KALDI_ASSERT(sub_frame_length > 0);
-    window->SetZero();
-    SubVector<BaseFloat> sub_window(*window, 0, sub_frame_length);
-    ExtractFrame(downsampled_wave_part, sample_index, &sub_window);
-    return;
-  }
-  // "offset" is the offset of the start of the frame, into this
-  // signal.
-  if (offset >= 0) {
-    // frame is full inside the new part of the signal.
-    window->CopyFromVec(downsampled_wave_part.Range(offset, full_frame_length));
-  } else {
-    // frame is partly in the remainder and partly in the new part.
-    int32 remainder_offset = downsampled_signal_remainder_.Dim() + offset;
-    KALDI_ASSERT(remainder_offset >= 0);  // or we didn't keep enough remainder.
-    KALDI_ASSERT(offset + full_frame_length > 0);  // or we should have
-                                                   // processed this frame last
-                                                   // time.
-    int32 old_length = -offset, new_length = offset + full_frame_length;
-    window->Range(0, old_length).CopyFromVec(
-        downsampled_signal_remainder_.Range(remainder_offset, old_length));
-    window->Range(old_length, new_length).CopyFromVec(
-        downsampled_wave_part.Range(0, new_length));
-  }
-  if (opts_.preemph_coeff != 0.0) {
-    BaseFloat preemph_coeff = opts_.preemph_coeff;
-    for (int32 i = window->Dim() - 1; i > 0; i--)
-      (*window)(i) -= preemph_coeff * (*window)(i-1);
-    (*window)(0) *= (1.0 - preemph_coeff);
-  }
-}
-bool OnlinePitchFeatureImpl::IsLastFrame(int32 frame) const {
-  int32 T = NumFramesReady();
-  KALDI_ASSERT(frame < T);
-  return (input_finished_ && frame + 1 == T);
-}
-BaseFloat OnlinePitchFeatureImpl::FrameShiftInSeconds() const {
-  return opts_.frame_shift_ms / 1000.0f;
-}
-int32 OnlinePitchFeatureImpl::NumFramesReady() const {
-  int32 num_frames = lag_nccf_.size(),
-      latency = frames_latency_;
-  KALDI_ASSERT(latency <= num_frames);
-  return num_frames - latency;
-}
-void OnlinePitchFeatureImpl::GetFrame(int32 frame,
-                                      VectorBase<BaseFloat> *feat) {
-  KALDI_ASSERT(frame < NumFramesReady() && feat->Dim() == 2);
-  (*feat)(0) = lag_nccf_[frame].second;
-  (*feat)(1) = 1.0 / lags_(lag_nccf_[frame].first);
-}
-void OnlinePitchFeatureImpl::InputFinished() {
-  input_finished_ = true;
-  // Process an empty waveform; this has an effect because
-  // after setting input_finished_ to true, NumFramesAvailable()
-  // will return a slightly larger number.
-  AcceptWaveform(opts_.samp_freq, Vector<BaseFloat>());
-  int32 num_frames = static_cast<size_t>(frame_info_.size() - 1);
-  if (num_frames < opts_.recompute_frame && !opts_.nccf_ballast_online)
-    RecomputeBacktraces();
-  frames_latency_ = 0;
-  KALDI_VLOG(3) << "Pitch-tracking Viterbi cost is "
-                << (forward_cost_remainder_ / num_frames)
-                << " per frame, over " << num_frames << " frames.";
-}
-// see comment with declaration.  This is only relevant for online
-// operation (it gets called for non-online mode, but is a no-op).
-void OnlinePitchFeatureImpl::RecomputeBacktraces() {
-  KALDI_ASSERT(!opts_.nccf_ballast_online);
-  int32 num_frames = static_cast<int32>(frame_info_.size()) - 1;
-  // The assertion reflects how we believe this function will be called.
-  KALDI_ASSERT(num_frames <= opts_.recompute_frame);
-  KALDI_ASSERT(nccf_info_.size() == static_cast<size_t>(num_frames));
-  if (num_frames == 0)
-    return;
-  double num_samp = downsampled_samples_processed_, sum = signal_sum_,
-      sumsq = signal_sumsq_, mean = sum / num_samp;
-  BaseFloat mean_square = sumsq / num_samp - mean * mean;
-  bool must_recompute = false;
-  BaseFloat threshold = 0.01;
-  for (int32 frame = 0; frame < num_frames; frame++)
-    if (!ApproxEqual(nccf_info_[frame]->mean_square_energy,
-                     mean_square, threshold))
-      must_recompute = true;
-  if (!must_recompute) {
-    // Nothing to do.  We'll reach here, for instance, if everything was in one
-    // chunk and opts_.nccf_ballast_online == false.  This is the case for
-    // offline processing.
-    for (size_t i = 0; i < nccf_info_.size(); i++)
-      delete nccf_info_[i];
-    nccf_info_.clear();
-    return;
-  }
-  int32 num_states = forward_cost_.Dim(),
-      basic_frame_length = opts_.NccfWindowSize();
-  BaseFloat new_nccf_ballast = pow(mean_square * basic_frame_length, 2) *
-      opts_.nccf_ballast;
-  double forward_cost_remainder = 0.0;
-  Vector<BaseFloat> forward_cost(num_states),  // start off at zero.
-      next_forward_cost(forward_cost);
-  std::vector<std::pair<int32, int32 > > index_info;
-  for (int32 frame = 0; frame < num_frames; frame++) {
-    NccfInfo &nccf_info = *nccf_info_[frame];
-    BaseFloat old_mean_square = nccf_info_[frame]->mean_square_energy,
-        avg_norm_prod = nccf_info_[frame]->avg_norm_prod,
-        old_nccf_ballast = pow(old_mean_square * basic_frame_length, 2) *
-            opts_.nccf_ballast,
-        nccf_scale = pow((old_nccf_ballast + avg_norm_prod) /
-                         (new_nccf_ballast + avg_norm_prod),
-                         static_cast<BaseFloat>(0.5));
-    // The "nccf_scale" is an estimate of the scaling factor by which the NCCF
-    // would change on this frame, on average, by changing the ballast term from
-    // "old_nccf_ballast" to "new_nccf_ballast".  It's not exact because the
-    // "avg_norm_prod" is just an average of the product e1 * e2 of frame
-    // energies of the (frame, shifted-frame), but these won't change that much
-    // within a frame, and even if they do, the inaccuracy of the scaled NCCF
-    // will still be very small if the ballast term didn't change much, or if
-    // it's much larger or smaller than e1*e2.  By doing it as a simple scaling,
-    // we save the overhead of the NCCF resampling, which is a considerable part
-    // of the whole computation.
-    nccf_info.nccf_pitch_resampled.Scale(nccf_scale);
-    frame_info_[frame + 1]->ComputeBacktraces(
-        opts_, nccf_info.nccf_pitch_resampled, lags_,
-        forward_cost, &index_info, &next_forward_cost);
-    forward_cost.Swap(&next_forward_cost);
-    BaseFloat remainder = forward_cost.Min();
-    forward_cost_remainder += remainder;
-    forward_cost.Add(-remainder);
-  }
-  KALDI_VLOG(3) << "Forward-cost per frame changed from "
-                << (forward_cost_remainder_ / num_frames) << " to "
-                << (forward_cost_remainder / num_frames);
-  forward_cost_remainder_ = forward_cost_remainder;
-  forward_cost_.Swap(&forward_cost);
-  int32 best_final_state;
-  forward_cost_.Min(&best_final_state);
-  if (lag_nccf_.size() != static_cast<size_t>(num_frames))
-    lag_nccf_.resize(num_frames);
-  frame_info_.back()->SetBestState(best_final_state, lag_nccf_);
-  frames_latency_ =
-      frame_info_.back()->ComputeLatency(opts_.max_frames_latency);
-  for (size_t i = 0; i < nccf_info_.size(); i++)
-    delete nccf_info_[i];
-  nccf_info_.clear();
-}
-OnlinePitchFeatureImpl::~OnlinePitchFeatureImpl() {
-  delete nccf_resampler_;
-  delete signal_resampler_;
-  for (size_t i = 0; i < frame_info_.size(); i++)
-    delete frame_info_[i];
-  for (size_t i = 0; i < nccf_info_.size(); i++)
-    delete nccf_info_[i];
-}
-void OnlinePitchFeatureImpl::AcceptWaveform(
-    BaseFloat sampling_rate,
-    const VectorBase<BaseFloat> &wave) {
-  // flush out the last few samples of input waveform only if input_finished_ ==
-  // true.
-  const bool flush = input_finished_;
-  Vector<BaseFloat> downsampled_wave;
-  signal_resampler_->Resample(wave, flush, &downsampled_wave);
-  // these variables will be used to compute the root-mean-square value of the
-  // signal for the ballast term.
-  double cur_sumsq = signal_sumsq_, cur_sum = signal_sum_;
-  int64 cur_num_samp = downsampled_samples_processed_,
-      prev_frame_end_sample = 0;
-  if (!opts_.nccf_ballast_online) {
-    cur_sumsq += VecVec(downsampled_wave, downsampled_wave);
-    cur_sum += downsampled_wave.Sum();
-    cur_num_samp += downsampled_wave.Dim();
-  }
-  // end_frame is the total number of frames we can now process, including
-  // previously processed ones.
-  int32 end_frame = NumFramesAvailable(
-      downsampled_samples_processed_ + downsampled_wave.Dim(), opts_.snip_edges);
-  // "start_frame" is the first frame-index we process
-  int32 start_frame = frame_info_.size() - 1,
-      num_new_frames = end_frame - start_frame;
-  if (num_new_frames == 0) {
-    UpdateRemainder(downsampled_wave);
-    return;
-    // continuing to the rest of the code would generate
-    // an error when sizing matrices with zero rows, and
-    // anyway is a waste of time.
-  }
-  int32 num_measured_lags = nccf_last_lag_ + 1 - nccf_first_lag_,
-      num_resampled_lags = lags_.Dim(),
-      frame_shift = opts_.NccfWindowShift(),
-      basic_frame_length = opts_.NccfWindowSize(),
-      full_frame_length = basic_frame_length + nccf_last_lag_;
-  Vector<BaseFloat> window(full_frame_length),
-      inner_prod(num_measured_lags),
-      norm_prod(num_measured_lags);
-  Matrix<BaseFloat> nccf_pitch(num_new_frames, num_measured_lags),
-      nccf_pov(num_new_frames, num_measured_lags);
-  Vector<BaseFloat> cur_forward_cost(num_resampled_lags);
-  // Because the resampling of the NCCF is more efficient when grouped together,
-  // we first compute the NCCF for all frames, then resample as a matrix, then
-  // do the Viterbi [that happens inside the constructor of PitchFrameInfo].
-  for (int32 frame = start_frame; frame < end_frame; frame++) {
-    // start_sample is index into the whole wave, not just this part.
-    int64 start_sample;
-    if (opts_.snip_edges) {
-      // Usual case: offset starts at 0
-      start_sample = static_cast<int64>(frame) * frame_shift;
-    } else {
-      // When we are not snipping the edges, the first offsets may be
-      // negative. In this case we will pad with zeros, it should not impact
-      // the pitch tracker.
-      start_sample =
-        static_cast<int64>((frame + 0.5) * frame_shift) - full_frame_length / 2;
-    }
-    ExtractFrame(downsampled_wave, start_sample, &window);
-    if (opts_.nccf_ballast_online) {
-      // use only up to end of current frame to compute root-mean-square value.
-      // end_sample will be the sample-index into "downsampled_wave", so
-      // not really comparable to start_sample.
-      int64 end_sample = start_sample + full_frame_length -
-          downsampled_samples_processed_;
-      KALDI_ASSERT(end_sample > 0);  // or should have processed this frame last
-                                     // time.  Note: end_sample is one past last
-                                     // sample.
-      if (end_sample > downsampled_wave.Dim()) {
-        KALDI_ASSERT(input_finished_);
-        end_sample = downsampled_wave.Dim();
-      }
-      SubVector<BaseFloat> new_part(downsampled_wave, prev_frame_end_sample,
-                                    end_sample - prev_frame_end_sample);
-      cur_num_samp += new_part.Dim();
-      cur_sumsq += VecVec(new_part, new_part);
-      cur_sum += new_part.Sum();
-      prev_frame_end_sample = end_sample;
-    }
-    double mean_square = cur_sumsq / cur_num_samp -
-        pow(cur_sum / cur_num_samp, 2.0);
-    ComputeCorrelation(window, nccf_first_lag_, nccf_last_lag_,
-                       basic_frame_length, &inner_prod, &norm_prod);
-    double nccf_ballast_pov = 0.0,
-        nccf_ballast_pitch = pow(mean_square * basic_frame_length, 2) *
-             opts_.nccf_ballast,
-        avg_norm_prod = norm_prod.Sum() / norm_prod.Dim();
-    SubVector<BaseFloat> nccf_pitch_row(nccf_pitch, frame - start_frame);
-    ComputeNccf(inner_prod, norm_prod, nccf_ballast_pitch,
-                &nccf_pitch_row);
-    SubVector<BaseFloat> nccf_pov_row(nccf_pov, frame - start_frame);
-    ComputeNccf(inner_prod, norm_prod, nccf_ballast_pov,
-                &nccf_pov_row);
-    if (frame < opts_.recompute_frame)
-      nccf_info_.push_back(new NccfInfo(avg_norm_prod, mean_square));
-  }
-  Matrix<BaseFloat> nccf_pitch_resampled(num_new_frames, num_resampled_lags);
-  nccf_resampler_->Resample(nccf_pitch, &nccf_pitch_resampled);
-  nccf_pitch.Resize(0, 0);  // no longer needed.
-  Matrix<BaseFloat> nccf_pov_resampled(num_new_frames, num_resampled_lags);
-  nccf_resampler_->Resample(nccf_pov, &nccf_pov_resampled);
-  nccf_pov.Resize(0, 0);  // no longer needed.
-  // We've finished dealing with the waveform so we can call UpdateRemainder
-  // now; we need to call it before we possibly call RecomputeBacktraces()
-  // below, which is why we don't do it at the very end.
-  UpdateRemainder(downsampled_wave);
-  std::vector<std::pair<int32, int32 > > index_info;
-  for (int32 frame = start_frame; frame < end_frame; frame++) {
-    int32 frame_idx = frame - start_frame;
-    PitchFrameInfo *prev_info = frame_info_.back(),
-        *cur_info = new PitchFrameInfo(prev_info);
-    cur_info->SetNccfPov(nccf_pov_resampled.Row(frame_idx));
-    cur_info->ComputeBacktraces(opts_, nccf_pitch_resampled.Row(frame_idx),
-                                lags_, forward_cost_, &index_info,
-                                &cur_forward_cost);
-    forward_cost_.Swap(&cur_forward_cost);
-    // Renormalize forward_cost so smallest element is zero.
-    BaseFloat remainder = forward_cost_.Min();
-    forward_cost_remainder_ += remainder;
-    forward_cost_.Add(-remainder);
-    frame_info_.push_back(cur_info);
-    if (frame < opts_.recompute_frame)
-      nccf_info_[frame]->nccf_pitch_resampled =
-          nccf_pitch_resampled.Row(frame_idx);
-    if (frame == opts_.recompute_frame - 1 && !opts_.nccf_ballast_online)
-      RecomputeBacktraces();
-  }
-  // Trace back the best-path.
-  int32 best_final_state;
-  forward_cost_.Min(&best_final_state);
-  lag_nccf_.resize(frame_info_.size() - 1);  // will keep any existing data.
-  frame_info_.back()->SetBestState(best_final_state, lag_nccf_);
-  frames_latency_ =
-      frame_info_.back()->ComputeLatency(opts_.max_frames_latency);
-  KALDI_VLOG(4) << "Latency is " << frames_latency_;
-}
-// Some functions that forward from OnlinePitchFeature to
-// OnlinePitchFeatureImpl.
-int32 OnlinePitchFeature::NumFramesReady() const {
-  return impl_->NumFramesReady();
-}
-OnlinePitchFeature::OnlinePitchFeature(const PitchExtractionOptions &opts)
-    :impl_(new OnlinePitchFeatureImpl(opts)) { }
-bool OnlinePitchFeature::IsLastFrame(int32 frame) const {
-  return impl_->IsLastFrame(frame);
-}
-BaseFloat OnlinePitchFeature::FrameShiftInSeconds() const {
-  return impl_->FrameShiftInSeconds();
-}
-void OnlinePitchFeature::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
-  impl_->GetFrame(frame, feat);
-}
-void OnlinePitchFeature::AcceptWaveform(
-    BaseFloat sampling_rate,
-    const VectorBase<BaseFloat> &waveform) {
-  impl_->AcceptWaveform(sampling_rate, waveform);
-}
-void OnlinePitchFeature::InputFinished() {
-  impl_->InputFinished();
-}
-OnlinePitchFeature::~OnlinePitchFeature() {
-  delete impl_;
-}
-/**
-   This function is called from ComputeKaldiPitch when the user
-   specifies opts.simulate_first_pass_online == true.  It gives
-   the "first-pass" version of the features, which you would get
-   on the first decoding pass in an online setting.  These may
-   differ slightly from the final features due to both the
-   way the Viterbi traceback works (this is affected by
-   opts.max_frames_latency), and the online way we compute
-   the average signal energy.
-*/
-void ComputeKaldiPitchFirstPass(
-    const PitchExtractionOptions &opts,
-    const VectorBase<BaseFloat> &wave,
-    Matrix<BaseFloat> *output) {
-  int32 cur_rows = 100;
-  Matrix<BaseFloat> feats(cur_rows, 2);
-  OnlinePitchFeature pitch_extractor(opts);
-  KALDI_ASSERT(opts.frames_per_chunk > 0 &&
-               "--simulate-first-pass-online option does not make sense "
-               "unless you specify --frames-per-chunk");
-  int32 cur_offset = 0, cur_frame = 0, samp_per_chunk =
-      opts.frames_per_chunk * opts.samp_freq * opts.frame_shift_ms / 1000.0f;
-  while (cur_offset < wave.Dim()) {
-    int32 num_samp = std::min(samp_per_chunk, wave.Dim() - cur_offset);
-    SubVector<BaseFloat> wave_chunk(wave, cur_offset, num_samp);
-    pitch_extractor.AcceptWaveform(opts.samp_freq, wave_chunk);
-    cur_offset += num_samp;
-    if (cur_offset == wave.Dim())
-      pitch_extractor.InputFinished();
-    // Get each frame as soon as it is ready.
-    for (; cur_frame < pitch_extractor.NumFramesReady(); cur_frame++) {
-      if (cur_frame >= cur_rows) {
-        cur_rows *= 2;
-        feats.Resize(cur_rows, 2, kCopyData);
-      }
-      SubVector<BaseFloat> row(feats, cur_frame);
-      pitch_extractor.GetFrame(cur_frame, &row);
-    }
-  }
-  if (cur_frame  == 0) {
-    KALDI_WARN << "No features output since wave file too short";
-    output->Resize(0, 0);
-  } else {
-    *output = feats.RowRange(0, cur_frame);
-  }
-}
-void ComputeKaldiPitch(const PitchExtractionOptions &opts,
-                       const VectorBase<BaseFloat> &wave,
-                       Matrix<BaseFloat> *output) {
-  if (opts.simulate_first_pass_online) {
-    ComputeKaldiPitchFirstPass(opts, wave, output);
-    return;
-  }
-  OnlinePitchFeature pitch_extractor(opts);
-  if (opts.frames_per_chunk == 0) {
-    pitch_extractor.AcceptWaveform(opts.samp_freq, wave);
-  } else {
-    // the user may set opts.frames_per_chunk for better compatibility with
-    // online operation.
-    KALDI_ASSERT(opts.frames_per_chunk > 0);
-    int32 cur_offset = 0, samp_per_chunk =
-        opts.frames_per_chunk * opts.samp_freq * opts.frame_shift_ms / 1000.0f;
-    while (cur_offset < wave.Dim()) {
-      int32 num_samp = std::min(samp_per_chunk, wave.Dim() - cur_offset);
-      SubVector<BaseFloat> wave_chunk(wave, cur_offset, num_samp);
-      pitch_extractor.AcceptWaveform(opts.samp_freq, wave_chunk);
-      cur_offset += num_samp;
-    }
-  }
-  pitch_extractor.InputFinished();
-  int32 num_frames = pitch_extractor.NumFramesReady();
-  if (num_frames == 0) {
-    KALDI_WARN << "No frames output in pitch extraction";
-    output->Resize(0, 0);
-    return;
-  }
-  output->Resize(num_frames, 2);
-  for (int32 frame = 0; frame < num_frames; frame++) {
-    SubVector<BaseFloat> row(*output, frame);
-    pitch_extractor.GetFrame(frame, &row);
-  }
-}
-/*
-  This comment describes our invesigation of how much latency the
-  online-processing algorithm introduces, i.e. how many frames you would
-  typically have to wait until the traceback converges, if you were to set the
-  --max-frames-latency to a very large value.
-  This was done on a couple of files of language-id data.
-  /home/dpovey/kaldi-online/src/featbin/compute-kaldi-pitch-feats --frames-per-chunk=10 --max-frames-latency=100 --verbose=4 --sample-frequency=8000 --resample-frequency=2600 "scp:head -n 2 data/train/wav.scp |" ark:/dev/null 2>&1   | grep Latency | wc
-   4871   24355  443991
- /home/dpovey/kaldi-online/src/featbin/compute-kaldi-pitch-feats --frames-per-chunk=10 --max-frames-latency=100 --verbose=4 --sample-frequency=8000 --resample-frequency=2600 "scp:head -n 2 data/train/wav.scp |" ark:/dev/null 2>&1  | grep Latency | grep 100 | wc
-   1534    7670  141128
-# as above, but with 50 instead of 10 in the --max-frames-latency and grep statements.
-   2070   10350  188370
-# as above, but with 10 instead of 50.
-   4067   20335  370097
-   This says that out of 4871 selected frames [we measured the latency every 10
-   frames, since --frames-per-chunk=10], in 1534 frames (31%), the latency was
-    >= 100 frames, i.e. >= 1 second.  Including the other numbers, we can see
-    that
-    31% of frames had latency >= 1 second
-    42% of frames had latency >= 0.5 second
-    83% of frames had latency >= 0.1 second.
-  This doesn't necessarily mean that we actually have a latency of >= 1 second 31% of
-  the time when using these features, since by using the --max-frames-latency option
-  (default: 30 frames), it will limit the latency to, say, 0.3 seconds, and trace back
-  from the best current pitch.  Most of the time this will probably cause no change in
-  the pitch traceback since the best current pitch is probably the "right" point to
-  trace back from.  And anyway, in the online-decoding, we will most likely rescore
-  the features at the end anyway, and the traceback gets recomputed, so there will
-  be no inaccuracy (assuming the first-pass lattice had everything we needed).
-  Probably the greater source of inaccuracy due to the online algorithm is the
-  online energy-normalization, which affects the NCCF-ballast term, and which,
-  for reasons of efficiency, we don't attempt to "correct" in a later rescoring
-  pass.  This will make the most difference in the first few frames of the file,
-  before the first voicing, where it will tend to produce more pitch movement
-  than the offline version of the algorithm.
-*/
-// Function to do data accumulation for on-line usage
-template<typename Real>
-inline void AppendVector(const VectorBase<Real> &src, Vector<Real> *dst) {
-  if (src.Dim() == 0) return;
-  dst->Resize(dst->Dim() + src.Dim(), kCopyData);
-  dst->Range(dst->Dim() - src.Dim(), src.Dim()).CopyFromVec(src);
-}
-/**
-   Note on the implementation of OnlineProcessPitch: the
-   OnlineFeatureInterface allows random access to features (i.e. not necessarily
-   sequential order), so we need to support that.  But we don't need to support
-   it very efficiently, and our implementation is most efficient if frames are
-   accessed in sequential order.
-   Also note: we have to be a bit careful in this implementation because
-   the input features may change.  That is: if we call
-   src_->GetFrame(t, &vec) from GetFrame(), we can't guarantee that a later
-   call to src_->GetFrame(t, &vec) from another GetFrame() will return the
-   same value.  In fact, while designing this class we used some knowledge
-   of how the OnlinePitchFeature class works to minimize the amount of
-   re-querying we had to do.
-*/
-OnlineProcessPitch::OnlineProcessPitch(
-    const ProcessPitchOptions &opts,
-    OnlineFeatureInterface *src):
-    opts_(opts), src_(src),
-    dim_ ((opts.add_pov_feature ? 1 : 0)
-          + (opts.add_normalized_log_pitch ? 1 : 0)
-          + (opts.add_delta_pitch ? 1 : 0)
-          + (opts.add_raw_log_pitch ? 1 : 0)) {
-  KALDI_ASSERT(dim_ > 0 &&
-               " At least one of the pitch features should be chosen. "
-               "Check your post-process-pitch options.");
-  KALDI_ASSERT(src->Dim() == kRawFeatureDim &&
-               "Input feature must be pitch feature (should have dimension 2)");
-}
-void OnlineProcessPitch::GetFrame(int32 frame,
-                                  VectorBase<BaseFloat> *feat) {
-  int32 frame_delayed = frame < opts_.delay ? 0 : frame - opts_.delay;
-  KALDI_ASSERT(feat->Dim() == dim_ &&
-               frame_delayed < NumFramesReady());
-  int32 index = 0;
-  if (opts_.add_pov_feature)
-    (*feat)(index++) = GetPovFeature(frame_delayed);
-  if (opts_.add_normalized_log_pitch)
-    (*feat)(index++) = GetNormalizedLogPitchFeature(frame_delayed);
-  if (opts_.add_delta_pitch)
-    (*feat)(index++) = GetDeltaPitchFeature(frame_delayed);
-  if (opts_.add_raw_log_pitch)
-    (*feat)(index++) = GetRawLogPitchFeature(frame_delayed);
-  KALDI_ASSERT(index == dim_);
-}
-BaseFloat OnlineProcessPitch::GetPovFeature(int32 frame) const {
-  Vector<BaseFloat> tmp(kRawFeatureDim);
-  src_->GetFrame(frame, &tmp);  // (NCCF, pitch) from pitch extractor
-  BaseFloat nccf = tmp(0);
-  return opts_.pov_scale * NccfToPovFeature(nccf)
-      + opts_.pov_offset;
-}
-BaseFloat OnlineProcessPitch::GetDeltaPitchFeature(int32 frame) {
-  // Rather than computing the delta pitch directly in code here,
-  // which might seem easier, we accumulate a small window of features
-  // and call ComputeDeltas.  This might seem like overkill; the reason
-  // we do it this way is to ensure that the end effects (at file
-  // beginning and end) are handled in a consistent way.
-  int32 context = opts_.delta_window;
-  int32 start_frame = std::max(0, frame - context),
-      end_frame = std::min(frame + context + 1, src_->NumFramesReady()),
-      frames_in_window = end_frame - start_frame;
-  Matrix<BaseFloat> feats(frames_in_window, 1),
-      delta_feats;
-  for (int32 f = start_frame; f < end_frame; f++)
-    feats(f - start_frame, 0) = GetRawLogPitchFeature(f);
-  DeltaFeaturesOptions delta_opts;
-  delta_opts.order = 1;
-  delta_opts.window = opts_.delta_window;
-  ComputeDeltas(delta_opts, feats, &delta_feats);
-  while (delta_feature_noise_.size() <= static_cast<size_t>(frame)) {
-    delta_feature_noise_.push_back(RandGauss() *
-                                   opts_.delta_pitch_noise_stddev);
-  }
-  // note: delta_feats will have two columns, second contains deltas.
-  return (delta_feats(frame - start_frame, 1) + delta_feature_noise_[frame]) *
-      opts_.delta_pitch_scale;
-}
-BaseFloat OnlineProcessPitch::GetRawLogPitchFeature(int32 frame) const {
-  Vector<BaseFloat> tmp(kRawFeatureDim);
-  src_->GetFrame(frame, &tmp);
-  BaseFloat pitch = tmp(1);
-  KALDI_ASSERT(pitch > 0);
-  return Log(pitch);
-}
-BaseFloat OnlineProcessPitch::GetNormalizedLogPitchFeature(int32 frame) {
-  UpdateNormalizationStats(frame);
-  BaseFloat log_pitch = GetRawLogPitchFeature(frame),
-      avg_log_pitch = normalization_stats_[frame].sum_log_pitch_pov /
-        normalization_stats_[frame].sum_pov,
-      normalized_log_pitch = log_pitch - avg_log_pitch;
-  return normalized_log_pitch * opts_.pitch_scale;
-}
-// inline
-void OnlineProcessPitch::GetNormalizationWindow(int32 t,
-                                                int32 src_frames_ready,
-                                                int32 *window_begin,
-                                                int32 *window_end) const {
-  int32 left_context = opts_.normalization_left_context;
-  int32 right_context = opts_.normalization_right_context;
-  *window_begin = std::max(0, t - left_context);
-  *window_end = std::min(t + right_context + 1, src_frames_ready);
-}
-// Makes sure the entry in normalization_stats_ for this frame is up to date;
-// called from GetNormalizedLogPitchFeature.
-// the cur_num_frames and input_finished variables are needed because the
-// pitch features for a given frame may change as we see more data.
-void OnlineProcessPitch::UpdateNormalizationStats(int32 frame) {
-  KALDI_ASSERT(frame >= 0);
-  if (normalization_stats_.size() <= frame)
-    normalization_stats_.resize(frame + 1);
-  int32 cur_num_frames = src_->NumFramesReady();
-  bool input_finished = src_->IsLastFrame(cur_num_frames - 1);
-  NormalizationStats &this_stats = normalization_stats_[frame];
-  if (this_stats.cur_num_frames == cur_num_frames &&
-      this_stats.input_finished == input_finished) {
-    // Stats are fully up-to-date.
-    return;
-  }
-  int32 this_window_begin, this_window_end;
-  GetNormalizationWindow(frame, cur_num_frames,
-                         &this_window_begin, &this_window_end);
-  if (frame > 0) {
-    const NormalizationStats &prev_stats = normalization_stats_[frame - 1];
-    if (prev_stats.cur_num_frames == cur_num_frames &&
-        prev_stats.input_finished == input_finished) {
-      // we'll derive this_stats efficiently from prev_stats.
-      // Checking that cur_num_frames and input_finished have not changed
-      // ensures that the underlying features will not have changed.
-      this_stats = prev_stats;
-      int32 prev_window_begin, prev_window_end;
-      GetNormalizationWindow(frame - 1, cur_num_frames,
-                             &prev_window_begin, &prev_window_end);
-      if (this_window_begin != prev_window_begin) {
-        KALDI_ASSERT(this_window_begin == prev_window_begin + 1);
-        Vector<BaseFloat> tmp(kRawFeatureDim);
-        src_->GetFrame(prev_window_begin, &tmp);
-        BaseFloat accurate_pov = NccfToPov(tmp(0)),
-            log_pitch = Log(tmp(1));
-        this_stats.sum_pov -= accurate_pov;
-        this_stats.sum_log_pitch_pov -= accurate_pov * log_pitch;
-      }
-      if (this_window_end != prev_window_end) {
-        KALDI_ASSERT(this_window_end == prev_window_end + 1);
-        Vector<BaseFloat> tmp(kRawFeatureDim);
-        src_->GetFrame(prev_window_end, &tmp);
-        BaseFloat accurate_pov = NccfToPov(tmp(0)),
-            log_pitch = Log(tmp(1));
-        this_stats.sum_pov += accurate_pov;
-        this_stats.sum_log_pitch_pov += accurate_pov * log_pitch;
-      }
-      return;
-    }
-  }
-  // The way we do it here is not the most efficient way to do it;
-  // we'll see if it becomes a problem.  The issue is we have to redo
-  // this computation from scratch each time we process a new chunk, which
-  // may be a little inefficient if the chunk-size is very small.
-  this_stats.cur_num_frames = cur_num_frames;
-  this_stats.input_finished = input_finished;
-  this_stats.sum_pov = 0.0;
-  this_stats.sum_log_pitch_pov = 0.0;
-  Vector<BaseFloat> tmp(kRawFeatureDim);
-  for (int32 f = this_window_begin; f < this_window_end; f++) {
-    src_->GetFrame(f, &tmp);
-    BaseFloat accurate_pov = NccfToPov(tmp(0)),
-        log_pitch = Log(tmp(1));
-    this_stats.sum_pov += accurate_pov;
-    this_stats.sum_log_pitch_pov += accurate_pov * log_pitch;
-  }
-}
-int32 OnlineProcessPitch::NumFramesReady() const {
-  int32 src_frames_ready = src_->NumFramesReady();
-  if (src_frames_ready == 0) {
-    return 0;
-  } else if (src_->IsLastFrame(src_frames_ready - 1)) {
-    return src_frames_ready + opts_.delay;
-  } else {
-    return std::max(0, src_frames_ready -
-      opts_.normalization_right_context + opts_.delay);
-  }
-}
-void ProcessPitch(const ProcessPitchOptions &opts,
-                  const MatrixBase<BaseFloat> &input,
-                  Matrix<BaseFloat> *output) {
-  OnlineMatrixFeature pitch_feat(input);
-  OnlineProcessPitch online_process_pitch(opts, &pitch_feat);
-  output->Resize(online_process_pitch.NumFramesReady(),
-                 online_process_pitch.Dim());
-  for (int32 t = 0; t < online_process_pitch.NumFramesReady(); t++) {
-    SubVector<BaseFloat> row(*output, t);
-    online_process_pitch.GetFrame(t, &row);
-  }
-}
-void ComputeAndProcessKaldiPitch(
-    const PitchExtractionOptions &pitch_opts,
-    const ProcessPitchOptions &process_opts,
-    const VectorBase<BaseFloat> &wave,
-    Matrix<BaseFloat> *output) {
-  OnlinePitchFeature pitch_extractor(pitch_opts);
-  if (pitch_opts.simulate_first_pass_online) {
-    KALDI_ASSERT(pitch_opts.frames_per_chunk > 0 &&
-                 "--simulate-first-pass-online option does not make sense "
-                 "unless you specify --frames-per-chunk");
-  }
-  OnlineProcessPitch post_process(process_opts, &pitch_extractor);
-  int32 cur_rows = 100;
-  Matrix<BaseFloat> feats(cur_rows, post_process.Dim());
-  int32 cur_offset = 0, cur_frame = 0,
-      samp_per_chunk = pitch_opts.frames_per_chunk *
-      pitch_opts.samp_freq * pitch_opts.frame_shift_ms / 1000.0f;
-  // We request the first-pass features as soon as they are available,
-  // regardless of whether opts.simulate_first_pass_online == true.  If
-  // opts.simulate_first_pass_online == true this should
-  // not affect the features generated, but it helps us to test the code
-  // in a way that's closer to what online decoding would see.
-  while (cur_offset < wave.Dim()) {
-    int32 num_samp;
-    if (samp_per_chunk > 0)
-      num_samp = std::min(samp_per_chunk, wave.Dim() - cur_offset);
-    else  // user left opts.frames_per_chunk at zero.
-      num_samp = wave.Dim();
-    SubVector<BaseFloat> wave_chunk(wave, cur_offset, num_samp);
-    pitch_extractor.AcceptWaveform(pitch_opts.samp_freq, wave_chunk);
-    cur_offset += num_samp;
-    if (cur_offset == wave.Dim())
-      pitch_extractor.InputFinished();
-    // Get each frame as soon as it is ready.
-    for (; cur_frame < post_process.NumFramesReady(); cur_frame++) {
-      if (cur_frame >= cur_rows) {
-        cur_rows *= 2;
-        feats.Resize(cur_rows, post_process.Dim(), kCopyData);
-      }
-      SubVector<BaseFloat> row(feats, cur_frame);
-      post_process.GetFrame(cur_frame, &row);
-    }
-  }
-  if (pitch_opts.simulate_first_pass_online) {
-    if (cur_frame == 0) {
-      KALDI_WARN << "No features output since wave file too short";
-      output->Resize(0, 0);
-    } else {
-      *output = feats.RowRange(0, cur_frame);
-    }
-  } else {
-    // want the "final" features for second pass, so get them again.
-    output->Resize(post_process.NumFramesReady(), post_process.Dim());
-    for (int32 frame = 0; frame < post_process.NumFramesReady(); frame++) {
-      SubVector<BaseFloat> row(*output, frame);
-      post_process.GetFrame(frame, &row);
-    }
-  }
-}
-}  // namespace kaldi
--- a/speechx/speechx/kaldi/feat/pitch-functions.h
+++ b/speechx/speechx/kaldi/feat/pitch-functions.h
-// feat/pitch-functions.h
-// Copyright     2013  Pegah Ghahremani
-//               2014  IMSL, PKU-HKUST (author: Wei Shi)
-//               2014  Yanqing Sun, Junjie Wang,
-//                     Daniel Povey, Korbinian Riedhammer
-//                     Xin Lei
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_FEAT_PITCH_FUNCTIONS_H_
-#define KALDI_FEAT_PITCH_FUNCTIONS_H_
-#include <cassert>
-#include <cstdlib>
-#include <string>
-#include <vector>
-#include "base/kaldi-error.h"
-#include "feat/mel-computations.h"
-#include "feat/online-feature-itf.h"
-#include "matrix/matrix-lib.h"
-#include "util/common-utils.h"
-namespace kaldi {
-/// @addtogroup  feat FeatureExtraction
-/// @{
-struct PitchExtractionOptions {
-  // FrameExtractionOptions frame_opts;
-  BaseFloat samp_freq;          // sample frequency in hertz
-  BaseFloat frame_shift_ms;     // in milliseconds.
-  BaseFloat frame_length_ms;    // in milliseconds.
-  BaseFloat preemph_coeff;      // Preemphasis coefficient. [use is deprecated.]
-  BaseFloat min_f0;             // min f0 to search (Hz)
-  BaseFloat max_f0;             // max f0 to search (Hz)
-  BaseFloat soft_min_f0;        // Minimum f0, applied in soft way, must not
-                                // exceed min-f0
-  BaseFloat penalty_factor;     // cost factor for FO change
-  BaseFloat lowpass_cutoff;     // cutoff frequency for Low pass filter
-  BaseFloat resample_freq;      // Integer that determines filter width when
-                                // upsampling NCCF
-  BaseFloat delta_pitch;        // the pitch tolerance in pruning lags
-  BaseFloat nccf_ballast;       // Increasing this factor reduces NCCF for
-                                // quiet frames, helping ensure pitch
-                                // continuity in unvoiced region
-  int32 lowpass_filter_width;   // Integer that determines filter width of
-                                // lowpass filter
-  int32 upsample_filter_width;  // Integer that determines filter width when
-                                // upsampling NCCF
-  // Below are newer config variables, not present in the original paper,
-  // that relate to the online pitch extraction algorithm.
-  // The maximum number of frames of latency that we allow the pitch-processing
-  // to introduce, for online operation. If you set this to a large value,
-  // there would be no inaccuracy from the Viterbi traceback (but it might make
-  // you wait to see the pitch). This is not very relevant for the online
-  // operation: normalization-right-context is more relevant, you
-  // can just leave this value at zero.
-  int32 max_frames_latency;
-  // Only relevant for the function ComputeKaldiPitch which is called by
-  // compute-kaldi-pitch-feats. If nonzero, we provide the input as chunks of
-  // this size. This affects the energy normalization which has a small effect
-  // on the resulting features, especially at the beginning of a file. For best
-  // compatibility with online operation (e.g. if you plan to train models for
-  // the online-deocding setup), you might want to set this to a small value,
-  // like one frame.
-  int32 frames_per_chunk;
-  // Only relevant for the function ComputeKaldiPitch which is called by
-  // compute-kaldi-pitch-feats, and only relevant if frames_per_chunk is
-  // nonzero. If true, it will query the features as soon as they are
-  // available, which simulates the first-pass features you would get in online
-  // decoding. If false, the features you will get will be the same as those
-  // available at the end of the utterance, after InputFinished() has been
-  // called: e.g. during lattice rescoring.
-  bool simulate_first_pass_online;
-  // Only relevant for online operation or when emulating online operation
-  // (e.g. when setting frames_per_chunk). This is the frame-index on which we
-  // recompute the NCCF (e.g. frame-index 500 = after 5 seconds); if the
-  // segment ends before this we do it when the segment ends. We do this by
-  // re-computing the signal average energy, which affects the NCCF via the
-  // "ballast term", scaling the resampled NCCF by a factor derived from the
-  // average change in the "ballast term", and re-doing the backtrace
-  // computation. Making this infinity would be the most exact, but would
-  // introduce unwanted latency at the end of long utterances, for little
-  // benefit.
-  int32 recompute_frame;
-  // This is a "hidden config" used only for testing the online pitch
-  // extraction. If true, we compute the signal root-mean-squared for the
-  // ballast term, only up to the current frame, rather than the end of the
-  // current chunk of signal. This makes the output insensitive to the
-  // chunking, which is useful for testing purposes.
-  bool nccf_ballast_online;
-  bool snip_edges;
-  PitchExtractionOptions():
-      samp_freq(16000),
-      frame_shift_ms(10.0),
-      frame_length_ms(25.0),
-      preemph_coeff(0.0),
-      min_f0(50),
-      max_f0(400),
-      soft_min_f0(10.0),
-      penalty_factor(0.1),
-      lowpass_cutoff(1000),
-      resample_freq(4000),
-      delta_pitch(0.005),
-      nccf_ballast(7000),
-      lowpass_filter_width(1),
-      upsample_filter_width(5),
-      max_frames_latency(0),
-      frames_per_chunk(0),
-      simulate_first_pass_online(false),
-      recompute_frame(500),
-      nccf_ballast_online(false),
-      snip_edges(true) { }
-  void Register(OptionsItf *opts) {
-    opts->Register("sample-frequency", &samp_freq,
-                   "Waveform data sample frequency (must match the waveform "
-                   "file, if specified there)");
-    opts->Register("frame-length", &frame_length_ms, "Frame length in "
-                   "milliseconds");
-    opts->Register("frame-shift", &frame_shift_ms, "Frame shift in "
-                   "milliseconds");
-    opts->Register("preemphasis-coefficient", &preemph_coeff,
-                   "Coefficient for use in signal preemphasis (deprecated)");
-    opts->Register("min-f0", &min_f0,
-                   "min. F0 to search for (Hz)");
-    opts->Register("max-f0", &max_f0,
-                   "max. F0 to search for (Hz)");
-    opts->Register("soft-min-f0", &soft_min_f0,
-                   "Minimum f0, applied in soft way, must not exceed min-f0");
-    opts->Register("penalty-factor", &penalty_factor,
-                   "cost factor for FO change.");
-    opts->Register("lowpass-cutoff", &lowpass_cutoff,
-                   "cutoff frequency for LowPass filter (Hz) ");
-    opts->Register("resample-frequency", &resample_freq,
-                   "Frequency that we down-sample the signal to.  Must be "
-                   "more than twice lowpass-cutoff");
-    opts->Register("delta-pitch", &delta_pitch,
-                   "Smallest relative change in pitch that our algorithm "
-                   "measures");
-    opts->Register("nccf-ballast", &nccf_ballast,
-                   "Increasing this factor reduces NCCF for quiet frames");
-    opts->Register("nccf-ballast-online", &nccf_ballast_online,
-                   "This is useful mainly for debug; it affects how the NCCF "
-                   "ballast is computed.");
-    opts->Register("lowpass-filter-width", &lowpass_filter_width,
-                   "Integer that determines filter width of "
-                   "lowpass filter, more gives sharper filter");
-    opts->Register("upsample-filter-width", &upsample_filter_width,
-                   "Integer that determines filter width when upsampling NCCF");
-    opts->Register("frames-per-chunk", &frames_per_chunk, "Only relevant for "
-                   "offline pitch extraction (e.g. compute-kaldi-pitch-feats), "
-                   "you can set it to a small nonzero value, such as 10, for "
-                   "better feature compatibility with online decoding (affects "
-                   "energy normalization in the algorithm)");
-    opts->Register("simulate-first-pass-online", &simulate_first_pass_online,
-                   "If true, compute-kaldi-pitch-feats will output features "
-                   "that correspond to what an online decoder would see in the "
-                   "first pass of decoding-- not the final version of the "
-                   "features, which is the default.  Relevant if "
-                   "--frames-per-chunk > 0");
-    opts->Register("recompute-frame", &recompute_frame, "Only relevant for "
-                   "online pitch extraction, or for compatibility with online "
-                   "pitch extraction.  A non-critical parameter; the frame at "
-                   "which we recompute some of the forward pointers, after "
-                   "revising our estimate of the signal energy.  Relevant if"
-                   "--frames-per-chunk > 0");
-    opts->Register("max-frames-latency", &max_frames_latency, "Maximum number "
-                   "of frames of latency that we allow pitch tracking to "
-                   "introduce into the feature processing (affects output only "
-                   "if --frames-per-chunk > 0 and "
-                   "--simulate-first-pass-online=true");
-    opts->Register("snip-edges", &snip_edges, "If this is set to false, the "
-                   "incomplete frames near the ending edge won't be snipped, "
-                   "so that the number of frames is the file size divided by "
-                   "the frame-shift. This makes different types of features "
-                   "give the same number of frames.");
-  }
-  /// Returns the window-size in samples, after resampling.  This is the
-  /// "basic window size", not the full window size after extending by max-lag.
-  // Because of floating point representation, it is more reliable to divide
-  // by 1000 instead of multiplying by 0.001, but it is a bit slower.
-  int32 NccfWindowSize() const {
-    return static_cast<int32>(resample_freq * frame_length_ms / 1000.0);
-  }
-  /// Returns the window-shift in samples, after resampling.
-  int32 NccfWindowShift() const {
-    return static_cast<int32>(resample_freq * frame_shift_ms / 1000.0);
-  }
-};
-struct ProcessPitchOptions {
-  BaseFloat pitch_scale;  // the final normalized-log-pitch feature is scaled
-                          // with this value
-  BaseFloat pov_scale;    // the final POV feature is scaled with this value
-  BaseFloat pov_offset;   // An offset that can be added to the final POV
-                          // feature (useful for online-decoding, where we don't
-                          // do CMN to the pitch-derived features.
-  BaseFloat delta_pitch_scale;
-  BaseFloat delta_pitch_noise_stddev;  // stddev of noise we add to delta-pitch
-  int32 normalization_left_context;    // left-context used for sliding-window
-                                       // normalization
-  int32 normalization_right_context;   // this should be reduced in online
-                                       // decoding to reduce latency
-  int32 delta_window;
-  int32 delay;
-  bool add_pov_feature;
-  bool add_normalized_log_pitch;
-  bool add_delta_pitch;
-  bool add_raw_log_pitch;
-  ProcessPitchOptions() :
-      pitch_scale(2.0),
-      pov_scale(2.0),
-      pov_offset(0.0),
-      delta_pitch_scale(10.0),
-      delta_pitch_noise_stddev(0.005),
-      normalization_left_context(75),
-      normalization_right_context(75),
-      delta_window(2),
-      delay(0),
-      add_pov_feature(true),
-      add_normalized_log_pitch(true),
-      add_delta_pitch(true),
-      add_raw_log_pitch(false) { }
-  void Register(ParseOptions *opts) {
-    opts->Register("pitch-scale", &pitch_scale,
-                   "Scaling factor for the final normalized log-pitch value");
-    opts->Register("pov-scale", &pov_scale,
-                   "Scaling factor for final POV (probability of voicing) "
-                   "feature");
-    opts->Register("pov-offset", &pov_offset,
-                   "This can be used to add an offset to the POV feature. "
-                   "Intended for use in online decoding as a substitute for "
-                   " CMN.");
-    opts->Register("delta-pitch-scale", &delta_pitch_scale,
-                   "Term to scale the final delta log-pitch feature");
-    opts->Register("delta-pitch-noise-stddev", &delta_pitch_noise_stddev,
-                   "Standard deviation for noise we add to the delta log-pitch "
-                   "(before scaling); should be about the same as delta-pitch "
-                   "option to pitch creation.  The purpose is to get rid of "
-                   "peaks in the delta-pitch caused by discretization of pitch "
-                   "values.");
-    opts->Register("normalization-left-context", &normalization_left_context,
-                   "Left-context (in frames) for moving window normalization");
-    opts->Register("normalization-right-context", &normalization_right_context,
-                   "Right-context (in frames) for moving window normalization");
-    opts->Register("delta-window", &delta_window,
-                   "Number of frames on each side of central frame, to use for "
-                   "delta window.");
-    opts->Register("delay", &delay,
-                   "Number of frames by which the pitch information is "
-                   "delayed.");
-    opts->Register("add-pov-feature", &add_pov_feature,
-                   "If true, the warped NCCF is added to output features");
-    opts->Register("add-normalized-log-pitch", &add_normalized_log_pitch,
-                   "If true, the log-pitch with POV-weighted mean subtraction "
-                   "over 1.5 second window is added to output features");
-    opts->Register("add-delta-pitch", &add_delta_pitch,
-                   "If true, time derivative of log-pitch is added to output "
-                   "features");
-    opts->Register("add-raw-log-pitch", &add_raw_log_pitch,
-                   "If true, log(pitch) is added to output features");
-  }
-};
-// We don't want to expose the pitch-extraction internals here as it's
-// quite complex, so we use a private implementation.
-class OnlinePitchFeatureImpl;
-// Note: to start on a new waveform, just construct a new version
-// of this object.
-class OnlinePitchFeature: public OnlineBaseFeature {
- public:
-  explicit OnlinePitchFeature(const PitchExtractionOptions &opts);
-  virtual int32 Dim() const { return 2; /* (NCCF, pitch) */ }
-  virtual int32 NumFramesReady() const;
-  virtual BaseFloat FrameShiftInSeconds() const;
-  virtual bool IsLastFrame(int32 frame) const;
-  /// Outputs the two-dimensional feature consisting of (pitch, NCCF).  You
-  /// should probably post-process this using class OnlineProcessPitch.
-  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
-  virtual void AcceptWaveform(BaseFloat sampling_rate,
-                              const VectorBase<BaseFloat> &waveform);
-  virtual void InputFinished();
-  virtual ~OnlinePitchFeature();
- private:
-  OnlinePitchFeatureImpl *impl_;
-};
-/// This online-feature class implements post processing of pitch features.
-/// Inputs are original 2 dims (nccf, pitch).  It can produce various
-/// kinds of outputs, using the default options it will be (pov-feature,
-/// normalized-log-pitch, delta-log-pitch).
-class OnlineProcessPitch: public OnlineFeatureInterface {
- public:
-  virtual int32 Dim() const { return dim_; }
-  virtual bool IsLastFrame(int32 frame) const {
-    if (frame <= -1)
-      return src_->IsLastFrame(-1);
-    else if (frame < opts_.delay)
-      return src_->IsLastFrame(-1) == true ? false : src_->IsLastFrame(0);
-    else
-      return src_->IsLastFrame(frame - opts_.delay);
-  }
-  virtual BaseFloat FrameShiftInSeconds() const {
-    return src_->FrameShiftInSeconds();
-  }
-  virtual int32 NumFramesReady() const;
-  virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
-  virtual ~OnlineProcessPitch() {  }
-  // Does not take ownership of "src".
-  OnlineProcessPitch(const ProcessPitchOptions &opts,
-                     OnlineFeatureInterface *src);
- private:
-  enum { kRawFeatureDim = 2};  // anonymous enum to define a constant.
-                               // kRawFeatureDim defines the dimension
-                               // of the input: (nccf, pitch)
-  ProcessPitchOptions opts_;
-  OnlineFeatureInterface *src_;
-  int32 dim_;  // Output feature dimension, set in initializer.
-  struct NormalizationStats {
-    int32 cur_num_frames;      // value of src_->NumFramesReady() when
-                               // "mean_pitch" was set.
-    bool input_finished;       // true if input data was finished when
-                               // "mean_pitch" was computed.
-    double sum_pov;            // sum of pov over relevant range
-    double sum_log_pitch_pov;  // sum of log(pitch) * pov over relevant range
-    NormalizationStats(): cur_num_frames(-1), input_finished(false),
-                          sum_pov(0.0), sum_log_pitch_pov(0.0) { }
-  };
-  std::vector<BaseFloat> delta_feature_noise_;
-  std::vector<NormalizationStats> normalization_stats_;
-  /// Computes and returns the POV feature for this frame.
-  /// Called from GetFrame().
-  inline BaseFloat GetPovFeature(int32 frame) const;
-  /// Computes and returns the delta-log-pitch feature for this frame.
-  /// Called from GetFrame().
-  inline BaseFloat GetDeltaPitchFeature(int32 frame);
-  /// Computes and returns the raw log-pitch feature for this frame.
-  /// Called from GetFrame().
-  inline BaseFloat GetRawLogPitchFeature(int32 frame) const;
-  /// Computes and returns the mean-subtracted log-pitch feature for this frame.
-  /// Called from GetFrame().
-  inline BaseFloat GetNormalizedLogPitchFeature(int32 frame);
-  /// Computes the normalization window sizes.
-  inline void GetNormalizationWindow(int32 frame,
-                                     int32 src_frames_ready,
-                                     int32 *window_begin,
-                                     int32 *window_end) const;
-  /// Makes sure the entry in normalization_stats_ for this frame is up to date;
-  /// called from GetNormalizedLogPitchFeature.
-  inline void UpdateNormalizationStats(int32 frame);
-};
-/// This function extracts (pitch, NCCF) per frame, using the pitch extraction
-/// method described in "A Pitch Extraction Algorithm Tuned for Automatic Speech
-/// Recognition", Pegah Ghahremani, Bagher BabaAli, Daniel Povey, Korbinian
-/// Riedhammer, Jan Trmal and Sanjeev Khudanpur, ICASSP 2014.  The output will
-/// have as many rows as there are frames, and two columns corresponding to
-/// (NCCF, pitch)
-void ComputeKaldiPitch(const PitchExtractionOptions &opts,
-                       const VectorBase<BaseFloat> &wave,
-                       Matrix<BaseFloat> *output);
-/// This function processes the raw (NCCF, pitch) quantities computed by
-/// ComputeKaldiPitch, and processes them into features.  By default it will
-/// output three-dimensional features, (POV-feature, mean-subtracted-log-pitch,
-/// delta-of-raw-pitch), but this is configurable in the options.  The number of
-/// rows of "output" will be the number of frames (rows) in "input", and the
-/// number of columns will be the number of different types of features
-/// requested (by default, 3; 4 is the max).  The four config variables
-/// --add-pov-feature, --add-normalized-log-pitch, --add-delta-pitch,
-/// --add-raw-log-pitch determine which features we create; by default we create
-/// the first three.
-void ProcessPitch(const ProcessPitchOptions &opts,
-                  const MatrixBase<BaseFloat> &input,
-                  Matrix<BaseFloat> *output);
-/// This function combines ComputeKaldiPitch and ProcessPitch.  The reason
-/// why we need a separate function to do this is in order to be able to
-/// accurately simulate the online pitch-processing, for testing and for
-/// training models matched to the "first-pass" features.  It is sensitive to
-/// the variables in pitch_opts that relate to online processing,
-/// i.e. max_frames_latency, frames_per_chunk, simulate_first_pass_online,
-/// recompute_frame.
-void ComputeAndProcessKaldiPitch(const PitchExtractionOptions &pitch_opts,
-                                 const ProcessPitchOptions &process_opts,
-                                 const VectorBase<BaseFloat> &wave,
-                                 Matrix<BaseFloat> *output);
-/// @} End of "addtogroup feat"
-}  // namespace kaldi
-#endif  // KALDI_FEAT_PITCH_FUNCTIONS_H_
--- a/speechx/speechx/kaldi/feat/resample.cc
+++ b/speechx/speechx/kaldi/feat/resample.cc
-// feat/resample.cc
-// Copyright    2013  Pegah Ghahremani
-//              2014  IMSL, PKU-HKUST (author: Wei Shi)
-//              2014  Yanqing Sun, Junjie Wang
-//              2014  Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include <algorithm>
-#include <limits>
-#include "feat/feature-functions.h"
-#include "matrix/matrix-functions.h"
-#include "feat/resample.h"
-namespace kaldi {
-LinearResample::LinearResample(int32 samp_rate_in_hz,
-                               int32 samp_rate_out_hz,
-                               BaseFloat filter_cutoff_hz,
-                               int32 num_zeros):
-    samp_rate_in_(samp_rate_in_hz),
-    samp_rate_out_(samp_rate_out_hz),
-    filter_cutoff_(filter_cutoff_hz),
-    num_zeros_(num_zeros) {
-  KALDI_ASSERT(samp_rate_in_hz > 0.0 &&
-               samp_rate_out_hz > 0.0 &&
-               filter_cutoff_hz > 0.0 &&
-               filter_cutoff_hz*2 <= samp_rate_in_hz &&
-               filter_cutoff_hz*2 <= samp_rate_out_hz &&
-               num_zeros > 0);
-  // base_freq is the frequency of the repeating unit, which is the gcd
-  // of the input frequencies.
-  int32 base_freq = Gcd(samp_rate_in_, samp_rate_out_);
-  input_samples_in_unit_ = samp_rate_in_ / base_freq;
-  output_samples_in_unit_ = samp_rate_out_ / base_freq;
-  SetIndexesAndWeights();
-  Reset();
-}
-int64 LinearResample::GetNumOutputSamples(int64 input_num_samp,
-                                          bool flush) const {
-  // For exact computation, we measure time in "ticks" of 1.0 / tick_freq,
-  // where tick_freq is the least common multiple of samp_rate_in_ and
-  // samp_rate_out_.
-  int32 tick_freq = Lcm(samp_rate_in_, samp_rate_out_);
-  int32 ticks_per_input_period = tick_freq / samp_rate_in_;
-  // work out the number of ticks in the time interval
-  // [ 0, input_num_samp/samp_rate_in_ ).
-  int64 interval_length_in_ticks = input_num_samp * ticks_per_input_period;
-  if (!flush) {
-    BaseFloat window_width = num_zeros_ / (2.0 * filter_cutoff_);
-    // To count the window-width in ticks we take the floor.  This
-    // is because since we're looking for the largest integer num-out-samp
-    // that fits in the interval, which is open on the right, a reduction
-    // in interval length of less than a tick will never make a difference.
-    // For example, the largest integer in the interval [ 0, 2 ) and the
-    // largest integer in the interval [ 0, 2 - 0.9 ) are the same (both one).
-    // So when we're subtracting the window-width we can ignore the fractional
-    // part.
-    int32 window_width_ticks = floor(window_width * tick_freq);
-    // The time-period of the output that we can sample gets reduced
-    // by the window-width (which is actually the distance from the
-    // center to the edge of the windowing function) if we're not
-    // "flushing the output".
-    interval_length_in_ticks -= window_width_ticks;
-  }
-  if (interval_length_in_ticks <= 0)
-    return 0;
-  int32 ticks_per_output_period = tick_freq / samp_rate_out_;
-  // Get the last output-sample in the closed interval, i.e. replacing [ ) with
-  // [ ].  Note: integer division rounds down.  See
-  // http://en.wikipedia.org/wiki/Interval_(mathematics) for an explanation of
-  // the notation.
-  int64 last_output_samp = interval_length_in_ticks / ticks_per_output_period;
-  // We need the last output-sample in the open interval, so if it takes us to
-  // the end of the interval exactly, subtract one.
-  if (last_output_samp * ticks_per_output_period == interval_length_in_ticks)
-    last_output_samp--;
-  // First output-sample index is zero, so the number of output samples
-  // is the last output-sample plus one.
-  int64 num_output_samp = last_output_samp + 1;
-  return num_output_samp;
-}
-void LinearResample::SetIndexesAndWeights() {
-  first_index_.resize(output_samples_in_unit_);
-  weights_.resize(output_samples_in_unit_);
-  double window_width = num_zeros_ / (2.0 * filter_cutoff_);
-  for (int32 i = 0; i < output_samples_in_unit_; i++) {
-    double output_t = i / static_cast<double>(samp_rate_out_);
-    double min_t = output_t - window_width, max_t = output_t + window_width;
-    // we do ceil on the min and floor on the max, because if we did it
-    // the other way around we would unnecessarily include indexes just
-    // outside the window, with zero coefficients.  It's possible
-    // if the arguments to the ceil and floor expressions are integers
-    // (e.g. if filter_cutoff_ has an exact ratio with the sample rates),
-    // that we unnecessarily include something with a zero coefficient,
-    // but this is only a slight efficiency issue.
-    int32 min_input_index = ceil(min_t * samp_rate_in_),
-        max_input_index = floor(max_t * samp_rate_in_),
-        num_indices = max_input_index - min_input_index + 1;
-    first_index_[i] = min_input_index;
-    weights_[i].Resize(num_indices);
-    for (int32 j = 0; j < num_indices; j++) {
-      int32 input_index = min_input_index + j;
-      double input_t = input_index / static_cast<double>(samp_rate_in_),
-          delta_t = input_t - output_t;
-      // sign of delta_t doesn't matter.
-      weights_[i](j) = FilterFunc(delta_t) / samp_rate_in_;
-    }
-  }
-}
-// inline
-void LinearResample::GetIndexes(int64 samp_out,
-                                int64 *first_samp_in,
-                                int32 *samp_out_wrapped) const {
-  // A unit is the smallest nonzero amount of time that is an exact
-  // multiple of the input and output sample periods.  The unit index
-  // is the answer to "which numbered unit we are in".
-  int64 unit_index = samp_out / output_samples_in_unit_;
-  // samp_out_wrapped is equal to samp_out % output_samples_in_unit_
-  *samp_out_wrapped = static_cast<int32>(samp_out -
-                                         unit_index * output_samples_in_unit_);
-  *first_samp_in = first_index_[*samp_out_wrapped] +
-      unit_index * input_samples_in_unit_;
-}
-void LinearResample::Resample(const VectorBase<BaseFloat> &input,
-                              bool flush,
-                              Vector<BaseFloat> *output) {
-  int32 input_dim = input.Dim();
-  int64 tot_input_samp = input_sample_offset_ + input_dim,
-      tot_output_samp = GetNumOutputSamples(tot_input_samp, flush);
-  KALDI_ASSERT(tot_output_samp >= output_sample_offset_);
-  output->Resize(tot_output_samp - output_sample_offset_);
-  // samp_out is the index into the total output signal, not just the part
-  // of it we are producing here.
-  for (int64 samp_out = output_sample_offset_;
-       samp_out < tot_output_samp;
-       samp_out++) {
-    int64 first_samp_in;
-    int32 samp_out_wrapped;
-    GetIndexes(samp_out, &first_samp_in, &samp_out_wrapped);
-    const Vector<BaseFloat> &weights = weights_[samp_out_wrapped];
-    // first_input_index is the first index into "input" that we have a weight
-    // for.
-    int32 first_input_index = static_cast<int32>(first_samp_in -
-                                                 input_sample_offset_);
-    BaseFloat this_output;
-    if (first_input_index >= 0 &&
-        first_input_index + weights.Dim() <= input_dim) {
-      SubVector<BaseFloat> input_part(input, first_input_index, weights.Dim());
-      this_output = VecVec(input_part, weights);
-    } else {  // Handle edge cases.
-      this_output = 0.0;
-      for (int32 i = 0; i < weights.Dim(); i++) {
-        BaseFloat weight = weights(i);
-        int32 input_index = first_input_index + i;
-        if (input_index < 0 && input_remainder_.Dim() + input_index >= 0) {
-          this_output += weight *
-              input_remainder_(input_remainder_.Dim() + input_index);
-        } else if (input_index >= 0 && input_index < input_dim) {
-          this_output += weight * input(input_index);
-        } else if (input_index >= input_dim) {
-          // We're past the end of the input and are adding zero; should only
-          // happen if the user specified flush == true, or else we would not
-          // be trying to output this sample.
-          KALDI_ASSERT(flush);
-        }
-      }
-    }
-    int32 output_index = static_cast<int32>(samp_out - output_sample_offset_);
-    (*output)(output_index) = this_output;
-  }
-  if (flush) {
-    Reset();  // Reset the internal state.
-  } else {
-    SetRemainder(input);
-    input_sample_offset_ = tot_input_samp;
-    output_sample_offset_ = tot_output_samp;
-  }
-}
-void LinearResample::SetRemainder(const VectorBase<BaseFloat> &input) {
-  Vector<BaseFloat> old_remainder(input_remainder_);
-  // max_remainder_needed is the width of the filter from side to side,
-  // measured in input samples.  you might think it should be half that,
-  // but you have to consider that you might be wanting to output samples
-  // that are "in the past" relative to the beginning of the latest
-  // input... anyway, storing more remainder than needed is not harmful.
-  int32 max_remainder_needed = ceil(samp_rate_in_ * num_zeros_ /
-                                    filter_cutoff_);
-  input_remainder_.Resize(max_remainder_needed);
-  for (int32 index = - input_remainder_.Dim(); index < 0; index++) {
-    // we interpret "index" as an offset from the end of "input" and
-    // from the end of input_remainder_.
-    int32 input_index = index + input.Dim();
-    if (input_index >= 0)
-      input_remainder_(index + input_remainder_.Dim()) = input(input_index);
-    else if (input_index + old_remainder.Dim() >= 0)
-      input_remainder_(index + input_remainder_.Dim()) =
-          old_remainder(input_index + old_remainder.Dim());
-    // else leave it at zero.
-  }
-}
-void LinearResample::Reset() {
-  input_sample_offset_ = 0;
-  output_sample_offset_ = 0;
-  input_remainder_.Resize(0);
-}
-/** Here, t is a time in seconds representing an offset from
-    the center of the windowed filter function, and FilterFunction(t)
-    returns the windowed filter function, described
-    in the header as h(t) = f(t)g(t), evaluated at t.
-*/
-BaseFloat LinearResample::FilterFunc(BaseFloat t) const {
-  BaseFloat window,  // raised-cosine (Hanning) window of width
-                  // num_zeros_/2*filter_cutoff_
-      filter;  // sinc filter function
-  if (fabs(t) < num_zeros_ / (2.0 * filter_cutoff_))
-    window = 0.5 * (1 + cos(M_2PI * filter_cutoff_ / num_zeros_ * t));
-  else
-    window = 0.0;  // outside support of window function
-  if (t != 0)
-    filter = sin(M_2PI * filter_cutoff_ * t) / (M_PI * t);
-  else
-    filter = 2 * filter_cutoff_;  // limit of the function at t = 0
-  return filter * window;
-}
-ArbitraryResample::ArbitraryResample(
-    int32 num_samples_in, BaseFloat samp_rate_in,
-    BaseFloat filter_cutoff, const Vector<BaseFloat> &sample_points,
-    int32 num_zeros):
-    num_samples_in_(num_samples_in),
-    samp_rate_in_(samp_rate_in),
-    filter_cutoff_(filter_cutoff),
-    num_zeros_(num_zeros) {
-  KALDI_ASSERT(num_samples_in > 0 && samp_rate_in > 0.0 &&
-               filter_cutoff > 0.0 &&
-               filter_cutoff * 2.0 <= samp_rate_in
-               && num_zeros > 0);
-  // set up weights_ and indices_.  Please try to keep all functions short and
-  SetIndexes(sample_points);
-  SetWeights(sample_points);
-}
-void ArbitraryResample::Resample(const MatrixBase<BaseFloat> &input,
-                                 MatrixBase<BaseFloat> *output) const {
-  // each row of "input" corresponds to the data to resample;
-  // the corresponding row of "output" is the resampled data.
-  KALDI_ASSERT(input.NumRows() == output->NumRows() &&
-               input.NumCols() == num_samples_in_ &&
-               output->NumCols() == weights_.size());
-  Vector<BaseFloat> output_col(output->NumRows());
-  for (int32 i = 0; i < NumSamplesOut(); i++) {
-    SubMatrix<BaseFloat> input_part(input, 0, input.NumRows(),
-                                    first_index_[i],
-                                    weights_[i].Dim());
-    const Vector<BaseFloat> &weight_vec(weights_[i]);
-    output_col.AddMatVec(1.0, input_part,
-                         kNoTrans, weight_vec, 0.0);
-    output->CopyColFromVec(output_col, i);
-  }
-}
-void ArbitraryResample::Resample(const VectorBase<BaseFloat> &input,
-                                 VectorBase<BaseFloat> *output) const {
-  KALDI_ASSERT(input.Dim() == num_samples_in_ &&
-               output->Dim() == weights_.size());
-  int32 output_dim = output->Dim();
-  for (int32 i = 0; i < output_dim; i++) {
-    SubVector<BaseFloat> input_part(input, first_index_[i], weights_[i].Dim());
-    (*output)(i) = VecVec(input_part, weights_[i]);
-  }
-}
-void ArbitraryResample::SetIndexes(const Vector<BaseFloat> &sample_points) {
-  int32 num_samples = sample_points.Dim();
-  first_index_.resize(num_samples);
-  weights_.resize(num_samples);
-  BaseFloat filter_width = num_zeros_ / (2.0 * filter_cutoff_);
-  for (int32  i = 0; i < num_samples; i++) {
-    // the t values are in seconds.
-    BaseFloat t = sample_points(i),
-        t_min = t - filter_width, t_max = t + filter_width;
-    int32 index_min = ceil(samp_rate_in_ * t_min),
-        index_max = floor(samp_rate_in_ * t_max);
-    // the ceil on index min and the floor on index_max are because there
-    // is no point using indices just outside the window (coeffs would be zero).
-    if (index_min < 0)
-      index_min = 0;
-    if (index_max >= num_samples_in_)
-      index_max = num_samples_in_ - 1;
-    first_index_[i] = index_min;
-    weights_[i].Resize(index_max - index_min + 1);
-  }
-}
-void ArbitraryResample::SetWeights(const Vector<BaseFloat> &sample_points) {
-  int32 num_samples_out = NumSamplesOut();
-  for (int32 i = 0; i < num_samples_out; i++) {
-    for (int32 j = 0 ; j < weights_[i].Dim(); j++) {
-      BaseFloat delta_t = sample_points(i) -
-          (first_index_[i] + j) / samp_rate_in_;
-      // Include at this point the factor of 1.0 / samp_rate_in_ which
-      // appears in the math.
-      weights_[i](j) = FilterFunc(delta_t) / samp_rate_in_;
-    }
-  }
-}
-/** Here, t is a time in seconds representing an offset from
-    the center of the windowed filter function, and FilterFunction(t)
-    returns the windowed filter function, described
-    in the header as h(t) = f(t)g(t), evaluated at t.
-*/
-BaseFloat ArbitraryResample::FilterFunc(BaseFloat t) const {
-  BaseFloat window,  // raised-cosine (Hanning) window of width
-                  // num_zeros_/2*filter_cutoff_
-      filter;  // sinc filter function
-  if (fabs(t) < num_zeros_ / (2.0 * filter_cutoff_))
-    window = 0.5 * (1 + cos(M_2PI * filter_cutoff_ / num_zeros_ * t));
-  else
-    window = 0.0;  // outside support of window function
-  if (t != 0.0)
-    filter = sin(M_2PI * filter_cutoff_ * t) / (M_PI * t);
-  else
-    filter = 2.0 * filter_cutoff_;  // limit of the function at zero.
-  return filter * window;
-}
-void ResampleWaveform(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
-                      BaseFloat new_freq, Vector<BaseFloat> *new_wave) {
-  BaseFloat min_freq = std::min(orig_freq, new_freq);
-  BaseFloat lowpass_cutoff = 0.99 * 0.5 * min_freq;
-  int32 lowpass_filter_width = 6;
-  LinearResample resampler(orig_freq, new_freq,
-                           lowpass_cutoff, lowpass_filter_width);
-  resampler.Resample(wave, true, new_wave);
-}
-}  // namespace kaldi
--- a/speechx/speechx/kaldi/feat/resample.h
+++ b/speechx/speechx/kaldi/feat/resample.h
-// feat/resample.h
-// Copyright     2013  Pegah Ghahremani
-//               2014  IMSL, PKU-HKUST (author: Wei Shi)
-//               2014  Yanqing Sun, Junjie Wang
-//               2014  Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_FEAT_RESAMPLE_H_
-#define KALDI_FEAT_RESAMPLE_H_
-#include <cassert>
-#include <cstdlib>
-#include <string>
-#include <vector>
-#include "matrix/matrix-lib.h"
-#include "util/common-utils.h"
-#include "base/kaldi-error.h"
-namespace kaldi {
-/// @addtogroup  feat FeatureExtraction
-/// @{
-/**
-   \file[resample.h]
-   This header contains declarations of classes for resampling signals.  The
-   normal cases of resampling a signal are upsampling and downsampling
-   (increasing and decreasing the sample rate of a signal, respectively),
-   although the ArbitraryResample class allows a more generic case where
-   we want to get samples of a signal at uneven intervals (for instance,
-   log-spaced).
-   The input signal is always evenly spaced, say sampled with frequency S, and
-   we assume the original signal was band-limited to S/2 or lower.  The n'th
-   input sample x_n (with n = 0, 1, ...) is interpreted as the original
-   signal's value at time n/S.
-   For resampling, it is convenient to view the input signal as a
-   continuous function x(t) of t, where each sample x_n becomes a delta function
-   with magnitude x_n/S, at time n/S.  If we band limit this to the Nyquist
-   frequency S/2, we can show that this is the same as the original signal
-   that was sampled. [assuming the original signal was periodic and band
-   limited.]  In general we want to bandlimit to lower than S/2, because
-   we don't have a perfect filter and also because if we want to resample
-   at a lower frequency than S, we need to bandlimit to below half of that.
-   Anyway, suppose we want to bandlimit to C, with 0 < C < S/2.  The perfect
-   rectangular filter with cutoff C is the sinc function,
-   \f[         f(t) = 2C sinc(2Ct),                   \f]
-   where sinc is the normalized sinc function \f$ sinc(t) = sin(pi t) / (pi t) \f$, with
-  \f$  sinc(0) = 1 \f$.  This is not a practical filter, though, because it has
-   infinite support.  At the cost of less-than-perfect rolloff, we can choose
-   a suitable windowing function g(t), and use f(t) g(t) as the filter.  For
-   a windowing function we choose raised-cosine (Hanning) window with support
-   on [-w/2C, w/2C], where w >= 2 is an integer chosen by the user.  w = 1
-   means we window the sinc function out to its first zero on the left and right,
-   w = 2 means the second zero, and so on; we normally choose w to be at least two.
-   We call this num_zeros, not w, in the code.
-   Convolving the signal x(t) with this windowed filter h(t) = f(t)g(t) and evaluating the resulting
-   signal s(t) at an arbitrary time t is easy: we have
-    \f[          s(t) = 1/S \sum_n x_n h(t - n/S)        \f].
-   (note: the sign of t - n/S might be wrong, but it doesn't matter as the filter
-   and window are symmetric).
-   This is true for arbitrary values of t.  What the class ArbitraryResample does
-   is to allow you to evaluate the signal for specified values of t.
-*/
-/**
-   Class ArbitraryResample allows you to resample a signal (assumed zero outside
-   the sample region, not periodic) at arbitrary specified time values, which
-   don't have to be linearly spaced.  The low-pass filter cutoff
-   "filter_cutoff_hz" should be less than half the sample rate;
-   "num_zeros" should probably be at least two preferably more; higher numbers give
-   sharper filters but will be less efficient.
-*/
-class ArbitraryResample {
- public:
-  ArbitraryResample(int32 num_samples_in,
-                    BaseFloat samp_rate_hz,
-                    BaseFloat filter_cutoff_hz,
-                    const Vector<BaseFloat> &sample_points_secs,
-                    int32 num_zeros);
-  int32 NumSamplesIn() const { return num_samples_in_; }
-  int32 NumSamplesOut() const { return weights_.size(); }
-  /// This function does the resampling.
-  /// input.NumRows() and output.NumRows() should be equal
-  /// and nonzero.
-  /// input.NumCols() should equal NumSamplesIn()
-  /// and output.NumCols() should equal NumSamplesOut().
-  void Resample(const MatrixBase<BaseFloat> &input,
-                MatrixBase<BaseFloat> *output) const;
-  /// This version of the Resample function processes just
-  /// one vector.
-  void Resample(const VectorBase<BaseFloat> &input,
-                VectorBase<BaseFloat> *output) const;
- private:
-  void SetIndexes(const Vector<BaseFloat> &sample_points);
-  void SetWeights(const Vector<BaseFloat> &sample_points);
-  BaseFloat FilterFunc(BaseFloat t) const;
-  int32 num_samples_in_;
-  BaseFloat samp_rate_in_;
-  BaseFloat filter_cutoff_;
-  int32 num_zeros_;
-  std::vector<int32> first_index_;  // The first input-sample index that we sum
-                                    // over, for this output-sample index.
-  std::vector<Vector<BaseFloat> > weights_;
-};
-/**
-   LinearResample is a special case of ArbitraryResample, where we want to
-   resample a signal at linearly spaced intervals (this means we want to
-   upsample or downsample the signal).  It is more efficient than
-   ArbitraryResample because we can construct it just once.
-   We require that the input and output sampling rate be specified as
-   integers, as this is an easy way to specify that their ratio be rational.
-*/
-class LinearResample {
- public:
-  /// Constructor.  We make the input and output sample rates integers, because
-  /// we are going to need to find a common divisor.  This should just remind
-  /// you that they need to be integers.  The filter cutoff needs to be less
-  /// than samp_rate_in_hz/2 and less than samp_rate_out_hz/2.  num_zeros
-  /// controls the sharpness of the filter, more == sharper but less efficient.
-  /// We suggest around 4 to 10 for normal use.
-  LinearResample(int32 samp_rate_in_hz,
-                 int32 samp_rate_out_hz,
-                 BaseFloat filter_cutoff_hz,
-                 int32 num_zeros);
-  /// This function does the resampling.  If you call it with flush == true and
-  /// you have never called it with flush == false, it just resamples the input
-  /// signal (it resizes the output to a suitable number of samples).
-  ///
-  /// You can also use this function to process a signal a piece at a time.
-  /// suppose you break it into piece1, piece2, ... pieceN.  You can call
-  /// \code{.cc}
-  /// Resample(piece1, &output1, false);
-  /// Resample(piece2, &output2, false);
-  /// Resample(piece3, &output3, true);
-  /// \endcode
-  /// If you call it with flush == false, it won't output the last few samples
-  /// but will remember them, so that if you later give it a second piece of
-  /// the input signal it can process it correctly.
-  /// If your most recent call to the object was with flush == false, it will
-  /// have internal state; you can remove this by calling Reset().
-  /// Empty input is acceptable.
-  void Resample(const VectorBase<BaseFloat> &input,
-                bool flush,
-                Vector<BaseFloat> *output);
-  /// Calling the function Reset() resets the state of the object prior to
-  /// processing a new signal; it is only necessary if you have called
-  /// Resample(x, y, false) for some signal, leading to a remainder of the
-  /// signal being called, but then abandon processing the signal before calling
-  /// Resample(x, y, true) for the last piece.  Call it unnecessarily between
-  /// signals will not do any harm.
-  void Reset();
-  //// Return the input and output sampling rates (for checks, for example)
-  inline int32 GetInputSamplingRate() { return samp_rate_in_; }
-  inline int32 GetOutputSamplingRate() { return samp_rate_out_; }
- private:
-  /// This function outputs the number of output samples we will output
-  /// for a signal with "input_num_samp" input samples.  If flush == true,
-  /// we return the largest n such that
-  /// (n/samp_rate_out_) is in the interval [ 0, input_num_samp/samp_rate_in_ ),
-  /// and note that the interval is half-open.  If flush == false,
-  /// define window_width as num_zeros / (2.0 * filter_cutoff_);
-  /// we return the largest n such that (n/samp_rate_out_) is in the interval
-  /// [ 0, input_num_samp/samp_rate_in_ - window_width ).
-  int64 GetNumOutputSamples(int64 input_num_samp, bool flush) const;
-  /// Given an output-sample index, this function outputs to *first_samp_in the
-  /// first input-sample index that we have a weight on (may be negative),
-  /// and to *samp_out_wrapped the index into weights_ where we can get the
-  /// corresponding weights on the input.
-  inline void GetIndexes(int64 samp_out,
-                         int64 *first_samp_in,
-                         int32 *samp_out_wrapped) const;
-  void SetRemainder(const VectorBase<BaseFloat> &input);
-  void SetIndexesAndWeights();
-  BaseFloat FilterFunc(BaseFloat) const;
-  // The following variables are provided by the user.
-  int32 samp_rate_in_;
-  int32 samp_rate_out_;
-  BaseFloat filter_cutoff_;
-  int32 num_zeros_;
-  int32 input_samples_in_unit_;   ///< The number of input samples in the
-                                  ///< smallest repeating unit: num_samp_in_ =
-                                  ///< samp_rate_in_hz / Gcd(samp_rate_in_hz,
-                                  ///< samp_rate_out_hz)
-  int32 output_samples_in_unit_;  ///< The number of output samples in the
-                                  ///< smallest repeating unit: num_samp_out_ =
-                                  ///< samp_rate_out_hz / Gcd(samp_rate_in_hz,
-                                  ///< samp_rate_out_hz)
-  /// The first input-sample index that we sum over, for this output-sample
-  /// index.  May be negative; any truncation at the beginning is handled
-  /// separately.  This is just for the first few output samples, but we can
-  /// extrapolate the correct input-sample index for arbitrary output samples.
-  std::vector<int32> first_index_;
-  /// Weights on the input samples, for this output-sample index.
-  std::vector<Vector<BaseFloat> > weights_;
-  // the following variables keep track of where we are in a particular signal,
-  // if it is being provided over multiple calls to Resample().
-  int64 input_sample_offset_;  ///< The number of input samples we have
-                               ///< already received for this signal
-                               ///< (including anything in remainder_)
-  int64 output_sample_offset_;  ///< The number of samples we have already
-                                ///< output for this signal.
-  Vector<BaseFloat> input_remainder_;  ///< A small trailing part of the
-                                       ///< previously seen input signal.
-};
-/**
-   Downsample or upsample a waveform. This is a convenience wrapper for the
-   class 'LinearResample'.
-   The low-pass filter cutoff used in 'LinearResample' is 0.99 of the Nyquist,
-   where the Nyquist is half of the minimum of (orig_freq, new_freq).  The
-   resampling is done with a symmetric FIR filter with N_z (number of zeros)
-   as 6.
-   We compared the downsampling results with those from the sox resampling
-   toolkit.
-   Sox's design is inspired by Laurent De Soras' paper,
-   https://ccrma.stanford.edu/~jos/resample/Implementation.html
-   Note: we expect that while orig_freq and new_freq are of type BaseFloat, they
-   are actually required to have exact integer values (like 16000 or 8000) with
-   a ratio between them that can be expressed as a rational number with
-   reasonably small integer factors.
-*/
-void ResampleWaveform(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
-                      BaseFloat new_freq, Vector<BaseFloat> *new_wave);
-/// This function is deprecated.  It is provided for backward compatibility, to avoid
-/// breaking older code.
-inline void DownsampleWaveForm(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
-                               BaseFloat new_freq, Vector<BaseFloat> *new_wave) {
-  ResampleWaveform(orig_freq, wave, new_freq, new_wave);
-}
-/// @} End of "addtogroup feat"
-}  // namespace kaldi
-#endif  // KALDI_FEAT_RESAMPLE_H_
--- a/speechx/speechx/kaldi/feat/signal.cc
+++ b/speechx/speechx/kaldi/feat/signal.cc
-// feat/signal.cc
-// Copyright 2015  Tom Ko
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "feat/signal.h"
-namespace kaldi {
-void ElementwiseProductOfFft(const Vector<BaseFloat> &a, Vector<BaseFloat> *b) {
-  int32 num_fft_bins = a.Dim() / 2;
-  for (int32 i = 0; i < num_fft_bins; i++) {
-    // do complex multiplication
-    ComplexMul(a(2*i), a(2*i + 1), &((*b)(2*i)), &((*b)(2*i + 1)));
-  }
-}
-void ConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal) {
-  int32 signal_length = signal->Dim();
-  int32 filter_length = filter.Dim();
-  int32 output_length = signal_length + filter_length - 1;
-  Vector<BaseFloat> signal_padded(output_length);
-  signal_padded.SetZero();
-  for (int32 i = 0; i < signal_length; i++) {
-    for (int32 j = 0; j < filter_length; j++) {
-        signal_padded(i + j) += (*signal)(i) * filter(j);
-    }
-  }
-  signal->Resize(output_length);
-  signal->CopyFromVec(signal_padded);
-}
-void FFTbasedConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal) {
-  int32 signal_length = signal->Dim();
-  int32 filter_length = filter.Dim();
-  int32 output_length = signal_length + filter_length - 1;
-  int32 fft_length = RoundUpToNearestPowerOfTwo(output_length);
-  KALDI_VLOG(1) << "fft_length for full signal convolution is " << fft_length;
-  SplitRadixRealFft<BaseFloat> srfft(fft_length);
-  Vector<BaseFloat> filter_padded(fft_length);
-  filter_padded.Range(0, filter_length).CopyFromVec(filter);
-  srfft.Compute(filter_padded.Data(), true);
-  Vector<BaseFloat> signal_padded(fft_length);
-  signal_padded.Range(0, signal_length).CopyFromVec(*signal);
-  srfft.Compute(signal_padded.Data(), true);
-  ElementwiseProductOfFft(filter_padded, &signal_padded);
-  srfft.Compute(signal_padded.Data(), false);
-  signal_padded.Scale(1.0 / fft_length);
-  signal->Resize(output_length);
-  signal->CopyFromVec(signal_padded.Range(0, output_length));
-}
-void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal) {
-  int32 signal_length = signal->Dim();
-  int32 filter_length = filter.Dim();
-  int32 output_length = signal_length + filter_length - 1;
-  signal->Resize(output_length, kCopyData);
-  KALDI_VLOG(1) << "Length of the filter is " << filter_length;
-  int32 fft_length = RoundUpToNearestPowerOfTwo(4 * filter_length);
-  KALDI_VLOG(1) << "Best FFT length is " << fft_length;
-  int32 block_length = fft_length - filter_length + 1;
-  KALDI_VLOG(1) << "Block size is " << block_length;
-  SplitRadixRealFft<BaseFloat> srfft(fft_length);
-  Vector<BaseFloat> filter_padded(fft_length);
-  filter_padded.Range(0, filter_length).CopyFromVec(filter);
-  srfft.Compute(filter_padded.Data(), true);
-  Vector<BaseFloat> temp_pad(filter_length - 1);
-  temp_pad.SetZero();
-  Vector<BaseFloat> signal_block_padded(fft_length);
-  for (int32 po = 0; po < output_length; po += block_length) {
-    // get a block of the signal
-    int32 process_length = std::min(block_length, output_length - po);
-    signal_block_padded.SetZero();
-    signal_block_padded.Range(0, process_length).CopyFromVec(signal->Range(po, process_length));
-    srfft.Compute(signal_block_padded.Data(), true);
-    ElementwiseProductOfFft(filter_padded, &signal_block_padded);
-    srfft.Compute(signal_block_padded.Data(), false);
-    signal_block_padded.Scale(1.0 / fft_length);
-    // combine the block
-    if (po + block_length < output_length) {       // current block is not the last block
-      signal->Range(po, block_length).CopyFromVec(signal_block_padded.Range(0, block_length));
-      signal->Range(po, filter_length - 1).AddVec(1.0, temp_pad);
-      temp_pad.CopyFromVec(signal_block_padded.Range(block_length, filter_length - 1));
-    } else {
-      signal->Range(po, output_length - po).CopyFromVec(
-                        signal_block_padded.Range(0, output_length - po));
-      if (filter_length - 1 < output_length - po)
-        signal->Range(po, filter_length - 1).AddVec(1.0, temp_pad);
-      else
-        signal->Range(po, output_length - po).AddVec(1.0, temp_pad.Range(0, output_length - po));
-    }
-  }
-}
-}
--- a/speechx/speechx/kaldi/feat/signal.h
+++ b/speechx/speechx/kaldi/feat/signal.h
-// feat/signal.h
-// Copyright 2015  Tom Ko
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_FEAT_SIGNAL_H_
-#define KALDI_FEAT_SIGNAL_H_
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-namespace kaldi {
-/* 
-   The following three functions are having the same functionality but
-   different implementations so as the efficiency. After the convolution,
-   the length of the signal will be extended to (original signal length +
-   filter length - 1).
-*/
-/*
-   This function implements a simple non-FFT-based convolution of two signals.
-   It is suggested to use the FFT-based convolution function which is more
-   efficient.
-*/
-void ConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal);
-/*
-   This function implements FFT-based convolution of two signals.
-   However this should be an inefficient version of BlockConvolveSignals()
-   as it processes the entire signal with a single FFT.
-*/
-void FFTbasedConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal);
-/*
-   This function implements FFT-based block convolution of two signals using
-   overlap-add method. This is an efficient way to evaluate the discrete
-   convolution of a long signal with a finite impulse response filter.
-*/
-void FFTbasedBlockConvolveSignals(const Vector<BaseFloat> &filter, Vector<BaseFloat> *signal);
-}  // namespace kaldi
-#endif  // KALDI_FEAT_SIGNAL_H_
--- a/speechx/speechx/kaldi/matrix/CMakeLists.txt
+++ b/speechx/speechx/kaldi/matrix/CMakeLists.txt
-add_library(kaldi-matrix
-compressed-matrix.cc
-kaldi-matrix.cc
-kaldi-vector.cc
-matrix-functions.cc
-optimization.cc
-packed-matrix.cc
-qr.cc
-sparse-matrix.cc
-sp-matrix.cc
-srfft.cc
-tp-matrix.cc
-)
-target_link_libraries(kaldi-matrix gfortran kaldi-base libopenblas.a)
--- a/speechx/speechx/kaldi/matrix/cblas-wrappers.h
+++ b/speechx/speechx/kaldi/matrix/cblas-wrappers.h
-// matrix/cblas-wrappers.h
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey);
-//                 Haihua Xu; Wei Shi
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//  http://www.apache.org/licenses/LICENSE-2.0
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_MATRIX_CBLAS_WRAPPERS_H_
-#define KALDI_MATRIX_CBLAS_WRAPPERS_H_ 1
-#include <limits>
-#include "matrix/sp-matrix.h"
-#include "matrix/kaldi-vector.h"
-#include "matrix/kaldi-matrix.h"
-#include "matrix/matrix-functions.h"
-#include "matrix/kaldi-blas.h"
-// Do not include this file directly.  It is to be included
-// by .cc files in this directory.
-namespace kaldi {
-inline void cblas_Xcopy(const int N, const float *X, const int incX, float *Y,
-                        const int incY) {
-  cblas_scopy(N, X, incX, Y, incY);
-}
-inline void cblas_Xcopy(const int N, const double *X, const int incX, double *Y,
-                        const int incY) {
-  cblas_dcopy(N, X, incX, Y, incY);
-}
-inline float cblas_Xasum(const int N, const float *X, const int incX) {
-  return cblas_sasum(N, X, incX);
-}
-inline double cblas_Xasum(const int N, const double *X, const int incX) {
-  return cblas_dasum(N, X, incX);
-}
-inline void cblas_Xrot(const int N, float *X, const int incX, float *Y,
-                       const int incY, const float c, const float s) {
-  cblas_srot(N, X, incX, Y, incY, c, s);
-}
-inline void cblas_Xrot(const int N, double *X, const int incX, double *Y,
-                       const int incY, const double c, const double s) {
-  cblas_drot(N, X, incX, Y, incY, c, s);
-}
-inline float cblas_Xdot(const int N, const float *const X,
-                        const int incX, const float *const Y,
-                        const int incY) {
-  return cblas_sdot(N, X, incX, Y, incY);
-}
-inline double cblas_Xdot(const int N, const double *const X,
-                        const int incX, const double *const Y,
-                        const int incY) {
-  return cblas_ddot(N, X, incX, Y, incY);
-}
-inline void cblas_Xaxpy(const int N, const float alpha, const float *X,
-                        const int incX, float *Y, const int incY) {
-  cblas_saxpy(N, alpha, X, incX, Y, incY);
-}
-inline void cblas_Xaxpy(const int N, const double alpha, const double *X,
-                        const int incX, double *Y, const int incY) {
-  cblas_daxpy(N, alpha, X, incX, Y, incY);
-}
-inline void cblas_Xscal(const int N, const float alpha, float *data,
-                        const int inc) {
-  cblas_sscal(N, alpha, data, inc);
-}
-inline void cblas_Xscal(const int N, const double alpha, double *data, 
-                        const int inc) {
-  cblas_dscal(N, alpha, data, inc);
-}
-inline void cblas_Xspmv(const float alpha, const int num_rows, const float *Mdata,
-                        const float *v, const int v_inc,
-                        const float beta, float *y, const int y_inc) {
-  cblas_sspmv(CblasRowMajor, CblasLower, num_rows, alpha, Mdata, v, v_inc, beta, y, y_inc);
-}
-inline void cblas_Xspmv(const double alpha, const int num_rows, const double *Mdata,
-                        const double *v, const int v_inc,
-                        const double beta, double *y, const int y_inc) {
-  cblas_dspmv(CblasRowMajor, CblasLower, num_rows, alpha, Mdata, v, v_inc, beta, y, y_inc);
-}
-inline void cblas_Xtpmv(MatrixTransposeType trans, const float *Mdata,
-                        const int num_rows, float *y, const int y_inc) {
-  cblas_stpmv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
-              CblasNonUnit, num_rows, Mdata, y, y_inc);
-}
-inline void cblas_Xtpmv(MatrixTransposeType trans, const double *Mdata,
-                        const int num_rows, double *y, const int y_inc) {
-  cblas_dtpmv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
-              CblasNonUnit, num_rows, Mdata, y, y_inc);
-}
-inline void cblas_Xtpsv(MatrixTransposeType trans, const float *Mdata,
-                        const int num_rows, float *y, const int y_inc) {
-  cblas_stpsv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
-              CblasNonUnit, num_rows, Mdata, y, y_inc);
-}
-inline void cblas_Xtpsv(MatrixTransposeType trans, const double *Mdata,
-                        const int num_rows, double *y, const int y_inc) {
-  cblas_dtpsv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
-              CblasNonUnit, num_rows, Mdata, y, y_inc);
-}
-// x = alpha * M * y + beta * x
-inline void cblas_Xspmv(MatrixIndexT dim, float alpha, const float *Mdata,
-                        const float *ydata, MatrixIndexT ystride,
-                        float beta, float *xdata, MatrixIndexT xstride) {
-  cblas_sspmv(CblasRowMajor, CblasLower, dim, alpha, Mdata,
-              ydata, ystride, beta, xdata, xstride);
-}
-inline void cblas_Xspmv(MatrixIndexT dim, double alpha, const double *Mdata,
-                        const double *ydata, MatrixIndexT ystride,
-                        double beta, double *xdata, MatrixIndexT xstride) {
-  cblas_dspmv(CblasRowMajor, CblasLower, dim, alpha, Mdata,
-              ydata, ystride, beta, xdata, xstride);
-}
-// Implements  A += alpha * (x y'  + y x'); A is symmetric matrix.
-inline void cblas_Xspr2(MatrixIndexT dim, float alpha, const float *Xdata,
-                        MatrixIndexT incX, const float *Ydata, MatrixIndexT incY,
-                          float *Adata) {
-  cblas_sspr2(CblasRowMajor, CblasLower, dim, alpha, Xdata,
-              incX, Ydata, incY, Adata);
-}
-inline void cblas_Xspr2(MatrixIndexT dim, double alpha, const double *Xdata,
-                        MatrixIndexT incX, const double *Ydata, MatrixIndexT incY,
-                        double *Adata) {
-  cblas_dspr2(CblasRowMajor, CblasLower, dim, alpha, Xdata,
-              incX, Ydata, incY, Adata);
-}
-// Implements  A += alpha * (x x'); A is symmetric matrix.
-inline void cblas_Xspr(MatrixIndexT dim, float alpha, const float *Xdata,
-                       MatrixIndexT incX, float *Adata) {
-  cblas_sspr(CblasRowMajor, CblasLower, dim, alpha, Xdata, incX, Adata);
-}
-inline void cblas_Xspr(MatrixIndexT dim, double alpha, const double *Xdata,
-                       MatrixIndexT incX, double *Adata) {
-  cblas_dspr(CblasRowMajor, CblasLower, dim, alpha, Xdata, incX, Adata);
-}
-// sgemv,dgemv: y = alpha M x + beta y.
-inline void cblas_Xgemv(MatrixTransposeType trans, MatrixIndexT num_rows,
-                        MatrixIndexT num_cols, float alpha, const float *Mdata,
-                        MatrixIndexT stride, const float *xdata,
-                        MatrixIndexT incX, float beta, float *ydata, MatrixIndexT incY) {
-  cblas_sgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
-              num_cols, alpha, Mdata, stride, xdata, incX, beta, ydata, incY);
-}
-inline void cblas_Xgemv(MatrixTransposeType trans, MatrixIndexT num_rows,
-                        MatrixIndexT num_cols, double alpha, const double *Mdata,
-                        MatrixIndexT stride, const double *xdata,
-                        MatrixIndexT incX, double beta, double *ydata, MatrixIndexT incY) {
-  cblas_dgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
-              num_cols, alpha, Mdata, stride, xdata, incX, beta, ydata, incY);
-}
-// sgbmv, dgmmv: y = alpha M x +  + beta * y.
-inline void cblas_Xgbmv(MatrixTransposeType trans, MatrixIndexT num_rows,
-                        MatrixIndexT num_cols, MatrixIndexT num_below,
-                        MatrixIndexT num_above, float alpha, const float *Mdata,
-                        MatrixIndexT stride, const float *xdata,
-                        MatrixIndexT incX, float beta, float *ydata, MatrixIndexT incY) {
-  cblas_sgbmv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
-              num_cols, num_below, num_above, alpha, Mdata, stride, xdata,
-              incX, beta, ydata, incY);
-}
-inline void cblas_Xgbmv(MatrixTransposeType trans, MatrixIndexT num_rows,
-                        MatrixIndexT num_cols, MatrixIndexT num_below,
-                        MatrixIndexT num_above, double alpha, const double *Mdata,
-                        MatrixIndexT stride, const double *xdata,
-                        MatrixIndexT incX, double beta, double *ydata, MatrixIndexT incY) {
-  cblas_dgbmv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
-              num_cols, num_below, num_above, alpha, Mdata, stride, xdata,
-              incX, beta, ydata, incY);
-}
-template<typename Real>
-inline void Xgemv_sparsevec(MatrixTransposeType trans, MatrixIndexT num_rows,
-                            MatrixIndexT num_cols, Real alpha, const Real *Mdata,
-                            MatrixIndexT stride, const Real *xdata,
-                            MatrixIndexT incX, Real beta, Real *ydata,
-                            MatrixIndexT incY) {
-  if (trans == kNoTrans) {
-    if (beta != 1.0) cblas_Xscal(num_rows, beta, ydata, incY);
-    for (MatrixIndexT i = 0; i < num_cols; i++) {
-      Real x_i = xdata[i * incX];
-      if (x_i == 0.0) continue;
-      // Add to ydata, the i'th column of M, times alpha * x_i
-      cblas_Xaxpy(num_rows, x_i * alpha, Mdata + i, stride, ydata, incY);
-    }    
-  } else {
-    if (beta != 1.0) cblas_Xscal(num_cols, beta, ydata, incY);
-    for (MatrixIndexT i = 0; i < num_rows; i++) {
-      Real x_i = xdata[i * incX];
-      if (x_i == 0.0) continue;
-      // Add to ydata, the i'th row of M, times alpha * x_i
-      cblas_Xaxpy(num_cols, x_i * alpha,
-                  Mdata + (i * stride), 1, ydata, incY);
-    }
-  }
-}
-inline void cblas_Xgemm(const float alpha,
-                        MatrixTransposeType transA,
-                        const float *Adata,
-                        MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride,
-                        MatrixTransposeType transB, 
-                        const float *Bdata, MatrixIndexT b_stride,
-                        const float beta,
-                        float *Mdata, 
-                        MatrixIndexT num_rows, MatrixIndexT num_cols,MatrixIndexT stride) {
-  cblas_sgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA), 
-              static_cast<CBLAS_TRANSPOSE>(transB),
-              num_rows, num_cols, transA == kNoTrans ? a_num_cols : a_num_rows,
-              alpha, Adata, a_stride, Bdata, b_stride,
-              beta, Mdata, stride); 
-}
-inline void cblas_Xgemm(const double alpha,
-                        MatrixTransposeType transA,
-                        const double *Adata,
-                        MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride,
-                        MatrixTransposeType transB, 
-                        const double *Bdata, MatrixIndexT b_stride,
-                        const double beta,
-                        double *Mdata, 
-                        MatrixIndexT num_rows, MatrixIndexT num_cols,MatrixIndexT stride) {
-  cblas_dgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA), 
-              static_cast<CBLAS_TRANSPOSE>(transB),
-              num_rows, num_cols, transA == kNoTrans ? a_num_cols : a_num_rows,
-              alpha, Adata, a_stride, Bdata, b_stride,
-              beta, Mdata, stride); 
-}
-inline void cblas_Xsymm(const float alpha,
-                        MatrixIndexT sz,
-                        const float *Adata,MatrixIndexT a_stride,
-                        const float *Bdata,MatrixIndexT b_stride,
-                        const float beta,
-                        float *Mdata, MatrixIndexT stride) {
-  cblas_ssymm(CblasRowMajor, CblasLeft, CblasLower, sz, sz, alpha, Adata,
-              a_stride, Bdata, b_stride, beta, Mdata, stride);
-}
-inline void cblas_Xsymm(const double alpha,
-                        MatrixIndexT sz,
-                        const double *Adata,MatrixIndexT a_stride,
-                        const double *Bdata,MatrixIndexT b_stride,
-                        const double beta,
-                        double *Mdata, MatrixIndexT stride) {
-  cblas_dsymm(CblasRowMajor, CblasLeft, CblasLower, sz, sz, alpha, Adata,
-              a_stride, Bdata, b_stride, beta, Mdata, stride);
-}
-// ger: M += alpha x y^T.
-inline void cblas_Xger(MatrixIndexT num_rows, MatrixIndexT num_cols, float alpha,
-                       const float *xdata, MatrixIndexT incX, const float *ydata,
-                       MatrixIndexT incY, float *Mdata, MatrixIndexT stride) {
-  cblas_sger(CblasRowMajor, num_rows, num_cols, alpha, xdata, 1, ydata, 1,
-             Mdata, stride);
-}
-inline void cblas_Xger(MatrixIndexT num_rows, MatrixIndexT num_cols, double alpha,
-                       const double *xdata, MatrixIndexT incX, const double *ydata,
-                       MatrixIndexT incY, double *Mdata, MatrixIndexT stride) {
-  cblas_dger(CblasRowMajor, num_rows, num_cols, alpha, xdata, 1, ydata, 1,
-             Mdata, stride);
-}
-// syrk: symmetric rank-k update.
-// if trans==kNoTrans, then C = alpha A A^T + beta C
-// else C = alpha A^T A + beta C.
-// note: dim_c is dim(C), other_dim_a is the "other" dimension of A, i.e.
-// num-cols(A) if kNoTrans, or num-rows(A) if kTrans.
-// We only need the row-major and lower-triangular option of this, and this
-// is hard-coded.
-inline void cblas_Xsyrk (
-    const MatrixTransposeType trans, const MatrixIndexT dim_c,
-    const MatrixIndexT other_dim_a, const float alpha, const float *A,
-    const MatrixIndexT a_stride, const float beta, float *C,
-    const MatrixIndexT c_stride) {
-  cblas_ssyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
-              dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
-}
-inline void cblas_Xsyrk(
-    const MatrixTransposeType trans, const MatrixIndexT dim_c,
-    const MatrixIndexT other_dim_a, const double alpha, const double *A,
-    const MatrixIndexT a_stride, const double beta, double *C,
-    const MatrixIndexT c_stride) {
-  cblas_dsyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
-              dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
-}
-/// matrix-vector multiply using a banded matrix; we always call this
-/// with b = 1 meaning we're multiplying by a diagonal matrix.  This is used for
-/// elementwise multiplication.  We miss some of the arguments out of this
-/// wrapper.
-inline void cblas_Xsbmv1(
-    const MatrixIndexT dim,
-    const double *A,
-    const double alpha,
-    const double *x,
-    const double beta,
-    double *y) {
-  cblas_dsbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
-              1, x, 1, beta, y, 1);
-}
-inline void cblas_Xsbmv1(
-    const MatrixIndexT dim,
-    const float *A,
-    const float alpha,
-    const float *x,
-    const float beta,
-    float *y) {
-  cblas_ssbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
-              1, x, 1, beta, y, 1);
-}
-/// This is not really a wrapper for CBLAS as CBLAS does not have this; in future we could
-/// extend this somehow.
-inline void mul_elements(
-    const MatrixIndexT dim,
-    const double *a,
-    double *b) { // does b *= a, elementwise.
-  double c1, c2, c3, c4;
-  MatrixIndexT i;
-  for (i = 0; i + 4 <= dim; i += 4) {
-    c1 = a[i] * b[i];
-    c2 = a[i+1] * b[i+1];
-    c3 = a[i+2] * b[i+2];
-    c4 = a[i+3] * b[i+3];
-    b[i] = c1;
-    b[i+1] = c2;
-    b[i+2] = c3;
-    b[i+3] = c4;
-  }
-  for (; i < dim; i++)
-    b[i] *= a[i];
-}
-inline void mul_elements(
-    const MatrixIndexT dim,
-    const float *a,
-    float *b) { // does b *= a, elementwise.
-  float c1, c2, c3, c4;
-  MatrixIndexT i;
-  for (i = 0; i + 4 <= dim; i += 4) {
-    c1 = a[i] * b[i];
-    c2 = a[i+1] * b[i+1];
-    c3 = a[i+2] * b[i+2];
-    c4 = a[i+3] * b[i+3];
-    b[i] = c1;
-    b[i+1] = c2;
-    b[i+2] = c3;
-    b[i+3] = c4;
-  }
-  for (; i < dim; i++)
-    b[i] *= a[i];
-}
-// add clapack here
-#if !defined(HAVE_ATLAS)
-inline void clapack_Xtptri(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *result) {
-  stptri_(const_cast<char *>("U"), const_cast<char *>("N"), num_rows, Mdata, result);
-}
-inline void clapack_Xtptri(KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *result) {
-  dtptri_(const_cast<char *>("U"), const_cast<char *>("N"), num_rows, Mdata, result);
-}
-// 
-inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols, 
-                            float *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot, 
-                            KaldiBlasInt *result) {
-  sgetrf_(num_rows, num_cols, Mdata, stride, pivot, result);
-}
-inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols, 
-                            double *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot, 
-                            KaldiBlasInt *result) {
-  dgetrf_(num_rows, num_cols, Mdata, stride, pivot, result);
-}
-// 
-inline void clapack_Xgetri2(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride,
-                           KaldiBlasInt *pivot, float *p_work, 
-                           KaldiBlasInt *l_work, KaldiBlasInt *result) {
-  sgetri_(num_rows, Mdata, stride, pivot, p_work, l_work, result);
-}
-inline void clapack_Xgetri2(KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *stride,
-                           KaldiBlasInt *pivot, double *p_work, 
-                           KaldiBlasInt *l_work, KaldiBlasInt *result) {
-  dgetri_(num_rows, Mdata, stride, pivot, p_work, l_work, result);
-}
-//
-inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols,
-                           KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride,
-                           float *sv, float *Vdata, KaldiBlasInt *vstride,
-                           float *Udata, KaldiBlasInt *ustride, float *p_work,
-                           KaldiBlasInt *l_work, KaldiBlasInt *result) {
-  sgesvd_(v, u,
-          num_cols, num_rows, Mdata, stride,
-          sv, Vdata, vstride, Udata, ustride, 
-          p_work, l_work, result); 
-}
-inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols,
-                           KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *stride,
-                           double *sv, double *Vdata, KaldiBlasInt *vstride,
-                           double *Udata, KaldiBlasInt *ustride, double *p_work,
-                           KaldiBlasInt *l_work, KaldiBlasInt *result) {
-  dgesvd_(v, u,
-          num_cols, num_rows, Mdata, stride,
-          sv, Vdata, vstride, Udata, ustride,
-          p_work, l_work, result); 
-}
-//
-void inline clapack_Xsptri(KaldiBlasInt *num_rows, float *Mdata, 
-                           KaldiBlasInt *ipiv, float *work, KaldiBlasInt *result) {
-  ssptri_(const_cast<char *>("U"), num_rows, Mdata, ipiv, work, result);
-}
-void inline clapack_Xsptri(KaldiBlasInt *num_rows, double *Mdata, 
-                           KaldiBlasInt *ipiv, double *work, KaldiBlasInt *result) {
-  dsptri_(const_cast<char *>("U"), num_rows, Mdata, ipiv, work, result);
-}
-//
-void inline clapack_Xsptrf(KaldiBlasInt *num_rows, float *Mdata,
-                           KaldiBlasInt *ipiv, KaldiBlasInt *result) {
-  ssptrf_(const_cast<char *>("U"), num_rows, Mdata, ipiv, result);
-}
-void inline clapack_Xsptrf(KaldiBlasInt *num_rows, double *Mdata,
-                           KaldiBlasInt *ipiv, KaldiBlasInt *result) {
-  dsptrf_(const_cast<char *>("U"), num_rows, Mdata, ipiv, result);
-}
-#else
-inline void clapack_Xgetrf(MatrixIndexT num_rows, MatrixIndexT num_cols,
-                           float *Mdata, MatrixIndexT stride, 
-                           int *pivot, int *result) {
-  *result = clapack_sgetrf(CblasColMajor, num_rows, num_cols,
-                              Mdata, stride, pivot);
-}
-inline void clapack_Xgetrf(MatrixIndexT num_rows, MatrixIndexT num_cols,
-                           double *Mdata, MatrixIndexT stride, 
-                           int *pivot, int *result) {
-  *result = clapack_dgetrf(CblasColMajor, num_rows, num_cols,
-                              Mdata, stride, pivot);
-}
-//
-inline int clapack_Xtrtri(int num_rows, float *Mdata, MatrixIndexT stride) {
-  return  clapack_strtri(CblasColMajor, CblasUpper, CblasNonUnit, num_rows,
-                              Mdata, stride);
-}
-inline int clapack_Xtrtri(int num_rows, double *Mdata, MatrixIndexT stride) {
-  return  clapack_dtrtri(CblasColMajor, CblasUpper, CblasNonUnit, num_rows,
-                              Mdata, stride);
-}
-//
-inline void clapack_Xgetri(MatrixIndexT num_rows, float *Mdata, MatrixIndexT stride,
-                      int *pivot, int *result) {
-  *result = clapack_sgetri(CblasColMajor, num_rows, Mdata, stride, pivot);
-}
-inline void clapack_Xgetri(MatrixIndexT num_rows, double *Mdata, MatrixIndexT stride,
-                      int *pivot, int *result) {
-  *result = clapack_dgetri(CblasColMajor, num_rows, Mdata, stride, pivot);
-}
-#endif
-}
-// namespace kaldi
-#endif
--- a/speechx/speechx/kaldi/matrix/compressed-matrix.cc
+++ b/speechx/speechx/kaldi/matrix/compressed-matrix.cc
-// matrix/compressed-matrix.cc
-// Copyright 2012    Johns Hopkins University (author: Daniel Povey)
-//                   Frantisek Skala, Wei Shi
-//           2015    Tom Ko
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include "matrix/compressed-matrix.h"
-#include <algorithm>
-namespace kaldi {
-//static
-MatrixIndexT CompressedMatrix::DataSize(const GlobalHeader &header) {
-  // Returns size in bytes of the data.
-  DataFormat format = static_cast<DataFormat>(header.format);
-  if (format == kOneByteWithColHeaders) {
-    return sizeof(GlobalHeader) +
-        header.num_cols * (sizeof(PerColHeader) + header.num_rows);
-  } else if (format == kTwoByte) {
-    return sizeof(GlobalHeader) +
-        2 * header.num_rows * header.num_cols;
-  } else {
-    KALDI_ASSERT(format == kOneByte);
-    return sizeof(GlobalHeader) +
-        header.num_rows * header.num_cols;
-  }
-}
-// scale all element of matrix by scaling floats
-// in GlobalHeader with alpha.
-void CompressedMatrix::Scale(float alpha) {
-  if (data_ != NULL) {
-    GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
-    // scale the floating point values in each PerColHolder
-    // and leave all integers the same.
-    h->min_value *= alpha;
-    h->range *= alpha;
-  }
-}
-template<typename Real>  // static inline
-void CompressedMatrix::ComputeGlobalHeader(
-    const MatrixBase<Real> &mat, CompressionMethod method,
-    GlobalHeader *header) {
-  if (method == kAutomaticMethod) {
-    if (mat.NumRows() > 8) method = kSpeechFeature;
-    else method = kTwoByteAuto;
-  }
-  switch (method) {
-    case kSpeechFeature:
-      header->format = static_cast<int32>(kOneByteWithColHeaders);  // 1.
-      break;
-    case kTwoByteAuto: case kTwoByteSignedInteger:
-      header->format = static_cast<int32>(kTwoByte);  // 2.
-      break;
-    case kOneByteAuto: case kOneByteUnsignedInteger: case kOneByteZeroOne:
-      header->format = static_cast<int32>(kOneByte);  // 3.
-      break;
-    default:
-      KALDI_ERR << "Invalid compression type: "
-                << static_cast<int32>(method);
-  }
-  header->num_rows = mat.NumRows();
-  header->num_cols = mat.NumCols();
-  // Now compute 'min_value' and 'range'.
-  switch (method) {
-    case kSpeechFeature: case kTwoByteAuto: case kOneByteAuto: {
-      float min_value = mat.Min(), max_value = mat.Max();
-      // ensure that max_value is strictly greater than min_value, even if matrix is
-      // constant; this avoids crashes in ComputeColHeader when compressing speech
-      // featupres.
-      if (max_value == min_value)
-        max_value = min_value + (1.0 + fabs(min_value));
-      KALDI_ASSERT(min_value - min_value == 0 &&
-                   max_value - max_value == 0 &&
-                   "Cannot compress a matrix with Nan's or Inf's");
-      header->min_value = min_value;
-      header->range = max_value - min_value;
-      // we previously checked that max_value != min_value, so their
-      // difference should be nonzero.
-      KALDI_ASSERT(header->range > 0.0);
-      break;
-    }
-    case kTwoByteSignedInteger: {
-      header->min_value = -32768.0;
-      header->range = 65535.0;
-      break;
-    }
-    case kOneByteUnsignedInteger: {
-      header->min_value = 0.0;
-      header->range = 255.0;
-      break;
-    }
-    case kOneByteZeroOne: {
-      header->min_value = 0.0;
-      header->range = 1.0;
-      break;
-    }
-    default:
-      KALDI_ERR << "Unknown compression method = "
-                << static_cast<int32>(method);
-  }
-  KALDI_COMPILE_TIME_ASSERT(sizeof(*header) == 20);  // otherwise
-  // something weird is happening and our code probably won't work or
-  // won't be robust across platforms.
-}
-template<typename Real>
-void CompressedMatrix::CopyFromMat(
-    const MatrixBase<Real> &mat, CompressionMethod method) {
-  if (data_ != NULL) {
-    delete [] static_cast<float*>(data_);  // call delete [] because was allocated with new float[]
-    data_ = NULL;
-  }
-  if (mat.NumRows() == 0) { return; }  // Zero-size matrix stored as zero pointer.
-  GlobalHeader global_header;
-  ComputeGlobalHeader(mat, method, &global_header);
-  int32 data_size = DataSize(global_header);
-  data_ = AllocateData(data_size);
-  *(reinterpret_cast<GlobalHeader*>(data_)) = global_header;
-  DataFormat format = static_cast<DataFormat>(global_header.format);
-  if (format == kOneByteWithColHeaders) {
-    PerColHeader *header_data =
-        reinterpret_cast<PerColHeader*>(static_cast<char*>(data_) +
-                                        sizeof(GlobalHeader));
-    uint8 *byte_data =
-        reinterpret_cast<uint8*>(header_data + global_header.num_cols);
-    const Real *matrix_data = mat.Data();
-    for (int32 col = 0; col < global_header.num_cols; col++) {
-      CompressColumn(global_header,
-                     matrix_data + col, mat.Stride(),
-                     global_header.num_rows,
-                     header_data, byte_data);
-      header_data++;
-      byte_data += global_header.num_rows;
-    }
-  } else if (format == kTwoByte) {
-    uint16 *data = reinterpret_cast<uint16*>(static_cast<char*>(data_) +
-                                             sizeof(GlobalHeader));
-    int32 num_rows = mat.NumRows(), num_cols = mat.NumCols();
-    for (int32 r = 0; r < num_rows; r++) {
-      const Real *row_data = mat.RowData(r);
-      for (int32 c = 0; c < num_cols; c++)
-        data[c] = FloatToUint16(global_header, row_data[c]);
-      data += num_cols;
-    }
-  } else {
-    KALDI_ASSERT(format == kOneByte);
-    uint8 *data = reinterpret_cast<uint8*>(static_cast<char*>(data_) +
-                                           sizeof(GlobalHeader));
-    int32 num_rows = mat.NumRows(), num_cols = mat.NumCols();
-    for (int32 r = 0; r < num_rows; r++) {
-      const Real *row_data = mat.RowData(r);
-      for (int32 c = 0; c < num_cols; c++)
-        data[c] = FloatToUint8(global_header, row_data[c]);
-      data += num_cols;
-    }
-  }
-}
-// Instantiate the template for float and double.
-template
-void CompressedMatrix::CopyFromMat(const MatrixBase<float> &mat,
-                                   CompressionMethod method);
-template
-void CompressedMatrix::CopyFromMat(const MatrixBase<double> &mat,
-                                   CompressionMethod method);
-CompressedMatrix::CompressedMatrix(
-    const CompressedMatrix &cmat,
-    const MatrixIndexT row_offset,
-    const MatrixIndexT num_rows,
-    const MatrixIndexT col_offset,
-    const MatrixIndexT num_cols,
-    bool allow_padding): data_(NULL) {
-  int32 old_num_rows = cmat.NumRows(), old_num_cols = cmat.NumCols();
-  if (old_num_rows == 0) {
-    KALDI_ASSERT(num_rows == 0 && num_cols == 0);
-    // The empty matrix is stored as a zero pointer.
-    return;
-  }
-  KALDI_ASSERT(row_offset < old_num_rows);
-  KALDI_ASSERT(col_offset < old_num_cols);
-  KALDI_ASSERT(row_offset >= 0 || allow_padding);
-  KALDI_ASSERT(col_offset >= 0);
-  KALDI_ASSERT(row_offset + num_rows <= old_num_rows || allow_padding);
-  KALDI_ASSERT(col_offset + num_cols <= old_num_cols);
-  if (num_rows == 0 || num_cols == 0) { return; }
-  bool padding_is_used = (row_offset < 0 ||
-                          row_offset + num_rows > old_num_rows);
-  GlobalHeader new_global_header;
-  KALDI_COMPILE_TIME_ASSERT(sizeof(new_global_header) == 20);
-  GlobalHeader *old_global_header = reinterpret_cast<GlobalHeader*>(cmat.Data());
-  new_global_header = *old_global_header;
-  new_global_header.num_cols = num_cols;
-  new_global_header.num_rows = num_rows;
-  // We don't switch format from 1 -> 2 (in case of size reduction) yet; if this
-  // is needed, we will do this below by creating a temporary Matrix.
-  new_global_header.format = old_global_header->format;
-  data_ = AllocateData(DataSize(new_global_header));  // allocate memory
-  *(reinterpret_cast<GlobalHeader*>(data_)) = new_global_header;
-  DataFormat format = static_cast<DataFormat>(old_global_header->format);
-  if (format == kOneByteWithColHeaders) {
-    PerColHeader *old_per_col_header =
-        reinterpret_cast<PerColHeader*>(old_global_header + 1);
-    uint8 *old_byte_data =
-        reinterpret_cast<uint8*>(old_per_col_header +
-                                 old_global_header->num_cols);
-    PerColHeader *new_per_col_header =
-        reinterpret_cast<PerColHeader*>(
-            reinterpret_cast<GlobalHeader*>(data_) + 1);
-    memcpy(new_per_col_header, old_per_col_header + col_offset,
-           sizeof(PerColHeader) * num_cols);
-    uint8 *new_byte_data =
-        reinterpret_cast<uint8*>(new_per_col_header + num_cols);
-    if (!padding_is_used) {
-      uint8 *old_start_of_subcol =
-          old_byte_data + row_offset + (col_offset * old_num_rows),
-          *new_start_of_col = new_byte_data;
-      for (int32 i = 0; i < num_cols; i++) {
-        memcpy(new_start_of_col, old_start_of_subcol, num_rows);
-        new_start_of_col += num_rows;
-        old_start_of_subcol += old_num_rows;
-      }
-    } else {
-      uint8 *old_start_of_col =
-          old_byte_data + (col_offset * old_num_rows),
-          *new_start_of_col = new_byte_data;
-      for (int32 i = 0; i < num_cols; i++) {
-        for (int32 j = 0; j < num_rows; j++) {
-          int32 old_j = j + row_offset;
-          if (old_j < 0) old_j = 0;
-          else if (old_j >= old_num_rows) old_j = old_num_rows - 1;
-          new_start_of_col[j] = old_start_of_col[old_j];
-        }
-        new_start_of_col += num_rows;
-        old_start_of_col += old_num_rows;
-      }
-    }
-  } else if (format == kTwoByte) {
-    const uint16 *old_data =
-        reinterpret_cast<const uint16*>(old_global_header + 1);
-    uint16 *new_row_data =
-        reinterpret_cast<uint16*>(reinterpret_cast<GlobalHeader*>(data_) + 1);
-    for (int32 row = 0; row < num_rows; row++) {
-      int32 old_row = row + row_offset;
-      // The next two lines are only relevant if padding_is_used.
-      if (old_row < 0) old_row = 0;
-      else if (old_row >= old_num_rows) old_row = old_num_rows - 1;
-      const uint16 *old_row_data =
-          old_data + col_offset + (old_num_cols * old_row);
-      memcpy(new_row_data, old_row_data, sizeof(uint16) * num_cols);
-      new_row_data += num_cols;
-    }
-  } else {
-    KALDI_ASSERT(format == kOneByte);
-    const uint8 *old_data =
-        reinterpret_cast<const uint8*>(old_global_header + 1);
-    uint8 *new_row_data =
-        reinterpret_cast<uint8*>(reinterpret_cast<GlobalHeader*>(data_) + 1);
-    for (int32 row = 0; row < num_rows; row++) {
-      int32 old_row = row + row_offset;
-      // The next two lines are only relevant if padding_is_used.
-      if (old_row < 0) old_row = 0;
-      else if (old_row >= old_num_rows) old_row = old_num_rows - 1;
-      const uint8 *old_row_data =
-          old_data + col_offset + (old_num_cols * old_row);
-      memcpy(new_row_data, old_row_data, sizeof(uint8) * num_cols);
-      new_row_data += num_cols;
-    }
-  }
-  if (num_rows < 8 && format == kOneByteWithColHeaders) {
-    // format was 1 but we want it to be 2 -> create a temporary
-    // Matrix (uncompress), re-compress, and swap.
-    // This gives us almost exact reconstruction while saving
-    // memory (the elements take more space but there will be
-    // no per-column headers).
-    Matrix<float> temp(this->NumRows(), this->NumCols(),
-                       kUndefined);
-    this->CopyToMat(&temp);
-    CompressedMatrix temp_cmat(temp, kTwoByteAuto);
-    this->Swap(&temp_cmat);
-  }
-}
-template<typename Real>
-CompressedMatrix &CompressedMatrix::operator =(const MatrixBase<Real> &mat) {
-  this->CopyFromMat(mat);
-  return *this;
-}
-// Instantiate the template for float and double.
-template
-CompressedMatrix& CompressedMatrix::operator =(const MatrixBase<float> &mat);
-template
-CompressedMatrix& CompressedMatrix::operator =(const MatrixBase<double> &mat);
-inline uint16 CompressedMatrix::FloatToUint16(
-    const GlobalHeader &global_header,
-    float value) {
-  float f = (value - global_header.min_value) /
-      global_header.range;
-  if (f > 1.0) f = 1.0;  // Note: this should not happen.
-  if (f < 0.0) f = 0.0;  // Note: this should not happen.
-  return static_cast<int>(f * 65535 + 0.499);  // + 0.499 is to
-  // round to closest int; avoids bias.
-}
-inline uint8 CompressedMatrix::FloatToUint8(
-    const GlobalHeader &global_header,
-    float value) {
-  float f = (value - global_header.min_value) /
-      global_header.range;
-  if (f > 1.0) f = 1.0;  // Note: this should not happen.
-  if (f < 0.0) f = 0.0;  // Note: this should not happen.
-  return static_cast<int>(f * 255 + 0.499);  // + 0.499 is to
-  // round to closest int; avoids bias.
-}
-inline float CompressedMatrix::Uint16ToFloat(
-    const GlobalHeader &global_header,
-    uint16 value) {
-  // the constant 1.52590218966964e-05 is 1/65535.
-  return global_header.min_value
-      + global_header.range * 1.52590218966964e-05F * value;
-}
-template<typename Real>  // static
-void CompressedMatrix::ComputeColHeader(
-    const GlobalHeader &global_header,
-    const Real *data, MatrixIndexT stride,
-    int32 num_rows, CompressedMatrix::PerColHeader *header) {
-  KALDI_ASSERT(num_rows > 0);
-  std::vector<Real> sdata(num_rows); // the sorted data.
-  for (size_t i = 0, size = sdata.size(); i < size; i++)
-    sdata[i] = data[i*stride];
-  if (num_rows >= 5) {
-    int quarter_nr = num_rows/4;
-    // std::sort(sdata.begin(), sdata.end());
-    // The elements at positions 0, quarter_nr,
-    // 3*quarter_nr, and num_rows-1 need to be in sorted order.
-    std::nth_element(sdata.begin(), sdata.begin() + quarter_nr, sdata.end());
-    // Now, sdata.begin() + quarter_nr contains the element that would appear
-    // in sorted order, in that position.
-    std::nth_element(sdata.begin(), sdata.begin(), sdata.begin() + quarter_nr);
-    // Now, sdata.begin() and sdata.begin() + quarter_nr contain the elements
-    // that would appear at those positions in sorted order.
-    std::nth_element(sdata.begin() + quarter_nr + 1,
-                     sdata.begin() + (3*quarter_nr), sdata.end());
-    // Now, sdata.begin(), sdata.begin() + quarter_nr, and sdata.begin() +
-    // 3*quarter_nr, contain the elements that would appear at those positions
-    // in sorted order.
-    std::nth_element(sdata.begin() + (3*quarter_nr) + 1, sdata.end() - 1,
-                     sdata.end());
-    // Now, sdata.begin(), sdata.begin() + quarter_nr, and sdata.begin() +
-    // 3*quarter_nr, and sdata.end() - 1, contain the elements that would appear
-    // at those positions in sorted order.
-    header->percentile_0 =
-        std::min<uint16>(FloatToUint16(global_header, sdata[0]), 65532);
-    header->percentile_25 =
-        std::min<uint16>(
-            std::max<uint16>(
-                FloatToUint16(global_header, sdata[quarter_nr]),
-                header->percentile_0 + static_cast<uint16>(1)), 65533);
-    header->percentile_75 =
-        std::min<uint16>(
-            std::max<uint16>(
-                FloatToUint16(global_header, sdata[3*quarter_nr]),
-                header->percentile_25 + static_cast<uint16>(1)), 65534);
-    header->percentile_100 = std::max<uint16>(
-        FloatToUint16(global_header, sdata[num_rows-1]),
-        header->percentile_75 + static_cast<uint16>(1));
-  } else {  // handle this pathological case.
-    std::sort(sdata.begin(), sdata.end());
-    // Note: we know num_rows is at least 1.
-    header->percentile_0 =
-        std::min<uint16>(FloatToUint16(global_header, sdata[0]),
-                         65532);
-    if (num_rows > 1)
-      header->percentile_25 =
-          std::min<uint16>(
-              std::max<uint16>(FloatToUint16(global_header, sdata[1]),
-                               header->percentile_0 + 1), 65533);
-    else
-      header->percentile_25 = header->percentile_0 + 1;
-    if (num_rows > 2)
-      header->percentile_75 =
-          std::min<uint16>(
-              std::max<uint16>(FloatToUint16(global_header, sdata[2]),
-                               header->percentile_25 + 1), 65534);
-    else
-      header->percentile_75 = header->percentile_25 + 1;
-    if (num_rows > 3)
-      header->percentile_100 =
-          std::max<uint16>(FloatToUint16(global_header, sdata[3]),
-                           header->percentile_75 + 1);
-    else
-      header->percentile_100 = header->percentile_75 + 1;
-  }
-}
-// static
-inline uint8 CompressedMatrix::FloatToChar(
-    float p0, float p25, float p75, float p100,
-    float value) {
-  int ans;
-  if (value < p25) {  // range [ p0, p25 ) covered by
-    // characters 0 .. 64.  We round to the closest int.
-    float f = (value - p0) / (p25 - p0);
-    ans = static_cast<int>(f * 64 + 0.5);
-    // Note: the checks on the next two lines
-    // are necessary in pathological cases when all the elements in a row
-    // are the same and the percentile_* values are separated by one.
-    if (ans < 0) ans = 0;
-    if (ans > 64) ans = 64;
-  } else if (value < p75) {  // range [ p25, p75 )covered
-    // by characters 64 .. 192.  We round to the closest int.
-    float f = (value - p25) / (p75 - p25);
-    ans = 64 + static_cast<int>(f * 128 + 0.5);
-    if (ans < 64) ans = 64;
-    if (ans > 192) ans = 192;
-  } else {  // range [ p75, p100 ] covered by
-    // characters 192 .. 255.  Note: this last range
-    // has fewer characters than the left range, because
-    // we go up to 255, not 256.
-    float f = (value - p75) / (p100 - p75);
-    ans = 192 + static_cast<int>(f * 63 + 0.5);
-    if (ans < 192) ans = 192;
-    if (ans > 255) ans = 255;
-  }
-  return static_cast<uint8>(ans);
-}
-// static
-inline float CompressedMatrix::CharToFloat(
-    float p0, float p25, float p75, float p100,
-    uint8 value) {
-  if (value <= 64) {
-    return p0 + (p25 - p0) * value * (1/64.0);
-  } else if (value <= 192) {
-    return p25 + (p75 - p25) * (value - 64) * (1/128.0);
-  } else {
-    return p75 + (p100 - p75) * (value - 192) * (1/63.0);
-  }
-}
-template<typename Real>  // static
-void CompressedMatrix::CompressColumn(
-    const GlobalHeader &global_header,
-    const Real *data, MatrixIndexT stride,
-    int32 num_rows, CompressedMatrix::PerColHeader *header,
-    uint8 *byte_data) {
-  ComputeColHeader(global_header, data, stride,
-                   num_rows, header);
-  float p0 = Uint16ToFloat(global_header, header->percentile_0),
-      p25 = Uint16ToFloat(global_header, header->percentile_25),
-      p75 = Uint16ToFloat(global_header, header->percentile_75),
-      p100 = Uint16ToFloat(global_header, header->percentile_100);
-  for (int32 i = 0; i < num_rows; i++) {
-    Real this_data = data[i * stride];
-    byte_data[i] = FloatToChar(p0, p25, p75, p100, this_data);
-  }
-}
-// static
-void* CompressedMatrix::AllocateData(int32 num_bytes) {
-  KALDI_ASSERT(num_bytes > 0);
-  KALDI_COMPILE_TIME_ASSERT(sizeof(float) == 4);
-  // round size up to nearest number of floats.
-  return reinterpret_cast<void*>(new float[(num_bytes/3) + 4]);
-}
-void CompressedMatrix::Write(std::ostream &os, bool binary) const {
-  if (binary) {  // Binary-mode write:
-    if (data_ != NULL) {
-      GlobalHeader &h = *reinterpret_cast<GlobalHeader*>(data_);
-      DataFormat format = static_cast<DataFormat>(h.format);
-      if (format == kOneByteWithColHeaders) {
-        WriteToken(os, binary, "CM");
-      } else if (format == kTwoByte) {
-        WriteToken(os, binary, "CM2");
-      } else if (format == kOneByte) {
-        WriteToken(os, binary, "CM3");
-      }
-      MatrixIndexT size = DataSize(h);  // total size of data in data_
-      // We don't write out the "int32 format", hence the + 4, - 4.
-      os.write(reinterpret_cast<const char*>(data_) + 4, size - 4);
-    } else {  // special case: where data_ == NULL, we treat it as an empty
-      // matrix.
-      WriteToken(os, binary, "CM");
-      GlobalHeader h;
-      h.range = h.min_value = 0.0;
-      h.num_rows = h.num_cols = 0;
-      os.write(reinterpret_cast<const char*>(&h), sizeof(h));
-    }
-  } else {
-    // In text mode, just use the same format as a regular matrix.
-    // This is not compressed.
-    Matrix<BaseFloat> temp_mat(this->NumRows(), this->NumCols(),
-                               kUndefined);
-    this->CopyToMat(&temp_mat);
-    temp_mat.Write(os, binary);
-  }
-  if (os.fail())
-    KALDI_ERR << "Error writing compressed matrix to stream.";
-}
-void CompressedMatrix::Read(std::istream &is, bool binary) {
-  if (data_ != NULL) {
-    delete [] (static_cast<float*>(data_));
-    data_ = NULL;
-  }
-  if (binary) {
-    int peekval = Peek(is, binary);
-    if (peekval == 'C') {
-      std::string tok; // Should be CM (format 1) or CM2 (format 2)
-      ReadToken(is, binary, &tok);
-      GlobalHeader h;
-      if (tok == "CM") { h.format = 1; } //  kOneByteWithColHeaders
-      else if (tok == "CM2") { h.format = 2; }  // kTwoByte
-      else if (tok == "CM3") { h.format = 3; }  // kOneByte
-      else {
-        KALDI_ERR << "Unexpected token " << tok << ", expecting CM, CM2 or CM3";
-      }
-      // don't read the "format" -> hence + 4, - 4.
-      is.read(reinterpret_cast<char*>(&h) + 4, sizeof(h) - 4);
-      if (is.fail())
-        KALDI_ERR << "Failed to read header";
-      if (h.num_cols == 0) // empty matrix.
-        return;
-      int32 size = DataSize(h), remaining_size = size - sizeof(GlobalHeader);
-      data_ = AllocateData(size);
-      *(reinterpret_cast<GlobalHeader*>(data_)) = h;
-      is.read(reinterpret_cast<char*>(data_) + sizeof(GlobalHeader),
-              remaining_size);
-    } else {
-      // Assume that what we're reading is a regular Matrix.  This might be the
-      // case if you changed your code, making a Matrix into a CompressedMatrix,
-      // and you want back-compatibility for reading.
-      Matrix<BaseFloat> M;
-      M.Read(is, binary); // This will crash if it was not a Matrix.
-      this->CopyFromMat(M);
-    }
-  } else {  // Text-mode read.  In this case you don't get to
-    // choose the compression type.  Anyway this branch would only
-    // be taken when debugging.
-    Matrix<BaseFloat> temp;
-    temp.Read(is, binary);
-    this->CopyFromMat(temp);
-  }
-  if (is.fail())
-    KALDI_ERR << "Failed to read data.";
-}
-template<typename Real>
-void CompressedMatrix::CopyToMat(MatrixBase<Real> *mat,
-                                 MatrixTransposeType trans) const {
-  if (trans == kTrans) {
-    Matrix<Real> temp(this->NumCols(), this->NumRows());
-    CopyToMat(&temp, kNoTrans);
-    mat->CopyFromMat(temp, kTrans);
-    return;
-  }
-  if (data_ == NULL) {
-    KALDI_ASSERT(mat->NumRows() == 0);
-    KALDI_ASSERT(mat->NumCols() == 0);
-    return;
-  }
-  GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
-  int32 num_cols = h->num_cols, num_rows = h->num_rows;
-  KALDI_ASSERT(mat->NumRows() == num_rows);
-  KALDI_ASSERT(mat->NumCols() == num_cols);
-  DataFormat format = static_cast<DataFormat>(h->format);
-  if (format == kOneByteWithColHeaders) {
-    PerColHeader *per_col_header = reinterpret_cast<PerColHeader*>(h+1);
-    uint8 *byte_data = reinterpret_cast<uint8*>(per_col_header +
-                                                h->num_cols);
-    for (int32 i = 0; i < num_cols; i++, per_col_header++) {
-      float p0 = Uint16ToFloat(*h, per_col_header->percentile_0),
-          p25 = Uint16ToFloat(*h, per_col_header->percentile_25),
-          p75 = Uint16ToFloat(*h, per_col_header->percentile_75),
-          p100 = Uint16ToFloat(*h, per_col_header->percentile_100);
-      for (int32 j = 0; j < num_rows; j++, byte_data++) {
-        float f = CharToFloat(p0, p25, p75, p100, *byte_data);
-        (*mat)(j, i) = f;
-      }
-    }
-  } else if (format == kTwoByte) {
-    const uint16 *data = reinterpret_cast<const uint16*>(h + 1);
-    float min_value = h->min_value,
-        increment = h->range * (1.0 / 65535.0);
-    for (int32 i = 0; i < num_rows; i++) {
-      Real *row_data = mat->RowData(i);
-      for (int32 j = 0; j < num_cols; j++)
-        row_data[j] = min_value + data[j] * increment;
-      data += num_cols;
-    }
-  } else {
-    KALDI_ASSERT(format == kOneByte);
-    float min_value = h->min_value, increment = h->range * (1.0 / 255.0);
-    const uint8 *data = reinterpret_cast<const uint8*>(h + 1);
-    for (int32 i = 0; i < num_rows; i++) {
-      Real *row_data = mat->RowData(i);
-      for (int32 j = 0; j < num_cols; j++)
-        row_data[j] = min_value + data[j] * increment;
-      data += num_cols;
-    }
-  }
-}
-// Instantiate the template for float and double.
-template
-void CompressedMatrix::CopyToMat(MatrixBase<float> *mat,
-                                 MatrixTransposeType trans) const;
-template
-void CompressedMatrix::CopyToMat(MatrixBase<double> *mat,
-                                 MatrixTransposeType trans) const;
-template<typename Real>
-void CompressedMatrix::CopyRowToVec(MatrixIndexT row,
-                                    VectorBase<Real> *v) const {
-  KALDI_ASSERT(row < this->NumRows());
-  KALDI_ASSERT(row >= 0);
-  KALDI_ASSERT(v->Dim() == this->NumCols());
-  GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
-  DataFormat format = static_cast<DataFormat>(h->format);
-  if (format == kOneByteWithColHeaders) {
-    PerColHeader *per_col_header = reinterpret_cast<PerColHeader*>(h+1);
-    uint8 *byte_data = reinterpret_cast<uint8*>(per_col_header +
-                                                h->num_cols);
-    byte_data += row;  // point to first value we are interested in
-    for (int32 i = 0; i < h->num_cols;
-         i++, per_col_header++, byte_data += h->num_rows) {
-      float p0 = Uint16ToFloat(*h, per_col_header->percentile_0),
-          p25 = Uint16ToFloat(*h, per_col_header->percentile_25),
-          p75 = Uint16ToFloat(*h, per_col_header->percentile_75),
-          p100 = Uint16ToFloat(*h, per_col_header->percentile_100);
-      float f = CharToFloat(p0, p25, p75, p100, *byte_data);
-      (*v)(i) = f;
-    }
-  } else if (format == kTwoByte) {
-    int32 num_cols = h->num_cols;
-    float min_value = h->min_value,
-        increment = h->range * (1.0 / 65535.0);
-    const uint16 *row_data = reinterpret_cast<uint16*>(h + 1) + (num_cols * row);
-    Real *v_data = v->Data();
-    for (int32 c = 0; c < num_cols; c++)
-      v_data[c] = min_value + row_data[c] * increment;
-  } else {
-    KALDI_ASSERT(format == kOneByte);
-    int32 num_cols = h->num_cols;
-    float min_value = h->min_value,
-        increment = h->range * (1.0 / 255.0);
-    const uint8 *row_data = reinterpret_cast<uint8*>(h + 1) + (num_cols * row);
-    Real *v_data = v->Data();
-    for (int32 c = 0; c < num_cols; c++)
-      v_data[c] = min_value + row_data[c] * increment;
-  }
-}
-template<typename Real>
-void CompressedMatrix::CopyColToVec(MatrixIndexT col,
-                                    VectorBase<Real> *v) const {
-  KALDI_ASSERT(col < this->NumCols());
-  KALDI_ASSERT(col >= 0);
-  KALDI_ASSERT(v->Dim() == this->NumRows());
-  GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
-  DataFormat format = static_cast<DataFormat>(h->format);
-  if (format == kOneByteWithColHeaders) {
-    PerColHeader *per_col_header = reinterpret_cast<PerColHeader*>(h+1);
-    uint8 *byte_data = reinterpret_cast<uint8*>(per_col_header +
-                                                h->num_cols);
-    byte_data += col*h->num_rows;  // point to first value in the column we want
-    per_col_header += col;
-    float p0 = Uint16ToFloat(*h, per_col_header->percentile_0),
-        p25 = Uint16ToFloat(*h, per_col_header->percentile_25),
-        p75 = Uint16ToFloat(*h, per_col_header->percentile_75),
-        p100 = Uint16ToFloat(*h, per_col_header->percentile_100);
-    for (int32 i = 0; i < h->num_rows; i++, byte_data++) {
-      float f = CharToFloat(p0, p25, p75, p100, *byte_data);
-      (*v)(i) = f;
-    }
-  } else if (format == kTwoByte) {
-    int32 num_rows = h->num_rows, num_cols = h->num_cols;
-    float min_value = h->min_value,
-        increment = h->range * (1.0 / 65535.0);
-    const uint16 *col_data = reinterpret_cast<uint16*>(h + 1) + col;
-    Real *v_data = v->Data();
-    for (int32 r = 0; r < num_rows; r++)
-      v_data[r] = min_value + increment * col_data[r * num_cols];
-  } else {
-    KALDI_ASSERT(format == kOneByte);
-    int32 num_rows = h->num_rows, num_cols = h->num_cols;
-    float min_value = h->min_value,
-        increment = h->range * (1.0 / 255.0);
-    const uint8 *col_data = reinterpret_cast<uint8*>(h + 1) + col;
-    Real *v_data = v->Data();
-    for (int32 r = 0; r < num_rows; r++)
-      v_data[r] = min_value + increment * col_data[r * num_cols];
-  }
-}
-// instantiate the templates.
-template void
-CompressedMatrix::CopyColToVec(MatrixIndexT, VectorBase<double> *) const;
-template void
-CompressedMatrix::CopyColToVec(MatrixIndexT, VectorBase<float> *) const;
-template void
-CompressedMatrix::CopyRowToVec(MatrixIndexT, VectorBase<double> *) const;
-template void
-CompressedMatrix::CopyRowToVec(MatrixIndexT, VectorBase<float> *) const;
-template<typename Real>
-void CompressedMatrix::CopyToMat(int32 row_offset,
-                                 int32 col_offset,
-                                 MatrixBase<Real> *dest) const {
-  KALDI_PARANOID_ASSERT(row_offset < this->NumRows());
-  KALDI_PARANOID_ASSERT(col_offset < this->NumCols());
-  KALDI_PARANOID_ASSERT(row_offset >= 0);
-  KALDI_PARANOID_ASSERT(col_offset >= 0);
-  KALDI_ASSERT(row_offset+dest->NumRows() <= this->NumRows());
-  KALDI_ASSERT(col_offset+dest->NumCols() <= this->NumCols());
-  // everything is OK
-  GlobalHeader *h = reinterpret_cast<GlobalHeader*>(data_);
-  int32 num_rows = h->num_rows, num_cols = h->num_cols,
-      tgt_cols = dest->NumCols(), tgt_rows = dest->NumRows();
-  DataFormat format = static_cast<DataFormat>(h->format);
-  if (format == kOneByteWithColHeaders) {
-    PerColHeader *per_col_header = reinterpret_cast<PerColHeader*>(h+1);
-    uint8 *byte_data = reinterpret_cast<uint8*>(per_col_header +
-                                                h->num_cols);
-    uint8 *start_of_subcol = byte_data+row_offset;  // skip appropriate
-    // number of columns
-    start_of_subcol += col_offset*num_rows;  // skip appropriate number of rows
-    per_col_header += col_offset;  // skip the appropriate number of headers
-    for (int32 i = 0;
-         i < tgt_cols;
-         i++, per_col_header++, start_of_subcol+=num_rows) {
-      byte_data = start_of_subcol;
-      float p0 = Uint16ToFloat(*h, per_col_header->percentile_0),
-          p25 = Uint16ToFloat(*h, per_col_header->percentile_25),
-          p75 = Uint16ToFloat(*h, per_col_header->percentile_75),
-          p100 = Uint16ToFloat(*h, per_col_header->percentile_100);
-      for (int32 j = 0; j < tgt_rows; j++, byte_data++) {
-        float f = CharToFloat(p0, p25, p75, p100, *byte_data);
-        (*dest)(j, i) = f;
-      }
-    }
-  } else if (format == kTwoByte) {
-    const uint16 *data = reinterpret_cast<const uint16*>(h+1) + col_offset +
-        (num_cols * row_offset);
-    float min_value = h->min_value,
-        increment = h->range * (1.0 / 65535.0);
-    for (int32 row = 0; row < tgt_rows; row++) {
-      Real *dest_row = dest->RowData(row);
-      for (int32 col = 0; col < tgt_cols; col++)
-        dest_row[col] = min_value + increment * data[col];
-      data += num_cols;
-    }
-  } else {
-    KALDI_ASSERT(format == kOneByte);
-    const uint8 *data = reinterpret_cast<const uint8*>(h+1) + col_offset +
-        (num_cols * row_offset);
-    float min_value = h->min_value,
-        increment = h->range * (1.0 / 255.0);
-    for (int32 row = 0; row < tgt_rows; row++) {
-      Real *dest_row = dest->RowData(row);
-      for (int32 col = 0; col < tgt_cols; col++)
-        dest_row[col] = min_value + increment * data[col];
-      data += num_cols;
-    }
-  }
-}
-// instantiate the templates.
-template void CompressedMatrix::CopyToMat(int32,
-                                          int32,
-                                          MatrixBase<float> *dest) const;
-template void CompressedMatrix::CopyToMat(int32,
-                                          int32,
-                                          MatrixBase<double> *dest) const;
-void CompressedMatrix::Clear() {
-  if (data_ != NULL) {
-    delete [] static_cast<float*>(data_);
-    data_ = NULL;
-  }
-}
-CompressedMatrix::CompressedMatrix(const CompressedMatrix &mat): data_(NULL) {
-  *this = mat; // use assignment operator.
-}
-CompressedMatrix &CompressedMatrix::operator = (const CompressedMatrix &mat) {
-  Clear(); // now this->data_ == NULL.
-  if (mat.data_ != NULL) {
-    MatrixIndexT data_size = DataSize(*static_cast<GlobalHeader*>(mat.data_));
-    data_ = AllocateData(data_size);
-    memcpy(static_cast<void*>(data_),
-           static_cast<void*>(mat.data_),
-           data_size);
-  }
-  return *this;
-}
-}  // namespace kaldi
--- a/speechx/speechx/kaldi/matrix/compressed-matrix.h
+++ b/speechx/speechx/kaldi/matrix/compressed-matrix.h
-// matrix/compressed-matrix.h
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-//                 Frantisek Skala, Wei Shi
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_MATRIX_COMPRESSED_MATRIX_H_
-#define KALDI_MATRIX_COMPRESSED_MATRIX_H_ 1
-#include "matrix/kaldi-matrix.h"
-namespace kaldi {
-/// \addtogroup matrix_group
-/// @{
-/*
-  The enum CompressionMethod is used when creating a CompressedMatrix (a lossily
-  compressed matrix) from a regular Matrix.  It dictates how we choose the
-  compressed format and how we choose the ranges of floats that are represented
-  by particular integers.
-    kAutomaticMethod = 1 This is the default when you don't specify the
-                        compression method.  It is a shorthand for using
-                        kSpeechFeature if the num-rows is more than 8, and
-                        kTwoByteAuto otherwise.
-    kSpeechFeature = 2  This is the most complicated of the compression methods,
-                        and was designed for speech features which have a roughly
-                        Gaussian distribution with different ranges for each
-                        dimension.  Each element is stored in one byte, but there
-                        is an 8-byte header per column; the spacing of the
-                        integer values is not uniform but is in 3 ranges.
-    kTwoByteAuto = 3    Each element is stored in two bytes as a uint16, with
-                        the representable range of values chosen automatically
-                        with the minimum and maximum elements of the matrix as
-                        its edges.
-    kTwoByteSignedInteger = 4
-                        Each element is stored in two bytes as a uint16, with
-                        the representable range of value chosen to coincide with
-                        what you'd get if you stored signed integers, i.e.
-                        [-32768.0, 32767.0].  Suitable for waveform data that
-                        was previously stored as 16-bit PCM.
-    kOneByteAuto = 5    Each element is stored in one byte as a uint8, with the
-                        representable range of values chosen automatically with
-                        the minimum and maximum elements of the matrix as its
-                        edges.
-    kOneByteUnsignedInteger = 6 Each element is stored in
-                        one byte as a uint8, with the representable range of
-                        values equal to [0.0, 255.0].
-    kOneByteZeroOne = 7 Each element is stored in
-                        one byte as a uint8, with the representable range of
-                        values equal to [0.0, 1.0].  Suitable for image data
-                        that has previously been compressed as int8.
-    // We can add new methods here as needed: if they just imply different ways
-    // of selecting the min_value and range, and a num-bytes = 1 or 2, they will
-    // be trivial to implement.
-*/
-enum CompressionMethod {
-  kAutomaticMethod = 1,
-  kSpeechFeature = 2,
-  kTwoByteAuto = 3,
-  kTwoByteSignedInteger = 4,
-  kOneByteAuto = 5,
-  kOneByteUnsignedInteger = 6,
-  kOneByteZeroOne = 7
-};
-/*
-  This class does lossy compression of a matrix.  It supports various compression
-  methods, see enum CompressionMethod.
-*/
-class CompressedMatrix {
- public:
-  CompressedMatrix(): data_(NULL) { }
-  ~CompressedMatrix() { Clear(); }
-  template<typename Real>
-  explicit CompressedMatrix(const MatrixBase<Real> &mat,
-                            CompressionMethod method = kAutomaticMethod):
-      data_(NULL) { CopyFromMat(mat, method); }
-  /// Initializer that can be used to select part of an existing
-  /// CompressedMatrix without un-compressing and re-compressing (note: unlike
-  /// similar initializers for class Matrix, it doesn't point to the same memory
-  /// location).
-  ///
-  /// This creates a CompressedMatrix with the size (num_rows, num_cols)
-  /// starting at (row_offset, col_offset).
-  ///
-  /// If you specify allow_padding = true,
-  /// it is permitted to have row_offset < 0 and
-  /// row_offset + num_rows > mat.NumRows(), and the result will contain
-  /// repeats of the first and last rows of 'mat' as necessary.
-  CompressedMatrix(const CompressedMatrix &mat,
-                   const MatrixIndexT row_offset,
-                   const MatrixIndexT num_rows,
-                   const MatrixIndexT col_offset,
-                   const MatrixIndexT num_cols,
-                   bool allow_padding = false);
-  void *Data() const { return this->data_; }
-  /// This will resize *this and copy the contents of mat to *this.
-  template<typename Real>
-  void CopyFromMat(const MatrixBase<Real> &mat,
-                   CompressionMethod method = kAutomaticMethod);
-  CompressedMatrix(const CompressedMatrix &mat);
-  CompressedMatrix &operator = (const CompressedMatrix &mat); // assignment operator.
-  template<typename Real>
-  CompressedMatrix &operator = (const MatrixBase<Real> &mat); // assignment operator.
-  /// Copies contents to matrix.  Note: mat must have the correct size.
-  /// The kTrans case uses a temporary.
-  template<typename Real>
-  void CopyToMat(MatrixBase<Real> *mat,
-                 MatrixTransposeType trans = kNoTrans) const;
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &is, bool binary);
-  /// Returns number of rows (or zero for emtpy matrix).
-  inline MatrixIndexT NumRows() const { return (data_ == NULL) ? 0 :
-      (*reinterpret_cast<GlobalHeader*>(data_)).num_rows; }
-  /// Returns number of columns (or zero for emtpy matrix).
-  inline MatrixIndexT NumCols() const { return (data_ == NULL) ? 0 :
-      (*reinterpret_cast<GlobalHeader*>(data_)).num_cols; }
-  /// Copies row #row of the matrix into vector v.
-  /// Note: v must have same size as #cols.
-  template<typename Real>
-  void CopyRowToVec(MatrixIndexT row, VectorBase<Real> *v) const;
-  /// Copies column #col of the matrix into vector v.
-  /// Note: v must have same size as #rows.
-  template<typename Real>
-  void CopyColToVec(MatrixIndexT col, VectorBase<Real> *v) const;
-  /// Copies submatrix of compressed matrix into matrix dest.
-  /// Submatrix starts at row row_offset and column column_offset and its size
-  /// is defined by size of provided matrix dest
-  template<typename Real>
-  void CopyToMat(int32 row_offset,
-                 int32 column_offset,
-                 MatrixBase<Real> *dest) const;
-  void Swap(CompressedMatrix *other) { std::swap(data_, other->data_); }
-  void Clear();
-  /// scales all elements of matrix by alpha.
-  /// It scales the floating point values in GlobalHeader by alpha.
-  void Scale(float alpha);
-  friend class Matrix<float>;
-  friend class Matrix<double>;
- private:
-  // This enum describes the different compressed-data formats: these are
-  // distinct from the compression methods although all of the methods apart
-  // from kAutomaticMethod dictate a particular compressed-data format.
-  //
-  //  kOneByteWithColHeaders means there is a GlobalHeader and each
-  //    column has a PerColHeader; the actual data is stored in
-  //    one byte per element, in column-major order (the mapping
-  //    from integers to floats is a little complicated).
-  //  kTwoByte means there is a global header but no PerColHeader;
-  //    the actual data is stored in two bytes per element in
-  //    row-major order; it's decompressed as:
-  //       uint16 i;  GlobalHeader g;
-  //       float f = g.min_value + i * (g.range / 65535.0)
-  //  kOneByte means there is a global header but not PerColHeader;
-  //    the data is stored in one byte per element in row-major
-  //    order and is decompressed as:
-  //       uint8 i;  GlobalHeader g;
-  //       float f = g.min_value + i * (g.range / 255.0)
-  enum DataFormat {
-    kOneByteWithColHeaders = 1,
-    kTwoByte = 2,
-    kOneByte = 3
-  };
-  // allocates data using new [], ensures byte alignment
-  // sufficient for float.
-  static void *AllocateData(int32 num_bytes);
-  struct GlobalHeader {
-    int32 format;     // Represents the enum DataFormat.
-    float min_value;  // min_value and range represent the ranges of the integer
-                      // data in the kTwoByte and kOneByte formats, and the
-                      // range of the PerColHeader uint16's in the
-                      // kOneByteWithColheaders format.
-    float range;
-    int32 num_rows;
-    int32 num_cols;
-  };
-  // This function computes the global header for compressing this data.
-  template<typename Real>
-  static inline void ComputeGlobalHeader(const MatrixBase<Real> &mat,
-                                         CompressionMethod method,
-                                         GlobalHeader *header);
-  // The number of bytes we need to request when allocating 'data_'.
-  static MatrixIndexT DataSize(const GlobalHeader &header);
-  // This struct is only used in format kOneByteWithColHeaders.
-  struct PerColHeader {
-    uint16 percentile_0;
-    uint16 percentile_25;
-    uint16 percentile_75;
-    uint16 percentile_100;
-  };
-  template<typename Real>
-  static void CompressColumn(const GlobalHeader &global_header,
-                             const Real *data, MatrixIndexT stride,
-                             int32 num_rows, PerColHeader *header,
-                             uint8 *byte_data);
-  template<typename Real>
-  static void ComputeColHeader(const GlobalHeader &global_header,
-                               const Real *data, MatrixIndexT stride,
-                               int32 num_rows, PerColHeader *header);
-  static inline uint16 FloatToUint16(const GlobalHeader &global_header,
-                                     float value);
-  // this is used only in the kOneByte compression format.
-  static inline uint8 FloatToUint8(const GlobalHeader &global_header,
-                                   float value);
-  static inline float Uint16ToFloat(const GlobalHeader &global_header,
-                                    uint16 value);
-  // this is used only in the kOneByteWithColHeaders compression format.
-  static inline uint8 FloatToChar(float p0, float p25,
-                                          float p75, float p100,
-                                          float value);
-  // this is used only in the kOneByteWithColHeaders compression format.
-  static inline float CharToFloat(float p0, float p25,
-                                  float p75, float p100,
-                                  uint8 value);
-  void *data_; // first GlobalHeader, then PerColHeader (repeated), then
-  // the byte data for each column (repeated).  Note: don't intersperse
-  // the byte data with the PerColHeaders, because of alignment issues.
-};
-/// @} end of \addtogroup matrix_group
-}  // namespace kaldi
-#endif  // KALDI_MATRIX_COMPRESSED_MATRIX_H_
--- a/speechx/speechx/kaldi/matrix/jama-eig.h
+++ b/speechx/speechx/kaldi/matrix/jama-eig.h
-// matrix/jama-eig.h
-// Copyright 2009-2011 Microsoft Corporation 
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//  http://www.apache.org/licenses/LICENSE-2.0
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-// This file consists of a port and modification of materials from
-//   JAMA: A Java Matrix Package
-// under the following notice: This software is a cooperative product of
-// The MathWorks and the National Institute of Standards and Technology (NIST)
-// which has been released to the public.  This notice and the original code are
-// available at http://math.nist.gov/javanumerics/jama/domain.notice
-#ifndef KALDI_MATRIX_JAMA_EIG_H_
-#define KALDI_MATRIX_JAMA_EIG_H_ 1
-#include "matrix/kaldi-matrix.h"
-namespace kaldi {
-// This class is not to be used externally.  See the Eig function in the Matrix
-// class in kaldi-matrix.h.  This is the external interface.
-template<typename Real> class EigenvalueDecomposition {
-  // This class is based on the EigenvalueDecomposition class from the JAMA
-  // library (version 1.0.2).
- public:
-  EigenvalueDecomposition(const MatrixBase<Real> &A);
-  ~EigenvalueDecomposition();  // free memory.
-  void GetV(MatrixBase<Real> *V_out) {  // V is what we call P externally; it's the matrix of
-    // eigenvectors.
-    KALDI_ASSERT(V_out->NumRows() == static_cast<MatrixIndexT>(n_)
-                 && V_out->NumCols() == static_cast<MatrixIndexT>(n_));
-    for (int i = 0; i < n_; i++)
-      for (int j = 0; j < n_; j++)
-        (*V_out)(i, j) = V(i, j);  // V(i, j) is member function.
-  }
-  void GetRealEigenvalues(VectorBase<Real> *r_out) {
-    // returns real part of eigenvalues.
-    KALDI_ASSERT(r_out->Dim() == static_cast<MatrixIndexT>(n_));
-    for (int i = 0; i < n_; i++)
-      (*r_out)(i) = d_[i];
-  }
-  void GetImagEigenvalues(VectorBase<Real> *i_out) {
-    // returns imaginary part of eigenvalues.
-    KALDI_ASSERT(i_out->Dim() == static_cast<MatrixIndexT>(n_));
-    for (int i = 0; i < n_; i++)
-      (*i_out)(i) = e_[i];
-  }
- private:
-  inline Real &H(int r, int c) { return H_[r*n_ + c]; }
-  inline Real &V(int r, int c) { return V_[r*n_ + c]; }
-  // complex division
-  inline static void cdiv(Real xr, Real xi, Real yr, Real yi, Real *cdivr, Real *cdivi) {
-    Real r, d;
-    if (std::abs(yr) > std::abs(yi)) {
-      r = yi/yr;
-      d = yr + r*yi;
-      *cdivr = (xr + r*xi)/d;
-      *cdivi = (xi - r*xr)/d;
-    } else {
-      r = yr/yi;
-      d = yi + r*yr;
-      *cdivr = (r*xr + xi)/d;
-      *cdivi = (r*xi - xr)/d;
-    }
-  }
-  // Nonsymmetric reduction from Hessenberg to real Schur form.
-  void Hqr2 ();
-  int n_;  // matrix dimension.
-  Real *d_, *e_;  // real and imaginary parts of eigenvalues.
-  Real *V_;  // the eigenvectors (P in our external notation)
-  Real *H_;  // the nonsymmetric Hessenberg form.
-  Real *ort_;  // working storage for nonsymmetric algorithm.
-  // Symmetric Householder reduction to tridiagonal form.
-  void Tred2 ();
-  // Symmetric tridiagonal QL algorithm.
-  void Tql2 ();
-  // Nonsymmetric reduction to Hessenberg form.
-  void Orthes ();
-};
-template class EigenvalueDecomposition<float>;  // force instantiation.
-template class EigenvalueDecomposition<double>;  // force instantiation.
-template<typename Real> void  EigenvalueDecomposition<Real>::Tred2() {
-  //  This is derived from the Algol procedures tred2 by
-  //  Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
-  //  Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
-  //  Fortran subroutine in EISPACK.
-  for (int j = 0; j < n_; j++) {
-    d_[j] = V(n_-1, j);
-  }
-  // Householder reduction to tridiagonal form.
-  for (int i = n_-1; i > 0; i--) {
-    // Scale to avoid under/overflow.
-    Real scale = 0.0;
-    Real h = 0.0;
-    for (int k = 0; k < i; k++) {
-      scale = scale + std::abs(d_[k]);
-    }
-    if (scale == 0.0) {
-      e_[i] = d_[i-1];
-      for (int j = 0; j < i; j++) {
-        d_[j] = V(i-1, j);
-        V(i, j) = 0.0;
-        V(j, i) = 0.0;
-      }
-    } else {
-      // Generate Householder vector.
-      for (int k = 0; k < i; k++) {
-        d_[k] /= scale;
-        h += d_[k] * d_[k];
-      }
-      Real f = d_[i-1];
-      Real g = std::sqrt(h);
-      if (f > 0) {
-        g = -g;
-      }
-      e_[i] = scale * g;
-      h = h - f * g;
-      d_[i-1] = f - g;
-      for (int j = 0; j < i; j++) {
-        e_[j] = 0.0;
-      }
-      // Apply similarity transformation to remaining columns.
-      for (int j = 0; j < i; j++) {
-        f = d_[j];
-        V(j, i) = f;
-        g =e_[j] + V(j, j) * f;
-        for (int k = j+1; k <= i-1; k++) {
-          g += V(k, j) * d_[k];
-          e_[k] += V(k, j) * f;
-        }
-        e_[j] = g;
-      }
-      f = 0.0;
-      for (int j = 0; j < i; j++) {
-        e_[j] /= h;
-        f += e_[j] * d_[j];
-      }
-      Real hh = f / (h + h);
-      for (int j = 0; j < i; j++) {
-        e_[j] -= hh * d_[j];
-      }
-      for (int j = 0; j < i; j++) {
-        f = d_[j];
-        g = e_[j];
-        for (int k = j; k <= i-1; k++) {
-          V(k, j) -= (f * e_[k] + g * d_[k]);
-        }
-        d_[j] = V(i-1, j);
-        V(i, j) = 0.0;
-      }
-    }
-    d_[i] = h;
-  }
-  // Accumulate transformations.
-  for (int i = 0; i < n_-1; i++) {
-    V(n_-1, i) = V(i, i);
-    V(i, i) = 1.0;
-    Real h = d_[i+1];
-    if (h != 0.0) {
-      for (int k = 0; k <= i; k++) {
-        d_[k] = V(k, i+1) / h;
-      }
-      for (int j = 0; j <= i; j++) {
-        Real g = 0.0;
-        for (int k = 0; k <= i; k++) {
-          g += V(k, i+1) * V(k, j);
-        }
-        for (int k = 0; k <= i; k++) {
-          V(k, j) -= g * d_[k];
-        }
-      }
-    }
-    for (int k = 0; k <= i; k++) {
-      V(k, i+1) = 0.0;
-    }
-  }
-  for (int j = 0; j < n_; j++) {
-    d_[j] = V(n_-1, j);
-    V(n_-1, j) = 0.0;
-  }
-  V(n_-1, n_-1) = 1.0;
-   e_[0] = 0.0;
-}
-template<typename Real> void EigenvalueDecomposition<Real>::Tql2() {
-  //  This is derived from the Algol procedures tql2, by
-  //  Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
-  //  Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
-  //  Fortran subroutine in EISPACK.
-  for (int i = 1; i < n_; i++) {
-     e_[i-1] = e_[i];
-  }
-   e_[n_-1] = 0.0;
-  Real f = 0.0;
-  Real tst1 = 0.0;
-  Real eps = std::numeric_limits<Real>::epsilon();
-  for (int l = 0; l < n_; l++) {
-    // Find small subdiagonal element
-    tst1 = std::max(tst1, std::abs(d_[l]) + std::abs(e_[l]));
-    int m = l;
-    while (m < n_) {
-      if (std::abs(e_[m]) <= eps*tst1) {
-        break;
-      }
-      m++;
-    }
-    // If m == l, d_[l] is an eigenvalue,
-    // otherwise, iterate.
-    if (m > l) {
-      int iter = 0;
-      do {
-        iter = iter + 1;  // (Could check iteration count here.)
-        // Compute implicit shift
-        Real g = d_[l];
-        Real p = (d_[l+1] - g) / (2.0 *e_[l]);
-        Real r = Hypot(p, static_cast<Real>(1.0));  // This is a Kaldi version of hypot that works with templates.
-        if (p < 0) {
-          r = -r;
-        }
-        d_[l] =e_[l] / (p + r);
-        d_[l+1] =e_[l] * (p + r);
-        Real dl1 = d_[l+1];
-        Real h = g - d_[l];
-        for (int i = l+2; i < n_; i++) {
-          d_[i] -= h;
-        }
-        f = f + h;
-        // Implicit QL transformation.
-        p = d_[m];
-        Real c = 1.0;
-        Real c2 = c;
-        Real c3 = c;
-        Real el1 =e_[l+1];
-        Real s = 0.0;
-        Real s2 = 0.0;
-        for (int i = m-1; i >= l; i--) {
-          c3 = c2;
-          c2 = c;
-          s2 = s;
-          g = c *e_[i];
-          h = c * p;
-          r = Hypot(p, e_[i]);  // This is a Kaldi version of Hypot that works with templates.
-          e_[i+1] = s * r;
-          s =e_[i] / r;
-          c = p / r;
-          p = c * d_[i] - s * g;
-          d_[i+1] = h + s * (c * g + s * d_[i]);
-          // Accumulate transformation.
-          for (int k = 0; k < n_; k++) {
-            h = V(k, i+1);
-            V(k, i+1) = s * V(k, i) + c * h;
-            V(k, i) = c * V(k, i) - s * h;
-          }
-        }
-        p = -s * s2 * c3 * el1 *e_[l] / dl1;
-        e_[l] = s * p;
-        d_[l] = c * p;
-        // Check for convergence.
-      } while (std::abs(e_[l]) > eps*tst1);
-    }
-    d_[l] = d_[l] + f;
-    e_[l] = 0.0;
-  }
-  // Sort eigenvalues and corresponding vectors.
-  for (int i = 0; i < n_-1; i++) {
-    int k = i;
-    Real p = d_[i];
-    for (int j = i+1; j < n_; j++) {
-      if (d_[j] < p) {
-        k = j;
-        p = d_[j];
-      }
-    }
-    if (k != i) {
-      d_[k] = d_[i];
-      d_[i] = p;
-      for (int j = 0; j < n_; j++) {
-        p = V(j, i);
-        V(j, i) = V(j, k);
-        V(j, k) = p;
-      }
-    }
-  }
-}
-template<typename Real>
-void EigenvalueDecomposition<Real>::Orthes() {
-  //  This is derived from the Algol procedures orthes and ortran,
-  //  by Martin and Wilkinson, Handbook for Auto. Comp.,
-  //  Vol.ii-Linear Algebra, and the corresponding
-  //  Fortran subroutines in EISPACK.
-  int low = 0;
-  int high = n_-1;
-  for (int m = low+1; m <= high-1; m++) {
-    // Scale column.
-    Real scale = 0.0;
-    for (int i = m; i <= high; i++) {
-      scale = scale + std::abs(H(i, m-1));
-    }
-    if (scale != 0.0) {
-      // Compute Householder transformation.
-      Real h = 0.0;
-      for (int i = high; i >= m; i--) {
-        ort_[i] = H(i, m-1)/scale;
-        h += ort_[i] * ort_[i];
-      }
-      Real g = std::sqrt(h);
-      if (ort_[m] > 0) {
-        g = -g;
-      }
-      h = h - ort_[m] * g;
-      ort_[m] = ort_[m] - g;
-      // Apply Householder similarity transformation
-      // H = (I-u*u'/h)*H*(I-u*u')/h)
-      for (int j = m; j < n_; j++) {
-        Real f = 0.0;
-        for (int i = high; i >= m; i--) {
-          f += ort_[i]*H(i, j);
-        }
-        f = f/h;
-        for (int i = m; i <= high; i++) {
-          H(i, j) -= f*ort_[i];
-        }
-      }
-      for (int i = 0; i <= high; i++) {
-        Real f = 0.0;
-        for (int j = high; j >= m; j--) {
-          f += ort_[j]*H(i, j);
-        }
-        f = f/h;
-        for (int j = m; j <= high; j++) {
-          H(i, j) -= f*ort_[j];
-        }
-      }
-      ort_[m] = scale*ort_[m];
-      H(m, m-1) = scale*g;
-    }
-  }
-  // Accumulate transformations (Algol's ortran).
-  for (int i = 0; i < n_; i++) {
-    for (int j = 0; j < n_; j++) {
-      V(i, j) = (i == j ? 1.0 : 0.0);
-    }
-  }
-  for (int m = high-1; m >= low+1; m--) {
-    if (H(m, m-1) != 0.0) {
-      for (int i = m+1; i <= high; i++) {
-        ort_[i] = H(i, m-1);
-      }
-      for (int j = m; j <= high; j++) {
-        Real g = 0.0;
-        for (int i = m; i <= high; i++) {
-          g += ort_[i] * V(i, j);
-        }
-        // Double division avoids possible underflow
-        g = (g / ort_[m]) / H(m, m-1);
-        for (int i = m; i <= high; i++) {
-          V(i, j) += g * ort_[i];
-        }
-      }
-    }
-  }
-}
-template<typename Real> void  EigenvalueDecomposition<Real>::Hqr2() {
-  //  This is derived from the Algol procedure hqr2,
-  //  by Martin and Wilkinson, Handbook for Auto. Comp.,
-  //  Vol.ii-Linear Algebra, and the corresponding
-  //  Fortran subroutine in EISPACK.
-  int nn = n_;
-  int n = nn-1;
-  int low = 0;
-  int high = nn-1;
-  Real eps = std::numeric_limits<Real>::epsilon();
-  Real exshift = 0.0;
-  Real p = 0, q = 0, r = 0, s = 0, z=0, t, w, x, y;
-  // Store roots isolated by balanc and compute matrix norm
-  Real norm = 0.0;
-  for (int i = 0; i < nn; i++) {
-    if (i < low || i > high) {
-      d_[i] = H(i, i);
-      e_[i] = 0.0;
-    }
-    for (int j = std::max(i-1, 0); j < nn; j++) {
-      norm = norm + std::abs(H(i, j));
-    }
-  }
-  // Outer loop over eigenvalue index
-  int iter = 0;
-  while (n >= low) {
-    // Look for single small sub-diagonal element
-    int l = n;
-    while (l > low) {
-      s = std::abs(H(l-1, l-1)) + std::abs(H(l, l));
-      if (s == 0.0) {
-        s = norm;
-      }
-      if (std::abs(H(l, l-1)) < eps * s) {
-        break;
-      }
-      l--;
-    }
-    // Check for convergence
-    // One root found
-    if (l == n) {
-      H(n, n) = H(n, n) + exshift;
-      d_[n] = H(n, n);
-      e_[n] = 0.0;
-      n--;
-      iter = 0;
-      // Two roots found
-    } else if (l == n-1) {
-      w = H(n, n-1) * H(n-1, n);
-      p = (H(n-1, n-1) - H(n, n)) / 2.0;
-      q = p * p + w;
-      z = std::sqrt(std::abs(q));
-      H(n, n) = H(n, n) + exshift;
-      H(n-1, n-1) = H(n-1, n-1) + exshift;
-      x = H(n, n);
-      // Real pair
-      if (q >= 0) {
-        if (p >= 0) {
-          z = p + z;
-        } else {
-          z = p - z;
-        }
-        d_[n-1] = x + z;
-        d_[n] = d_[n-1];
-        if (z != 0.0) {
-          d_[n] = x - w / z;
-        }
-        e_[n-1] = 0.0;
-        e_[n] = 0.0;
-        x = H(n, n-1);
-        s = std::abs(x) + std::abs(z);
-        p = x / s;
-        q = z / s;
-        r = std::sqrt(p * p+q * q);
-        p = p / r;
-        q = q / r;
-        // Row modification
-        for (int j = n-1; j < nn; j++) {
-          z = H(n-1, j);
-          H(n-1, j) = q * z + p * H(n, j);
-          H(n, j) = q * H(n, j) - p * z;
-        }
-        // Column modification
-        for (int i = 0; i <= n; i++) {
-          z = H(i, n-1);
-          H(i, n-1) = q * z + p * H(i, n);
-          H(i, n) = q * H(i, n) - p * z;
-        }
-        // Accumulate transformations
-        for (int i = low; i <= high; i++) {
-          z = V(i, n-1);
-          V(i, n-1) = q * z + p * V(i, n);
-          V(i, n) = q * V(i, n) - p * z;
-        }
-        // Complex pair
-      } else {
-        d_[n-1] = x + p;
-        d_[n] = x + p;
-        e_[n-1] = z;
-        e_[n] = -z;
-      }
-      n = n - 2;
-      iter = 0;
-      // No convergence yet
-    } else {
-      // Form shift
-      x = H(n, n);
-      y = 0.0;
-      w = 0.0;
-      if (l < n) {
-        y = H(n-1, n-1);
-        w = H(n, n-1) * H(n-1, n);
-      }
-      // Wilkinson's original ad hoc shift
-      if (iter == 10) {
-        exshift += x;
-        for (int i = low; i <= n; i++) {
-          H(i, i) -= x;
-        }
-        s = std::abs(H(n, n-1)) + std::abs(H(n-1, n-2));
-        x = y = 0.75 * s;
-        w = -0.4375 * s * s;
-      }
-      // MATLAB's new ad hoc shift
-      if (iter == 30) {
-        s = (y - x) / 2.0;
-        s = s * s + w;
-        if (s > 0) {
-          s = std::sqrt(s);
-          if (y < x) {
-            s = -s;
-          }
-          s = x - w / ((y - x) / 2.0 + s);
-          for (int i = low; i <= n; i++) {
-            H(i, i) -= s;
-          }
-          exshift += s;
-          x = y = w = 0.964;
-        }
-      }
-      iter = iter + 1;   // (Could check iteration count here.)
-      // Look for two consecutive small sub-diagonal elements
-      int m = n-2;
-      while (m >= l) {
-        z = H(m, m);
-        r = x - z;
-        s = y - z;
-        p = (r * s - w) / H(m+1, m) + H(m, m+1);
-        q = H(m+1, m+1) - z - r - s;
-        r = H(m+2, m+1);
-        s = std::abs(p) + std::abs(q) + std::abs(r);
-        p = p / s;
-        q = q / s;
-        r = r / s;
-        if (m == l) {
-          break;
-        }
-        if (std::abs(H(m, m-1)) * (std::abs(q) + std::abs(r)) <
-            eps * (std::abs(p) * (std::abs(H(m-1, m-1)) + std::abs(z) +
-                                  std::abs(H(m+1, m+1))))) {
-          break;
-        }
-        m--;
-      }
-      for (int i = m+2; i <= n; i++) {
-        H(i, i-2) = 0.0;
-        if (i > m+2) {
-          H(i, i-3) = 0.0;
-        }
-      }
-      // Double QR step involving rows l:n and columns m:n
-      for (int k = m; k <= n-1; k++) {
-        bool notlast = (k != n-1);
-        if (k != m) {
-          p = H(k, k-1);
-          q = H(k+1, k-1);
-          r = (notlast ? H(k+2, k-1) : 0.0);
-          x = std::abs(p) + std::abs(q) + std::abs(r);
-          if (x != 0.0) {
-            p = p / x;
-            q = q / x;
-            r = r / x;
-          }
-        }
-        if (x == 0.0) {
-          break;
-        }
-        s = std::sqrt(p * p + q * q + r * r);
-        if (p < 0) {
-          s = -s;
-        }
-        if (s != 0) {
-          if (k != m) {
-            H(k, k-1) = -s * x;
-          } else if (l != m) {
-            H(k, k-1) = -H(k, k-1);
-          }
-          p = p + s;
-          x = p / s;
-          y = q / s;
-          z = r / s;
-          q = q / p;
-          r = r / p;
-          // Row modification
-          for (int j = k; j < nn; j++) {
-            p = H(k, j) + q * H(k+1, j);
-            if (notlast) {
-              p = p + r * H(k+2, j);
-              H(k+2, j) = H(k+2, j) - p * z;
-            }
-            H(k, j) = H(k, j) - p * x;
-            H(k+1, j) = H(k+1, j) - p * y;
-          }
-          // Column modification
-          for (int i = 0; i <= std::min(n, k+3); i++) {
-            p = x * H(i, k) + y * H(i, k+1);
-            if (notlast) {
-              p = p + z * H(i, k+2);
-              H(i, k+2) = H(i, k+2) - p * r;
-            }
-            H(i, k) = H(i, k) - p;
-            H(i, k+1) = H(i, k+1) - p * q;
-          }
-          // Accumulate transformations
-          for (int i = low; i <= high; i++) {
-            p = x * V(i, k) + y * V(i, k+1);
-            if (notlast) {
-              p = p + z * V(i, k+2);
-              V(i, k+2) = V(i, k+2) - p * r;
-            }
-            V(i, k) = V(i, k) - p;
-            V(i, k+1) = V(i, k+1) - p * q;
-          }
-        }  // (s != 0)
-      }  // k loop
-    }  // check convergence
-  }  // while (n >= low)
-  // Backsubstitute to find vectors of upper triangular form
-  if (norm == 0.0) {
-    return;
-  }
-  for (n = nn-1; n >= 0; n--) {
-    p = d_[n];
-    q = e_[n];
-    // Real vector
-    if (q == 0) {
-      int l = n;
-      H(n, n) = 1.0;
-      for (int i = n-1; i >= 0; i--) {
-        w = H(i, i) - p;
-        r = 0.0;
-        for (int j = l; j <= n; j++) {
-          r = r + H(i, j) * H(j, n);
-        }
-        if (e_[i] < 0.0) {
-          z = w;
-          s = r;
-        } else {
-          l = i;
-          if (e_[i] == 0.0) {
-            if (w != 0.0) {
-              H(i, n) = -r / w;
-            } else {
-              H(i, n) = -r / (eps * norm);
-            }
-            // Solve real equations
-          } else {
-            x = H(i, i+1);
-            y = H(i+1, i);
-            q = (d_[i] - p) * (d_[i] - p) +e_[i] *e_[i];
-            t = (x * s - z * r) / q;
-            H(i, n) = t;
-            if (std::abs(x) > std::abs(z)) {
-              H(i+1, n) = (-r - w * t) / x;
-            } else {
-              H(i+1, n) = (-s - y * t) / z;
-            }
-          }
-          // Overflow control
-          t = std::abs(H(i, n));
-          if ((eps * t) * t > 1) {
-            for (int j = i; j <= n; j++) {
-              H(j, n) = H(j, n) / t;
-            }
-          }
-        }
-      }
-      // Complex vector
-    } else if (q < 0) {
-      int l = n-1;
-      // Last vector component imaginary so matrix is triangular
-      if (std::abs(H(n, n-1)) > std::abs(H(n-1, n))) {
-        H(n-1, n-1) = q / H(n, n-1);
-        H(n-1, n) = -(H(n, n) - p) / H(n, n-1);
-      } else {
-        Real cdivr, cdivi;
-        cdiv(0.0, -H(n-1, n), H(n-1, n-1)-p, q, &cdivr, &cdivi);
-        H(n-1, n-1) = cdivr;
-        H(n-1, n) = cdivi;
-      }
-      H(n, n-1) = 0.0;
-      H(n, n) = 1.0;
-      for (int i = n-2; i >= 0; i--) {
-        Real ra, sa, vr, vi;
-        ra = 0.0;
-        sa = 0.0;
-        for (int j = l; j <= n; j++) {
-          ra = ra + H(i, j) * H(j, n-1);
-          sa = sa + H(i, j) * H(j, n);
-        }
-        w = H(i, i) - p;
-        if (e_[i] < 0.0) {
-          z = w;
-          r = ra;
-          s = sa;
-        } else {
-          l = i;
-          if (e_[i] == 0) {
-            Real cdivr, cdivi;
-            cdiv(-ra, -sa, w, q, &cdivr, &cdivi);
-            H(i, n-1) = cdivr;
-            H(i, n) = cdivi;
-          } else {
-            Real cdivr, cdivi;
-            // Solve complex equations
-            x = H(i, i+1);
-            y = H(i+1, i);
-            vr = (d_[i] - p) * (d_[i] - p) +e_[i] *e_[i] - q * q;
-            vi = (d_[i] - p) * 2.0 * q;
-            if (vr == 0.0 && vi == 0.0) {
-              vr = eps * norm * (std::abs(w) + std::abs(q) +
-                                 std::abs(x) + std::abs(y) + std::abs(z));
-            }
-            cdiv(x*r-z*ra+q*sa, x*s-z*sa-q*ra, vr, vi, &cdivr, &cdivi);
-            H(i, n-1) = cdivr;
-            H(i, n) = cdivi;
-            if (std::abs(x) > (std::abs(z) + std::abs(q))) {
-              H(i+1, n-1) = (-ra - w * H(i, n-1) + q * H(i, n)) / x;
-              H(i+1, n) = (-sa - w * H(i, n) - q * H(i, n-1)) / x;
-            } else {
-              cdiv(-r-y*H(i, n-1), -s-y*H(i, n), z, q, &cdivr, &cdivi);
-              H(i+1, n-1) = cdivr;
-              H(i+1, n) = cdivi;
-            }
-          }
-          // Overflow control
-          t = std::max(std::abs(H(i, n-1)), std::abs(H(i, n)));
-          if ((eps * t) * t > 1) {
-            for (int j = i; j <= n; j++) {
-              H(j, n-1) = H(j, n-1) / t;
-              H(j, n) = H(j, n) / t;
-            }
-          }
-        }
-      }
-    }
-  }
-  // Vectors of isolated roots
-  for (int i = 0; i < nn; i++) {
-    if (i < low || i > high) {
-      for (int j = i; j < nn; j++) {
-        V(i, j) = H(i, j);
-      }
-    }
-  }
-  // Back transformation to get eigenvectors of original matrix
-  for (int j = nn-1; j >= low; j--) {
-    for (int i = low; i <= high; i++) {
-      z = 0.0;
-      for (int k = low; k <= std::min(j, high); k++) {
-        z = z + V(i, k) * H(k, j);
-      }
-      V(i, j) = z;
-    }
-  }
-}
-template<typename Real>
-EigenvalueDecomposition<Real>::EigenvalueDecomposition(const MatrixBase<Real> &A) {
-  KALDI_ASSERT(A.NumCols() == A.NumRows() && A.NumCols() >= 1);
-  n_ = A.NumRows();
-  V_ = new Real[n_*n_];
-  d_ = new Real[n_];
-  e_ = new Real[n_];
-  H_ = NULL;
-  ort_ = NULL;
-  if (A.IsSymmetric(0.0)) {
-    for (int i = 0; i < n_; i++)
-      for (int j = 0; j < n_; j++)
-        V(i, j) = A(i, j);  // Note that V(i, j) is a member function; A(i, j) is an operator
-    // of the matrix A.
-    // Tridiagonalize.
-    Tred2();
-    // Diagonalize.
-    Tql2();
-  } else {
-    H_ = new Real[n_*n_];
-    ort_ = new Real[n_];
-    for (int i = 0; i < n_; i++)
-      for (int j = 0; j < n_; j++)
-        H(i, j) = A(i, j);  // as before: H is member function, A(i, j) is operator of matrix.
-    // Reduce to Hessenberg form.
-    Orthes();
-    // Reduce Hessenberg to real Schur form.
-    Hqr2();
-  }
-}
-template<typename Real>
-EigenvalueDecomposition<Real>::~EigenvalueDecomposition() {
-  delete [] d_;
-  delete [] e_;
-  delete [] V_;
-  delete [] H_;
-  delete [] ort_;
-}
-// see function MatrixBase<Real>::Eig in kaldi-matrix.cc
-} // namespace kaldi
-#endif // KALDI_MATRIX_JAMA_EIG_H_
--- a/speechx/speechx/kaldi/matrix/jama-svd.h
+++ b/speechx/speechx/kaldi/matrix/jama-svd.h
-// matrix/jama-svd.h
-// Copyright 2009-2011 Microsoft Corporation
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-// This file consists of a port and modification of materials from
-//   JAMA: A Java Matrix Package
-// under the following notice: This software is a cooperative product of
-// The MathWorks and the National Institute of Standards and Technology (NIST)
-// which has been released to the public.  This notice and the original code are
-// available at http://math.nist.gov/javanumerics/jama/domain.notice
-#ifndef KALDI_MATRIX_JAMA_SVD_H_
-#define KALDI_MATRIX_JAMA_SVD_H_ 1
-#include "matrix/kaldi-matrix.h"
-#include "matrix/sp-matrix.h"
-#include "matrix/cblas-wrappers.h"
-namespace kaldi {
-#if defined(HAVE_ATLAS) || defined(USE_KALDI_SVD)
-// using ATLAS as our math library, which doesn't have SVD -> need
-// to implement it.
-// This routine is a modified form of jama_svd.h which is part of the TNT distribution.
-// (originally comes from JAMA).
-/** Singular Value Decomposition.
- * <P>
- * For an m-by-n matrix A with m >= n, the singular value decomposition is
- * an m-by-n orthogonal matrix U, an n-by-n diagonal matrix S, and
- * an n-by-n orthogonal matrix V so that A = U*S*V'.
- * <P>
- * The singular values, sigma[k] = S(k, k), are ordered so that
- * sigma[0] >= sigma[1] >= ... >= sigma[n-1].
- * <P>
- * The singular value decompostion always exists, so the constructor will
- * never fail.  The matrix condition number and the effective numerical
- * rank can be computed from this decomposition.
- * <p>
- *     (Adapted from JAMA, a Java Matrix Library, developed by jointly
- *     by the Mathworks and NIST; see  http://math.nist.gov/javanumerics/jama).
- */
-template<typename Real>
-bool MatrixBase<Real>::JamaSvd(VectorBase<Real> *s_in,
-                               MatrixBase<Real> *U_in,
-                               MatrixBase<Real> *V_in) {  //  Destructive!
-  KALDI_ASSERT(s_in != NULL && U_in != this && V_in != this);
-  int wantu = (U_in != NULL), wantv = (V_in != NULL);
-  Matrix<Real> Utmp, Vtmp;
-  MatrixBase<Real> &U = (U_in ? *U_in : Utmp), &V = (V_in ? *V_in : Vtmp);
-  VectorBase<Real> &s = *s_in;
-  int m = num_rows_, n = num_cols_;
-  KALDI_ASSERT(m>=n && m != 0 && n != 0);
-  if (wantu) KALDI_ASSERT((int)U.num_rows_ == m && (int)U.num_cols_ == n);
-  if (wantv) KALDI_ASSERT((int)V.num_rows_ == n && (int)V.num_cols_ == n);
-  KALDI_ASSERT((int)s.Dim() == n);  // n<=m so n is min.
-  int nu = n;
-  U.SetZero();  // make sure all zero.
-  Vector<Real> e(n);
-  Vector<Real> work(m);
-  MatrixBase<Real> &A(*this);
-  Real *adata = A.Data(), *workdata = work.Data(), *edata = e.Data(),
-      *udata = U.Data(), *vdata = V.Data();
-  int astride = static_cast<int>(A.Stride()),
-      ustride = static_cast<int>(U.Stride()),
-      vstride = static_cast<int>(V.Stride());
-  int i = 0, j = 0, k = 0;
-  // Reduce A to bidiagonal form, storing the diagonal elements
-  // in s and the super-diagonal elements in e.
-  int nct = std::min(m-1, n);
-  int nrt = std::max(0, std::min(n-2, m));
-  for (k = 0; k < std::max(nct, nrt); k++) {
-    if (k < nct) {
-      // Compute the transformation for the k-th column and
-      // place the k-th diagonal in s(k).
-      // Compute 2-norm of k-th column without under/overflow.
-      s(k) = 0;
-      for (i = k; i < m; i++) {
-        s(k) = hypot(s(k), A(i, k));
-      }
-      if (s(k) != 0.0) {
-        if (A(k, k) < 0.0) {
-          s(k) = -s(k);
-        }
-        for (i = k; i < m; i++) {
-          A(i, k) /= s(k);
-        }
-        A(k, k) += 1.0;
-      }
-      s(k) = -s(k);
-    }
-    for (j = k+1; j < n; j++) {
-      if ((k < nct) && (s(k) != 0.0))  {
-        // Apply the transformation.
-        Real t = cblas_Xdot(m - k, adata + astride*k + k, astride,
-                            adata + astride*k + j, astride);
-        /*for (i = k; i < m; i++) {
-          t += adata[i*astride + k]*adata[i*astride + j];  //   A(i, k)*A(i, j); // 3
-          }*/
-        t = -t/A(k, k);
-        cblas_Xaxpy(m - k, t, adata + k*astride + k, astride,
-                    adata + k*astride + j, astride);
-        /*for (i = k; i < m; i++) {
-          adata[i*astride + j] += t*adata[i*astride + k];  // A(i, j) += t*A(i, k); // 5
-          }*/
-      }
-      // Place the k-th row of A into e for the
-      // subsequent calculation of the row transformation.
-      e(j) = A(k, j);
-    }
-    if (wantu & (k < nct)) {
-      // Place the transformation in U for subsequent back
-      // multiplication.
-      for (i = k; i < m; i++) {
-        U(i, k) = A(i, k);
-      }
-    }
-    if (k < nrt) {
-      // Compute the k-th row transformation and place the
-      // k-th super-diagonal in e(k).
-      // Compute 2-norm without under/overflow.
-      e(k) = 0;
-      for (i = k+1; i < n; i++) {
-        e(k) = hypot(e(k), e(i));
-      }
-      if (e(k) != 0.0) {
-        if (e(k+1) < 0.0) {
-          e(k) = -e(k);
-        }
-        for (i = k+1; i < n; i++) {
-          e(i) /= e(k);
-        }
-        e(k+1) += 1.0;
-      }
-      e(k) = -e(k);
-      if ((k+1 < m) & (e(k) != 0.0)) {
-        // Apply the transformation.
-        for (i = k+1; i < m; i++) {
-          work(i) = 0.0;
-        }
-        for (j = k+1; j < n; j++) {
-          for (i = k+1; i < m; i++) {
-            workdata[i] += edata[j] * adata[i*astride + j];  // work(i) += e(j)*A(i, j); // 5
-          }
-        }
-        for (j = k+1; j < n; j++) {
-          Real t(-e(j)/e(k+1));
-          cblas_Xaxpy(m - (k+1), t, workdata + (k+1), 1,
-                      adata + (k+1)*astride + j, astride);
-          /*
-          for (i = k+1; i < m; i++) {
-            adata[i*astride + j] += t*workdata[i];  // A(i, j) += t*work(i); // 5
-            }*/
-        }
-      }
-      if (wantv) {
-        // Place the transformation in V for subsequent
-        // back multiplication.
-        for (i = k+1; i < n; i++) {
-          V(i, k) = e(i);
-        }
-      }
-    }
-  }
-  // Set up the final bidiagonal matrix or order p.
-  int p = std::min(n, m+1);
-  if (nct < n) {
-    s(nct) = A(nct, nct);
-  }
-  if (m < p) {
-    s(p-1) = 0.0;
-  }
-  if (nrt+1 < p) {
-    e(nrt) = A(nrt, p-1);
-  }
-  e(p-1) = 0.0;
-  // If required, generate U.
-  if (wantu) {
-    for (j = nct; j < nu; j++) {
-      for (i = 0; i < m; i++) {
-        U(i, j) = 0.0;
-      }
-      U(j, j) = 1.0;
-    }
-    for (k = nct-1; k >= 0; k--) {
-      if (s(k) != 0.0) {
-        for (j = k+1; j < nu; j++) {
-          Real t = cblas_Xdot(m - k, udata + k*ustride + k, ustride, udata + k*ustride + j, ustride);
-          //for (i = k; i < m; i++) {
-          //  t += udata[i*ustride + k]*udata[i*ustride + j];  // t += U(i, k)*U(i, j); // 8
-          // }
-          t = -t/U(k, k);
-          cblas_Xaxpy(m - k, t, udata + ustride*k + k, ustride,
-                      udata + k*ustride + j, ustride);
-          /*for (i = k; i < m; i++) {
-            udata[i*ustride + j] += t*udata[i*ustride + k];  // U(i, j) += t*U(i, k); // 4
-            }*/
-        }
-        for (i = k; i < m; i++ ) {
-          U(i, k) = -U(i, k);
-        }
-        U(k, k) = 1.0 + U(k, k);
-        for (i = 0; i < k-1; i++) {
-          U(i, k) = 0.0;
-        }
-      } else {
-        for (i = 0; i < m; i++) {
-          U(i, k) = 0.0;
-        }
-        U(k, k) = 1.0;
-      }
-    }
-  }
-  // If required, generate V.
-  if (wantv) {
-    for (k = n-1; k >= 0; k--) {
-      if ((k < nrt) & (e(k) != 0.0)) {
-        for (j = k+1; j < nu; j++) {
-          Real t = cblas_Xdot(n - (k+1), vdata + (k+1)*vstride + k, vstride,
-                              vdata + (k+1)*vstride + j, vstride); 
-          /*Real t (0.0);
-          for (i = k+1; i < n; i++) {
-            t += vdata[i*vstride + k]*vdata[i*vstride + j];  // t += V(i, k)*V(i, j); // 7
-            }*/
-          t = -t/V(k+1, k);
-          cblas_Xaxpy(n - (k+1), t, vdata + (k+1)*vstride + k, vstride,
-                      vdata + (k+1)*vstride + j, vstride);
-          /*for (i = k+1; i < n; i++) {
-            vdata[i*vstride + j] += t*vdata[i*vstride + k];  // V(i, j) += t*V(i, k); // 7
-            }*/
-        }
-      }
-      for (i = 0; i < n; i++) {
-        V(i, k) = 0.0;
-      }
-      V(k, k) = 1.0;
-    }
-  }
-  // Main iteration loop for the singular values.
-  int pp = p-1;
-  int iter = 0;
-  // note: -52.0 is from Jama code; the -23 is the extension
-  // to float, because mantissa length in (double, float)
-  // is (52, 23) bits respectively.
-  Real eps(pow(2.0, sizeof(Real) == 4 ? -23.0 : -52.0));
-  // Note: the -966 was taken from Jama code, but the -120 is a guess
-  // of how to extend this to float... the exponent in double goes
-  // from -1022 .. 1023, and in float from -126..127.  I'm not sure
-  // what the significance of 966 is, so -120 just represents a number
-  // that's a bit less negative than -126.  If we get convergence
-  // failure in float only, this may mean that we have to make the
-  // -120 value less negative.
-  Real tiny(pow(2.0, sizeof(Real) == 4 ? -120.0: -966.0 ));
-  while (p > 0) {
-    int k = 0;
-    int kase = 0;
-    if (iter == 500 || iter == 750) {
-      KALDI_WARN << "Svd taking a long time: making convergence criterion less exact.";
-      eps = pow(static_cast<Real>(0.8), eps);
-      tiny = pow(static_cast<Real>(0.8), tiny);
-    }
-    if (iter > 1000) {
-      KALDI_WARN << "Svd not converging on matrix of size " << m << " by " <<n;
-      return false;
-    }
-    // This section of the program inspects for
-    // negligible elements in the s and e arrays.  On
-    // completion the variables kase and k are set as follows.
-    // kase = 1     if s(p) and e(k-1) are negligible and k < p
-    // kase = 2     if s(k) is negligible and k < p
-    // kase = 3     if e(k-1) is negligible, k < p, and
-    //              s(k), ..., s(p) are not negligible (qr step).
-    // kase = 4     if e(p-1) is negligible (convergence).
-    for (k = p-2; k >= -1; k--) {
-      if (k == -1) {
-        break;
-      }
-      if (std::abs(e(k)) <=
-          tiny + eps*(std::abs(s(k)) + std::abs(s(k+1)))) {
-        e(k) = 0.0;
-        break;
-      }
-    }
-    if (k == p-2) {
-      kase = 4;
-    } else {
-      int ks;
-      for (ks = p-1; ks >= k; ks--) {
-        if (ks == k) {
-          break;
-        }
-        Real t( (ks != p ? std::abs(e(ks)) : 0.) +
-                (ks != k+1 ? std::abs(e(ks-1)) : 0.));
-        if (std::abs(s(ks)) <= tiny + eps*t)  {
-          s(ks) = 0.0;
-          break;
-        }
-      }
-      if (ks == k) {
-        kase = 3;
-      } else if (ks == p-1) {
-        kase = 1;
-      } else {
-        kase = 2;
-        k = ks;
-      }
-    }
-    k++;
-    // Perform the task indicated by kase.
-    switch (kase) {
-      // Deflate negligible s(p).
-      case 1: {
-        Real f(e(p-2));
-        e(p-2) = 0.0;
-        for (j = p-2; j >= k; j--) {
-          Real t( hypot(s(j), f));
-          Real cs(s(j)/t);
-          Real sn(f/t);
-          s(j) = t;
-          if (j != k) {
-            f = -sn*e(j-1);
-            e(j-1) = cs*e(j-1);
-          }
-          if (wantv) {
-            for (i = 0; i < n; i++) {
-              t = cs*V(i, j) + sn*V(i, p-1);
-              V(i, p-1) = -sn*V(i, j) + cs*V(i, p-1);
-              V(i, j) = t;
-            }
-          }
-        }
-      }
-        break;
-        // Split at negligible s(k).
-      case 2: {
-        Real f(e(k-1));
-        e(k-1) = 0.0;
-        for (j = k; j < p; j++) {
-          Real t(hypot(s(j), f));
-          Real cs( s(j)/t);
-          Real sn(f/t);
-          s(j) = t;
-          f = -sn*e(j);
-          e(j) = cs*e(j);
-          if (wantu) {
-            for (i = 0; i < m; i++) {
-              t = cs*U(i, j) + sn*U(i, k-1);
-              U(i, k-1) = -sn*U(i, j) + cs*U(i, k-1);
-              U(i, j) = t;
-            }
-          }
-        }
-      }
-        break;
-        // Perform one qr step.
-      case 3: {
-        // Calculate the shift.
-        Real scale = std::max(std::max(std::max(std::max(
-            std::abs(s(p-1)), std::abs(s(p-2))), std::abs(e(p-2))),
-                                       std::abs(s(k))), std::abs(e(k)));
-        Real sp = s(p-1)/scale;
-        Real spm1 = s(p-2)/scale;
-        Real epm1 = e(p-2)/scale;
-        Real sk = s(k)/scale;
-        Real ek = e(k)/scale;
-        Real b = ((spm1 + sp)*(spm1 - sp) + epm1*epm1)/2.0;
-        Real c = (sp*epm1)*(sp*epm1);
-        Real shift = 0.0;
-        if ((b != 0.0) || (c != 0.0)) {
-          shift = std::sqrt(b*b + c);
-          if (b < 0.0) {
-            shift = -shift;
-          }
-          shift = c/(b + shift);
-        }
-        Real f = (sk + sp)*(sk - sp) + shift;
-        Real g = sk*ek;
-        // Chase zeros.
-        for (j = k; j < p-1; j++) {
-          Real t = hypot(f, g);
-          Real cs = f/t;
-          Real sn = g/t;
-          if (j != k) {
-            e(j-1) = t;
-          }
-          f = cs*s(j) + sn*e(j);
-          e(j) = cs*e(j) - sn*s(j);
-          g = sn*s(j+1);
-          s(j+1) = cs*s(j+1);
-          if (wantv) {
-            cblas_Xrot(n, vdata + j, vstride, vdata + j+1, vstride, cs, sn);
-            /*for (i = 0; i < n; i++) {
-              t = cs*vdata[i*vstride + j] + sn*vdata[i*vstride + j+1];  // t = cs*V(i, j) + sn*V(i, j+1);         // 13
-              vdata[i*vstride + j+1] = -sn*vdata[i*vstride + j] + cs*vdata[i*vstride + j+1];  // V(i, j+1) = -sn*V(i, j) + cs*V(i, j+1); // 5
-              vdata[i*vstride + j] = t;  // V(i, j) = t; // 4
-              }*/
-          }
-          t = hypot(f, g);
-          cs = f/t;
-          sn = g/t;
-          s(j) = t;
-          f = cs*e(j) + sn*s(j+1);
-          s(j+1) = -sn*e(j) + cs*s(j+1);
-          g = sn*e(j+1);
-          e(j+1) = cs*e(j+1);
-          if (wantu && (j < m-1)) {
-            cblas_Xrot(m, udata + j, ustride, udata + j+1, ustride, cs, sn);
-            /*for (i = 0; i < m; i++) {
-              t = cs*udata[i*ustride + j] + sn*udata[i*ustride + j+1];  // t = cs*U(i, j) + sn*U(i, j+1); // 7
-              udata[i*ustride + j+1] = -sn*udata[i*ustride + j] +cs*udata[i*ustride + j+1];  // U(i, j+1) = -sn*U(i, j) + cs*U(i, j+1); // 8
-              udata[i*ustride + j] = t;  // U(i, j) = t; // 1
-              }*/
-          }
-        }
-        e(p-2) = f;
-        iter = iter + 1;
-      }
-        break;
-        // Convergence.
-      case 4: {
-        // Make the singular values positive.
-        if (s(k) <= 0.0) {
-          s(k) = (s(k) < 0.0 ? -s(k) : 0.0);
-          if (wantv) {
-            for (i = 0; i <= pp; i++) {
-              V(i, k) = -V(i, k);
-            }
-          }
-        }
-        // Order the singular values.
-        while (k < pp) {
-          if (s(k) >= s(k+1)) {
-            break;
-          }
-          Real t = s(k);
-          s(k) = s(k+1);
-          s(k+1) = t;
-          if (wantv && (k < n-1)) {
-            for (i = 0; i < n; i++) {
-              t = V(i, k+1); V(i, k+1) = V(i, k); V(i, k) = t;
-            }
-          }
-          if (wantu && (k < m-1)) {
-            for (i = 0; i < m; i++) {
-              t = U(i, k+1); U(i, k+1) = U(i, k); U(i, k) = t;
-            }
-          }
-          k++;
-        }
-        iter = 0;
-        p--;
-      }
-        break;
-    }
-  }
-  return true;
-}
-#endif // defined(HAVE_ATLAS) || defined(USE_KALDI_SVD)
-} // namespace kaldi
-#endif // KALDI_MATRIX_JAMA_SVD_H_
--- a/speechx/speechx/kaldi/matrix/kaldi-blas.h
+++ b/speechx/speechx/kaldi/matrix/kaldi-blas.h
-// matrix/kaldi-blas.h
-// Copyright 2009-2011  Ondrej Glembek;  Microsoft Corporation
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//  http://www.apache.org/licenses/LICENSE-2.0
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_MATRIX_KALDI_BLAS_H_
-#define KALDI_MATRIX_KALDI_BLAS_H_
-// This file handles the #includes for BLAS, LAPACK and so on.
-// It manipulates the declarations into a common format that kaldi can handle.
-// However, the kaldi code will check whether HAVE_ATLAS is defined as that
-// code is called a bit differently from CLAPACK that comes from other sources.
-// There are three alternatives:
-//   (i) you have ATLAS, which includes the ATLAS implementation of CBLAS
-//   plus a subset of CLAPACK (but with clapack_ in the function declarations).
-//   In this case, define HAVE_ATLAS and make sure the relevant directories are
-//   in the include path.
-//   (ii) you have CBLAS (some implementation thereof) plus CLAPACK.
-//   In this case, define HAVE_CLAPACK.
-//   [Since CLAPACK depends on BLAS, the presence of BLAS is implicit].
-//  (iii) you have the MKL library, which includes CLAPACK and CBLAS.
-// Note that if we are using ATLAS, no Svd implementation is supplied,
-// so we define HAVE_Svd to be zero and this directs our implementation to
-// supply its own "by hand" implementation which is based on TNT code.
-#define HAVE_OPENBLAS
-#if (defined(HAVE_CLAPACK) && (defined(HAVE_ATLAS) || defined(HAVE_MKL))) \
-    || (defined(HAVE_ATLAS) && defined(HAVE_MKL))
-#error "Do not define more than one of HAVE_CLAPACK, HAVE_ATLAS and HAVE_MKL"
-#endif
-#ifdef HAVE_ATLAS
-  extern "C" {
-    #include "cblas.h"
-    #include "clapack.h"
-  }
-#elif defined(HAVE_CLAPACK)
-  #ifdef __APPLE__
-    #ifndef __has_extension
-    #define __has_extension(x) 0
-    #endif
-    #define vImage_Utilities_h
-    #define vImage_CVUtilities_h
-    #include <Accelerate/Accelerate.h>
-    typedef __CLPK_integer          integer;
-    typedef __CLPK_logical          logical;
-    typedef __CLPK_real             real;
-    typedef __CLPK_doublereal       doublereal;
-    typedef __CLPK_complex          complex;
-    typedef __CLPK_doublecomplex    doublecomplex;
-    typedef __CLPK_ftnlen           ftnlen;
-  #else
-    extern "C" {
-      // May be in /usr/[local]/include if installed; else this uses the one
-      // from the tools/CLAPACK_include directory.
-      #include <cblas.h>
-      #include <f2c.h>
-      #include <clapack.h>
-      // get rid of macros from f2c.h -- these are dangerous.
-      #undef abs
-      #undef dabs
-      #undef min
-      #undef max
-      #undef dmin
-      #undef dmax
-      #undef bit_test
-      #undef bit_clear
-      #undef bit_set
-    }
-  #endif
-#elif defined(HAVE_MKL)
-  extern "C" {
-    #include <mkl.h>
-  }
-#elif defined(HAVE_OPENBLAS)
-  // getting cblas.h and lapacke.h from <openblas-install-dir>/.
-  // putting in "" not <> to search -I before system libraries.
-  #if defined(_MSC_VER)
-    #include <complex.h>
-    #define LAPACK_COMPLEX_CUSTOM
-    #define lapack_complex_float _Fcomplex
-    #define lapack_complex_double _Dcomplex
-  #endif
-  #include "cblas.h"
-  #include "lapacke.h"
-  #undef I
-  #undef complex
-  // get rid of macros from f2c.h -- these are dangerous.
-  #undef abs
-  #undef dabs
-  #undef min
-  #undef max
-  #undef dmin
-  #undef dmax
-  #undef bit_test
-  #undef bit_clear
-  #undef bit_set
-#else
-  #error "You need to define (using the preprocessor) either HAVE_CLAPACK or HAVE_ATLAS or HAVE_MKL (but not more than one)"
-#endif
-#ifdef HAVE_OPENBLAS
-typedef int KaldiBlasInt; // try int.
-#endif
-#ifdef HAVE_CLAPACK
-typedef integer KaldiBlasInt;
-#endif
-#ifdef HAVE_MKL
-typedef MKL_INT KaldiBlasInt;
-#endif
-#ifdef HAVE_ATLAS
-// in this case there is no need for KaldiBlasInt-- this typedef is only needed
-// for Svd code which is not included in ATLAS (we re-implement it).
-#endif
-#endif  // KALDI_MATRIX_KALDI_BLAS_H_
--- a/speechx/speechx/kaldi/matrix/matrix-functions-inl.h
+++ b/speechx/speechx/kaldi/matrix/matrix-functions-inl.h
-// matrix/matrix-functions-inl.h
-// Copyright 2009-2011 Microsoft Corporation
-//
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-//
-// (*) incorporates, with permission, FFT code from his book
-// "Signal Processing with Lapped Transforms", Artech, 1992.
-#ifndef KALDI_MATRIX_MATRIX_FUNCTIONS_INL_H_
-#define KALDI_MATRIX_MATRIX_FUNCTIONS_INL_H_
-namespace kaldi {
-//! ComplexMul implements, inline, the complex multiplication b *= a.
-template<typename Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
-                                            Real *b_re, Real *b_im) {
-  Real tmp_re = (*b_re * a_re) - (*b_im * a_im);
-  *b_im = *b_re * a_im + *b_im * a_re;
-  *b_re = tmp_re;
-}
-template<typename Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
-                                                   const Real &b_re, const Real &b_im,
-                                                   Real *c_re, Real *c_im) {
-  *c_re += b_re*a_re - b_im*a_im;
-  *c_im += b_re*a_im + b_im*a_re;
-}
-template<typename Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im) {
-  *a_re = std::cos(x);
-  *a_im = std::sin(x);
-}
-} // end namespace kaldi
-#endif // KALDI_MATRIX_MATRIX_FUNCTIONS_INL_H_
--- a/speechx/speechx/kaldi/matrix/matrix-functions.cc
+++ b/speechx/speechx/kaldi/matrix/matrix-functions.cc
-// matrix/matrix-functions.cc
-// Copyright 2009-2011  Microsoft Corporation;  Go Vivace Inc.;  Jan Silovsky
-//                      Yanmin Qian;  Saarland University;  Johns Hopkins University (Author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-//
-// (*) incorporates, with permission, FFT code from his book
-// "Signal Processing with Lapped Transforms", Artech, 1992.
-#include "matrix/matrix-functions.h"
-#include "matrix/sp-matrix.h"
-namespace kaldi {
-template<typename Real> void ComplexFt (const VectorBase<Real> &in,
-                                     VectorBase<Real> *out, bool forward) {
-  int exp_sign = (forward ? -1 : 1);
-  KALDI_ASSERT(out != NULL);
-  KALDI_ASSERT(in.Dim() == out->Dim());
-  KALDI_ASSERT(in.Dim() % 2 == 0);
-  int twoN = in.Dim(), N = twoN / 2;
-  const Real *data_in = in.Data();
-  Real *data_out = out->Data();
-  Real exp1N_re, exp1N_im;  //  forward -> exp(-2pi / N), backward -> exp(2pi / N).
-  Real fraction = exp_sign * M_2PI / static_cast<Real>(N);  // forward -> -2pi/N, backward->-2pi/N
-  ComplexImExp(fraction, &exp1N_re, &exp1N_im);
-  Real expm_re = 1.0, expm_im = 0.0;  // forward -> exp(-2pi m / N).
-  for (int two_m = 0; two_m < twoN; two_m+=2) {  // For each output component.
-    Real expmn_re = 1.0, expmn_im = 0.0;  // forward -> exp(-2pi m n / N).
-    Real sum_re = 0.0, sum_im = 0.0;  // complex output for index m (the sum expression)
-    for (int two_n = 0; two_n < twoN; two_n+=2) {
-      ComplexAddProduct(data_in[two_n], data_in[two_n+1],
-                        expmn_re, expmn_im,
-                        &sum_re, &sum_im);
-      ComplexMul(expm_re, expm_im, &expmn_re, &expmn_im);
-    }
-    data_out[two_m] = sum_re;
-    data_out[two_m + 1] = sum_im;
-    if (two_m % 10 == 0) {  // occasionally renew "expm" from scratch to avoid
-      // loss of precision.
-      int nextm = 1 + two_m/2;
-      Real fraction_mult = fraction * nextm;
-      ComplexImExp(fraction_mult, &expm_re, &expm_im);
-    } else {
-      ComplexMul(exp1N_re, exp1N_im, &expm_re, &expm_im);
-    }
-  }
-}
-template
-void ComplexFt (const VectorBase<float> &in,
-                VectorBase<float> *out, bool forward);
-template
-void ComplexFt (const VectorBase<double> &in,
-                VectorBase<double> *out, bool forward);
-#define KALDI_COMPLEXFFT_BLOCKSIZE 8192
-// This #define affects how we recurse in ComplexFftRecursive.
-// We assume that memory-caching happens on a scale at
-// least as small as this.
-//! ComplexFftRecursive is a recursive function that computes the
-//! complex FFT of size N.  The "nffts" arguments specifies how many
-//! separate FFTs to compute in parallel (we assume the data for
-//! each one is consecutive in memory).  The "forward argument"
-//! specifies whether to do the FFT (true) or IFFT (false), although
-//! note that we do not include the factor of 1/N (the user should
-//! do this if required.  The iterators factor_begin and factor_end
-//! point to the beginning and end (i.e. one past the last element)
-//! of an array of small factors of N (typically prime factors).
-//! See the comments below this code for the detailed equations
-//! of the recursion.
-template<typename Real>
-void ComplexFftRecursive (Real *data, int nffts, int N,
-                          const int *factor_begin,
-                          const int *factor_end, bool forward,
-                          Vector<Real> *tmp_vec) {
-  if (factor_begin == factor_end) {
-    KALDI_ASSERT(N == 1);
-    return;
-  }
-  {  // an optimization: compute in smaller blocks.
-    // this block of code could be removed and it would still work.
-    MatrixIndexT size_perblock = N * 2 * sizeof(Real);
-    if (nffts > 1 && size_perblock*nffts > KALDI_COMPLEXFFT_BLOCKSIZE) {  // can break it up...
-      // Break up into multiple blocks.  This is an optimization.  We make
-      // no progress on the FFT when we do this.
-      int block_skip = KALDI_COMPLEXFFT_BLOCKSIZE / size_perblock;  // n blocks per call
-      if (block_skip == 0) block_skip = 1;
-      if (block_skip < nffts) {
-        int blocks_left = nffts;
-        while (blocks_left > 0) {
-          int skip_now = std::min(blocks_left, block_skip);
-          ComplexFftRecursive(data, skip_now, N, factor_begin, factor_end, forward, tmp_vec);
-          blocks_left -= skip_now;
-          data += skip_now * N*2;
-        }
-        return;
-      } // else do the actual algorithm.
-    } // else do the actual algorithm.
-  }
-  int P = *factor_begin;
-  KALDI_ASSERT(P > 1);
-  int Q = N / P;
-  if (P > 1 && Q > 1) {  // Do the rearrangement.   C.f. eq. (8) below.  Transform
-    // (a) to (b).
-    Real *data_thisblock = data;
-    if (tmp_vec->Dim() < (MatrixIndexT)N) tmp_vec->Resize(N);
-    Real *data_tmp = tmp_vec->Data();
-    for (int thisfft = 0; thisfft < nffts; thisfft++, data_thisblock+=N*2) {
-      for (int offset = 0; offset < 2; offset++) {  // 0 == real, 1 == im.
-        for (int p = 0; p < P; p++) {
-          for (int q = 0; q < Q; q++) {
-            int aidx = q*P + p, bidx = p*Q + q;
-            data_tmp[bidx] = data_thisblock[2*aidx+offset];
-          }
-        }
-        for (int n = 0;n < P*Q;n++) data_thisblock[2*n+offset] = data_tmp[n];
-      }
-    }
-  }
-  {  // Recurse.
-    ComplexFftRecursive(data, nffts*P, Q, factor_begin+1, factor_end, forward, tmp_vec);
-  }
-  int exp_sign = (forward ? -1 : 1);
-  Real rootN_re, rootN_im;  // Nth root of unity.
-  ComplexImExp(static_cast<Real>(exp_sign * M_2PI / N), &rootN_re, &rootN_im);
-  Real rootP_re, rootP_im;  // Pth root of unity.
-  ComplexImExp(static_cast<Real>(exp_sign * M_2PI / P), &rootP_re, &rootP_im);
-  {  // Do the multiplication
-    // could avoid a bunch of complex multiplies by moving the loop over data_thisblock
-    // inside.
-    if (tmp_vec->Dim() < (MatrixIndexT)(P*2)) tmp_vec->Resize(P*2);
-    Real *temp_a = tmp_vec->Data();
-    Real *data_thisblock = data, *data_end = data+(N*2*nffts);
-    for (; data_thisblock != data_end; data_thisblock += N*2) {  // for each separate fft.
-      Real qd_re = 1.0, qd_im = 0.0;  // 1^(q'/N)
-      for (int qd = 0; qd < Q; qd++) {
-        Real pdQ_qd_re = qd_re, pdQ_qd_im = qd_im;  // 1^((p'Q+q') / N) == 1^((p'/P) + (q'/N))
-                                              // Initialize to q'/N, corresponding to p' == 0.
-        for (int pd = 0; pd < P; pd++) {  // pd == p'
-          {  // This is the p = 0 case of the loop below [an optimization].
-            temp_a[pd*2] = data_thisblock[qd*2];
-            temp_a[pd*2 + 1] = data_thisblock[qd*2 + 1];
-          }
-          {  // This is the p = 1 case of the loop below [an optimization]
-            // **** MOST OF THE TIME (>60% I think) gets spent here. ***
-            ComplexAddProduct(pdQ_qd_re, pdQ_qd_im,
-                              data_thisblock[(qd+Q)*2], data_thisblock[(qd+Q)*2 + 1],
-                              &(temp_a[pd*2]), &(temp_a[pd*2 + 1]));
-          }
-          if (P > 2) {
-            Real p_pdQ_qd_re = pdQ_qd_re, p_pdQ_qd_im = pdQ_qd_im;  // 1^(p(p'Q+q')/N)
-            for (int p = 2; p < P; p++) {
-              ComplexMul(pdQ_qd_re, pdQ_qd_im, &p_pdQ_qd_re, &p_pdQ_qd_im);  // p_pdQ_qd *= pdQ_qd.
-              int data_idx = p*Q + qd;
-              ComplexAddProduct(p_pdQ_qd_re, p_pdQ_qd_im,
-                                data_thisblock[data_idx*2], data_thisblock[data_idx*2 + 1],
-                                &(temp_a[pd*2]), &(temp_a[pd*2 + 1]));
-            }
-          }
-          if (pd != P-1)
-            ComplexMul(rootP_re, rootP_im, &pdQ_qd_re, &pdQ_qd_im);  // pdQ_qd *= (rootP == 1^{1/P})
-          // (using 1/P == Q/N)
-        }
-        for (int pd = 0; pd < P; pd++) {
-          data_thisblock[(pd*Q + qd)*2] = temp_a[pd*2];
-          data_thisblock[(pd*Q + qd)*2 + 1] = temp_a[pd*2 + 1];
-        }
-        ComplexMul(rootN_re, rootN_im, &qd_re, &qd_im);  // qd *= rootN.
-      }
-    }
-  }
-}
-/* Equations for ComplexFftRecursive.
-   We consider here one of the "nffts" separate ffts; it's just a question of
-   doing them all in parallel.  We also write all equations in terms of
-   complex math (the conversion to real arithmetic is not hard, and anyway
-   takes place inside function calls).
-   Let the input (i.e. "data" at start) be a_n, n = 0..N-1, and
-   the output (Fourier transform) be d_k, k = 0..N-1.  We use these letters because
-   there will be two intermediate variables b and c.
-   We want to compute:
-     d_k = \sum_n a_n 1^(kn/N)                                             (1)
-   where we use 1^x as shorthand for exp(-2pi x) for the forward algorithm
-   and exp(2pi x) for the backward one.
-   We factorize N = P Q (P small, Q usually large).
-   With p = 0..P-1 and q = 0..Q-1, and also p'=0..P-1 and q'=0..P-1, we let:
-    k == p'Q + q'                                                           (2)
-    n == qP + p                                                             (3)
-   That is, we let p, q, p', q' range over these indices and observe that this way we
-   can cover all n, k.  Expanding (1) using (2) and (3), we can write:
-      d_k = \sum_{p, q}  a_n 1^((p'Q+q')(qP+p)/N)
-          = \sum_{p, q}  a_n 1^(p'pQ/N) 1^(q'qP/N) 1^(q'p/N)                 (4)
-   using 1^(PQ/N) = 1 to get rid of the terms with PQ in them.  Rearranging (4),
-     d_k =  \sum_p 1^(p'pQ/N) 1^(q'p/N)  \sum_q 1^(q'qP/N) a_n              (5)
-   The point here is to separate the index q.  Now we can expand out the remaining
-   instances of k and n using (2) and (3):
-     d_(p'Q+q') =  \sum_p 1^(p'pQ/N) 1^(q'p/N)  \sum_q 1^(q'qP/N) a_(qP+p)   (6)
-   The expression \sum_q varies with the indices p and q'.  Let us define
-         C_{p, q'} =  \sum_q 1^(q'qP/N) a_(qP+p)                            (7)
-   Here, C_{p, q'}, viewed as a sequence in q', is just the DFT of the points
-   a_(qP+p) for q = 1..Q-1.  These points are not consecutive in memory though,
-   they jump by P each time.  Let us define b as a rearranged version of a,
-   so that
-         b_(pQ+q) = a_(qP+p)                                                  (8)
-   How to do this rearrangement in place?  In
-   We can rearrange (7) to be written in terms of the b's, using (8), so that
-         C_{p, q'} =  \sum_q 1^(q'q (P/N)) b_(pQ+q)                            (9)
-   Here, the sequence of C_{p, q'} over q'=0..Q-1, is just the DFT of the sequence
-   of b_(pQ) .. b_(p(Q+1)-1).  Let's arrange the C_{p, q'} in a single array in
-   memory in the same way as the b's, i.e. we define
-         c_(pQ+q') == C_{p, q'}.                                                (10)
-   Note that we could have written (10) with q in place of q', as there is only
-   one index of type q present, but q' is just a more natural variable name to use
-   since we use q' elsewhere to subscript c and C.
-   Rewriting (9), we have:
-         c_(pQ+q')  = \sum_q 1^(q'q (P/N)) b_(pQ+q)                            (11)
-    which is the DFT computed by the recursive call to this function [after computing
-    the b's by rearranging the a's].  From the c's we want to compute the d's.
-    Taking (6), substituting in the sum (7), and using (10) to write it as an array,
-    we have:
-         d_(p'Q+q') =  \sum_p 1^(p'pQ/N) 1^(q'p/N)  c_(pQ+q')                   (12)
-    This sum is independent for different values of q'.  Note that d overwrites c
-    in memory.  We compute this in  a direct way, using a little array of size P to
-    store the computed d values for one value of q' (we reuse the array for each value
-    of q').
-    So the overall picture is this:
-    We get a call to compute DFT on size N.
-    - If N == 1 we return (nothing to do).
-    - We factor N = P Q (typically, P is small).
-    - Using (8), we rearrange the data in memory so that we have b not a in memory
-       (this is the block "do the rearrangement").
-       The pseudocode for this is as follows.  For simplicity we use a temporary array.
-          for p = 0..P-1
-             for q = 0..Q-1
-                bidx = pQ + q
-                aidx = qP + p
-                tmp[bidx] = data[aidx].
-             end
-          end
-          data <-- tmp
-        else
-        endif
-        The reason this accomplishes (8) is that we want pQ+q and qP+p to be swapped
-        over for each p, q, and the "if m > n" is a convenient way of ensuring that
-        this swapping happens only once (otherwise it would happen twice, since pQ+q
-        and qP+p both range over the entire set of numbers 0..N-1).
-    - We do the DFT on the smaller block size to compute c from b (this eq eq. (11)).
-      Note that this is actually multiple DFTs, one for each value of p, but this
-      goes to the "nffts" argument of the function call, which we have ignored up to now.
-    -We compute eq. (12) via a loop, as follows
-         allocate temporary array e of size P.
-         For q' = 0..Q-1:
-            for p' = 0..P-1:
-               set sum to zero [this will go in e[p']]
-               for p = p..P-1:
-                  sum += 1^(p'pQ/N) 1^(q'p/N)  c_(pQ+q')
-               end
-               e[p'] = sum
-            end
-            for p' = 0..P-1:
-               d_(p'Q+q') = e[p']
-            end
-         end
-         delete temporary array e
-*/
-// This is the outer-layer calling code for ComplexFftRecursive.
-// It factorizes the dimension and then calls the FFT routine.
-template<typename Real> void ComplexFft(VectorBase<Real> *v, bool forward, Vector<Real> *tmp_in) {
-  KALDI_ASSERT(v != NULL);
-  if (v->Dim()<=1) return;
-  KALDI_ASSERT(v->Dim() % 2 == 0);  // complex input.
-  int N = v->Dim() / 2;
-  std::vector<int> factors;
-  Factorize(N, &factors);
-  int *factor_beg = NULL;
-  if (factors.size() > 0)
-    factor_beg = &(factors[0]);
-  Vector<Real> tmp;  // allocated in ComplexFftRecursive.
-  ComplexFftRecursive(v->Data(), 1, N, factor_beg, factor_beg+factors.size(), forward, (tmp_in?tmp_in:&tmp));
-}
-//! Inefficient version of Fourier transform, for testing purposes.
-template<typename Real> void RealFftInefficient (VectorBase<Real> *v, bool forward) {
-  KALDI_ASSERT(v != NULL);
-  MatrixIndexT N = v->Dim();
-  KALDI_ASSERT(N%2 == 0);
-  if (N == 0) return;
-  Vector<Real> vtmp(N*2);  // store as complex.
-  if (forward) {
-    for (MatrixIndexT i = 0; i < N; i++)  vtmp(i*2) = (*v)(i);
-    ComplexFft(&vtmp, forward);  // this is already tested so we can use this.
-    v->CopyFromVec( vtmp.Range(0, N) );
-    (*v)(1) = vtmp(N);  // Copy the N/2'th fourier component, which is real,
-    // to the imaginary part of the 1st complex output.
-  } else {
-    // reverse the transformation above to get the complex spectrum.
-    vtmp(0) = (*v)(0);  // copy F_0 which is real
-    vtmp(N) = (*v)(1);  // copy F_{N/2} which is real
-    for (MatrixIndexT i = 1; i < N/2; i++) {
-      // Copy i'th to i'th fourier component
-      vtmp(2*i) = (*v)(2*i);
-      vtmp(2*i+1) = (*v)(2*i+1);
-      // Copy i'th to N-i'th, conjugated.
-      vtmp(2*(N-i)) = (*v)(2*i);
-      vtmp(2*(N-i)+1) = -(*v)(2*i+1);
-    }
-    ComplexFft(&vtmp, forward);  // actually backward since forward == false
-    // Copy back real part.  Complex part should be zero.
-    for (MatrixIndexT i = 0; i < N; i++)
-      (*v)(i) = vtmp(i*2);
-  }
-}
-template void RealFftInefficient (VectorBase<float> *v, bool forward);
-template void RealFftInefficient (VectorBase<double> *v, bool forward);
-template
-void ComplexFft(VectorBase<float> *v, bool forward, Vector<float> *tmp_in);
-template
-void ComplexFft(VectorBase<double> *v, bool forward, Vector<double> *tmp_in);
-// See the long comment below for the math behind this.
-template<typename Real> void RealFft (VectorBase<Real> *v, bool forward) {
-  KALDI_ASSERT(v != NULL);
-  MatrixIndexT N = v->Dim(), N2 = N/2;
-  KALDI_ASSERT(N%2 == 0);
-  if (N == 0) return;
-  if (forward) ComplexFft(v, true);
-  Real *data = v->Data();
-  Real rootN_re, rootN_im;  // exp(-2pi/N), forward; exp(2pi/N), backward
-  int forward_sign = forward ? -1 : 1;
-  ComplexImExp(static_cast<Real>(M_2PI/N *forward_sign), &rootN_re, &rootN_im);
-  Real kN_re = -forward_sign, kN_im = 0.0;  // exp(-2pik/N), forward; exp(-2pik/N), backward
-  // kN starts out as 1.0 for forward algorithm but -1.0 for backward.
-  for (MatrixIndexT k = 1; 2*k <= N2; k++) {
-    ComplexMul(rootN_re, rootN_im, &kN_re, &kN_im);
-    Real Ck_re, Ck_im, Dk_re, Dk_im;
-    // C_k = 1/2 (B_k + B_{N/2 - k}^*) :
-    Ck_re = 0.5 * (data[2*k] + data[N - 2*k]);
-    Ck_im = 0.5 * (data[2*k + 1] - data[N - 2*k + 1]);
-    // re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k})):
-    Dk_re = 0.5 * (data[2*k + 1] + data[N - 2*k + 1]);
-    // im(D_k) = -1/2 (re(B_k) - re(B_{N/2-k}))
-    Dk_im =-0.5 * (data[2*k] - data[N - 2*k]);
-    // A_k = C_k + 1^(k/N) D_k:
-    data[2*k] = Ck_re;  // A_k <-- C_k
-    data[2*k+1] = Ck_im;
-    // now A_k += D_k 1^(k/N)
-    ComplexAddProduct(Dk_re, Dk_im, kN_re, kN_im, &(data[2*k]), &(data[2*k+1]));
-    MatrixIndexT kdash = N2 - k;
-    if (kdash != k) {
-      // Next we handle the index k' = N/2 - k.  This is necessary
-      // to do now, to avoid invalidating data that we will later need.
-      // The quantities C_{k'} and D_{k'} are just the conjugates of C_k
-      // and D_k, so the equations are simple modifications of the above,
-      // replacing Ck_im and Dk_im with their negatives.
-      data[2*kdash] = Ck_re;  // A_k' <-- C_k'
-      data[2*kdash+1] = -Ck_im;
-      // now A_k' += D_k' 1^(k'/N)
-      // We use 1^(k'/N) = 1^((N/2 - k) / N) = 1^(1/2) 1^(-k/N) = -1 * (1^(k/N))^*
-      // so it's the same as 1^(k/N) but with the real part negated.
-      ComplexAddProduct(Dk_re, -Dk_im, -kN_re, kN_im, &(data[2*kdash]), &(data[2*kdash+1]));
-    }
-  }
-  {  // Now handle k = 0.
-    // In simple terms: after the complex fft, data[0] becomes the sum of real
-    // parts input[0], input[2]... and data[1] becomes the sum of imaginary
-    // pats input[1], input[3]...
-    // "zeroth" [A_0] is just the sum of input[0]+input[1]+input[2]..
-    // and "n2th" [A_{N/2}] is input[0]-input[1]+input[2]... .
-    Real zeroth = data[0] + data[1],
-        n2th = data[0] - data[1];
-    data[0] = zeroth;
-    data[1] = n2th;
-    if (!forward) {
-      data[0] /= 2;
-      data[1] /= 2;
-    }
-  }
-  if (!forward) {
-    ComplexFft(v, false);
-    v->Scale(2.0);  // This is so we get a factor of N increase, rather than N/2 which we would
-    // otherwise get from [ComplexFft, forward] + [ComplexFft, backward] in dimension N/2.
-    // It's for consistency with our normal FFT convensions.
-  }
-}
-template void RealFft (VectorBase<float> *v, bool forward);
-template void RealFft (VectorBase<double> *v, bool forward);
-/* Notes for real FFTs.
-   We are using the same convention as above, 1^x to mean exp(-2\pi x) for the forward transform.
-   Actually, in a slight abuse of notation, we use this meaning for 1^x in both the forward and
-   backward cases because it's more convenient in this section.
-   Suppose we have real data a[0...N-1], with N even, and want to compute its Fourier transform.
-   We can make do with the first N/2 points of the transform, since the remaining ones are complex
-   conjugates of the first.  We want to compute:
-       for k = 0...N/2-1,
-       A_k = \sum_{n = 0}^{N-1}  a_n 1^(kn/N)                 (1)
-   We treat a[0..N-1] as a complex sequence of length N/2, i.e. a sequence b[0..N/2 - 1].
-   Viewed as sequences of length N/2, we have:
-       b = c + i d,
-   where c = a_0, a_2 ... and d = a_1, a_3 ...
-   We can recover the length-N/2 Fourier transforms of c and d by doing FT on b and
-   then doing the equations below.  Derivation is marked by (*) in a comment below (search
-   for it).  Let B, C, D be the FTs.
-   We have
-       C_k = 1/2 (B_k + B_{N/2 - k}^*)                                 (z0)
-       D_k =-1/2i (B_k - B_{N/2 - k}^*)                                (z1)
-so: re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k}))                             (z2)
-    im(D_k) = -1/2 (re(B_k) - re(B_{N/2-k}))                             (z3)
-    To recover the FT A from C and D, we write, rearranging (1):
-       A_k = \sum_{n = 0, 2, ..., N-2} a_n 1^(kn/N)
-            +\sum_{n = 1, 3, ..., N-1} a_n 1^(kn/N)
-           = \sum_{n = 0, 1, ..., N/2-1} a_n 1^(2kn/N)  + a_{n+1} 1^(2kn/N) 1^(k/N)
-           = \sum_{n = 0, 1, ..., N/2-1} c_n 1^(2kn/N)  + d_n  1^(2kn/N) 1^(k/N)
-       A_k =  C_k + 1^(k/N) D_k                                              (a0)
-    This equation is valid for k = 0...N/2-1, which is the range of the sequences B_k and
-    C_k.  We don't use is for k = 0, which is a special case considered below.  For
-    1 < k < N/2, it's convenient to consider the pair k, k', where k' = N/2 - k.
-    Remember that C_k' = C_k^ *and D_k' = D_k^* [where * is conjugation].  Also,
-    1^(N/2 / N) = -1.  So we have:
-       A_k' = C_k^* - 1^(k/N) D_k^*                                          (a0b)
-    We do (a0) and (a0b) together.
-    By symmetry this gives us the Fourier components for N/2+1, ... N, if we want
-    them.  However, it doesn't give us the value for exactly k = N/2.  For k = 0 and k = N/2, it
-    is easiest to argue directly about the meaning of the A_k, B_k and C_k in terms of
-    sums of points.
-       A_0 and A_{N/2} are both real, with A_0=\sum_n a_n, and A_1 an alternating sum
-       A_1 = a_0 - a_1 + a_2 ...
-     It's easy to show that
-              A_0 = B_0 + C_0            (a1)
-              A_{N/2} = B_0 - C_0.       (a2)
-     Since B_0 and C_0 are both real, B_0 is the real coefficient of D_0 and C_0 is the
-     imaginary coefficient.
-     *REVERSING THE PROCESS*
-     Next we want to reverse this process.  We just need to work out C_k and D_k from the
-     sequence A_k.  Then we do the inverse complex fft and we get back where we started.
-     For 0 and N/2, working from (a1) and (a2) above, we can see that:
-          B_0 = 1/2 (A_0 + A_{N/2})                                       (y0)
-          C_0 = 1/2 (A_0 + A_{N/2})                                       (y1)
-     and we use
-         D_0 = B_0 + i C_0
-     to get the 1st complex coefficient of D.  This is exactly the same as the forward process
-     except with an extra factor of 1/2.
-     Consider equations (a0) and (a0b).  We want to work out C_k and D_k from A_k and A_k'.  Remember
-     k' = N/2 - k.
-     Write down
-         A_k     =  C_k + 1^(k/N) D_k        (copying a0)
-         A_k'^* =   C_k - 1^(k/N) D_k       (conjugate of a0b)
-      So
-             C_k =            0.5 (A_k + A_k'^*)                    (p0)
-             D_k = 1^(-k/N) . 0.5 (A_k - A_k'^*)                    (p1)
-      Next, we want to compute B_k and B_k' from C_k and D_k.  C.f. (z0)..(z3), and remember
-      that k' = N/2-k.  We can see
-      that
-              B_k  = C_k + i D_k                                    (p2)
-              B_k' = C_k - i D_k                                    (p3)
-     We would like to make the equations (p0) ... (p3) look like the forward equations (z0), (z1),
-     (a0) and (a0b) so we can reuse the code.  Define E_k = -i 1^(k/N) D_k.  Then write down (p0)..(p3).
-     We have
-             C_k  =            0.5 (A_k + A_k'^*)                    (p0')
-             E_k  =       -0.5 i   (A_k - A_k'^*)                    (p1')
-             B_k  =    C_k - 1^(-k/N) E_k                            (p2')
-             B_k' =    C_k + 1^(-k/N) E_k                            (p3')
-     So these are exactly the same as (z0), (z1), (a0), (a0b) except replacing 1^(k/N) with
-     -1^(-k/N) .  Remember that we defined 1^x above to be exp(-2pi x/N), so the signs here
-     might be opposite to what you see in the code.
-     MODIFICATION: we need to take care of a factor of two.  The complex FFT we implemented
-     does not divide by N in the reverse case.  So upon inversion we get larger by N/2.
-     However, this is not consistent with normal FFT conventions where you get a factor of N.
-     For this reason we multiply by two after the process described above.
-*/
-/*
-   (*) [this token is referred to in a comment above].
-   Notes for separating 2 real transforms from one complex one.  Note that the
-   letters here (A, B, C and N) are all distinct from the same letters used in the
-   place where this comment is used.
-   Suppose we
-   have two sequences a_n and b_n, n = 0..N-1.  We combine them into a complex
-   number,
-      c_n = a_n + i b_n.
-   Then we take the fourier transform to get
-      C_k = \sum_{n = 0}^{N-1} c_n 1^(n/N) .
-   Then we use symmetry.  Define A_k and B_k as the DFTs of a and b.
-   We use A_k = A_{N-k}^*, and B_k = B_{N-k}^*, since a and b are real.  Using
-      C_k     = A_k    +  i B_k,
-      C_{N-k} = A_k^*  +  i B_k^*
-              = A_k^*  -  (i B_k)^*
-   So:
-      A_k     = 1/2  (C_k + C_{N-k}^*)
-    i B_k     = 1/2  (C_k - C_{N-k}^*)
->    B_k     =-1/2i (C_k - C_{N-k}^*)
->  re(B_k)   = 1/2 (im(C_k) + im(C_{N-k}))
-    im(B_k)   =-1/2 (re(C_k) - re(C_{N-k}))
- */
-template<typename Real> void ComputeDctMatrix(Matrix<Real> *M) {
-  //KALDI_ASSERT(M->NumRows() == M->NumCols());
-  MatrixIndexT K = M->NumRows();
-  MatrixIndexT N = M->NumCols();
-  KALDI_ASSERT(K > 0);
-  KALDI_ASSERT(N > 0);
-  Real normalizer = std::sqrt(1.0 / static_cast<Real>(N));  // normalizer for
-  // X_0.
-  for (MatrixIndexT j = 0; j < N; j++) (*M)(0, j) = normalizer;
-  normalizer = std::sqrt(2.0 / static_cast<Real>(N));  // normalizer for other
-   // elements.
-  for (MatrixIndexT k = 1; k < K; k++)
-    for (MatrixIndexT n = 0; n < N; n++)
-      (*M)(k, n) = normalizer
-          * std::cos( static_cast<double>(M_PI)/N * (n + 0.5) * k );
-}
-template void ComputeDctMatrix(Matrix<float> *M);
-template void ComputeDctMatrix(Matrix<double> *M);
-template<typename Real>
-void ComputePca(const MatrixBase<Real> &X,
-                MatrixBase<Real> *U,
-                MatrixBase<Real> *A,
-                bool print_eigs,
-                bool exact) {
-  // Note that some of these matrices may be transposed w.r.t. the
-  // way it's most natural to describe them in math... it's the rows
-  // of X and U that correspond to the (data-points, basis elements).
-  MatrixIndexT N = X.NumRows(), D = X.NumCols();
-  // N = #points, D = feature dim.
-  KALDI_ASSERT(U != NULL && U->NumCols() == D);
-  MatrixIndexT G = U->NumRows();  // # of retained basis elements.
-  KALDI_ASSERT(A == NULL || (A->NumRows() == N && A->NumCols() == G));
-  KALDI_ASSERT(G <= N && G <= D);
-  if (D < N) {  // Do conventional PCA.
-    SpMatrix<Real> Msp(D);  // Matrix of outer products.
-    Msp.AddMat2(1.0, X, kTrans, 0.0);  // M <-- X^T X
-    Matrix<Real> Utmp;
-    Vector<Real> l;
-    if (exact) {
-      Utmp.Resize(D, D);
-      l.Resize(D);
-      //Matrix<Real> M(Msp);
-      //M.DestructiveSvd(&l, &Utmp, NULL);
-      Msp.Eig(&l, &Utmp);
-    } else {
-      Utmp.Resize(D, G);
-      l.Resize(G);
-      Msp.TopEigs(&l, &Utmp);
-    }
-    SortSvd(&l, &Utmp);
-    for (MatrixIndexT g = 0; g < G; g++)
-      U->Row(g).CopyColFromMat(Utmp, g);
-    if (print_eigs)
-      KALDI_LOG << (exact ? "" : "Retained ")
-                << "PCA eigenvalues are " << l;
-    if (A != NULL)
-      A->AddMatMat(1.0, X, kNoTrans, *U, kTrans, 0.0);
-  } else {  // Do inner-product PCA.
-    SpMatrix<Real> Nsp(N);  // Matrix of inner products.
-    Nsp.AddMat2(1.0, X, kNoTrans, 0.0);  // M <-- X X^T
-    Matrix<Real> Vtmp;
-    Vector<Real> l;
-    if (exact) {
-      Vtmp.Resize(N, N);
-      l.Resize(N);
-      Matrix<Real> Nmat(Nsp);
-      Nmat.DestructiveSvd(&l, &Vtmp, NULL);
-    } else {
-      Vtmp.Resize(N, G);
-      l.Resize(G);
-      Nsp.TopEigs(&l, &Vtmp);
-    }
-    MatrixIndexT num_zeroed = 0;
-    for (MatrixIndexT g = 0; g < G; g++) {
-      if (l(g) < 0.0) {
-        KALDI_WARN << "In PCA, setting element " << l(g) << " to zero.";
-        l(g) = 0.0;
-        num_zeroed++;
-      }
-    }
-    SortSvd(&l, &Vtmp); // Make sure zero elements are last, this
-    // is necessary for Orthogonalize() to work properly later.
-    Vtmp.Transpose();  // So eigenvalues are the rows.
-    for (MatrixIndexT g = 0; g < G; g++) {
-      Real sqrtlg = sqrt(l(g));
-      if (l(g) != 0.0) {
-        U->Row(g).AddMatVec(1.0 / sqrtlg, X, kTrans, Vtmp.Row(g), 0.0);
-      } else {
-        U->Row(g).SetZero();
-        (*U)(g, g) = 1.0;  // arbitrary direction.  Will later orthogonalize.
-      }
-      if (A != NULL)
-        for (MatrixIndexT n = 0; n < N; n++)
-          (*A)(n, g) = sqrtlg * Vtmp(g, n);
-    }
-    // Now orthogonalize.  This is mainly useful in
-    // case there were zero eigenvalues, but we do it
-    // for all of them.
-    U->OrthogonalizeRows();
-    if (print_eigs)
-      KALDI_LOG << "(inner-product) PCA eigenvalues are " << l;
-  }
-}
-template
-void ComputePca(const MatrixBase<float> &X,
-                MatrixBase<float> *U,
-                MatrixBase<float> *A,
-                bool print_eigs,
-                bool exact);
-template
-void ComputePca(const MatrixBase<double> &X,
-                MatrixBase<double> *U,
-                MatrixBase<double> *A,
-                bool print_eigs,
-                bool exact);
-// Added by Dan, Feb. 13 2012.
-// This function does: *plus += max(0, a b^T),
-// *minus += max(0, -(a b^T)).
-template<typename Real>
-void AddOuterProductPlusMinus(Real alpha,
-                              const VectorBase<Real> &a,
-                              const VectorBase<Real> &b,
-                              MatrixBase<Real> *plus,
-                              MatrixBase<Real> *minus) {
-  KALDI_ASSERT(a.Dim() == plus->NumRows() && b.Dim() == plus->NumCols()
-               && a.Dim() == minus->NumRows() && b.Dim() == minus->NumCols());
-  int32 nrows = a.Dim(), ncols = b.Dim(), pskip = plus->Stride() - ncols,
-      mskip = minus->Stride() - ncols;
-  const Real *adata = a.Data(), *bdata = b.Data();
-  Real *plusdata = plus->Data(), *minusdata = minus->Data();
-  for (int32 i = 0; i < nrows; i++) {
-    const Real *btmp = bdata;
-    Real multiple = alpha * *adata;
-    if (multiple > 0.0) {
-      for (int32 j = 0; j < ncols; j++, plusdata++, minusdata++, btmp++) {
-        if (*btmp > 0.0) *plusdata += multiple * *btmp;
-        else *minusdata -= multiple * *btmp;
-      }
-    } else {
-      for (int32 j = 0; j < ncols; j++, plusdata++, minusdata++, btmp++) {
-        if (*btmp < 0.0) *plusdata += multiple * *btmp;
-        else *minusdata -= multiple * *btmp;
-      }
-    }
-    plusdata += pskip;
-    minusdata += mskip;
-    adata++;
-  }
-}
-// Instantiate template
-template
-void AddOuterProductPlusMinus<float>(float alpha,
-                                     const VectorBase<float> &a,
-                                     const VectorBase<float> &b,
-                                     MatrixBase<float> *plus,
-                                     MatrixBase<float> *minus);
-template
-void AddOuterProductPlusMinus<double>(double alpha,
-                                      const VectorBase<double> &a,
-                                      const VectorBase<double> &b,
-                                      MatrixBase<double> *plus,
-                                      MatrixBase<double> *minus);
-} // end namespace kaldi
--- a/speechx/speechx/kaldi/matrix/matrix-functions.h
+++ b/speechx/speechx/kaldi/matrix/matrix-functions.h
-// matrix/matrix-functions.h
-// Copyright 2009-2011  Microsoft Corporation;  Go Vivace Inc.;  Jan Silovsky;
-//                      Yanmin Qian;   1991 Henrique (Rico) Malvar (*)
-//
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-//
-// (*) incorporates, with permission, FFT code from his book
-// "Signal Processing with Lapped Transforms", Artech, 1992.
-#ifndef KALDI_MATRIX_MATRIX_FUNCTIONS_H_
-#define KALDI_MATRIX_MATRIX_FUNCTIONS_H_
-#include "matrix/kaldi-vector.h"
-#include "matrix/kaldi-matrix.h"
-namespace kaldi {
-/// @addtogroup matrix_funcs_misc
-/// @{
-/** The function ComplexFft does an Fft on the vector argument v.
-   v is a vector of even dimension, interpreted for both input
-   and output as a vector of complex numbers i.e.
-   \f[ v = ( re_0, im_0, re_1, im_1, ... )    \f]
-   If "forward == true" this routine does the Discrete Fourier Transform
-   (DFT), i.e.:
-   \f[   vout[m] \leftarrow \sum_{n = 0}^{N-1} vin[i] exp( -2pi m n / N )  \f]
-   If "backward" it does the Inverse Discrete Fourier Transform (IDFT)
-   *WITHOUT THE FACTOR 1/N*,
-   i.e.:
-   \f[   vout[m] <-- \sum_{n = 0}^{N-1} vin[i] exp(  2pi m n / N )   \f]
-   [note the sign difference on the 2 pi for the backward one.]
-   Note that this is the definition of the FT given in most texts, but
-   it differs from the Numerical Recipes version in which the forward
-   and backward algorithms are flipped.
-   Note that you would have to multiply by 1/N after the IDFT to get
-   back to where you started from.  We don't do this because
-   in some contexts, the transform is made symmetric by multiplying
-   by sqrt(N) in both passes.   The user can do this by themselves.
-   See also SplitRadixComplexFft, declared in srfft.h, which is more efficient
-   but only works if the length of the input is a power of 2.
- */
-template<typename Real> void ComplexFft (VectorBase<Real> *v, bool forward, Vector<Real> *tmp_work = NULL);
-/// ComplexFt is the same as ComplexFft but it implements the Fourier
-/// transform in an inefficient way.  It is mainly included for testing purposes.
-/// See comment for ComplexFft to describe the input and outputs and what it does.
-template<typename Real> void ComplexFt (const VectorBase<Real> &in,
-                                     VectorBase<Real> *out, bool forward);
-/// RealFft is a fourier transform of real inputs.  Internally it uses
-/// ComplexFft.  The input dimension N must be even.  If forward == true,
-/// it transforms from a sequence of N real points to its complex fourier
-/// transform; otherwise it goes in the reverse direction.  If you call it
-/// in the forward and then reverse direction and multiply by 1.0/N, you
-/// will get back the original data.
-/// The interpretation of the complex-FFT data is as follows: the array
-/// is a sequence of complex numbers C_n of length N/2 with (real, im) format,
-/// i.e. [real0, real_{N/2}, real1, im1, real2, im2, real3, im3, ...].
-/// See also SplitRadixRealFft, declared in srfft.h, which is more efficient
-/// but only works if the length of the input is a power of 2.
-template<typename Real> void RealFft (VectorBase<Real> *v, bool forward);
-/// RealFt has the same input and output format as RealFft above, but it is
-/// an inefficient implementation included for testing purposes.
-template<typename Real> void RealFftInefficient (VectorBase<Real> *v, bool forward);
-/// ComputeDctMatrix computes a matrix corresponding to the DCT, such that
-/// M * v equals the DCT of vector v.  M must be square at input.
-/// This is the type = III DCT with normalization, corresponding to the
-/// following equations, where x is the signal and X is the DCT:
-/// X_0 = 1/sqrt(2*N) \sum_{n = 0}^{N-1} x_n
-/// X_k = 1/sqrt(N) \sum_{n = 0}^{N-1} x_n cos( \pi/N (n + 1/2) k )
-/// This matrix's transpose is its own inverse, so transposing this
-/// matrix will give the inverse DCT.
-/// Caution: the type III DCT is generally known as the "inverse DCT" (with the
-/// type II being the actual DCT), so this function is somewhatd mis-named.  It
-/// was probably done this way for HTK compatibility.  We don't change it
-/// because it was this way from the start and changing it would affect the
-/// feature generation.
-template<typename Real> void ComputeDctMatrix(Matrix<Real> *M);
-/// ComplexMul implements, inline, the complex multiplication b *= a.
-template<typename Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
-                                            Real *b_re, Real *b_im);
-/// ComplexMul implements, inline, the complex operation c += (a * b).
-template<typename Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
-                                                   const Real &b_re, const Real &b_im,
-                                                   Real *c_re, Real *c_im);
-/// ComplexImExp implements a <-- exp(i x), inline.
-template<typename Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im);
-/**
-    ComputePCA does a PCA computation, using either outer products
-    or inner products, whichever is more efficient.  Let D be
-    the dimension of the data points, N be the number of data
-    points, and G be the PCA dimension we want to retain.  We assume
-    G <= N and G <= D.
-    @param X [in]  An N x D matrix.  Each row of X is a point x_i.
-    @param U [out] A G x D matrix.  Each row of U is a basis element u_i.
-    @param A [out] An N x D matrix, or NULL.  Each row of A is a set of coefficients
-         in the basis for a point x_i, so A(i, g) is the coefficient of u_i
-         in x_i.
-    @param print_eigs [in] If true, prints out diagnostic information about the
-         eigenvalues.
-    @param exact [in] If true, does the exact computation; if false, does
-         a much faster (but almost exact) computation based on the Lanczos
-         method.
-*/
-template<typename Real>
-void ComputePca(const MatrixBase<Real> &X,
-                MatrixBase<Real> *U,
-                MatrixBase<Real> *A,
-                bool print_eigs = false,
-                bool exact = true);
-// This function does: *plus += max(0, a b^T),
-// *minus += max(0, -(a b^T)).
-template<typename Real>
-void AddOuterProductPlusMinus(Real alpha,
-                              const VectorBase<Real> &a,
-                              const VectorBase<Real> &b,
-                              MatrixBase<Real> *plus,
-                              MatrixBase<Real> *minus);
-template<typename Real1, typename Real2>
-inline void AssertSameDim(const MatrixBase<Real1> &mat1, const MatrixBase<Real2> &mat2) {
-  KALDI_ASSERT(mat1.NumRows() == mat2.NumRows()
-               && mat1.NumCols() == mat2.NumCols());
-}
-/// @} end of "addtogroup matrix_funcs_misc"
-} // end namespace kaldi
-#include "matrix/matrix-functions-inl.h"
-#endif
--- a/speechx/speechx/kaldi/matrix/matrix-lib.h
+++ b/speechx/speechx/kaldi/matrix/matrix-lib.h
-// matrix/matrix-lib.h
-// Copyright 2009-2011  Ondrej Glembek;  Microsoft Corporation;  Haihua Xu
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//  http://www.apache.org/licenses/LICENSE-2.0
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-// Include everything from this directory.
-// These files include other stuff that we need.
-#ifndef KALDI_MATRIX_MATRIX_LIB_H_
-#define KALDI_MATRIX_MATRIX_LIB_H_
-#include "base/kaldi-common.h"
-#include "matrix/kaldi-vector.h"
-#include "matrix/kaldi-matrix.h"
-#include "matrix/sp-matrix.h"
-#include "matrix/tp-matrix.h"
-#include "matrix/matrix-functions.h"
-#include "matrix/srfft.h"
-#include "matrix/compressed-matrix.h"
-#include "matrix/sparse-matrix.h"
-#include "matrix/optimization.h"
-#endif
--- a/speechx/speechx/kaldi/matrix/optimization.cc
+++ b/speechx/speechx/kaldi/matrix/optimization.cc
-// matrix/optimization.cc
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-//
-// (*) incorporates, with permission, FFT code from his book
-// "Signal Processing with Lapped Transforms", Artech, 1992.
-#include <algorithm>
-#include "matrix/optimization.h"
-#include "matrix/sp-matrix.h"
-namespace kaldi {
-// Below, N&W refers to Nocedal and Wright, "Numerical Optimization", 2nd Ed.
-template<typename Real>
-OptimizeLbfgs<Real>::OptimizeLbfgs(const VectorBase<Real> &x,
-                                   const LbfgsOptions &opts):
-    opts_(opts), k_(0), computation_state_(kBeforeStep), H_was_set_(false) {
-  KALDI_ASSERT(opts.m > 0); // dimension.
-  MatrixIndexT dim = x.Dim();
-  KALDI_ASSERT(dim > 0);
-  x_ = x; // this is the value of x_k
-  new_x_ = x;  // this is where we'll evaluate the function next.
-  deriv_.Resize(dim);
-  temp_.Resize(dim);
-  data_.Resize(2 * opts.m, dim);
-  rho_.Resize(opts.m);
-  // Just set f_ to some invalid value, as we haven't yet set it.
-  f_ = (opts.minimize ? 1 : -1 ) * std::numeric_limits<Real>::infinity();
-  best_f_ = f_;
-  best_x_ = x_;
-}
-template<typename Real>
-Real OptimizeLbfgs<Real>::RecentStepLength() const {
-  size_t n = step_lengths_.size();
-  if (n == 0) return std::numeric_limits<Real>::infinity();
-  else {
-    if (n >= 2 && step_lengths_[n-1] == 0.0 && step_lengths_[n-2] == 0.0)
-      return 0.0; // two zeros in a row means repeated restarts, which is
-    // a loop.  Short-circuit this by returning zero.
-    Real avg = 0.0;
-    for (size_t i = 0; i < n; i++)
-      avg += step_lengths_[i] / n;
-    return avg;
-  }
-}
-template<typename Real>
-void OptimizeLbfgs<Real>::ComputeHifNeeded(const VectorBase<Real> &gradient) {
-  if (k_ == 0) {
-    if (H_.Dim() == 0) {
-      // H was never set up.  Set it up for the first time.
-      Real learning_rate;
-      if (opts_.first_step_length > 0.0) { // this takes
-        // precedence over first_step_learning_rate, if set.
-        // We are setting up H for the first time.
-        Real gradient_length = gradient.Norm(2.0);
-        learning_rate = (gradient_length > 0.0 ?
-                         opts_.first_step_length / gradient_length :
-                         1.0);
-      } else if (opts_.first_step_impr > 0.0) {
-        Real gradient_length = gradient.Norm(2.0);
-        learning_rate = (gradient_length > 0.0 ?
-                  opts_.first_step_impr / (gradient_length * gradient_length) :
-                  1.0);
-      } else {
-        learning_rate = opts_.first_step_learning_rate;
-      }
-      H_.Resize(x_.Dim());
-      KALDI_ASSERT(learning_rate > 0.0);
-      H_.Set(opts_.minimize ? learning_rate : -learning_rate);
-    }
-  } else { // k_ > 0
-    if (!H_was_set_) { // The user never specified an approximate
-      // diagonal inverse Hessian.
-      // Set it using formula 7.20: H_k^{(0)} = \gamma_k I, where
-      // \gamma_k = s_{k-1}^T y_{k-1} / y_{k-1}^T y_{k-1}
-      SubVector<Real> y_km1 = Y(k_-1);
-      double gamma_k = VecVec(S(k_-1), y_km1) / VecVec(y_km1, y_km1);
-      if (KALDI_ISNAN(gamma_k) || KALDI_ISINF(gamma_k)) {
-        KALDI_WARN << "NaN encountered in L-BFGS (already converged?)";
-        gamma_k = (opts_.minimize ? 1.0 : -1.0);
-      }
-      H_.Set(gamma_k);
-    }
-  }
-}  
-// This represents the first 2 lines of Algorithm 7.5 (N&W), which
-// in fact is mostly a call to Algorithm 7.4.
-// Note: this is valid whether we are minimizing or maximizing.
-template<typename Real>
-void OptimizeLbfgs<Real>::ComputeNewDirection(Real function_value,
-                                              const VectorBase<Real> &gradient) {
-  KALDI_ASSERT(computation_state_ == kBeforeStep);
-  SignedMatrixIndexT m = M(), k = k_;
-  ComputeHifNeeded(gradient);
-  // The rest of this is computing p_k <-- - H_k \nabla f_k using Algorithm
-  // 7.4 of N&W.
-  Vector<Real> &q(deriv_), &r(new_x_); // Use deriv_ as a temporary place to put
-  // q, and new_x_ as a temporay place to put r.
-  // The if-statement below is just to get rid of spurious warnings from
-  // valgrind about memcpy source and destination overlap, since sometimes q and
-  // gradient are the same variable.
-  if (&q != &gradient)
-    q.CopyFromVec(gradient); // q <-- \nabla f_k.
-  Vector<Real> alpha(m);
-  // for i = k - 1, k - 2, ... k - m
-  for (SignedMatrixIndexT i = k - 1;
-       i >= std::max(k - m, static_cast<SignedMatrixIndexT>(0));
-       i--) { 
-    alpha(i % m) = rho_(i % m) * VecVec(S(i), q); // \alpha_i <-- \rho_i s_i^T q.
-    q.AddVec(-alpha(i % m), Y(i)); // q <-- q - \alpha_i y_i
-  }
-  r.SetZero();
-  r.AddVecVec(1.0, H_, q, 0.0); // r <-- H_k^{(0)} q.
-  // for k = k - m, k - m + 1, ... , k - 1
-  for (SignedMatrixIndexT i = std::max(k - m, static_cast<SignedMatrixIndexT>(0));
-       i < k;
-       i++) {
-    Real beta = rho_(i % m) * VecVec(Y(i), r); // \beta <-- \rho_i y_i^T r
-    r.AddVec(alpha(i % m) - beta, S(i)); // r <-- r + s_i (\alpha_i - \beta)
-  }
-  { // TEST.  Note, -r will be the direction.
-    Real dot = VecVec(gradient, r);
-    if ((opts_.minimize && dot < 0) || (!opts_.minimize && dot > 0))
-      KALDI_WARN << "Step direction has the wrong sign!  Routine will fail.";
-  }
-  // Now we're out of Alg. 7.4 and back into Alg. 7.5.
-  // Alg. 7.4 returned r (using new_x_ as the location), and with \alpha_k = 1
-  // as the initial guess, we're setting x_{k+1} = x_k + \alpha_k p_k, with
-  // p_k = -r [hence the statement new_x_.Scale(-1.0)]., and \alpha_k = 1.
-  // This is the first place we'll get the user to evaluate the function;
-  // any backtracking (or acceptance of that step) occurs inside StepSizeIteration.
-  // We're still within iteration k; we haven't yet finalized the step size.
-  new_x_.Scale(-1.0);
-  new_x_.AddVec(1.0, x_);
-  if (&deriv_ != &gradient)
-    deriv_.CopyFromVec(gradient);
-  f_ = function_value;
-  d_ = opts_.d;
-  num_wolfe_i_failures_ = 0;
-  num_wolfe_ii_failures_ = 0;
-  last_failure_type_ = kNone;
-  computation_state_ = kWithinStep;
-}
-template<typename Real>
-bool OptimizeLbfgs<Real>::AcceptStep(Real function_value,
-                                     const VectorBase<Real> &gradient) {
-  // Save s_k = x_{k+1} - x_{k}, and y_k = \nabla f_{k+1} - \nabla f_k.
-  SubVector<Real> s = S(k_), y = Y(k_);
-  s.CopyFromVec(new_x_);
-  s.AddVec(-1.0, x_); // s = new_x_ - x_.
-  y.CopyFromVec(gradient);
-  y.AddVec(-1.0, deriv_); // y = gradient - deriv_.
-  // Warning: there is a division in the next line.  This could
-  // generate inf or nan, but this wouldn't necessarily be an error
-  // at this point because for zero step size or derivative we should
-  // terminate the iterations.  But this is up to the calling code.
-  Real prod = VecVec(y, s);
-  rho_(k_ % opts_.m) = 1.0 / prod;
-  Real len = s.Norm(2.0);
-  if ((opts_.minimize && prod <= 1.0e-20) || (!opts_.minimize && prod >= -1.0e-20)
-      || len == 0.0)
-    return false; // This will force restart.
-  KALDI_VLOG(3) << "Accepted step; length was " << len
-                << ", prod was " << prod;
-  RecordStepLength(len);
-  // store x_{k+1} and the function value f_{k+1}.
-  x_.CopyFromVec(new_x_);
-  f_ = function_value;
-  k_++;
-  return true; // We successfully accepted the step.
-}
-template<typename Real>
-void OptimizeLbfgs<Real>::RecordStepLength(Real s) {
-  step_lengths_.push_back(s);
-  if (step_lengths_.size() > static_cast<size_t>(opts_.avg_step_length))
-    step_lengths_.erase(step_lengths_.begin(), step_lengths_.begin() + 1);
-}
-template<typename Real>
-void OptimizeLbfgs<Real>::Restart(const VectorBase<Real> &x,
-                                  Real f,
-                                  const VectorBase<Real> &gradient) {
-  // Note: we will consider restarting (the transition of x_ -> x)
-  // as a step, even if it has zero step size.  This is necessary in
-  // order for convergence to be detected.
-  {
-    Vector<Real> &diff(temp_);
-    diff.CopyFromVec(x);
-    diff.AddVec(-1.0, x_);
-    RecordStepLength(diff.Norm(2.0));
-  }
-  k_ = 0; // Restart the iterations!  [But note that the Hessian,
-  // whatever it was, stays as before.]
-  if (&x_ != &x)
-    x_.CopyFromVec(x);
-  new_x_.CopyFromVec(x);
-  f_ = f;
-  computation_state_ = kBeforeStep;
-  ComputeNewDirection(f, gradient);
-}
-template<typename Real>
-void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
-                                            const VectorBase<Real> &gradient) {
-  KALDI_VLOG(3) << "In step size iteration, function value changed "
-                << f_ << " to " << function_value;
-  // We're in some part of the backtracking, and the user is providing
-  // the objective function value and gradient.
-  // We're checking two conditions: Wolfe i) [the Armijo rule] and
-  // Wolfe ii).
-  // The Armijo rule (when minimizing) is:
-  // f(k_k + \alpha_k p_k) <= f(x_k) + c_1 \alpha_k p_k^T \nabla f(x_k), where
-  //  \nabla means the derivative.
-  // Below, "temp" is the RHS of this equation, where (\alpha_k p_k) equals
-  // (new_x_ - x_); we don't store \alpha or p_k separately, they are implicit
-  // as the difference new_x_ - x_.
-  // Below, pf is \alpha_k p_k^T \nabla f(x_k).
-  Real pf = VecVec(new_x_, deriv_) - VecVec(x_, deriv_);
-  Real temp = f_ + opts_.c1 * pf;
-  bool wolfe_i_ok;
-  if (opts_.minimize) wolfe_i_ok = (function_value <= temp);
-  else wolfe_i_ok = (function_value >= temp);
-  // Wolfe condition ii) can be written as:
-  //  p_k^T \nabla f(x_k + \alpha_k p_k) >= c_2 p_k^T \nabla f(x_k)
-  // p2f equals \alpha_k p_k^T \nabla f(x_k + \alpha_k p_k), where
-  // (\alpha_k p_k^T) is (new_x_ - x_).
-  // Note that in our version of Wolfe condition (ii) we have an extra
-  // factor alpha, which doesn't affect anything.
-  Real p2f = VecVec(new_x_, gradient) - VecVec(x_, gradient);
-  //eps = (sizeof(Real) == 4 ? 1.0e-05 : 1.0e-10) *
-  //(std::abs(p2f) + std::abs(pf));
-  bool wolfe_ii_ok;
-  if (opts_.minimize) wolfe_ii_ok = (p2f >= opts_.c2 * pf);
-  else wolfe_ii_ok = (p2f <= opts_.c2 * pf);
-  enum { kDecrease, kNoChange } d_action; // What do do with d_: leave it alone,
-  // or take the square root.
-  enum { kAccept, kDecreaseStep, kIncreaseStep, kRestart } iteration_action;
-  // What we'll do in the overall iteration: accept this value, DecreaseStep
-  // (reduce the step size), IncreaseStep (increase the step size), or kRestart
-  // (set k back to zero).  Generally when we can't get both conditions to be
-  // true with a reasonable period of time, it makes sense to restart, because
-  // probably we've almost converged and got into numerical issues; from here
-  // we'll just produced NaN's.  Restarting is a safe thing to do and the outer
-  // code will quickly detect convergence.
-  d_action = kNoChange; // the default.
-  if (wolfe_i_ok && wolfe_ii_ok) {
-    iteration_action = kAccept;
-    d_action = kNoChange; // actually doesn't matter, it'll get reset.
-  } else if (!wolfe_i_ok) {
-    // If wolfe i) [the Armijo rule] failed then we went too far (or are
-    // meeting numerical problems).
-    if (last_failure_type_ == kWolfeII) { // Last time we failed it was Wolfe ii).
-      // When we switch between them we decrease d.
-      d_action = kDecrease;
-    }
-    iteration_action = kDecreaseStep;
-    last_failure_type_ = kWolfeI;
-    num_wolfe_i_failures_++;
-  } else if (!wolfe_ii_ok) {
-    // Curvature condition failed -> we did not go far enough.
-    if (last_failure_type_ == kWolfeI) // switching between wolfe i and ii failures->
-      d_action = kDecrease; // decrease value of d.
-    iteration_action = kIncreaseStep;
-    last_failure_type_ = kWolfeII;
-    num_wolfe_ii_failures_++;
-  }
-  // Test whether we've been switching too many times betwen wolfe i) and ii)
-  // failures, or overall have an excessive number of failures.  We just give up
-  // and restart L-BFGS.  Probably we've almost converged.
-  if (num_wolfe_i_failures_ + num_wolfe_ii_failures_ >
-      opts_.max_line_search_iters) {
-    KALDI_VLOG(2) << "Too many steps in line search -> restarting.";
-    iteration_action = kRestart;
-  }
-  if (d_action == kDecrease)
-    d_ = std::sqrt(d_);
-  KALDI_VLOG(3) << "d = " << d_ << ", iter = " << k_ << ", action = "
-                << (iteration_action == kAccept ? "accept" :
-                    (iteration_action == kDecreaseStep ? "decrease" :
-                     (iteration_action == kIncreaseStep ? "increase" :
-                      "reject")));
-  // Note: even if iteration_action != Restart at this point,
-  // some code below may set it to Restart.
-  if (iteration_action == kAccept) {
-    if (AcceptStep(function_value, gradient)) { // If we did
-      // not detect a problem while accepting the step..
-      computation_state_ = kBeforeStep;
-      ComputeNewDirection(function_value, gradient);
-    } else {
-      KALDI_VLOG(2) << "Restarting L-BFGS computation; problem found while "
-                    << "accepting step.";
-      iteration_action = kRestart; // We'll have to restart now.
-    }
-  }
-  if (iteration_action == kDecreaseStep || iteration_action == kIncreaseStep) {
-    Real scale = (iteration_action == kDecreaseStep ? 1.0 / d_ : d_);
-    temp_.CopyFromVec(new_x_);
-    new_x_.Scale(scale);
-    new_x_.AddVec(1.0 - scale, x_);
-    if (new_x_.ApproxEqual(temp_, 0.0)) {
-      // Value of new_x_ did not change at all --> we must restart.
-      KALDI_VLOG(3) << "Value of x did not change, when taking step; "
-                    << "will restart computation.";
-      iteration_action = kRestart;
-    }
-    if (new_x_.ApproxEqual(temp_, 1.0e-08) &&
-        std::abs(f_ - function_value) < 1.0e-08 *
-        std::abs(f_) && iteration_action == kDecreaseStep) {
-      // This is common and due to roundoff.
-      KALDI_VLOG(3) << "We appear to be backtracking while we are extremely "
-                    << "close to the old value; restarting.";
-      iteration_action = kRestart;
-    }
-    if (iteration_action == kDecreaseStep) {
-      num_wolfe_i_failures_++;
-      last_failure_type_ = kWolfeI;
-    } else {
-      num_wolfe_ii_failures_++;
-      last_failure_type_ = kWolfeII;
-    }
-  }
-  if (iteration_action == kRestart) {
-    // We want to restart the computation.  If the objf at new_x_ is
-    // better than it was at x_, we'll start at new_x_, else at x_.
-    bool use_newx;
-    if (opts_.minimize) use_newx = (function_value < f_);
-    else use_newx = (function_value > f_);
-    KALDI_VLOG(3) << "Restarting computation.";
-    if (use_newx) Restart(new_x_, function_value, gradient);
-    else Restart(x_, f_, deriv_);
-  }
-}
-template<typename Real>
-void OptimizeLbfgs<Real>::DoStep(Real function_value,
-                                 const VectorBase<Real> &gradient) {
-  if (opts_.minimize ? function_value < best_f_ : function_value > best_f_) {
-    best_f_ = function_value;
-    best_x_.CopyFromVec(new_x_);
-  }
-  if (computation_state_ == kBeforeStep)
-    ComputeNewDirection(function_value, gradient);
-  else // kWithinStep{1,2,3}
-    StepSizeIteration(function_value, gradient);
-}
-template<typename Real>
-void OptimizeLbfgs<Real>::DoStep(Real function_value,
-                                 const VectorBase<Real> &gradient,
-                                 const VectorBase<Real> &diag_approx_2nd_deriv) {
-  if (opts_.minimize ? function_value < best_f_ : function_value > best_f_) {
-    best_f_ = function_value;
-    best_x_.CopyFromVec(new_x_);
-  }
-  if (opts_.minimize) {
-    KALDI_ASSERT(diag_approx_2nd_deriv.Min() > 0.0);
-  } else {
-    KALDI_ASSERT(diag_approx_2nd_deriv.Max() < 0.0);
-  }
-  H_was_set_ = true;
-  H_.CopyFromVec(diag_approx_2nd_deriv);
-  H_.InvertElements();
-  DoStep(function_value, gradient);
-}
-template<typename Real>
-const VectorBase<Real>&
-OptimizeLbfgs<Real>::GetValue(Real *objf_value) const {
-  if (objf_value != NULL) *objf_value = best_f_;
-  return best_x_;
-}
-// to compute the alpha, we are minimizing f(x) =  x^T b - 0.5 x_k^T A x_k  along
-// direction p_k... consider alpha
-// d/dx of f(x) = b - A x_k = r.
-// Notation based on Sec. 5.1 of Nocedal and Wright
-// Computation based on Alg. 5.2 of Nocedal and Wright (Pg. 112)
-// Notation (replicated for convenience):
-//  To solve Ax=b for x
-//  k : current iteration
-//  x_k : estimate of x (at iteration k)
-//  r_k : residual ( r_k \eqdef A x_k - b )
-//  \alpha_k : step size
-//  p_k : A-conjugate direction
-//  \beta_k  : coefficient used in A-conjugate direction computation for next
-//  iteration
-//  
-//  Algo.  LinearCG(A,b,x_0)
-//  ========================
-//  r_0 = Ax_0 - b
-//  p_0 = -r_0
-//  k = 0
-//
-//  while r_k != 0
-//    \alpha_k = (r_k^T  r_k) / (p_k^T  A  p_k)
-//    x_{k+1} = x_k + \alpha_k  p_k;
-//    r_{k+1} = r_k + \alpha_k  A  p_k
-//    \beta_{k+1} = \frac{r_{k+1}^T r_{k+1}}{r_k^T r_K}
-//    p_{k+1} = -r_{k+1} + \beta_{k+1} p_k
-//    k = k + 1
-//  end
-template<class Real>
-int32 LinearCgd(const LinearCgdOptions &opts,
-                const SpMatrix<Real> &A,
-                const VectorBase<Real> &b,
-                VectorBase<Real> *x) {
-  // Initialize the variables
-  //
-  int32 M = A.NumCols();
-  Matrix<Real> storage(4, M);
-  SubVector<Real> r(storage, 0), p(storage, 1), Ap(storage, 2), x_orig(storage, 3);
-  p.CopyFromVec(b);
-  p.AddSpVec(-1.0, A, *x, 1.0);  // p_0 = b - A x_0
-  r.AddVec(-1.0, p);  // r_0 = - p_0
-  x_orig.CopyFromVec(*x);  // in case of failure.
-  Real r_cur_norm_sq = VecVec(r, r),
-      r_initial_norm_sq = r_cur_norm_sq,
-      r_recompute_norm_sq = r_cur_norm_sq;
-  KALDI_VLOG(5) << "In linear CG: initial norm-square of residual = "
-                << r_initial_norm_sq;
-  KALDI_ASSERT(opts.recompute_residual_factor <= 1.0);
-  Real max_error_sq = std::max<Real>(opts.max_error * opts.max_error,
-                                     std::numeric_limits<Real>::min()),
-      residual_factor = opts.recompute_residual_factor *
-                        opts.recompute_residual_factor,
-      inv_residual_factor = 1.0 / residual_factor;
-  // Note: although from a mathematical point of view the method should converge
-  // after M iterations, in practice (due to roundoff) it does not always
-  // converge to good precision after that many iterations so we let the maximum
-  // be M + 5 instead.
-  int32 k = 0;
-  for (; k < M + 5 && k != opts.max_iters; k++) {
-    // Note: we'll break from this loop if we converge sooner due to
-    // max_error.
-    Ap.AddSpVec(1.0, A, p, 0.0);  // Ap = A p
-    // Below is how the code used to look.
-    // // next line: \alpha_k = (r_k^T r_k) / (p_k^T A p_k)
-    // Real alpha = r_cur_norm_sq / VecVec(p, Ap);
-    // 
-    // We changed r_cur_norm_sq below to -VecVec(p, r).  Although this is
-    // slightly less efficient, it seems to make the algorithm dramatically more
-    // robust.  Note that -p^T r is the mathematically more natural quantity to
-    // use here, that corresponds to minimizing along that direction... r^T r is
-    // recommended in Nocedal and Wright only as a kind of optimization as it is
-    // supposed to be the same as -p^T r and we already have it computed.
-    Real alpha = -VecVec(p, r) / VecVec(p, Ap);
-    // next line: x_{k+1} = x_k + \alpha_k p_k;
-    x->AddVec(alpha, p);
-    // next line: r_{k+1} = r_k + \alpha_k A p_k
-    r.AddVec(alpha, Ap);
-    Real r_next_norm_sq = VecVec(r, r);
-    if (r_next_norm_sq < residual_factor * r_recompute_norm_sq ||
-        r_next_norm_sq > inv_residual_factor * r_recompute_norm_sq) {
-      // Recompute the residual from scratch if the residual norm has decreased
-      // a lot; this costs an extra matrix-vector multiply, but helps keep the
-      // residual accurate.
-      // Also do the same if the residual norm has increased a lot since
-      // the last time we recomputed... this shouldn't happen often, but
-      // it can indicate bad stuff is happening.
-      // r_{k+1} = A x_{k+1} - b
-      r.AddSpVec(1.0, A, *x, 0.0);
-      r.AddVec(-1.0, b);
-      r_next_norm_sq = VecVec(r, r);
-      r_recompute_norm_sq = r_next_norm_sq;
-      KALDI_VLOG(5) << "In linear CG: recomputing residual.";
-    }
-    KALDI_VLOG(5) << "In linear CG: k = " << k
-                  << ", r_next_norm_sq = " << r_next_norm_sq;
-    // Check if converged.
-    if (r_next_norm_sq <= max_error_sq)
-      break;
-    // next line: \beta_{k+1} = \frac{r_{k+1}^T r_{k+1}}{r_k^T r_K}
-    Real beta_next = r_next_norm_sq / r_cur_norm_sq;
-    // next lines: p_{k+1} = -r_{k+1} + \beta_{k+1} p_k
-    Vector<Real> p_old(p);
-    p.Scale(beta_next);
-    p.AddVec(-1.0, r);
-    r_cur_norm_sq = r_next_norm_sq;
-  }
-  // note: the first element of the && is only there to save compute.
-  // the residual r is A x - b, and r_cur_norm_sq and r_initial_norm_sq are
-  // of the form r * r, so it's clear that b * b has the right dimension to
-  // compare with the residual.
-  if (r_cur_norm_sq > r_initial_norm_sq &&
-      r_cur_norm_sq > r_initial_norm_sq + 1.0e-10 * VecVec(b, b)) {
-    KALDI_WARN << "Doing linear CGD in dimension " << A.NumRows() << ", after " << k
-              << " iterations the squared residual has got worse, "
-               << r_cur_norm_sq << " > " << r_initial_norm_sq
-               << ".  Will do an exact optimization.";
-    SolverOptions opts("called-from-linearCGD");
-    x->CopyFromVec(x_orig);
-    SolveQuadraticProblem(A, b, opts, x);
-  }
-  return k;
-} 
-// Instantiate the class for float and double.
-template
-class OptimizeLbfgs<float>;
-template
-class OptimizeLbfgs<double>;
-template
-int32 LinearCgd<float>(const LinearCgdOptions &opts,
-                      const SpMatrix<float> &A, const VectorBase<float> &b,
-                      VectorBase<float> *x);
-template
-int32 LinearCgd<double>(const LinearCgdOptions &opts,
-                        const SpMatrix<double> &A, const VectorBase<double> &b,
-                        VectorBase<double> *x);
-} // end namespace kaldi
--- a/speechx/speechx/kaldi/matrix/optimization.h
+++ b/speechx/speechx/kaldi/matrix/optimization.h
-// matrix/optimization.h
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-//
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-//
-// (*) incorporates, with permission, FFT code from his book
-// "Signal Processing with Lapped Transforms", Artech, 1992.
-#ifndef KALDI_MATRIX_OPTIMIZATION_H_
-#define KALDI_MATRIX_OPTIMIZATION_H_
-#include "matrix/kaldi-vector.h"
-#include "matrix/kaldi-matrix.h"
-namespace kaldi {
-/// @addtogroup matrix_optimization
-/// @{
-struct LinearCgdOptions {
-  int32 max_iters;  //  Maximum number of iters (if >= 0).
-  BaseFloat max_error;  // Maximum 2-norm of the residual A x - b (convergence
-                        // test)
-  // Every time the residual 2-norm decreases by this recompute_residual_factor
-  // since the last time it was computed from scratch, recompute it from
-  // scratch.  This helps to keep the computed residual accurate even in the
-  // presence of roundoff.
-  BaseFloat recompute_residual_factor;
-  LinearCgdOptions(): max_iters(-1),
-                      max_error(0.0),
-                      recompute_residual_factor(0.01) { }
-};
-/*
-  This function uses linear conjugate gradient descent to approximately solve
-  the system A x = b.  The value of x at entry corresponds to the initial guess
-  of x.  The algorithm continues until the number of iterations equals b.Dim(),
-  or until the 2-norm of (A x - b) is <= max_error, or until the number of
-  iterations equals max_iter, whichever happens sooner.  It is a requirement
-  that A be positive definite.
-  It returns the number of iterations that were actually executed (this is
-  useful for testing purposes).
-*/
-template<typename Real>
-int32 LinearCgd(const LinearCgdOptions &opts,
-                const SpMatrix<Real> &A, const VectorBase<Real> &b,
-                VectorBase<Real> *x);
-/**
-   This is an implementation of L-BFGS.  It pushes responsibility for
-   determining when to stop, onto the user.  There is no call-back here:
-   everything is done via calls to the class itself (see the example in
-   matrix-lib-test.cc).  This does not implement constrained L-BFGS, but it will
-   handle constrained problems correctly as long as the function approaches
-   +infinity (or -infinity for maximization problems) when it gets close to the
-   bound of the constraint.  In these types of problems, you just let the
-   function value be +infinity for minimization problems, or -infinity for
-   maximization problems, outside these bounds).
-*/
-struct LbfgsOptions {
-  bool minimize; // if true, we're minimizing, else maximizing.
-  int m; // m is the number of stored vectors L-BFGS keeps.
-  float first_step_learning_rate; // The very first step of L-BFGS is
-  // like gradient descent.  If you want to configure the size of that step,
-  // you can do it using this variable.
-  float first_step_length; // If this variable is >0.0, it overrides
-  // first_step_learning_rate; on the first step we choose an approximate
-  // Hessian that is the multiple of the identity that would generate this
-  // step-length, or 1.0 if the gradient is zero.
-  float first_step_impr; // If this variable is >0.0, it overrides
-  // first_step_learning_rate; on the first step we choose an approximate
-  // Hessian that is the multiple of the identity that would generate this
-  // amount of objective function improvement (assuming the "real" objf
-  // was linear).
-  float c1; // A constant in Armijo rule = Wolfe condition i)
-  float c2; // A constant in Wolfe condition ii)
-  float d; // An amount > 1.0 (default 2.0) that we initially multiply or
-  // divide the step length by, in the line search.
-  int max_line_search_iters; // after this many iters we restart L-BFGS.
-  int avg_step_length; // number of iters to avg step length over, in
-  // RecentStepLength().
-  LbfgsOptions (bool minimize = true):
-      minimize(minimize),
-      m(10),
-      first_step_learning_rate(1.0),
-      first_step_length(0.0),
-      first_step_impr(0.0),
-      c1(1.0e-04),
-      c2(0.9),
-      d(2.0),
-      max_line_search_iters(50),
-      avg_step_length(4) { }
-};
-template<typename Real>
-class OptimizeLbfgs {
- public:
-  /// Initializer takes the starting value of x.
-  OptimizeLbfgs(const VectorBase<Real> &x,
-                const LbfgsOptions &opts);
-  /// This returns the value of the variable x that has the best objective
-  /// function so far, and the corresponding objective function value if
-  /// requested.  This would typically be called only at the end.
-  const VectorBase<Real>& GetValue(Real *objf_value = NULL) const;
-  /// This returns the value at which the function wants us
-  /// to compute the objective function and gradient.
-  const VectorBase<Real>& GetProposedValue() const { return new_x_; }
-  /// Returns the average magnitude of the last n steps (but not
-  /// more than the number we have stored).  Before we have taken
-  /// any steps, returns +infinity.  Note: if the most recent
-  /// step length was 0, it returns 0, regardless of the other
-  /// step lengths.  This makes it suitable as a convergence test
-  /// (else we'd generate NaN's).
-  Real RecentStepLength() const;
-  /// The user calls this function to provide the class with the
-  /// function and gradient info at the point GetProposedValue().
-  /// If this point is outside the constraints you can set function_value
-  /// to {+infinity,-infinity} for {minimization,maximization} problems.
-  /// In this case the gradient, and also the second derivative (if you call
-  /// the second overloaded version of this function) will be ignored.
-  void DoStep(Real function_value,
-              const VectorBase<Real> &gradient);
-  /// The user can call this version of DoStep() if it is desired to set some
-  /// kind of approximate Hessian on this iteration.  Note: it is a prerequisite
-  /// that diag_approx_2nd_deriv must be strictly positive (minimizing), or
-  /// negative (maximizing).
-  void DoStep(Real function_value,
-              const VectorBase<Real> &gradient,
-              const VectorBase<Real> &diag_approx_2nd_deriv);
- private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(OptimizeLbfgs);
-  // The following variable says what stage of the computation we're at.
-  // Refer to Algorithm 7.5 (L-BFGS) of Nodecdal & Wright, "Numerical
-  // Optimization", 2nd edition.
-  // kBeforeStep means we're about to do
-  /// "compute p_k <-- - H_k \delta f_k" (i.e. Algorithm 7.4).
-  // kWithinStep means we're at some point within line search; note
-  // that line search is iterative so we can stay in this state more
-  // than one time on each iteration.
-  enum ComputationState {
-    kBeforeStep,
-    kWithinStep, // This means we're within the step-size computation, and
-    // have not yet done the 1st function evaluation.
-  };
-  inline MatrixIndexT Dim() { return x_.Dim(); }
-  inline MatrixIndexT M() { return opts_.m; }
-  SubVector<Real> Y(MatrixIndexT i) {
-    return SubVector<Real>(data_, (i % M()) * 2); // vector y_i
-  }
-  SubVector<Real> S(MatrixIndexT i) {
-    return SubVector<Real>(data_, (i % M()) * 2 + 1); // vector s_i
-  }
-  // The following are subroutines within DoStep():
-  bool AcceptStep(Real function_value,
-                  const VectorBase<Real> &gradient);
-  void Restart(const VectorBase<Real> &x,
-               Real function_value,
-               const VectorBase<Real> &gradient);
-  void ComputeNewDirection(Real function_value,
-                           const VectorBase<Real> &gradient);
-  void ComputeHifNeeded(const VectorBase<Real> &gradient);
-  void StepSizeIteration(Real function_value,
-                         const VectorBase<Real> &gradient);
-  void RecordStepLength(Real s);
-  LbfgsOptions opts_;
-  SignedMatrixIndexT k_; // Iteration number, starts from zero.  Gets set back to zero
-  // when we restart.
-  ComputationState computation_state_;
-  bool H_was_set_; // True if the user specified H_; if false,
-  // we'll use a heuristic to estimate it.
-  Vector<Real> x_; // current x.
-  Vector<Real> new_x_; // the x proposed in the line search.
-  Vector<Real> best_x_; // the x with the best objective function so far
-                        // (either the same as x_ or something in the current line search.)
-  Vector<Real> deriv_; // The most recently evaluated derivative-- at x_k.
-  Vector<Real> temp_;
-  Real f_; // The function evaluated at x_k.
-  Real best_f_; // the best objective function so far.
-  Real d_; // a number d > 1.0, but during an iteration we may decrease this, when
-  // we switch between armijo and wolfe failures.
-  int num_wolfe_i_failures_; // the num times we decreased step size.
-  int num_wolfe_ii_failures_; // the num times we increased step size.
-  enum { kWolfeI, kWolfeII, kNone } last_failure_type_; // last type of step-search
-  // failure on this iter.
-  Vector<Real> H_; // Current inverse-Hessian estimate.  May be computed by this class itself,
-  // or provided by user using 2nd form of SetGradientInfo().
-  Matrix<Real> data_; // dimension (m*2) x dim.  Even rows store
-  // gradients y_i, odd rows store steps s_i.
-  Vector<Real> rho_; // dimension m; rho_(m) = 1/(y_m^T s_m), Eq. 7.17.
-  std::vector<Real> step_lengths_; // The step sizes we took on the last
-  // (up to m) iterations; these are not stored in a rotating buffer but
-  // are shifted by one each time (this is more convenient when we
-  // restart, as we keep this info past restarting).
-};
-/// @} 
-} // end namespace kaldi
-#endif
--- a/speechx/speechx/kaldi/matrix/packed-matrix.cc
+++ b/speechx/speechx/kaldi/matrix/packed-matrix.cc
-// matrix/packed-matrix.cc
-// Copyright 2009-2012  Microsoft Corporation  Saarland University
-//        Johns Hopkins University (Author: Daniel Povey);
-//        Haihua Xu
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//  http://www.apache.org/licenses/LICENSE-2.0
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-/**
- * @file packed-matrix.cc
- *
- * Implementation of specialized PackedMatrix template methods
- */
-#include "matrix/cblas-wrappers.h"
-#include "matrix/packed-matrix.h"
-#include "matrix/kaldi-vector.h"
-namespace kaldi {
-template<typename Real>
-void PackedMatrix<Real>::Scale(Real alpha) {
-  size_t nr = num_rows_,
-      sz = (nr * (nr + 1)) / 2;
-  cblas_Xscal(sz, alpha, data_, 1);
-}
-template<typename Real>
-void PackedMatrix<Real>::AddPacked(const Real alpha, const PackedMatrix<Real> &rMa) {
-  KALDI_ASSERT(num_rows_ == rMa.NumRows());
-  size_t nr = num_rows_,
-      sz = (nr * (nr + 1)) / 2;
-  cblas_Xaxpy(sz, alpha, rMa.Data(), 1, data_, 1);
-}
-template<typename Real>
-void PackedMatrix<Real>::SetRandn() {
-  Real *data = data_;
-  size_t dim = num_rows_, size = ((dim*(dim+1))/2);
-  for (size_t i = 0; i < size; i++)
-    data[i] = RandGauss();  
-}
-template<typename Real>
-inline void PackedMatrix<Real>::Init(MatrixIndexT r) {
-  if (r == 0) {
-    num_rows_ = 0;
-    data_ = 0;
-    return;
-  }
-  size_t size = ((static_cast<size_t>(r) * static_cast<size_t>(r + 1)) / 2);
-  if (static_cast<size_t>(static_cast<MatrixIndexT>(size)) != size) {
-    KALDI_WARN << "Allocating packed matrix whose full dimension does not fit "
-               << "in MatrixIndexT: not all code is tested for this case.";
-  }
-  void *data;  // aligned memory block
-  void *temp;
-  if ((data = KALDI_MEMALIGN(16, size * sizeof(Real), &temp)) != NULL) {
-    this->data_ = static_cast<Real *> (data);
-    this->num_rows_ = r;
-  } else {
-    throw std::bad_alloc();
-  }
-}
-template<typename Real>
-void PackedMatrix<Real>::Swap(PackedMatrix<Real> *other) {
-  std::swap(data_, other->data_);
-  std::swap(num_rows_, other->num_rows_);
-}
-template<typename Real>
-void PackedMatrix<Real>::Swap(Matrix<Real> *other) {
-  std::swap(data_, other->data_);
-  std::swap(num_rows_, other->num_rows_);
-}
-template<typename Real>
-void PackedMatrix<Real>::Resize(MatrixIndexT r, MatrixResizeType resize_type) {
-  // the next block uses recursion to handle what we have to do if
-  // resize_type == kCopyData.
-  if (resize_type == kCopyData) {
-    if (this->data_ == NULL || r == 0) resize_type = kSetZero;  // nothing to copy.
-    else if (this->num_rows_ == r) { return; } // nothing to do.
-    else {
-      // set tmp to a packed matrix of the desired size.
-      PackedMatrix<Real> tmp(r, kUndefined);
-      size_t r_min = std::min(r, num_rows_);
-      size_t mem_size_min = sizeof(Real) * (r_min*(r_min+1))/2,
-          mem_size_full = sizeof(Real) * (r*(r+1))/2;
-      // Copy the contents to tmp.
-      memcpy(tmp.data_, data_, mem_size_min);
-      char *ptr = static_cast<char*>(static_cast<void*>(tmp.data_));
-      // Set the rest of the contents of tmp to zero.
-      memset(static_cast<void*>(ptr + mem_size_min), 0, mem_size_full-mem_size_min);
-      tmp.Swap(this);
-      return;
-    }
-  }
-  if (data_ != NULL) Destroy();
-  Init(r);
-  if (resize_type == kSetZero) SetZero();
-}
-template<typename Real>
-void PackedMatrix<Real>::AddToDiag(Real r) {
-  Real *ptr = data_;
-  for (MatrixIndexT i = 2; i <= num_rows_+1; i++) {
-    *ptr += r;
-    ptr += i;
-  }
-}
-template<typename Real>
-void PackedMatrix<Real>::ScaleDiag(Real alpha) {
-  Real *ptr = data_;
-  for (MatrixIndexT i = 2; i <= num_rows_+1; i++) {
-    *ptr *= alpha;
-    ptr += i;
-  }
-}
-template<typename Real>
-void PackedMatrix<Real>::SetDiag(Real alpha) {
-  Real *ptr = data_;
-  for (MatrixIndexT i = 2; i <= num_rows_+1; i++) {
-    *ptr = alpha;
-    ptr += i;
-  }
-}
-template<typename Real>
-template<typename OtherReal>
-void PackedMatrix<Real>::CopyFromPacked(const PackedMatrix<OtherReal> &orig) {
-  KALDI_ASSERT(NumRows() == orig.NumRows());
-  if (sizeof(Real) == sizeof(OtherReal)) {
-    memcpy(data_, orig.Data(), SizeInBytes());
-  } else {
-    Real *dst = data_;
-    const OtherReal *src = orig.Data();
-    size_t nr = NumRows(),
-        size = (nr * (nr + 1)) / 2;
-    for (size_t i = 0; i < size; i++, dst++, src++)
-      *dst = *src;
-  }
-}
-// template instantiations.
-template
-void PackedMatrix<float>::CopyFromPacked(const PackedMatrix<double> &orig);
-template
-void PackedMatrix<double>::CopyFromPacked(const PackedMatrix<float> &orig);
-template
-void PackedMatrix<double>::CopyFromPacked(const PackedMatrix<double> &orig);
-template
-void PackedMatrix<float>::CopyFromPacked(const PackedMatrix<float> &orig);
-template<typename Real>
-template<typename OtherReal>
-void PackedMatrix<Real>::CopyFromVec(const SubVector<OtherReal> &vec) {
-  MatrixIndexT size = (NumRows()*(NumRows()+1)) / 2;
-  KALDI_ASSERT(vec.Dim() == size);
-  if (sizeof(Real) == sizeof(OtherReal)) {
-    memcpy(data_, vec.Data(), size * sizeof(Real));
-  } else {
-    Real *dst = data_;
-    const OtherReal *src = vec.Data();
-    for (MatrixIndexT i = 0; i < size; i++, dst++, src++)
-      *dst = *src;
-  }
-}
-// template instantiations.
-template
-void PackedMatrix<float>::CopyFromVec(const SubVector<double> &orig);
-template
-void PackedMatrix<double>::CopyFromVec(const SubVector<float> &orig);
-template
-void PackedMatrix<double>::CopyFromVec(const SubVector<double> &orig);
-template
-void PackedMatrix<float>::CopyFromVec(const SubVector<float> &orig);
-template<typename Real>
-void PackedMatrix<Real>::SetZero() {
-  memset(data_, 0, SizeInBytes());
-}
-template<typename Real>
-void PackedMatrix<Real>::SetUnit() {
-  memset(data_, 0, SizeInBytes());
-  for (MatrixIndexT row = 0;row < num_rows_;row++)
-    (*this)(row, row) = 1.0;
-}
-template<typename Real>
-Real PackedMatrix<Real>::Trace() const {
-  Real ans = 0.0;
-  for (MatrixIndexT row = 0;row < num_rows_;row++)
-    ans += (*this)(row, row);
-  return ans;
-}
-template<typename Real>
-void PackedMatrix<Real>::Destroy() {
-  // we need to free the data block if it was defined
-  if (data_ != NULL) KALDI_MEMALIGN_FREE(data_);
-  data_ = NULL;
-  num_rows_ = 0;
-}
-template<typename Real>
-void PackedMatrix<Real>::Write(std::ostream &os, bool binary) const {
-  if (!os.good()) {
-    KALDI_ERR << "Failed to write vector to stream: stream not good";
-  }
-  int32 size = this->NumRows();  // make the size 32-bit on disk.
-  KALDI_ASSERT(this->NumRows() == (MatrixIndexT) size);
-  MatrixIndexT num_elems = ((size+1)*(MatrixIndexT)size)/2;
-  if(binary) {  
-    std::string my_token = (sizeof(Real) == 4 ? "FP" : "DP");
-    WriteToken(os, binary, my_token);
-    WriteBasicType(os, binary, size);
-  // We don't use the built-in Kaldi write routines for the floats, as they are
-  // not efficient enough.
-    os.write((const char*) data_, sizeof(Real) * num_elems);
-  }
-  else {
-    if(size == 0)
-      os<<"[ ]\n";
-    else {
-      os<<"[\n";
-      MatrixIndexT i = 0;
-      for (int32 j = 0; j < size; j++) {  
-        for (int32 k = 0; k < j + 1; k++) {
-          WriteBasicType(os, binary, data_[i++]);
-        }
-        os << ( (j==size-1)? "]\n" : "\n");
-      }
-      KALDI_ASSERT(i == num_elems);
-    }
-  }
-  if (os.fail()) {
-    KALDI_ERR << "Failed to write packed matrix to stream";
-  }
-}
-// template<typename Real>
-//   void Save (std::ostream & os, const PackedMatrix<Real>& rM)
-//   {
-//     const Real* p_elem = rM.data();
-//     for (MatrixIndexT i = 0; i < rM.NumRows(); i++) {
-//       for (MatrixIndexT j = 0; j <= i ; j++) {
-//         os << *p_elem;
-//         p_elem++;
-//         if (j == i) {
-//           os << '\n';
-//         }
-//         else {
-//           os << ' ';
-//         }
-//       }
-//     }
-//     if (os.fail())
-//       KALDI_ERR("Failed to write packed matrix to stream");
-//   }
-template<typename Real>
-void PackedMatrix<Real>::Read(std::istream& is, bool binary, bool add) {
-  if (add) {
-    PackedMatrix<Real> tmp;
-    tmp.Read(is, binary, false);  // read without adding.
-    if (this->NumRows() == 0) this->Resize(tmp.NumRows());
-    else {
-      if (this->NumRows() != tmp.NumRows()) {
-        if (tmp.NumRows() == 0) return;  // do nothing in this case.
-        else KALDI_ERR << "PackedMatrix::Read, size mismatch " << this->NumRows()
-                       << " vs. " << tmp.NumRows();
-      }
-    }
-    this->AddPacked(1.0, tmp);
-    return;
-  } // now assume add == false.
-  std::ostringstream specific_error;
-  MatrixIndexT pos_at_start = is.tellg();
-  int peekval = Peek(is, binary);
-  const char *my_token =  (sizeof(Real) == 4 ? "FP" : "DP");
-  const char *new_format_token = "[";
-  bool is_new_format = false;//added by hxu
-  char other_token_start = (sizeof(Real) == 4 ? 'D' : 'F');
-  int32 size;
-  MatrixIndexT num_elems;
-  if (peekval == other_token_start) {  // need to instantiate the other type to read it.
-    typedef typename OtherReal<Real>::Real OtherType;  // if Real == float, OtherType == double, and vice versa.
-    PackedMatrix<OtherType> other(this->NumRows());
-    other.Read(is, binary, false);  // add is false at this point.
-    this->Resize(other.NumRows());
-    this->CopyFromPacked(other);
-    return;
-  }
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (token != my_token) {
-    if(token != new_format_token) {
-      specific_error << ": Expected token " << my_token << ", got " << token;
-      goto bad;
-    }
-    //new format it is
-    is_new_format = true; 
-  }
-  if(!is_new_format) {
-    ReadBasicType(is, binary, &size);  // throws on error.
-    if ((MatrixIndexT)size != this->NumRows()) {
-      KALDI_ASSERT(size>=0);
-      this->Resize(size);
-    }
-    num_elems = ((size+1)*(MatrixIndexT)size)/2;
-    if (!binary) {
-      for (MatrixIndexT i = 0; i < num_elems; i++) {
-        ReadBasicType(is, false, data_+i);  // will throw on error.
-      }
-    } else {
-      if (num_elems)
-        is.read(reinterpret_cast<char*>(data_), sizeof(Real)*num_elems);
-    }
-    if (is.fail()) goto bad;
-    return;
-  }
-  else {
-    std::vector<Real> data;
-    while(1) {
-      int32 num_lines = 0;
-      int i = is.peek();
-      if (i == -1) { specific_error << "Got EOF while reading matrix data"; goto bad; }
-      else if (static_cast<char>(i) == ']') {  // Finished reading matrix.
-        is.get();  // eat the "]".
-        i = is.peek();
-        if (static_cast<char>(i) == '\r') {
-          is.get();
-          is.get();  // get \r\n (must eat what we wrote)
-        }// I don't actually understand what it's doing here
-        else if (static_cast<char>(i) == '\n') { is.get(); } // get \n (must eat what we wrote)
-        if (is.fail()) {
-          KALDI_WARN << "After end of matrix data, read error.";
-          // we got the data we needed, so just warn for this error.
-        }
-        //now process the data:
-        num_lines = int32(sqrt(data.size()*2));
-        KALDI_ASSERT(data.size() == num_lines*(num_lines+1)/2);
-        this->Resize(num_lines);
-        //std::cout<<data.size()<<' '<<num_lines<<'\n';
-        for(int32 i = 0; i < data.size(); i++) {
-          data_[i] = data[i];
-        }
-        return;
-        //std::cout<<"here!!!!!hxu!!!!!"<<std::endl;
-      }
-      else if ( (i >= '0' && i <= '9') || i == '-' ) {  // A number...
-        Real r; 
-        is >> r;
-        if (is.fail()) {
-          specific_error << "Stream failure/EOF while reading matrix data.";
-          goto bad;
-        } 
-        data.push_back(r);
-      }
-      else if (isspace(i)) {
-        is.get();  // eat the space and do nothing.
-      } else {  // NaN or inf or error.
-        std::string str;
-        is >> str;
-        if (!KALDI_STRCASECMP(str.c_str(), "inf") ||
-            !KALDI_STRCASECMP(str.c_str(), "infinity")) {
-          data.push_back(std::numeric_limits<Real>::infinity());
-          KALDI_WARN << "Reading infinite value into matrix.";
-        } else if (!KALDI_STRCASECMP(str.c_str(), "nan")) {
-          data.push_back(std::numeric_limits<Real>::quiet_NaN());
-          KALDI_WARN << "Reading NaN value into matrix.";
-        } else {
-          specific_error << "Expecting numeric matrix data, got " << str;
-          goto bad;
-        } 
-      }       
-    } 
-  }
-bad:
-  KALDI_ERR << "Failed to read packed matrix from stream. " << specific_error.str()
-            << " File position at start is "
-            << pos_at_start << ", currently " << is.tellg();
-}
-// Instantiate PackedMatrix for float and double.
-template
-class PackedMatrix<float>;
-template
-class PackedMatrix<double>;
-}  // namespace kaldi
--- a/speechx/speechx/kaldi/matrix/packed-matrix.h
+++ b/speechx/speechx/kaldi/matrix/packed-matrix.h
-// matrix/packed-matrix.h
-// Copyright 2009-2013  Ondrej Glembek;  Lukas Burget;  Microsoft Corporation;
-//                      Saarland University;  Yanmin Qian;
-//                      Johns Hopkins University (Author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//  http://www.apache.org/licenses/LICENSE-2.0
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_MATRIX_PACKED_MATRIX_H_
-#define KALDI_MATRIX_PACKED_MATRIX_H_
-#include "matrix/matrix-common.h"
-#include <algorithm>
-namespace kaldi {
-/// \addtogroup matrix_funcs_io
-// we need to declare the friend << operator here
-template<typename Real>
-std::ostream & operator <<(std::ostream & out, const PackedMatrix<Real>& M);
-/// \addtogroup matrix_group
-/// @{
-/// @brief Packed matrix: base class for triangular and symmetric matrices.
-template<typename Real> class PackedMatrix {
-  friend class CuPackedMatrix<Real>;
- public:
-  //friend class CuPackedMatrix<Real>;
-  PackedMatrix() : data_(NULL), num_rows_(0) {}
-  explicit PackedMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero):
-      data_(NULL) {  Resize(r, resize_type);  }
-  explicit PackedMatrix(const PackedMatrix<Real> &orig) : data_(NULL) {
-    Resize(orig.num_rows_, kUndefined);
-    CopyFromPacked(orig);
-  }
-  template<typename OtherReal>
-  explicit PackedMatrix(const PackedMatrix<OtherReal> &orig) : data_(NULL) {
-    Resize(orig.NumRows(), kUndefined);
-    CopyFromPacked(orig);
-  }
-  void SetZero();  /// < Set to zero
-  void SetUnit();  /// < Set to unit matrix.
-  void SetRandn(); /// < Set to random values of a normal distribution
-  Real Trace() const;
-  // Needed for inclusion in std::vector
-  PackedMatrix<Real> & operator =(const PackedMatrix<Real> &other) {
-    Resize(other.NumRows());
-    CopyFromPacked(other);
-    return *this;
-  }
-  ~PackedMatrix() {
-    Destroy();
-  }
-  /// Set packed matrix to a specified size (can be zero).
-  /// The value of the new data depends on resize_type:
-  ///   -if kSetZero, the new data will be zero
-  ///   -if kUndefined, the new data will be undefined
-  ///   -if kCopyData, the new data will be the same as the old data in any
-  ///      shared positions, and zero elsewhere.
-  /// This function takes time proportional to the number of data elements.
-  void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero);
-  void AddToDiag(const Real r); // Adds r to diaginal
-  void ScaleDiag(const Real alpha);  // Scales diagonal by alpha.
-  void SetDiag(const Real alpha);  // Sets diagonal to this value.
-  template<typename OtherReal>
-  void CopyFromPacked(const PackedMatrix<OtherReal> &orig);
-  /// CopyFromVec just interprets the vector as having the same layout
-  /// as the packed matrix.  Must have the same dimension, i.e.
-  /// orig.Dim() == (NumRows()*(NumRows()+1)) / 2;
-  template<typename OtherReal>
-  void CopyFromVec(const SubVector<OtherReal> &orig);
-  Real* Data() { return data_; }
-  const Real* Data() const { return data_; }
-  inline MatrixIndexT NumRows() const { return num_rows_; }
-  inline MatrixIndexT NumCols() const { return num_rows_; }
-  size_t SizeInBytes() const {
-    size_t nr = static_cast<size_t>(num_rows_);
-    return ((nr * (nr+1)) / 2) * sizeof(Real);
-  }
-  //MatrixIndexT Stride() const { return stride_; }
-  // This code is duplicated in child classes to avoid extra levels of calls.
-  Real operator() (MatrixIndexT r, MatrixIndexT c) const {
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
-                 static_cast<UnsignedMatrixIndexT>(num_rows_) &&
-                 static_cast<UnsignedMatrixIndexT>(c) <
-                 static_cast<UnsignedMatrixIndexT>(num_rows_)
-                 && c <= r);
-    return *(data_ + (r * (r + 1)) / 2 + c);
-  }
-  // This code is duplicated in child classes to avoid extra levels of calls.
-  Real &operator() (MatrixIndexT r, MatrixIndexT c) {
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
-                 static_cast<UnsignedMatrixIndexT>(num_rows_) &&
-                 static_cast<UnsignedMatrixIndexT>(c) <
-                 static_cast<UnsignedMatrixIndexT>(num_rows_)
-                 && c <= r);
-    return *(data_ + (r * (r + 1)) / 2 + c);
-  }
-  Real Max() const {
-    KALDI_ASSERT(num_rows_ > 0);
-    return * (std::max_element(data_, data_ + ((num_rows_*(num_rows_+1))/2) ));
-  }
-  Real Min() const {
-    KALDI_ASSERT(num_rows_ > 0);
-    return * (std::min_element(data_, data_ + ((num_rows_*(num_rows_+1))/2) ));
-  }
-  void Scale(Real c);
-  friend std::ostream & operator << <> (std::ostream & out,
-                                     const PackedMatrix<Real> &m);
-  // Use instead of stream<<*this, if you want to add to existing contents.
-  // Will throw exception on failure.
-  void Read(std::istream &in, bool binary, bool add = false);
-  void Write(std::ostream &out, bool binary) const;
-  void Destroy();
-  /// Swaps the contents of *this and *other.  Shallow swap.
-  void Swap(PackedMatrix<Real> *other);
-  void Swap(Matrix<Real> *other);
- protected:
-  // Will only be called from this class or derived classes.
-  void AddPacked(const Real alpha, const PackedMatrix<Real>& M);
-  Real *data_;
-  MatrixIndexT num_rows_;
-  //MatrixIndexT stride_;
- private:
-  /// Init assumes the current contents of the class are is invalid (i.e. junk or
-  /// has already been freed), and it sets the matrixd to newly allocated memory
-  /// with the specified dimension.  dim == 0 is acceptable.  The memory contents
-  /// pointed to by data_ will be undefined.
-  void Init(MatrixIndexT dim);
-};
-/// @} end "addtogroup matrix_group"
-/// \addtogroup matrix_funcs_io
-/// @{
-template<typename Real>
-std::ostream & operator << (std::ostream & os, const PackedMatrix<Real>& M) {
-  M.Write(os, false);
-  return os;
-}
-template<typename Real>
-std::istream & operator >> (std::istream &is, PackedMatrix<Real> &M) {
-  M.Read(is, false);
-  return is;
-}
-/// @}
-}  // namespace kaldi
-#endif
--- a/speechx/speechx/kaldi/matrix/qr.cc
+++ b/speechx/speechx/kaldi/matrix/qr.cc
-// matrix/qr.cc
-// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//  http://www.apache.org/licenses/LICENSE-2.0
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include <limits>
-#include "matrix/sp-matrix.h"
-#include "matrix/kaldi-vector.h"
-#include "matrix/kaldi-matrix.h"
-#include "matrix/matrix-functions.h"
-#include "matrix/cblas-wrappers.h"
-// This file contains an implementation of the Symmetric QR Algorithm
-// for the symmetric eigenvalue problem.  See Golub and Van Loan,
-// 3rd ed., Algorithm 8.3.3.
-namespace kaldi {
-/* This is from Golub and Van Loan 3rd ed., sec. 5.1.3,
-   p210.
-   x is the input of dimenson 'dim', v is the output of dimension
-   dim, and beta is a scalar. Note: we use zero-based
-   not one-based indexing. */
-/*
-// We are commenting out the function below ("House") because it's not
-// needed, but we keep it just to show how we came up with HouseBackward.
-template<typename Real>
-void House(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
-  KALDI_ASSERT(dim > 0);
-  // To avoid overflow, we first compute the max of x_ (or
-  // one if that's zero, and we'll replace "x" by x/max(x_i)
-  // below.  The householder vector is anyway invariant to
-  // the magnitude of x.  We could actually avoid this extra loop
-  // over x if we wanted to be a bit smarter, but anyway this
-  // doesn't dominate the O(N) performance of the algorithm.
-  Real s; // s is a scale on x.
-  {
-    Real max_x = std::numeric_limits<Real>::min();
-    for (MatrixIndexT i = 0; i < dim; i++)
-      max_x = std::max(max_x, (x[i] < 0 ? -x[i] : x[i]));
-    if (max_x == 0.0) max_x = 1.0;
-    s = 1.0 / max_x;
-  }
-  Real sigma = 0.0;
-  v[0] = 1.0;
-  for (MatrixIndexT i = 1; i < dim; i++) {
-    sigma += (x[i]*s) * (x[i]*s);
-    v[i] = x[i]*s;
-  }
-  if (sigma == 0.0) *beta = 0.0;
-  else {
-    // When we say x1 = x[0], we reference the one-based indexing
-    // in Golub and Van Loan.
-    Real x1 = x[0] * s, mu = std::sqrt(x1*x1 + sigma);
-    if (x1 <= 0) {
-      v[0] = x1 - mu;
-    } else {
-      v[0] = -sigma / (x1 + mu);
-      KALDI_ASSERT(KALDI_ISFINITE(v[dim-1]));      
-    }
-    Real v1 = v[0];
-    Real v1sq = v1 * v1;
-    *beta = 2 * v1sq / (sigma + v1sq);
-    Real inv_v1 = 1.0 / v1;
-    if (KALDI_ISINF(inv_v1)) {
-      // can happen if v1 is denormal.
-      KALDI_ASSERT(v1 == v1 && v1 != 0.0);
-      for (MatrixIndexT i = 0; i < dim; i++) v[i] /= v1;
-    } else {
-      cblas_Xscal(dim, inv_v1, v, 1);
-    }
-    if (KALDI_ISNAN(inv_v1)) {
-      KALDI_ERR << "NaN encountered in HouseBackward";
-    }
-  }
-}
-*/
-// This is a backward version of the "House" routine above:
-// backward because it's the last index, not the first index of
-// the vector that is "special".  This is convenient in
-// the Tridiagonalize routine that uses reversed indexes for
-// compatibility with the packed lower triangular format.
-template<typename Real>
-void HouseBackward(MatrixIndexT dim, const Real *x, Real *v, Real *beta) {
-  KALDI_ASSERT(dim > 0);
-  // To avoid overflow, we first compute the max of x_ (or
-  // one if that's zero, and we'll replace "x" by x/max(x_i)
-  // below.  The householder vector is anyway invariant to
-  // the magnitude of x.  We could actually avoid this extra loop
-  // over x if we wanted to be a bit smarter, but anyway this
-  // doesn't dominate the O(N) performance of the algorithm.
-  Real s; // s is a scale on x.
-  {
-    Real max_x = std::numeric_limits<Real>::min();
-    for (MatrixIndexT i = 0; i < dim; i++)
-      max_x = std::max(max_x, (x[i] < 0 ? -x[i] : x[i]));
-    s = 1.0 / max_x;
-  }
-  Real sigma = 0.0;
-  v[dim-1] = 1.0;
-  for (MatrixIndexT i = 0; i + 1  < dim; i++) {
-    sigma += (x[i] * s) * (x[i] * s);
-    v[i] = x[i] * s;
-  }
-  KALDI_ASSERT(KALDI_ISFINITE(sigma) &&
-               "Tridiagonalizing matrix that is too large or has NaNs.");
-  if (sigma == 0.0) *beta = 0.0;
-  else {
-    Real x1 = x[dim-1] * s, mu = std::sqrt(x1 * x1 + sigma);
-    if (x1 <= 0) {
-      v[dim-1] = x1 - mu;
-    } else {
-      v[dim-1] = -sigma / (x1 + mu);
-      KALDI_ASSERT(KALDI_ISFINITE(v[dim-1]));
-    }
-    Real v1 = v[dim-1];
-    Real v1sq = v1 * v1;
-    *beta = 2 * v1sq / (sigma + v1sq);
-    Real inv_v1 = 1.0 / v1;
-    if (KALDI_ISINF(inv_v1)) {
-      // can happen if v1 is denormal.
-      KALDI_ASSERT(v1 == v1 && v1 != 0.0);
-      for (MatrixIndexT i = 0; i < dim; i++) v[i] /= v1;
-    } else {
-      cblas_Xscal(dim, inv_v1, v, 1);
-    }
-    if (KALDI_ISNAN(inv_v1)) {
-      KALDI_ERR << "NaN encountered in HouseBackward";
-    }
-  }
-}
-/**
-   This routine tridiagonalizes *this.  C.f. Golub and Van Loan 3rd ed., sec.
-   8.3.1 (p415).  We reverse the order of the indices as it's more natural
-   with packed lower-triangular matrices to do it this way.  There's also
-   a shift from one-based to zero-based indexing, so the index
-   k is transformed k -> n - k, and a corresponding transpose...
-   Let the original *this be A.  This algorithms replaces *this with
-   a tridiagonal matrix T such that T = Q A Q^T for an orthogonal Q.
-   Caution: Q is transposed vs. Golub and Van Loan.
-   If Q != NULL it outputs Q. 
-*/
-template<typename Real>
-void SpMatrix<Real>::Tridiagonalize(MatrixBase<Real> *Q) {
-  MatrixIndexT n = this->NumRows();
-  KALDI_ASSERT(Q == NULL || (Q->NumRows() == n &&
-                             Q->NumCols() == n));
-  if (Q != NULL) Q->SetUnit();
-  Real *data = this->Data();
-  Real *qdata = (Q == NULL ? NULL : Q->Data());
-  MatrixIndexT qstride = (Q == NULL ? 0 : Q->Stride());
-  Vector<Real> tmp_v(n-1), tmp_p(n);
-  Real beta, *v = tmp_v.Data(), *p = tmp_p.Data(), *w = p, *x = p;
-  for (MatrixIndexT k = n-1; k >= 2; k--) {
-    MatrixIndexT ksize = ((k+1)*k)/2;
-    // ksize is the packed size of the lower-triangular matrix of size k,
-    // which is the size of "all rows previous to this one."
-    Real *Arow = data + ksize; // In Golub+Van Loan it was A(k+1:n, k), we
-    // have Arow = A(k, 0:k-1).
-    HouseBackward(k, Arow, v, &beta); // sets v and beta.
-    cblas_Xspmv(k, beta, data, v, 1, 0.0, p, 1); // p = beta * A(0:k-1,0:k-1) v
-    Real minus_half_beta_pv = -0.5 * beta * cblas_Xdot(k, p, 1, v, 1);
-    cblas_Xaxpy(k, minus_half_beta_pv, v, 1, w, 1); // w = p - (beta p^T v/2) v;
-    // this relies on the fact that w and p are the same pointer.
-    // We're doing A(k, k-1) = ||Arow||.  It happens that this element
-    // is indexed at ksize + k - 1 in the packed lower-triangular format.
-    data[ksize + k - 1] = std::sqrt(cblas_Xdot(k, Arow, 1, Arow, 1));
-    for (MatrixIndexT i = 0; i + 1 < k; i++)
-      data[ksize + i] = 0; // This is not in Golub and Van Loan but is
-    // necessary if we're not using parts of A to store the Householder
-    // vectors.
-    // We're doing A(0:k-1,0:k-1) -= (v w' + w v')
-    cblas_Xspr2(k, -1.0, v, 1, w, 1, data);
-    if (Q != NULL) { // C.f. Golub, Q is H_1 .. H_n-2... in this
-      // case we apply them in the opposite order so it's H_n-1 .. H_1,
-      // but also Q is transposed so we really have Q = H_1 .. H_n-1.
-      // It's a double negative.    
-      // Anyway, we left-multiply Q by each one.  The H_n would each be
-      // diag(I + beta v v', I) but we don't ever touch the last dims.
-      // We do (in Matlab notation):
-      // Q(0:k-1,:) = (I - beta v v') * Q, i.e.:
-      // Q(:,0:i-1) += -beta v (v' Q(:,0:k-1)v .. let x = -beta Q(0:k-1,:)^T v.
-      cblas_Xgemv(kTrans, k, n, -beta, qdata, qstride, v, 1, 0.0, x, 1);
-      // now x = -beta Q(:,0:k-1) v.
-      // The next line does: Q(:,0:k-1) += v x'.
-      cblas_Xger(k, n, 1.0, v, 1, x, 1, qdata, qstride);
-    }
-  }
-}
-// Instantiate these functions, as it wasn't implemented in sp-matrix.cc
-// where we instantiated the whole class.
-template
-void SpMatrix<float>::Tridiagonalize(MatrixBase<float> *Q);
-template
-void SpMatrix<double>::Tridiagonalize(MatrixBase<double> *Q);
-/// Create Givens rotations, as in Golub and Van Loan 3rd ed., page 216.
-template<typename Real>
-inline void Givens(Real a, Real b, Real *c, Real *s) {
-  if (b == 0) {
-    *c = 1;
-    *s = 0;
-  } else {
-    if (std::abs(b) > std::abs(a)) {
-      Real tau = -a / b;
-      *s = 1 / std::sqrt(1 + tau*tau);
-      *c = *s * tau;
-    } else {
-      Real tau = -b / a;
-      *c = 1 / std::sqrt(1 + tau*tau);
-      *s = *c * tau;
-    }
-  }
-}
-// Some internal code for the QR algorithm: one "QR step".
-// This is Golub and Van Loan 3rd ed., Algorithm 8.3.2 "Implicit Symmetric QR step
-// with Wilkinson shift."  A couple of differences: this code is
-// in zero based arithmetic, and we represent Q transposed from
-// their Q for memory locality with row-major-indexed matrices.
-template <typename Real>
-void QrStep(MatrixIndexT n,
-            Real *diag,
-            Real *off_diag,
-            MatrixBase<Real> *Q) {
-  KALDI_ASSERT(n >= 2);
-  // below, "scale" could be any number; we introduce it to keep the
-  // floating point quantities within a good range.
-  Real   d = (diag[n-2] - diag[n-1]) / 2.0,
-      t = off_diag[n-2],
-      inv_scale = std::max(std::max(std::abs(d), std::abs(t)),
-                           std::numeric_limits<Real>::min()),
-      scale = 1.0 / inv_scale,
-      d_scaled = d * scale,
-      off_diag_n2_scaled = off_diag[n-2] * scale,
-      t2_n_n1_scaled = off_diag_n2_scaled * off_diag_n2_scaled,
-      sgn_d = (d > 0.0 ? 1.0 : -1.0),
-      mu = diag[n-1] - inv_scale * t2_n_n1_scaled /
-      (d_scaled + sgn_d * std::sqrt(d_scaled * d_scaled + t2_n_n1_scaled)),
-      x = diag[0] - mu,
-      z = off_diag[0];
-  KALDI_ASSERT(KALDI_ISFINITE(x));
-  Real *Qdata = (Q == NULL ? NULL : Q->Data());
-  MatrixIndexT Qstride = (Q == NULL ? 0 : Q->Stride()),
-      Qcols = (Q == NULL ? 0 : Q->NumCols());
-  for (MatrixIndexT k = 0; k < n-1; k++) {
-    Real c, s;
-    Givens(x, z, &c, &s);
-    // Rotate dimensions k and k+1 with the Givens matrix G, as
-    // T <== G^T T G.
-    // In 2d, a Givens matrix is [ c s; -s c ].  Forget about
-    // the dimension-indexing issues and assume we have a 2x2
-    // symmetric matrix [ p q ; q r ]
-    // We ask our friends at Wolfram Alpha about
-    // { { c, -s}, {s, c} } * { {p, q}, {q, r} } * { { c, s}, {-s, c} }
-    // Interpreting the result as [ p', q' ; q', r ]
-    //    p' = c (c p - s q) - s (c q - s r)
-    //    q' = s (c p - s q) + c (c q - s r)
-    //    r' = s (s p + c q) + c (s q + c r)
-    Real p = diag[k], q = off_diag[k], r = diag[k+1];
-    // p is element k,k; r is element k+1,k+1; q is element k,k+1 or k+1,k.
-    // We'll let the compiler optimize this.
-    diag[k] = c * (c*p - s*q) - s * (c*q - s*r);
-    off_diag[k] = s * (c*p - s*q) + c * (c*q - s*r);
-    diag[k+1] = s * (s*p + c*q) + c * (s*q + c*r);
-    // We also have some other elements to think of that
-    // got rotated in a simpler way: if k>0,
-    // then element (k, k-1) and (k+1, k-1) get rotated.  Here,
-    // element k+1, k-1 will be present as z; it's the out-of-band
-    // element that we remembered from last time.  This is
-    // on the left as it's the row indexes that differ, so think of
-    // this as being premultiplied by G^T.  In fact we're multiplying
-    // T by in some sense the opposite/transpose of the Givens rotation.
-    if (k > 0) { // Note, in rotations, going backward, (x,y) -> ((cx - sy), (sx + cy))
-      Real &elem_k_km1 = off_diag[k-1],
-          elem_kp1_km1 = z; // , tmp = elem_k_km1;
-      elem_k_km1 = c*elem_k_km1 - s*elem_kp1_km1;
-      // The next line will set elem_kp1_km1 to zero and we'll never access this
-      // value, so we comment it out.
-      // elem_kp1_km1 = s*tmp + c*elem_kp1_km1;
-    }
-    if (Q != NULL)
-      cblas_Xrot(Qcols, Qdata + k*Qstride, 1,
-                 Qdata + (k+1)*Qstride, 1, c, -s);
-    if (k < n-2) {
-      // Next is the elements (k+2, k) and (k+2, k-1), to be rotated, again
-      // backwards.
-      Real &elem_kp2_k = z, 
-          &elem_kp2_kp1 = off_diag[k+1];
-      // Note: elem_kp2_k == z would start off as zero because it's
-       // two off the diagonal, and not been touched yet.  Therefore
-      // we eliminate it in expressions below, commenting it out.
-      // If we didn't do this we should set it to zero first.
-      elem_kp2_k =  - s * elem_kp2_kp1; // + c*elem_kp2_k
-      elem_kp2_kp1 =  c * elem_kp2_kp1; // + s*elem_kp2_k (original value).
-      // The next part is from the algorithm they describe: x = t_{k+1,k}
-      x = off_diag[k];
-    }
-  }
-}
-// Internal code for the QR algorithm, where the diagonal
-// and off-diagonal of the symmetric matrix are represented as
-// vectors of length n and n-1.
-template <typename Real>
-void QrInternal(MatrixIndexT n,
-                Real *diag,
-                Real *off_diag,
-                MatrixBase<Real> *Q) {
-  KALDI_ASSERT(Q == NULL || Q->NumCols() == n); // We may
-  // later relax the condition that Q->NumCols() == n.
-  MatrixIndexT counter = 0, max_iters = 500 + 4*n, // Should never take this many iters.
-      large_iters = 100 + 2*n;
-  Real epsilon = (pow(2.0, sizeof(Real) == 4 ? -23.0 : -52.0));
-  for (; counter < max_iters; counter++) { // this takes the place of "until
-                                           // q=n"... we'll break out of the
-                                           // loop when we converge.
-    if (counter == large_iters ||
-        (counter > large_iters && (counter - large_iters) % 50 == 0)) {
-      KALDI_WARN << "Took " << counter
-                 << " iterations in QR (dim is " << n << "), doubling epsilon.";
-      SubVector<Real> d(diag, n), o(off_diag, n-1);
-      KALDI_WARN << "Diag, off-diag are " << d << " and " << o;
-      epsilon *= 2.0;
-    }
-    for (MatrixIndexT i = 0; i+1 < n; i++) {
-      if (std::abs(off_diag[i]) <= epsilon *
-          (std::abs(diag[i]) + std::abs(diag[i+1])))
-        off_diag[i] = 0.0;
-    }
-    // The next code works out p, q, and npq which is n - p - q.
-    // For the definitions of q and p, see Golub and Van Loan; we 
-    // partition the n dims into pieces of size (p, n-p-q, q) where
-    // the part of size q is diagonal and the part of size n-p-p is
-    // "unreduced", i.e. has no zero off-diagonal elements.
-    MatrixIndexT q = 0;
-    // Note: below, "n-q < 2" should more clearly be "n-2-q < 0", but that
-    // causes problems if MatrixIndexT is unsigned.
-    while (q < n && (n-q < 2 || off_diag[n-2-q] == 0.0))
-      q++;
-    if (q == n) break; // we're done.  It's diagonal.
-    KALDI_ASSERT(n - q >= 2);
-    MatrixIndexT npq = 2; // Value of n - p - q, where n - p - q must be
-    // unreduced.  This is the size of "middle" band of elements.  If q != n,
-    // we must have hit a nonzero off-diag element, so the size of this
-    // band must be at least two.
-    while (npq + q < n && (n-q-npq-1 < 0 || off_diag[n-q-npq-1] != 0.0))
-      npq++;
-    MatrixIndexT p = n - q - npq;
-    { // Checks.
-      for (MatrixIndexT i = 0; i+1 < npq; i++)
-        KALDI_ASSERT(off_diag[p + i] != 0.0);
-      for (MatrixIndexT i = 0; i+1 < q; i++)
-        KALDI_ASSERT(off_diag[p + npq - 1 + i] == 0.0);
-      if (p > 1) // Something must have stopped npq from growing further..
-        KALDI_ASSERT(off_diag[p-1] == 0.0); // so last off-diag elem in
-      // group of size p must be zero.
-    }
-    if (Q != NULL) {
-      // Do one QR step on the middle part of Q only.
-      // Qpart will be a subset of the rows of Q.
-      SubMatrix<Real> Qpart(*Q, p, npq, 0, Q->NumCols());
-      QrStep(npq, diag + p, off_diag + p, &Qpart);
-    } else {
-      QrStep(npq, diag + p, off_diag + p,
-             static_cast<MatrixBase<Real>*>(NULL));
-    }      
-  }
-  if (counter == max_iters) {
-    KALDI_WARN << "Failure to converge in QR algorithm. "
-               << "Exiting with partial output.";
-  }
-}
-/**
-   This is the symmetric QR algorithm, from Golub and Van Loan 3rd ed., Algorithm
-   8.3.3.  Q is transposed w.r.t. there, though.
-*/
-template <typename Real>
-void SpMatrix<Real>::Qr(MatrixBase<Real> *Q) {
-  KALDI_ASSERT(this->IsTridiagonal());
-  // We envisage that Q would be square but we don't check for this,
-  // as there are situations where you might not want this.
-  KALDI_ASSERT(Q == NULL || Q->NumRows() == this->NumRows());
-  // Note: the first couple of lines of the algorithm they give would be done
-  // outside of this function, by calling Tridiagonalize().
-  MatrixIndexT n = this->NumRows();
-  Vector<Real> diag(n), off_diag(n-1);
-  for (MatrixIndexT i = 0; i < n; i++) {
-    diag(i) = (*this)(i, i);
-    if (i > 0) off_diag(i-1) = (*this)(i, i-1);
-  }
-  QrInternal(n, diag.Data(), off_diag.Data(), Q);
-  // Now set *this to the value represented by diag and off_diag.
-  this->SetZero();
-  for (MatrixIndexT i = 0; i < n; i++) {
-    (*this)(i, i) = diag(i);
-    if (i > 0) (*this)(i, i-1) = off_diag(i-1);
-  }
-}
-template<typename Real>
-void SpMatrix<Real>::Eig(VectorBase<Real> *s, MatrixBase<Real> *P) const {
-  MatrixIndexT dim = this->NumRows();
-  KALDI_ASSERT(s->Dim() == dim);
-  KALDI_ASSERT(P == NULL || (P->NumRows() == dim && P->NumCols() == dim));
-  SpMatrix<Real> A(*this); // Copy *this, since the tridiagonalization
-  // and QR decomposition are destructive.
-  // Note: for efficiency of memory access, the tridiagonalization
-  // algorithm makes the *rows* of P the eigenvectors, not the columns.
-  // We'll transpose P before we exit.
-  // Also note: P may be null if you don't want the eigenvectors.  This
-  // will make this function more efficient.
-  A.Tridiagonalize(P); // Tridiagonalizes.
-  A.Qr(P); // Diagonalizes.
-  if(P) P->Transpose();
-  s->CopyDiagFromPacked(A);
-}
-template<typename Real>
-void SpMatrix<Real>::TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
-                             MatrixIndexT lanczos_dim) const {
-  const SpMatrix<Real> &S(*this); // call this "S" for easy notation.
-  MatrixIndexT eig_dim = s->Dim(); // Space of dim we want to retain.
-  if (lanczos_dim <= 0)
-    lanczos_dim = std::max(eig_dim + 50, eig_dim + eig_dim/2);
-  MatrixIndexT dim = this->NumRows();
-  if (lanczos_dim >= dim) {
-    // There would be no speed advantage in using this method, so just
-    // use the regular approach.
-    Vector<Real> s_tmp(dim);
-    Matrix<Real> P_tmp(dim, dim);
-    this->Eig(&s_tmp, &P_tmp);
-    SortSvd(&s_tmp, &P_tmp);
-    s->CopyFromVec(s_tmp.Range(0, eig_dim));
-    P->CopyFromMat(P_tmp.Range(0, dim, 0, eig_dim));
-    return;
-  }
-  KALDI_ASSERT(eig_dim <= dim && eig_dim > 0);
-  KALDI_ASSERT(P->NumRows() == dim && P->NumCols() == eig_dim); // each column
-  // is one eigenvector.
-  Matrix<Real> Q(lanczos_dim, dim); // The rows of Q will be the
-  // orthogonal vectors of the Krylov subspace.
-  SpMatrix<Real> T(lanczos_dim); // This will be equal to Q S Q^T,
-  // i.e. *this projected into the Krylov subspace.  Note: only the
-  // diagonal and off-diagonal fo T are nonzero, i.e. it's tridiagonal,
-  // but we don't have access to the low-level algorithms that work
-  // on that type of matrix (since we want to use ATLAS).  So we just
-  // do normal SVD, on a full matrix; it won't typically dominate.
-  Q.Row(0).SetRandn();
-  Q.Row(0).Scale(1.0 / Q.Row(0).Norm(2));
-  for (MatrixIndexT d = 0; d < lanczos_dim; d++) {
-    Vector<Real> r(dim);
-    r.AddSpVec(1.0, S, Q.Row(d), 0.0);
-    // r = S * q_d
-    MatrixIndexT counter = 0;
-    Real end_prod;
-    while (1) { // Normally we'll do this loop only once:
-      // we repeat to handle cases where r gets very much smaller
-      // and we want to orthogonalize again.
-      // We do "full orthogonalization" to preserve stability,
-      // even though this is usually a waste of time.
-      Real start_prod = VecVec(r, r);
-      for (SignedMatrixIndexT e = d; e >= 0; e--) { // e must be signed!
-        SubVector<Real> q_e(Q, e);
-        Real prod = VecVec(r, q_e);
-        if (counter == 0 && static_cast<MatrixIndexT>(e) + 1 >= d) // Keep T tridiagonal, which
-          T(d, e) = prod; // mathematically speaking, it is.
-        r.AddVec(-prod, q_e); // Subtract component in q_e.
-      }
-      if (d+1 == lanczos_dim) break;
-      end_prod = VecVec(r, r);
-      if (end_prod <= 0.1 * start_prod) {
-        // also handles case where both are 0.
-        // We're not confident any more that it's completely
-        // orthogonal to the rest so we want to re-do.
-        if (end_prod == 0.0)
-          r.SetRandn(); // "Restarting".
-        counter++;
-        if (counter > 100)
-          KALDI_ERR << "Loop detected in Lanczos iteration.";
-      } else {
-        break;
-      }
-    }
-    if (d+1 != lanczos_dim) {
-      // OK, at this point we're satisfied that r is orthogonal
-      // to all previous rows.
-      KALDI_ASSERT(end_prod != 0.0); // should have looped.
-      r.Scale(1.0 / std::sqrt(end_prod)); // make it unit.
-      Q.Row(d+1).CopyFromVec(r);
-    }
-  }
-  Matrix<Real> R(lanczos_dim, lanczos_dim);  
-  R.SetUnit();
-  T.Qr(&R); // Diagonalizes T.
-  Vector<Real> s_tmp(lanczos_dim);
-  s_tmp.CopyDiagFromSp(T);  
-  // Now T = R * diag(s_tmp) * R^T.
-  // The next call sorts the elements of s from greatest to least absolute value,
-  // and moves around the rows of R in the corresponding way.  This picks out
-  // the largest (absolute) eigenvalues.
-  SortSvd(&s_tmp, static_cast<Matrix<Real>*>(NULL), &R);
-  // Keep only the initial rows of R, those corresponding to greatest (absolute)
-  // eigenvalues.
-  SubMatrix<Real> Rsub(R, 0, eig_dim, 0, lanczos_dim);
-  SubVector<Real> s_sub(s_tmp, 0, eig_dim);
-  s->CopyFromVec(s_sub);
-  // For working out what to do now, just assume the other eigenvalues were
-  // zero.  This is just for purposes of knowing how to get the result, and
-  // not getting things wrongly transposed.
-  // We have T = Rsub^T * diag(s_sub) * Rsub.
-  // Now, T = Q S Q^T, with Q orthogonal,  so S = Q^T T Q = Q^T Rsub^T * diag(s) * Rsub * Q.
-  // The output is P and we want S = P * diag(s) * P^T, so we need P = Q^T Rsub^T.
-  P->AddMatMat(1.0, Q, kTrans, Rsub, kTrans, 0.0);
-}
-// Instantiate the templates for Eig and TopEig.
-template
-void SpMatrix<float>::Eig(VectorBase<float>*, MatrixBase<float>*) const;
-template
-void SpMatrix<double>::Eig(VectorBase<double>*, MatrixBase<double>*) const;
-template
-void SpMatrix<float>::TopEigs(VectorBase<float>*, MatrixBase<float>*, MatrixIndexT) const;
-template
-void SpMatrix<double>::TopEigs(VectorBase<double>*, MatrixBase<double>*, MatrixIndexT) const;
-// Someone had a problem with the Intel compiler with -O3, with Qr not being
-// defined for some strange reason (should automatically happen when
-// we instantiate Eig and TopEigs), so we explicitly instantiate it here.
-template
-void SpMatrix<float>::Qr(MatrixBase<float> *Q);
-template
-void SpMatrix<double>::Qr(MatrixBase<double> *Q);
-}
-// namespace kaldi
--- a/speechx/speechx/kaldi/matrix/sp-matrix-inl.h
+++ b/speechx/speechx/kaldi/matrix/sp-matrix-inl.h
-// matrix/sp-matrix-inl.h
-// Copyright 2009-2011  Ondrej Glembek;  Microsoft Corporation;  Haihua Xu
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//  http://www.apache.org/licenses/LICENSE-2.0
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_MATRIX_SP_MATRIX_INL_H_
-#define KALDI_MATRIX_SP_MATRIX_INL_H_
-#include "matrix/tp-matrix.h"
-namespace kaldi {
-// All the lines in this file seem to be declaring template specializations.
-// These tell the compiler that we'll implement the templated function
-// separately for the different template arguments (float, double).
-template<>
-double SolveQuadraticProblem(const SpMatrix<double> &H, const VectorBase<double> &g,
-                             const SolverOptions &opts, VectorBase<double> *x);
-template<>
-float SolveQuadraticProblem(const SpMatrix<float> &H, const VectorBase<float> &g,
-                            const SolverOptions &opts, VectorBase<float> *x);
-}  // namespace kaldi
-#endif  // KALDI_MATRIX_SP_MATRIX_INL_H_
--- a/speechx/speechx/kaldi/matrix/sp-matrix.cc
+++ b/speechx/speechx/kaldi/matrix/sp-matrix.cc
-// matrix/sp-matrix.cc
-// Copyright 2009-2011  Lukas Burget;  Ondrej Glembek;  Microsoft Corporation
-//                      Saarland University;   Petr Schwarz;   Yanmin Qian;
-//                      Haihua Xu
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//  http://www.apache.org/licenses/LICENSE-2.0
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include <limits>
-#include "matrix/sp-matrix.h"
-#include "matrix/kaldi-vector.h"
-#include "matrix/kaldi-matrix.h"
-#include "matrix/matrix-functions.h"
-#include "matrix/cblas-wrappers.h"
-namespace kaldi {
-// ****************************************************************************
-// Returns the log-determinant if +ve definite, else KALDI_ERR.
-// ****************************************************************************
-template<typename Real>
-Real SpMatrix<Real>::LogPosDefDet() const {
-  TpMatrix<Real> chol(this->NumRows());
-  double det = 0.0;
-  double diag;
-  chol.Cholesky(*this);  // Will throw exception if not +ve definite!
-  for (MatrixIndexT i = 0; i < this->NumRows(); i++) {
-    diag = static_cast<double>(chol(i, i));
-    det += kaldi::Log(diag);
-  }
-  return static_cast<Real>(2*det);
-}
-template<typename Real>
-void SpMatrix<Real>::Swap(SpMatrix<Real> *other) {
-  std::swap(this->data_, other->data_);
-  std::swap(this->num_rows_, other->num_rows_);
-}
-template<typename Real>
-void SpMatrix<Real>::SymPosSemiDefEig(VectorBase<Real> *s,
-                                      MatrixBase<Real> *P,
-                                      Real tolerance) const {
-  Eig(s, P);
-  Real max = s->Max(), min = s->Min();
-  KALDI_ASSERT(-min <= tolerance * max);
-  s->ApplyFloor(0.0);
-}
-template<typename Real>
-Real SpMatrix<Real>::MaxAbsEig() const {
-  Vector<Real> s(this->NumRows());
-  this->Eig(&s, static_cast<MatrixBase<Real>*>(NULL));
-  return std::max(s.Max(), -s.Min());
-}
-// returns true if positive definite--uses cholesky.
-template<typename Real>
-bool SpMatrix<Real>::IsPosDef() const {
-  MatrixIndexT D = (*this).NumRows();
-  KALDI_ASSERT(D > 0);
-  try {
-    TpMatrix<Real> C(D);
-    C.Cholesky(*this);
-    for (MatrixIndexT r = 0; r < D; r++)
-      if (C(r, r) == 0.0) return false;
-    return true;
-  }
-  catch(...) {  // not positive semidefinite.
-    return false;
-  }
-}
-template<typename Real>
-void SpMatrix<Real>::ApplyPow(Real power) {
-  if (power == 1) return;  // can do nothing.
-  MatrixIndexT D = this->NumRows();
-  KALDI_ASSERT(D > 0);
-  Matrix<Real> U(D, D);
-  Vector<Real> l(D);
-  (*this).SymPosSemiDefEig(&l, &U);
-  Vector<Real> l_copy(l);
-  try {
-    l.ApplyPow(power * 0.5);
-  }
-  catch(...) {
-    KALDI_ERR << "Error taking power " << (power * 0.5) << " of vector "
-              << l_copy;
-  }
-  U.MulColsVec(l);
-  (*this).AddMat2(1.0, U, kNoTrans, 0.0);
-}
-template<typename Real>
-void SpMatrix<Real>::CopyFromMat(const MatrixBase<Real> &M,
-                                 SpCopyType copy_type) {
-  KALDI_ASSERT(this->NumRows() == M.NumRows() && M.NumRows() == M.NumCols());
-  MatrixIndexT D = this->NumRows();
-  switch (copy_type) {
-    case kTakeMeanAndCheck:
-      {
-        Real good_sum = 0.0, bad_sum = 0.0;
-        for (MatrixIndexT i = 0; i < D; i++) {
-          for (MatrixIndexT j = 0; j < i; j++) {
-            Real a = M(i, j), b = M(j, i), avg = 0.5*(a+b), diff = 0.5*(a-b);
-            (*this)(i, j) = avg;
-            good_sum += std::abs(avg);
-            bad_sum += std::abs(diff);
-          }
-          good_sum += std::abs(M(i, i));
-          (*this)(i, i) = M(i, i);
-        }
-        if (bad_sum > 0.01 * good_sum) {
-          KALDI_ERR << "SpMatrix::Copy(), source matrix is not symmetric: "
-                    << bad_sum <<  ">" << good_sum;
-        }
-        break;
-      }
-    case kTakeMean:
-      {
-        for (MatrixIndexT i = 0; i < D; i++) {
-          for (MatrixIndexT j = 0; j < i; j++) {
-            (*this)(i, j) = 0.5*(M(i, j) + M(j, i));
-          }
-          (*this)(i, i) = M(i, i);
-        }
-        break;
-      }
-    case kTakeLower:
-      { // making this one a bit more efficient.
-        const Real *src = M.Data();
-        Real *dest = this->data_;
-        MatrixIndexT stride = M.Stride();
-        for (MatrixIndexT i = 0; i < D; i++) {
-          for (MatrixIndexT j = 0; j <= i; j++)
-            dest[j] = src[j];
-          dest += i + 1;
-          src += stride;
-        }
-      }
-      break;
-    case kTakeUpper:
-      for (MatrixIndexT i = 0; i < D; i++)
-        for (MatrixIndexT j = 0; j <= i; j++)
-          (*this)(i, j) = M(j, i);
-      break;
-    default:
-      KALDI_ASSERT("Invalid argument to SpMatrix::CopyFromMat");
-  }
-}
-template<typename Real>
-Real SpMatrix<Real>::Trace() const {
-  const Real *data = this->data_;
-  MatrixIndexT num_rows = this->num_rows_;
-  Real ans = 0.0;
-  for (int32 i = 1; i <= num_rows; i++, data += i)
-    ans += *data;
-  return ans;
-}
-// diagonal update, this <-- this + diag(v)
-template<typename Real>
-template<typename OtherReal>
-void  SpMatrix<Real>::AddDiagVec(const Real alpha, const VectorBase<OtherReal> &v) {
-  int32 num_rows = this->num_rows_;
-  KALDI_ASSERT(num_rows == v.Dim() && num_rows > 0);
-  const OtherReal *src = v.Data();
-  Real *dst = this->data_;
-  if (alpha == 1.0)
-    for (int32 i = 1; i <= num_rows; i++, src++, dst += i)
-      *dst += *src;
-  else
-    for (int32 i = 1; i <= num_rows; i++, src++, dst += i)
-      *dst += alpha * *src;
-}
-// instantiate the template above.
-template
-void SpMatrix<float>::AddDiagVec(const float alpha,
-                                 const VectorBase<double> &v);
-template
-void SpMatrix<double>::AddDiagVec(const double alpha,
-                                  const VectorBase<float> &v);
-template
-void SpMatrix<float>::AddDiagVec(const float alpha,
-                                 const VectorBase<float> &v);
-template
-void SpMatrix<double>::AddDiagVec(const double alpha,
-                                  const VectorBase<double> &v);
-template<>
-template<>
-void SpMatrix<double>::AddVec2(const double alpha, const VectorBase<double> &v);
-#ifndef HAVE_ATLAS
-template<typename Real>
-void SpMatrix<Real>::Invert(Real *logdet, Real *det_sign, bool need_inverse) {
-  // these are CLAPACK types
-  KaldiBlasInt   result;
-  KaldiBlasInt   rows = static_cast<int>(this->num_rows_);
-  KaldiBlasInt*  p_ipiv = new KaldiBlasInt[rows];
-  Real *p_work;  // workspace for the lapack function
-  void *temp;
-  if ((p_work = static_cast<Real*>(
-          KALDI_MEMALIGN(16, sizeof(Real) * rows, &temp))) == NULL) {
-    delete[] p_ipiv;
-    throw std::bad_alloc();
-  }
-#ifdef HAVE_OPENBLAS
-  memset(p_work, 0, sizeof(Real) * rows); // gets rid of a probably
-  // spurious Valgrind warning about jumps depending upon uninitialized values.
-#endif
-  // NOTE: Even though "U" is for upper, lapack assumes column-wise storage
-  // of the data. We have a row-wise storage, therefore, we need to "invert"
-  clapack_Xsptrf(&rows, this->data_, p_ipiv, &result);
-  KALDI_ASSERT(result >= 0 && "Call to CLAPACK ssptrf_ called with wrong arguments");
-  if (result > 0) {  // Singular...
-    if (det_sign) *det_sign = 0;
-    if (logdet) *logdet = -std::numeric_limits<Real>::infinity();
-    if (need_inverse) KALDI_ERR << "CLAPACK stptrf_ : factorization failed";
-  } else {  // Not singular.. compute log-determinant if needed.
-    if (logdet != NULL || det_sign != NULL) {
-      Real prod = 1.0, log_prod = 0.0;
-      int sign = 1;
-      for (int i = 0; i < (int)this->num_rows_; i++) {
-        if (p_ipiv[i] > 0) {  // not a 2x2 block...
-          // if (p_ipiv[i] != i+1) sign *= -1;  // row swap.
-          Real diag = (*this)(i, i);
-          prod *= diag;
-        } else {  // negative: 2x2 block. [we are in first of the two].
-          i++;  // skip over the first of the pair.
-          // each 2x2 block...
-          Real diag1 = (*this)(i, i), diag2 = (*this)(i-1, i-1),
-              offdiag = (*this)(i, i-1);
-          Real thisdet = diag1*diag2 - offdiag*offdiag;
-          // thisdet == determinant of 2x2 block.
-          // The following line is more complex than it looks: there are 2 offsets of
-          // 1 that cancel.
-          prod *= thisdet;
-        }
-        if (i == (int)(this->num_rows_-1) || fabs(prod) < 1.0e-10 || fabs(prod) > 1.0e+10) {
-          if (prod < 0) { prod = -prod; sign *= -1; }
-          log_prod += kaldi::Log(std::abs(prod));
-          prod = 1.0;
-        }
-      }
-      if (logdet != NULL) *logdet = log_prod;
-      if (det_sign != NULL) *det_sign = sign;
-    }
-  }
-  if (!need_inverse) {
-    delete [] p_ipiv;
-    KALDI_MEMALIGN_FREE(p_work);
-    return;  // Don't need what is computed next.
-  }
-  // NOTE: Even though "U" is for upper, lapack assumes column-wise storage
-  // of the data. We have a row-wise storage, therefore, we need to "invert"
-  clapack_Xsptri(&rows, this->data_, p_ipiv, p_work, &result);
-  KALDI_ASSERT(result >=0 &&
-               "Call to CLAPACK ssptri_ called with wrong arguments");
-  if (result != 0) {
-    KALDI_ERR << "CLAPACK ssptrf_ : Matrix is singular";
-  }
-  delete [] p_ipiv;
-  KALDI_MEMALIGN_FREE(p_work);
-}
-#else
-// in the ATLAS case, these are not implemented using a library and we back off to something else.
-template<typename Real>
-void SpMatrix<Real>::Invert(Real *logdet, Real *det_sign, bool need_inverse) {
-  Matrix<Real> M(this->NumRows(), this->NumCols());
-  M.CopyFromSp(*this);
-  M.Invert(logdet, det_sign, need_inverse);
-  if (need_inverse)
-    for (MatrixIndexT i = 0; i < this->NumRows(); i++)
-      for (MatrixIndexT j = 0; j <= i; j++)
-        (*this)(i, j) = M(i, j);
-}
-#endif
-template<typename Real>
-void SpMatrix<Real>::InvertDouble(Real *logdet, Real *det_sign,
-                                  bool inverse_needed) {
-  SpMatrix<double> dmat(*this);
-  double logdet_tmp, det_sign_tmp;
-  dmat.Invert(logdet ? &logdet_tmp : NULL,
-              det_sign ? &det_sign_tmp : NULL,
-              inverse_needed);
-  if (logdet) *logdet = logdet_tmp;
-  if (det_sign) *det_sign = det_sign_tmp;
-  (*this).CopyFromSp(dmat);
-}
-double TraceSpSp(const SpMatrix<double> &A, const SpMatrix<double> &B) {
-  KALDI_ASSERT(A.NumRows() == B.NumRows());
-  const double *Aptr = A.Data();
-  const double *Bptr = B.Data();
-  MatrixIndexT R = A.NumRows();
-  MatrixIndexT RR = (R * (R + 1)) / 2;
-  double all_twice = 2.0 * cblas_Xdot(RR, Aptr, 1, Bptr, 1);
-  // "all_twice" contains twice the vector-wise dot-product... this is
-  // what we want except the diagonal elements are represented
-  // twice.
-  double diag_once = 0.0;
-  for (MatrixIndexT row_plus_two = 2; row_plus_two <= R + 1; row_plus_two++) {
-    diag_once += *Aptr * *Bptr;
-    Aptr += row_plus_two;
-    Bptr += row_plus_two;
-  }
-  return all_twice - diag_once;
-}
-float TraceSpSp(const SpMatrix<float> &A, const SpMatrix<float> &B) {
-  KALDI_ASSERT(A.NumRows() == B.NumRows());
-  const float *Aptr = A.Data();
-  const float *Bptr = B.Data();
-  MatrixIndexT R = A.NumRows();
-  MatrixIndexT RR = (R * (R + 1)) / 2;
-  float all_twice = 2.0 * cblas_Xdot(RR, Aptr, 1, Bptr, 1);
-  // "all_twice" contains twice the vector-wise dot-product... this is
-  // what we want except the diagonal elements are represented
-  // twice.
-  float diag_once = 0.0;
-  for (MatrixIndexT row_plus_two = 2; row_plus_two <= R + 1; row_plus_two++) {
-    diag_once += *Aptr * *Bptr;
-    Aptr += row_plus_two;
-    Bptr += row_plus_two;
-  }
-  return all_twice - diag_once;
-}
-template<typename Real, typename OtherReal>
-Real TraceSpSp(const SpMatrix<Real> &A, const SpMatrix<OtherReal> &B) {
-  KALDI_ASSERT(A.NumRows() == B.NumRows());
-  Real ans = 0.0;
-  const Real *Aptr = A.Data();
-  const OtherReal *Bptr = B.Data();
-  MatrixIndexT row, col, R = A.NumRows();
-  for (row = 0; row < R; row++) {
-    for (col = 0; col < row; col++)
-      ans += 2.0 * *(Aptr++) * *(Bptr++);
-    ans += *(Aptr++) * *(Bptr++);  // Diagonal.
-  }
-  return ans;
-}
-template
-float TraceSpSp<float, double>(const SpMatrix<float> &A, const SpMatrix<double> &B);
-template
-double TraceSpSp<double, float>(const SpMatrix<double> &A, const SpMatrix<float> &B);
-template<typename Real>
-Real TraceSpMat(const SpMatrix<Real> &A, const MatrixBase<Real> &B) {
-  KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols() &&
-               "KALDI_ERR: TraceSpMat: arguments have mismatched dimension");
-  MatrixIndexT R = A.NumRows();
-  Real ans = (Real)0.0;
-  const Real *Aptr = A.Data(), *Bptr = B.Data();
-  MatrixIndexT bStride = B.Stride();
-  for (MatrixIndexT r = 0;r < R;r++) {
-    for (MatrixIndexT c = 0;c < r;c++) {
-      // ans += A(r, c) * (B(r, c) + B(c, r));
-      ans += *(Aptr++) * (Bptr[r*bStride + c] + Bptr[c*bStride + r]);
-    }
-    // ans += A(r, r) * B(r, r);
-    ans += *(Aptr++) * Bptr[r*bStride + r];
-  }
-  return ans;
-}
-template
-float TraceSpMat(const SpMatrix<float> &A, const MatrixBase<float> &B);
-template
-double TraceSpMat(const SpMatrix<double> &A, const MatrixBase<double> &B);
-template<typename Real>
-Real TraceMatSpMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
-                   const SpMatrix<Real> &B, const MatrixBase<Real> &C,
-                   MatrixTransposeType transC) {
-  KALDI_ASSERT((transA == kTrans?A.NumCols():A.NumRows()) ==
-               (transC == kTrans?C.NumRows():C.NumCols()) &&
-               (transA == kTrans?A.NumRows():A.NumCols()) == B.NumRows() &&
-               (transC == kTrans?C.NumCols():C.NumRows()) == B.NumRows() &&
-               "TraceMatSpMat: arguments have wrong dimension.");
-  Matrix<Real> tmp(B.NumRows(), B.NumRows());
-  tmp.AddMatMat(1.0, C, transC, A, transA, 0.0);  // tmp = C * A.
-  return TraceSpMat(B, tmp);
-}
-template
-float TraceMatSpMat(const MatrixBase<float> &A, MatrixTransposeType transA,
-                    const SpMatrix<float> &B, const MatrixBase<float> &C,
-                    MatrixTransposeType transC);
-template
-double TraceMatSpMat(const MatrixBase<double> &A, MatrixTransposeType transA,
-                     const SpMatrix<double> &B, const MatrixBase<double> &C,
-                     MatrixTransposeType transC);
-template<typename Real>
-Real TraceMatSpMatSp(const MatrixBase<Real> &A, MatrixTransposeType transA,
-                     const SpMatrix<Real> &B, const MatrixBase<Real> &C,
-                     MatrixTransposeType transC, const SpMatrix<Real> &D) {
-  KALDI_ASSERT((transA == kTrans ?A.NumCols():A.NumRows() == D.NumCols()) &&
-               (transA == kTrans ? A.NumRows():A.NumCols() == B.NumRows()) &&
-               (transC == kTrans ? A.NumCols():A.NumRows() == B.NumCols()) &&
-               (transC == kTrans ? A.NumRows():A.NumCols() == D.NumRows()) &&
-               "KALDI_ERR: TraceMatSpMatSp: arguments have mismatched dimension.");
-  // Could perhaps optimize this more depending on dimensions of quantities.
-  Matrix<Real> tmpAB(transA == kTrans ? A.NumCols():A.NumRows(), B.NumCols());
-  tmpAB.AddMatSp(1.0, A, transA, B, 0.0);
-  Matrix<Real> tmpCD(transC == kTrans ? C.NumCols():C.NumRows(), D.NumCols());
-  tmpCD.AddMatSp(1.0, C, transC, D, 0.0);
-  return TraceMatMat(tmpAB, tmpCD, kNoTrans);
-}
-template
-float TraceMatSpMatSp(const MatrixBase<float> &A, MatrixTransposeType transA,
-                      const SpMatrix<float> &B, const MatrixBase<float> &C,
-                      MatrixTransposeType transC, const SpMatrix<float> &D);
-template
-double TraceMatSpMatSp(const MatrixBase<double> &A, MatrixTransposeType transA,
-                       const SpMatrix<double> &B, const MatrixBase<double> &C,
-                       MatrixTransposeType transC, const SpMatrix<double> &D);
-template<typename Real>
-bool SpMatrix<Real>::IsDiagonal(Real cutoff) const {
-  MatrixIndexT R = this->NumRows();
-  Real bad_sum = 0.0, good_sum = 0.0;
-  for (MatrixIndexT i = 0; i < R; i++) {
-    for (MatrixIndexT j = 0; j <= i; j++) {
-      if (i == j)
-        good_sum += std::abs((*this)(i, j));
-      else
-        bad_sum += std::abs((*this)(i, j));
-    }
-  }
-  return (!(bad_sum > good_sum * cutoff));
-}
-template<typename Real>
-bool SpMatrix<Real>::IsUnit(Real cutoff) const {
-  MatrixIndexT R = this->NumRows();
-  Real max = 0.0;  // max error
-  for (MatrixIndexT i = 0; i < R; i++)
-    for (MatrixIndexT j = 0; j <= i; j++)
-      max = std::max(max, static_cast<Real>(std::abs((*this)(i, j) -
-                                                     (i == j ? 1.0 : 0.0))));
-  return (max <= cutoff);
-}
-template<typename Real>
-bool SpMatrix<Real>::IsTridiagonal(Real cutoff) const {
-  MatrixIndexT R = this->NumRows();
-  Real max_abs_2diag = 0.0, max_abs_offdiag = 0.0;
-  for (MatrixIndexT i = 0; i < R; i++)
-    for (MatrixIndexT j = 0; j <= i; j++) {
-      if (j+1 < i)
-        max_abs_offdiag = std::max(max_abs_offdiag,
-                                   std::abs((*this)(i, j)));
-      else
-        max_abs_2diag = std::max(max_abs_2diag,
-                                 std::abs((*this)(i, j)));
-    }
-  return (max_abs_offdiag <= cutoff * max_abs_2diag);
-}
-template<typename Real>
-bool SpMatrix<Real>::IsZero(Real cutoff) const {
-  if (this->num_rows_ == 0) return true;
-  return (this->Max() <= cutoff && this->Min() >= -cutoff);
-}
-template<typename Real>
-Real SpMatrix<Real>::FrobeniusNorm() const {
-  Real sum = 0.0;
-  MatrixIndexT R = this->NumRows();
-  for (MatrixIndexT i = 0; i < R; i++) {
-    for (MatrixIndexT j = 0; j < i; j++)
-      sum += (*this)(i, j) * (*this)(i, j) * 2;
-    sum += (*this)(i, i) * (*this)(i, i);
-  }
-  return std::sqrt(sum);
-}
-template<typename Real>
-bool SpMatrix<Real>::ApproxEqual(const SpMatrix<Real> &other, float tol) const {
-  if (this->NumRows() != other.NumRows())
-    KALDI_ERR << "SpMatrix::AproxEqual, size mismatch, "
-              << this->NumRows() << " vs. " << other.NumRows();
-  SpMatrix<Real> tmp(*this);
-  tmp.AddSp(-1.0, other);
-  return (tmp.FrobeniusNorm() <= tol * std::max(this->FrobeniusNorm(), other.FrobeniusNorm()));
-}
-// function Floor: A = Floor(B, alpha * C) ... see tutorial document.
-template<typename Real>
-int SpMatrix<Real>::ApplyFloor(const SpMatrix<Real> &C, Real alpha,
-                               bool verbose) {
-  MatrixIndexT dim = this->NumRows();
-  int nfloored = 0;
-  KALDI_ASSERT(C.NumRows() == dim);
-  KALDI_ASSERT(alpha > 0);
-  TpMatrix<Real> L(dim);
-  L.Cholesky(C);
-  L.Scale(std::sqrt(alpha));  // equivalent to scaling C by alpha.
-  TpMatrix<Real> LInv(L);
-  LInv.Invert();
-  SpMatrix<Real> D(dim);
-  {  // D = L^{-1} * (*this) * L^{-T}
-    Matrix<Real> LInvFull(LInv);
-    D.AddMat2Sp(1.0, LInvFull, kNoTrans, (*this), 0.0);
-  }
-  Vector<Real> l(dim);
-  Matrix<Real> U(dim, dim);
-  D.Eig(&l, &U);
-  if (verbose) {
-    KALDI_LOG << "ApplyFloor: flooring following diagonal to 1: " << l;
-  }
-  for (MatrixIndexT i = 0; i < l.Dim(); i++) {
-    if (l(i) < 1.0) {
-      nfloored++;
-      l(i) = 1.0;
-    }
-  }
-  l.ApplyPow(0.5);
-  U.MulColsVec(l);
-  D.AddMat2(1.0, U, kNoTrans, 0.0);
-  {  // D' := U * diag(l') * U^T ... l'=floor(l, 1)
-    Matrix<Real> LFull(L);
-    (*this).AddMat2Sp(1.0, LFull, kNoTrans, D, 0.0);  // A := L * D' * L^T
-  }
-  return nfloored;
-}
-template<typename Real>
-Real SpMatrix<Real>::LogDet(Real *det_sign) const {
-  Real log_det;
-  SpMatrix<Real> tmp(*this);
-  // false== output not needed (saves some computation).
-  tmp.Invert(&log_det, det_sign, false);
-  return log_det;
-}
-template<typename Real>
-int SpMatrix<Real>::ApplyFloor(Real floor) {
-  MatrixIndexT Dim = this->NumRows();
-  int nfloored = 0;
-  Vector<Real> s(Dim);
-  Matrix<Real> P(Dim, Dim);
-  (*this).Eig(&s, &P);
-  for (MatrixIndexT i = 0; i < Dim; i++) {
-    if (s(i) < floor) {
-      nfloored++;
-      s(i) = floor;
-    }
-  }
-  (*this).AddMat2Vec(1.0, P, kNoTrans, s, 0.0);
-  return nfloored;
-}
-template<typename Real>
-MatrixIndexT SpMatrix<Real>::LimitCond(Real maxCond, bool invert) {  // e.g. maxCond = 1.0e+05.
-  MatrixIndexT Dim = this->NumRows();
-  Vector<Real> s(Dim);
-  Matrix<Real> P(Dim, Dim);
-  (*this).SymPosSemiDefEig(&s, &P);
-  KALDI_ASSERT(maxCond > 1);
-  Real floor = s.Max() / maxCond;
-  if (floor < 0) floor = 0;
-  if (floor < 1.0e-40) {
-    KALDI_WARN << "LimitCond: limiting " << floor << " to 1.0e-40";
-    floor = 1.0e-40;
-  }
-  MatrixIndexT nfloored = 0;
-  for (MatrixIndexT i = 0; i < Dim; i++) {
-    if (s(i) <= floor) nfloored++;
-    if (invert)
-      s(i) = 1.0 / std::sqrt(std::max(s(i), floor));
-    else
-      s(i) = std::sqrt(std::max(s(i), floor));
-  }
-  P.MulColsVec(s);
-  (*this).AddMat2(1.0, P, kNoTrans, 0.0);  // (*this) = P*P^T.  ... (*this) = P * floor(s) * P^T  ... if P was original P.
-  return nfloored;
-}
-void SolverOptions::Check() const {
-  KALDI_ASSERT(K>10 && eps<1.0e-10);
-}
-template<> double SolveQuadraticProblem(const SpMatrix<double> &H,
-                                        const VectorBase<double> &g,
-                                        const SolverOptions &opts,
-                                        VectorBase<double> *x) {
-  KALDI_ASSERT(H.NumRows() == g.Dim() && g.Dim() == x->Dim() && x->Dim() != 0);
-  opts.Check();
-  MatrixIndexT dim = x->Dim();
-  if (H.IsZero(0.0)) {
-    KALDI_WARN << "Zero quadratic term in quadratic vector problem for "
-               << opts.name << ": leaving it unchanged.";
-    return 0.0;
-  }
-  if (opts.diagonal_precondition) {
-    // We can re-cast the problem with a diagonal preconditioner to
-    // make H better-conditioned.
-    Vector<double> H_diag(dim);
-    H_diag.CopyDiagFromSp(H);
-    H_diag.ApplyFloor(std::numeric_limits<double>::min() * 1.0E+3);
-    Vector<double> H_diag_sqrt(H_diag);
-    H_diag_sqrt.ApplyPow(0.5);
-    Vector<double> H_diag_inv_sqrt(H_diag_sqrt);
-    H_diag_inv_sqrt.InvertElements();
-    Vector<double> x_scaled(*x);
-    x_scaled.MulElements(H_diag_sqrt);
-    Vector<double> g_scaled(g);
-    g_scaled.MulElements(H_diag_inv_sqrt);
-    SpMatrix<double> H_scaled(dim);
-    H_scaled.AddVec2Sp(1.0, H_diag_inv_sqrt, H, 0.0);
-    double ans;
-    SolverOptions new_opts(opts);
-    new_opts.diagonal_precondition = false;
-    ans = SolveQuadraticProblem(H_scaled, g_scaled, new_opts, &x_scaled);
-    x->CopyFromVec(x_scaled);
-    x->MulElements(H_diag_inv_sqrt);
-    return ans;
-  }
-  Vector<double> gbar(g);
-  if (opts.optimize_delta) gbar.AddSpVec(-1.0, H, *x, 1.0);  // gbar = g - H x
-  Matrix<double> U(dim, dim);
-  Vector<double> l(dim);
-  H.SymPosSemiDefEig(&l, &U);  // does svd H = U L V^T and checks that H == U L U^T to within a tolerance.
-  // floor l.
-  double f = std::max(static_cast<double>(opts.eps), l.Max() / opts.K);
-  MatrixIndexT nfloored = 0;
-  for (MatrixIndexT i = 0; i < dim; i++) {  // floor l.
-    if (l(i) < f) {
-      nfloored++;
-      l(i) = f;
-    }
-  }
-  if (nfloored != 0 && opts.print_debug_output) {
-    KALDI_LOG << "Solving quadratic problem for " << opts.name
-              << ": floored " << nfloored<< " eigenvalues. ";
-  }
-  Vector<double> tmp(dim);
-  tmp.AddMatVec(1.0, U, kTrans, gbar, 0.0);  // tmp = U^T \bar{g}
-  tmp.DivElements(l);  // divide each element of tmp by l: tmp = \tilde{L}^{-1} U^T \bar{g}
-  Vector<double> delta(dim);
-  delta.AddMatVec(1.0, U, kNoTrans, tmp, 0.0);  // delta = U tmp = U \tilde{L}^{-1} U^T \bar{g}
-  Vector<double> &xhat(tmp);
-  xhat.CopyFromVec(delta);
-  if (opts.optimize_delta) xhat.AddVec(1.0, *x);  // xhat = x + delta.
-  double auxf_before = VecVec(g, *x) - 0.5 * VecSpVec(*x, H, *x),
-         auxf_after = VecVec(g, xhat) - 0.5 * VecSpVec(xhat, H, xhat);
-  if (auxf_after < auxf_before) {  // Reject change.
-    if (auxf_after < auxf_before - 1.0e-10 && opts.print_debug_output)
-      KALDI_WARN << "Optimizing vector auxiliary function for "
-                 << opts.name<< ": auxf decreased " << auxf_before
-                 << " to "  << auxf_after <<  ", change is "
-                 << (auxf_after-auxf_before);
-    return 0.0;
-  } else {
-    x->CopyFromVec(xhat);
-    return auxf_after - auxf_before;
-  }
-}
-template<> float SolveQuadraticProblem(const SpMatrix<float> &H,
-                                       const VectorBase<float> &g,
-                                       const SolverOptions &opts,
-                                       VectorBase<float> *x) {
-  KALDI_ASSERT(H.NumRows() == g.Dim() && g.Dim() == x->Dim() && x->Dim() != 0);
-  SpMatrix<double> Hd(H);
-  Vector<double> gd(g);
-  Vector<double> xd(*x);
-  float ans = static_cast<float>(SolveQuadraticProblem(Hd, gd, opts, &xd));
-  x->CopyFromVec(xd);
-  return ans;
-}
-// Maximizes the auxiliary function   Q(x) = tr(M^T SigmaInv Y) - 0.5 tr(SigmaInv M Q M^T).
-// Like a numerically stable version of   M := Y Q^{-1}.
-template<typename Real>
-Real
-SolveQuadraticMatrixProblem(const SpMatrix<Real> &Q,
-                            const MatrixBase<Real> &Y,
-                            const SpMatrix<Real> &SigmaInv,
-                            const SolverOptions &opts,
-                            MatrixBase<Real> *M) {
-  KALDI_ASSERT(Q.NumRows() == M->NumCols() &&
-               SigmaInv.NumRows() == M->NumRows() && Y.NumRows() == M->NumRows()
-               && Y.NumCols() == M->NumCols() && M->NumCols() != 0);
-  opts.Check();
-  MatrixIndexT rows = M->NumRows(), cols = M->NumCols();
-  if (Q.IsZero(0.0)) {
-    KALDI_WARN << "Zero quadratic term in quadratic matrix problem for "
-               << opts.name << ": leaving it unchanged.";
-    return 0.0;
-  }
-  if (opts.diagonal_precondition) {
-    // We can re-cast the problem with a diagonal preconditioner in the space
-    // of Q (columns of M).  Helps to improve the condition of Q.
-    Vector<Real> Q_diag(cols);
-    Q_diag.CopyDiagFromSp(Q);
-    Q_diag.ApplyFloor(std::numeric_limits<Real>::min() * 1.0E+3);
-    Vector<Real> Q_diag_sqrt(Q_diag);
-    Q_diag_sqrt.ApplyPow(0.5);
-    Vector<Real> Q_diag_inv_sqrt(Q_diag_sqrt);
-    Q_diag_inv_sqrt.InvertElements();
-    Matrix<Real> M_scaled(*M);
-    M_scaled.MulColsVec(Q_diag_sqrt);
-    Matrix<Real> Y_scaled(Y);
-    Y_scaled.MulColsVec(Q_diag_inv_sqrt);
-    SpMatrix<Real> Q_scaled(cols);
-    Q_scaled.AddVec2Sp(1.0, Q_diag_inv_sqrt, Q, 0.0);
-    Real ans;
-    SolverOptions new_opts(opts);
-    new_opts.diagonal_precondition = false;
-    ans = SolveQuadraticMatrixProblem(Q_scaled, Y_scaled, SigmaInv,
-                                      new_opts, &M_scaled);
-    M->CopyFromMat(M_scaled);
-    M->MulColsVec(Q_diag_inv_sqrt);
-    return ans;
-  }
-  Matrix<Real> Ybar(Y);
-  if (opts.optimize_delta) {
-    Matrix<Real> Qfull(Q);
-    Ybar.AddMatMat(-1.0, *M, kNoTrans, Qfull, kNoTrans, 1.0);
-  } // Ybar = Y - M Q.
-  Matrix<Real> U(cols, cols);
-  Vector<Real> l(cols);
-  Q.SymPosSemiDefEig(&l, &U);  // does svd Q = U L V^T and checks that Q == U L U^T to within a tolerance.
-  // floor l.
-  Real f = std::max<Real>(static_cast<Real>(opts.eps), l.Max() / opts.K);
-  MatrixIndexT nfloored = 0;
-  for (MatrixIndexT i = 0; i < cols; i++) {  // floor l.
-    if (l(i) < f) { nfloored++; l(i) = f; }
-  }
-  if (nfloored != 0 && opts.print_debug_output)
-    KALDI_LOG << "Solving matrix problem for " << opts.name
-              << ": floored " << nfloored << " eigenvalues. ";
-  Matrix<Real> tmpDelta(rows, cols);
-  tmpDelta.AddMatMat(1.0, Ybar, kNoTrans, U, kNoTrans, 0.0);  // tmpDelta = Ybar * U.
-  l.InvertElements(); KALDI_ASSERT(1.0/l.Max() != 0);  // check not infinite.  eps should take care of this.
-  tmpDelta.MulColsVec(l);  // tmpDelta = Ybar * U * \tilde{L}^{-1}
-  Matrix<Real> Delta(rows, cols);
-  Delta.AddMatMat(1.0, tmpDelta, kNoTrans, U, kTrans, 0.0);  // Delta = Ybar * U * \tilde{L}^{-1} * U^T
-  Real auxf_before, auxf_after;
-  SpMatrix<Real> MQM(rows);
-  Matrix<Real> &SigmaInvY(tmpDelta);
-  { Matrix<Real> SigmaInvFull(SigmaInv);  SigmaInvY.AddMatMat(1.0, SigmaInvFull, kNoTrans, Y, kNoTrans, 0.0); }
-  {  // get auxf_before.      Q(x) = tr(M^T SigmaInv Y) - 0.5 tr(SigmaInv M Q M^T).
-    MQM.AddMat2Sp(1.0, *M, kNoTrans, Q, 0.0);
-    auxf_before = TraceMatMat(*M, SigmaInvY, kaldi::kTrans) - 0.5*TraceSpSp(SigmaInv, MQM);
-  }
-  Matrix<Real> Mhat(Delta);
-  if (opts.optimize_delta) Mhat.AddMat(1.0, *M);  // Mhat = Delta + M.
-  {  // get auxf_after.
-    MQM.AddMat2Sp(1.0, Mhat, kNoTrans, Q, 0.0);
-    auxf_after = TraceMatMat(Mhat, SigmaInvY, kaldi::kTrans) - 0.5*TraceSpSp(SigmaInv, MQM);
-  }
-  if (auxf_after < auxf_before) {
-    if (auxf_after < auxf_before - 1.0e-10)
-      KALDI_WARN << "Optimizing matrix auxiliary function for "
-                 << opts.name << ", auxf decreased "
-                 << auxf_before << " to " << auxf_after << ", change is "
-                 << (auxf_after-auxf_before);
-    return 0.0;
-  } else {
-    M->CopyFromMat(Mhat);
-    return auxf_after - auxf_before;
-  }
-}
-template<typename Real>
-Real SolveDoubleQuadraticMatrixProblem(const MatrixBase<Real> &G,
-                                       const SpMatrix<Real> &P1,
-                                       const SpMatrix<Real> &P2,
-                                       const SpMatrix<Real> &Q1,
-                                       const SpMatrix<Real> &Q2,
-                                       const SolverOptions &opts,
-                                       MatrixBase<Real> *M) {
-  KALDI_ASSERT(Q1.NumRows() == M->NumCols() && P1.NumRows() == M->NumRows() &&
-               G.NumRows() == M->NumRows() && G.NumCols() == M->NumCols() &&
-               M->NumCols() != 0 && Q2.NumRows() == M->NumCols() &&
-               P2.NumRows() == M->NumRows());
-  MatrixIndexT rows = M->NumRows(), cols = M->NumCols();
-  // The following check should not fail as we stipulate P1, P2 and one of Q1
-  // or Q2 must be +ve def and other Q1 or Q2 must be +ve semidef.
-  TpMatrix<Real> LInv(rows);
-  LInv.Cholesky(P1);
-  LInv.Invert();  // Will throw exception if fails.
-  SpMatrix<Real> S(rows);
-  Matrix<Real> LInvFull(LInv);
-  S.AddMat2Sp(1.0, LInvFull, kNoTrans, P2, 0.0);  // S := L^{-1} P_2 L^{-T}
-  Matrix<Real> U(rows, rows);
-  Vector<Real> d(rows);
-  S.SymPosSemiDefEig(&d, &U);
-  Matrix<Real> T(rows, rows);
-  T.AddMatMat(1.0, U, kTrans, LInvFull, kNoTrans, 0.0);  // T := U^T * L^{-1}
-#ifdef KALDI_PARANOID  // checking mainly for errors in the code or math.
-  {
-    SpMatrix<Real> P1Trans(rows);
-    P1Trans.AddMat2Sp(1.0, T, kNoTrans, P1, 0.0);
-    KALDI_ASSERT(P1Trans.IsUnit(0.01));
-  }
-  {
-    SpMatrix<Real> P2Trans(rows);
-    P2Trans.AddMat2Sp(1.0, T, kNoTrans, P2, 0.0);
-    KALDI_ASSERT(P2Trans.IsDiagonal(0.01));
-  }
-#endif
-  Matrix<Real> TInv(T);
-  TInv.Invert();
-  Matrix<Real> Gdash(rows, cols);
-  Gdash.AddMatMat(1.0, T, kNoTrans, G, kNoTrans, 0.0);  // G' = T G
-  Matrix<Real> MdashOld(rows, cols);
-  MdashOld.AddMatMat(1.0, TInv, kTrans, *M, kNoTrans, 0.0);  // M' = T^{-T} M
-  Matrix<Real> MdashNew(MdashOld);
-  Real objf_impr = 0.0;
-  for (MatrixIndexT n = 0; n < rows; n++) {
-    SpMatrix<Real> Qsum(Q1);
-    Qsum.AddSp(d(n), Q2);
-    SubVector<Real> mdash_n = MdashNew.Row(n);
-    SubVector<Real> gdash_n = Gdash.Row(n);
-    Matrix<Real> QsumInv(Qsum);
-    try {
-      QsumInv.Invert();
-      Real old_objf = VecVec(mdash_n, gdash_n)
-          - 0.5 * VecSpVec(mdash_n, Qsum, mdash_n);
-      mdash_n.AddMatVec(1.0, QsumInv, kNoTrans, gdash_n, 0.0); // m'_n := g'_n * (Q_1 + d_n Q_2)^{-1}
-      Real new_objf = VecVec(mdash_n, gdash_n)
-          - 0.5 * VecSpVec(mdash_n, Qsum, mdash_n);
-      if (new_objf < old_objf) {
-        if (new_objf < old_objf - 1.0e-05) {
-          KALDI_WARN << "In double quadratic matrix problem: objective "
-              "function decreasing during optimization of " << opts.name
-              << ", " << old_objf << "->" << new_objf << ", change is "
-              << (new_objf - old_objf);
-          KALDI_ERR << "Auxiliary function decreasing."; // Will be caught.
-        } else {  // Reset to old value, didn't improve (very close to optimum).
-          MdashNew.Row(n).CopyFromVec(MdashOld.Row(n));
-        }
-      }
-      objf_impr += new_objf - old_objf;
-    }
-    catch (...) {
-      KALDI_WARN << "Matrix inversion or optimization failed during double "
-          "quadratic problem, solving for" << opts.name
-          << ": trying more stable approach.";
-      objf_impr += SolveQuadraticProblem(Qsum, gdash_n, opts, &mdash_n);
-    }
-  }
-  M->AddMatMat(1.0, T, kTrans, MdashNew, kNoTrans, 0.0); // M := T^T M'.
-  return objf_impr;
-}
-// rank-one update, this <-- this + alpha V V'
-template<>
-template<>
-void SpMatrix<float>::AddVec2(const float alpha, const VectorBase<float> &v) {
-  KALDI_ASSERT(v.Dim() == this->NumRows());
-  cblas_Xspr(v.Dim(), alpha, v.Data(), 1,
-             this->data_);
-}
-template<class Real>
-void SpMatrix<Real>::AddVec2Sp(const Real alpha, const VectorBase<Real> &v,
-                               const SpMatrix<Real> &S, const Real beta) {
-  KALDI_ASSERT(v.Dim() == this->NumRows() && S.NumRows() == this->NumRows());
-  const Real *Sdata = S.Data();
-  const Real *vdata = v.Data();
-  Real *data = this->data_;
-  MatrixIndexT dim = this->num_rows_;
-  for (MatrixIndexT r = 0; r < dim; r++)
-    for (MatrixIndexT c = 0; c <= r; c++, Sdata++, data++)
-      *data = beta * *data + alpha * vdata[r] * vdata[c] * *Sdata;
-}
-// rank-one update, this <-- this + alpha V V'
-template<>
-template<>
-void SpMatrix<double>::AddVec2(const double alpha, const VectorBase<double> &v) {
-  KALDI_ASSERT(v.Dim() == num_rows_);
-  cblas_Xspr(v.Dim(), alpha, v.Data(), 1, data_);
-}
-template<typename Real>
-template<typename OtherReal>
-void SpMatrix<Real>::AddVec2(const Real alpha, const VectorBase<OtherReal> &v) {
-  KALDI_ASSERT(v.Dim() == this->NumRows());
-  Real *data = this->data_;
-  const OtherReal *v_data = v.Data();
-  MatrixIndexT nr = this->num_rows_;
-  for (MatrixIndexT i = 0; i < nr; i++)
-    for (MatrixIndexT j = 0; j <= i; j++, data++)
-      *data += alpha * v_data[i] * v_data[j];
-}
-// instantiate the template above.
-template
-void SpMatrix<float>::AddVec2(const float alpha, const VectorBase<double> &v);
-template
-void SpMatrix<double>::AddVec2(const double alpha, const VectorBase<float> &v);
-template<typename Real>
-Real VecSpVec(const VectorBase<Real> &v1, const SpMatrix<Real> &M,
-              const VectorBase<Real> &v2) {
-  MatrixIndexT D = M.NumRows();
-  KALDI_ASSERT(v1.Dim() == D && v1.Dim() == v2.Dim());
-  Vector<Real> tmp_vec(D);
-  cblas_Xspmv(D, 1.0, M.Data(), v1.Data(), 1, 0.0, tmp_vec.Data(), 1);
-  return VecVec(tmp_vec, v2);
-}
-template
-float VecSpVec(const VectorBase<float> &v1, const SpMatrix<float> &M,
-               const VectorBase<float> &v2);
-template
-double VecSpVec(const VectorBase<double> &v1, const SpMatrix<double> &M,
-                const VectorBase<double> &v2);
-template<typename Real>
-void SpMatrix<Real>::AddMat2Sp(
-    const Real alpha, const MatrixBase<Real> &M,
-    MatrixTransposeType transM, const SpMatrix<Real> &A, const Real beta) {
-  if (transM == kNoTrans) {
-    KALDI_ASSERT(M.NumCols() == A.NumRows() && M.NumRows() == this->num_rows_);
-  } else {
-    KALDI_ASSERT(M.NumRows() == A.NumRows() && M.NumCols() == this->num_rows_);
-  }
-  Vector<Real> tmp_vec(A.NumRows());
-  Real *tmp_vec_data = tmp_vec.Data();
-  SpMatrix<Real> tmp_A;
-  const Real *p_A_data = A.Data();
-  Real *p_row_data = this->Data();
-  MatrixIndexT M_other_dim = (transM == kNoTrans ? M.NumCols() : M.NumRows()),
-      M_same_dim = (transM == kNoTrans ? M.NumRows() : M.NumCols()),
-      M_stride = M.Stride(), dim = this->NumRows();
-  KALDI_ASSERT(M_same_dim == dim);
-  const Real *M_data = M.Data();
-  if (this->Data() <= A.Data() + A.SizeInBytes() &&
-      this->Data() + this->SizeInBytes() >= A.Data()) {
-    // Matrices A and *this overlap. Make copy of A
-    tmp_A.Resize(A.NumRows());
-    tmp_A.CopyFromSp(A);
-    p_A_data = tmp_A.Data();
-  }
-  if (transM == kNoTrans) {
-    for (MatrixIndexT r = 0; r < dim; r++, p_row_data += r) {
-      cblas_Xspmv(A.NumRows(), 1.0, p_A_data, M.RowData(r), 1, 0.0, tmp_vec_data, 1);
-      cblas_Xgemv(transM, r+1, M_other_dim, alpha, M_data, M_stride,
-                  tmp_vec_data, 1, beta, p_row_data, 1);
-    }
-  } else {
-    for (MatrixIndexT r = 0; r < dim; r++, p_row_data += r) {
-      cblas_Xspmv(A.NumRows(), 1.0, p_A_data, M.Data() + r, M.Stride(), 0.0, tmp_vec_data, 1);
-      cblas_Xgemv(transM, M_other_dim, r+1, alpha, M_data, M_stride,
-                  tmp_vec_data, 1, beta, p_row_data, 1);
-    }
-  }
-}
-template<typename Real>
-void SpMatrix<Real>::AddSmat2Sp(
-    const Real alpha, const MatrixBase<Real> &M,
-    MatrixTransposeType transM, const SpMatrix<Real> &A,
-    const Real beta) {
-  KALDI_ASSERT((transM == kNoTrans && M.NumCols() == A.NumRows()) ||
-               (transM == kTrans && M.NumRows() == A.NumRows()));
-  if (transM == kNoTrans) {
-    KALDI_ASSERT(M.NumCols() == A.NumRows() && M.NumRows() == this->num_rows_);
-  } else {
-    KALDI_ASSERT(M.NumRows() == A.NumRows() && M.NumCols() == this->num_rows_);
-  }
-  MatrixIndexT Adim = A.NumRows(), dim = this->num_rows_;
-  Matrix<Real> temp_A(A); // represent A as full matrix.
-  Matrix<Real> temp_MA(dim, Adim);
-  temp_MA.AddSmatMat(1.0, M, transM, temp_A, kNoTrans, 0.0);
-  // Next-- we want to do *this = alpha * temp_MA * M^T + beta * *this.
-  // To make it sparse vector multiplies, since M is sparse, we'd like
-  // to do: for each column c, (*this column c) += temp_MA * (M^T's column c.)
-  // [ignoring the alpha and beta here.]
-  // It's not convenient to process columns in the symmetric
-  // packed format because they don't have a constant stride.  However,
-  // we can use the fact that temp_MA * M is symmetric, to just assign
-  // each row of *this instead of each column.
-  // So the final iteration is:
-  // for i = 0... dim-1,
-  //   [the i'th row of *this] = beta * [the i'th row of *this] + alpha *
-  //                               temp_MA * [the i'th column of M].
-  // Of course, we only process the first 0 ... i elements of this row,
-  // as that's all that are kept in the symmetric packed format.
-  Matrix<Real> temp_this(*this);
-  Real *data = this->data_;
-  const Real *Mdata = M.Data(), *MAdata = temp_MA.Data();
-  MatrixIndexT temp_MA_stride = temp_MA.Stride(), Mstride = M.Stride();
-  if (transM == kNoTrans) {
-    // The column of M^T corresponds to the rows of the supplied matrix.
-    for (MatrixIndexT i = 0; i < dim; i++, data += i) {
-      MatrixIndexT num_rows = i + 1, num_cols = Adim;
-      Xgemv_sparsevec(kNoTrans, num_rows, num_cols, alpha, MAdata,
-                      temp_MA_stride, Mdata + (i * Mstride), 1, beta, data, 1);
-    }
-  } else {
-    // The column of M^T corresponds to the columns of the supplied matrix.
-    for (MatrixIndexT i = 0; i < dim; i++, data += i) {
-      MatrixIndexT num_rows = i + 1, num_cols = Adim;
-      Xgemv_sparsevec(kNoTrans, num_rows, num_cols, alpha, MAdata,
-                      temp_MA_stride, Mdata + i, Mstride, beta, data, 1);
-    }
-  }
-}
-template<typename Real>
-void SpMatrix<Real>::AddMat2Vec(const Real alpha,
-                                const MatrixBase<Real> &M,
-                                MatrixTransposeType transM,
-                                const VectorBase<Real> &v,
-                                const Real beta) {
-  this->Scale(beta);
-  KALDI_ASSERT((transM == kNoTrans && this->NumRows() == M.NumRows() &&
-                M.NumCols() == v.Dim()) ||
-               (transM == kTrans && this->NumRows() == M.NumCols() &&
-                M.NumRows() == v.Dim()));
-  if (transM == kNoTrans) {
-    const Real *Mdata = M.Data(), *vdata = v.Data();
-    Real *data = this->data_;
-    MatrixIndexT dim = this->NumRows(), mcols = M.NumCols(),
-        mstride = M.Stride();
-    for (MatrixIndexT col = 0; col < mcols; col++, vdata++, Mdata += 1)
-      cblas_Xspr(dim, *vdata*alpha, Mdata, mstride, data);
-  } else {
-    const Real *Mdata = M.Data(), *vdata = v.Data();
-    Real *data = this->data_;
-    MatrixIndexT dim = this->NumRows(), mrows = M.NumRows(),
-        mstride = M.Stride();
-    for (MatrixIndexT row = 0; row < mrows; row++, vdata++, Mdata += mstride)
-      cblas_Xspr(dim, *vdata*alpha, Mdata, 1, data);
-  }
-}
-template<typename Real>
-void SpMatrix<Real>::AddMat2(const Real alpha, const MatrixBase<Real> &M,
-                             MatrixTransposeType transM, const Real beta)  {
-  KALDI_ASSERT((transM == kNoTrans && this->NumRows() == M.NumRows())
-               || (transM == kTrans && this->NumRows() == M.NumCols()));
-  // Cblas has no function *sprk (i.e. symmetric packed rank-k update), so we
-  // use as temporary storage a regular matrix of which we only access its lower
-  // triangle
-  MatrixIndexT this_dim = this->NumRows(),
-      m_other_dim = (transM == kNoTrans ? M.NumCols() : M.NumRows());
-  if (this_dim == 0) return;
-  if (alpha == 0.0) {
-    if (beta != 1.0) this->Scale(beta);
-    return;
-  }
-  Matrix<Real> temp_mat(*this); // wastefully copies upper triangle too, but this
-  // doesn't dominate O(N) time.
-  // This function call is hard-coded to update the lower triangle.
-  cblas_Xsyrk(transM, this_dim, m_other_dim, alpha, M.Data(),
-              M.Stride(), beta, temp_mat.Data(), temp_mat.Stride());
-  this->CopyFromMat(temp_mat, kTakeLower);
-}
-template<typename Real>
-void SpMatrix<Real>::AddTp2Sp(const Real alpha, const TpMatrix<Real> &T,
-                              MatrixTransposeType transM, const SpMatrix<Real> &A,
-                              const Real beta) {
-  Matrix<Real> Tmat(T);
-  AddMat2Sp(alpha, Tmat, transM, A, beta);
-}
-template<typename Real>
-void SpMatrix<Real>::AddVecVec(const Real alpha, const VectorBase<Real> &v,
-                               const VectorBase<Real> &w) {
-  int32 dim = this->NumRows();
-  KALDI_ASSERT(dim == v.Dim() && dim == w.Dim() && dim > 0);
-  cblas_Xspr2(dim, alpha, v.Data(), 1, w.Data(), 1, this->data_);
-}
-template<typename Real>
-void SpMatrix<Real>::AddTp2(const Real alpha, const TpMatrix<Real> &T,
-                            MatrixTransposeType transM, const Real beta) {
-  Matrix<Real> Tmat(T);
-  AddMat2(alpha, Tmat, transM, beta);
-}
-// Explicit instantiation of the class.
-// This needs to be after the definition of all the class member functions.
-template class SpMatrix<float>;
-template class SpMatrix<double>;
-template<typename Real>
-Real TraceSpSpLower(const SpMatrix<Real> &A, const SpMatrix<Real> &B) {
-  MatrixIndexT adim = A.NumRows();
-  KALDI_ASSERT(adim == B.NumRows());
-  MatrixIndexT dim = (adim*(adim+1))/2;
-  return cblas_Xdot(dim, A.Data(), 1, B.Data(), 1);
-}
-// Instantiate the template above.
-template
-double TraceSpSpLower(const SpMatrix<double> &A, const SpMatrix<double> &B);
-template
-float TraceSpSpLower(const SpMatrix<float> &A, const SpMatrix<float> &B);
-// Instantiate the template above.
-template float SolveQuadraticMatrixProblem(const SpMatrix<float> &Q,
-                                           const MatrixBase<float> &Y,
-                                           const SpMatrix<float> &SigmaInv,
-                                           const SolverOptions &opts,
-                                           MatrixBase<float> *M);
-template double SolveQuadraticMatrixProblem(const SpMatrix<double> &Q,
-                                            const MatrixBase<double> &Y,
-                                            const SpMatrix<double> &SigmaInv,
-                                            const SolverOptions &opts,
-                                            MatrixBase<double> *M);
-// Instantiate the template above.
-template float SolveDoubleQuadraticMatrixProblem(
-    const MatrixBase<float> &G,
-    const SpMatrix<float> &P1,
-    const SpMatrix<float> &P2,
-    const SpMatrix<float> &Q1,
-    const SpMatrix<float> &Q2,
-    const SolverOptions &opts,
-    MatrixBase<float> *M);
-template double SolveDoubleQuadraticMatrixProblem(
-    const MatrixBase<double> &G,
-    const SpMatrix<double> &P1,
-    const SpMatrix<double> &P2,
-    const SpMatrix<double> &Q1,
-    const SpMatrix<double> &Q2,
-    const SolverOptions &opts,
-    MatrixBase<double> *M);
-} // namespace kaldi
--- a/speechx/speechx/kaldi/matrix/sp-matrix.h
+++ b/speechx/speechx/kaldi/matrix/sp-matrix.h
-// matrix/sp-matrix.h
-// Copyright 2009-2011   Ondrej Glembek;  Microsoft Corporation;  Lukas Burget;
-//                       Saarland University;  Ariya Rastrow;  Yanmin Qian;
-//                       Jan Silovsky
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//  http://www.apache.org/licenses/LICENSE-2.0
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_MATRIX_SP_MATRIX_H_
-#define KALDI_MATRIX_SP_MATRIX_H_
-#include <algorithm>
-#include <vector>
-#include "matrix/packed-matrix.h"
-namespace kaldi {
-/// \addtogroup matrix_group
-/// @{
-template<typename Real> class SpMatrix;
-/**
- * @brief Packed symetric matrix class
-*/
-template<typename Real>
-class SpMatrix : public PackedMatrix<Real> {
-  friend class CuSpMatrix<Real>;
- public:
-  // so it can use our assignment operator.
-  friend class std::vector<Matrix<Real> >;
-  SpMatrix(): PackedMatrix<Real>() {}
-  /// Copy constructor from CUDA version of SpMatrix
-  /// This is defined in ../cudamatrix/cu-sp-matrix.h
-  explicit SpMatrix(const CuSpMatrix<Real> &cu);
-  explicit SpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
-      : PackedMatrix<Real>(r, resize_type) {}
-  SpMatrix(const SpMatrix<Real> &orig)
-      : PackedMatrix<Real>(orig) {}
-  template<typename OtherReal>
-  explicit SpMatrix(const SpMatrix<OtherReal> &orig)
-      : PackedMatrix<Real>(orig) {}
-#ifdef KALDI_PARANOID
-  explicit SpMatrix(const MatrixBase<Real> & orig,
-                    SpCopyType copy_type = kTakeMeanAndCheck)
-      : PackedMatrix<Real>(orig.NumRows(), kUndefined) {
-    CopyFromMat(orig, copy_type);
-  }
-#else
-  explicit SpMatrix(const MatrixBase<Real> & orig,
-                    SpCopyType copy_type = kTakeMean)
-      : PackedMatrix<Real>(orig.NumRows(), kUndefined) {
-    CopyFromMat(orig, copy_type);
-  }
-#endif
-  /// Shallow swap.
-  void Swap(SpMatrix *other);
-  inline void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero) {
-    PackedMatrix<Real>::Resize(nRows, resize_type);
-  }
-  void CopyFromSp(const SpMatrix<Real> &other) {
-    PackedMatrix<Real>::CopyFromPacked(other);
-  }
-  template<typename OtherReal>
-  void CopyFromSp(const SpMatrix<OtherReal> &other) {
-    PackedMatrix<Real>::CopyFromPacked(other);
-  }
-#ifdef KALDI_PARANOID
-  void CopyFromMat(const MatrixBase<Real> &orig,
-                   SpCopyType copy_type = kTakeMeanAndCheck);
-#else  // different default arg if non-paranoid mode.
-  void CopyFromMat(const MatrixBase<Real> &orig,
-                   SpCopyType copy_type = kTakeMean);
-#endif
-  inline Real operator() (MatrixIndexT r, MatrixIndexT c) const {
-    // if column is less than row, then swap these as matrix is stored
-    // as upper-triangular...  only allowed for const matrix object.
-    if (static_cast<UnsignedMatrixIndexT>(c) >
-        static_cast<UnsignedMatrixIndexT>(r))
-      std::swap(c, r);
-    // c<=r now so don't have to check c.
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
-                 static_cast<UnsignedMatrixIndexT>(this->num_rows_));
-    return *(this->data_ + (r*(r+1)) / 2 + c);
-    // Duplicating code from PackedMatrix.h
-  }
-  inline Real &operator() (MatrixIndexT r, MatrixIndexT c) {
-    if (static_cast<UnsignedMatrixIndexT>(c) >
-        static_cast<UnsignedMatrixIndexT>(r))
-      std::swap(c, r);
-    // c<=r now so don't have to check c.
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
-                 static_cast<UnsignedMatrixIndexT>(this->num_rows_));
-    return *(this->data_ + (r * (r + 1)) / 2 + c);
-    // Duplicating code from PackedMatrix.h
-  }
-  SpMatrix<Real>& operator=(const SpMatrix<Real> &other) {
-    PackedMatrix<Real>::operator=(other);
-    return *this;
-  }
-  using PackedMatrix<Real>::Scale;
-  /// matrix inverse.
-  /// if inverse_needed = false, will fill matrix with garbage.
-  /// (only useful if logdet wanted).
-  void Invert(Real *logdet = NULL, Real *det_sign= NULL,
-              bool inverse_needed = true);
-  // Below routine does inversion in double precision,
-  // even for single-precision object.
-  void InvertDouble(Real *logdet = NULL, Real *det_sign = NULL,
-                    bool inverse_needed = true);
-  /// Returns maximum ratio of singular values.
-  inline Real Cond() const {
-    Matrix<Real> tmp(*this);
-    return tmp.Cond();
-  }
-  /// Takes matrix to a fraction power via Svd.
-  /// Will throw exception if matrix is not positive semidefinite
-  /// (to within a tolerance)
-  void ApplyPow(Real exponent);
-  /// This is the version of SVD that we implement for symmetric positive
-  /// definite matrices.  This exists for historical reasons; right now its
-  /// internal implementation is the same as Eig().  It computes the eigenvalue
-  /// decomposition (*this) = P * diag(s) * P^T with P orthogonal.  Will throw
-  /// exception if input is not positive semidefinite to within a tolerance.
-  void SymPosSemiDefEig(VectorBase<Real> *s, MatrixBase<Real> *P,
-                        Real tolerance = 0.001) const;
-  /// Solves the symmetric eigenvalue problem: at end we should have (*this) = P
-  /// * diag(s) * P^T.  We solve the problem using the symmetric QR method.
-  /// P may be NULL.
-  /// Implemented in qr.cc.
-  /// If you need the eigenvalues sorted, the function SortSvd declared in
-  /// kaldi-matrix is suitable.
-  void Eig(VectorBase<Real> *s, MatrixBase<Real> *P = NULL) const;
-  /// This function gives you, approximately, the largest eigenvalues of the
-  /// symmetric matrix and the corresponding eigenvectors.  (largest meaning,
-  /// further from zero).  It does this by doing a SVD within the Krylov
-  /// subspace generated by this matrix and a random vector.  This is
-  /// a form of the Lanczos method with complete reorthogonalization, followed
-  /// by SVD within a smaller dimension ("lanczos_dim").
-  ///
-  /// If *this is m by m, s should be of dimension n and P should be of
-  /// dimension m by n, with n <= m.  The *columns* of P are the approximate
-  /// eigenvectors; P * diag(s) * P^T would be a low-rank reconstruction of
-  /// *this.  The columns of P will be orthogonal, and the elements of s will be
-  /// the eigenvalues of *this projected into that subspace, but beyond that
-  /// there are no exact guarantees.  (This is because the convergence of this
-  /// method is statistical).  Note: it only makes sense to use this
-  /// method if you are in very high dimension and n is substantially smaller
-  /// than m: for example, if you want the 100 top eigenvalues of a 10k by 10k
-  /// matrix.  This function calls Rand() to initialize the lanczos
-  /// iterations and also for restarting.
-  /// If lanczos_dim is zero, it will default to the greater of:
-  /// s->Dim() + 50 or s->Dim() + s->Dim()/2, but not more than this->Dim().
-  /// If lanczos_dim == this->Dim(), you might as well just call the function
-  /// Eig() since the result will be the same, and Eig() would be faster; the
-  /// whole point of this function is to reduce the dimension of the SVD
-  /// computation.
-  void TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
-               MatrixIndexT lanczos_dim = 0) const;
-  /// Returns the maximum of the absolute values of any of the
-  /// eigenvalues.
-  Real MaxAbsEig() const;
-  void PrintEigs(const char *name) {
-    Vector<Real> s((*this).NumRows());
-    Matrix<Real> P((*this).NumRows(), (*this).NumCols());
-    SymPosSemiDefEig(&s, &P);
-    KALDI_LOG << "PrintEigs: " << name << ": " << s;
-  }
-  bool IsPosDef() const;  // returns true if Cholesky succeeds.
-  void AddSp(const Real alpha, const SpMatrix<Real> &Ma) {
-    this->AddPacked(alpha, Ma);
-  }
-  /// Computes log determinant but only for +ve-def matrices
-  /// (it uses Cholesky).
-  /// If matrix is not +ve-def, it will throw an exception
-  /// was LogPDDeterminant()
-  Real LogPosDefDet() const;
-  Real LogDet(Real *det_sign = NULL) const;
-  /// rank-one update, this <-- this + alpha v v'
-  template<typename OtherReal>
-  void AddVec2(const Real alpha, const VectorBase<OtherReal> &v);
-  /// rank-two update, this <-- this + alpha (v w' + w v').
-  void AddVecVec(const Real alpha, const VectorBase<Real> &v,
-                 const VectorBase<Real> &w);
-  /// Does *this = beta * *thi + alpha * diag(v) * S * diag(v)
-  void AddVec2Sp(const Real alpha, const VectorBase<Real> &v,
-                 const SpMatrix<Real> &S, const Real beta);
-  /// diagonal update, this <-- this + diag(v)
-  template<typename OtherReal>
-  void AddDiagVec(const Real alpha, const VectorBase<OtherReal> &v);
-  /// rank-N update:
-  /// if (transM == kNoTrans)
-  /// (*this) = beta*(*this) + alpha * M * M^T,
-  /// or  (if transM == kTrans)
-  ///  (*this) = beta*(*this) + alpha * M^T * M
-  /// Note: beta used to default to 0.0.
-  void AddMat2(const Real alpha, const MatrixBase<Real> &M,
-               MatrixTransposeType transM, const Real beta);
-  /// Extension of rank-N update:
-  /// this <-- beta*this  +  alpha * M * A * M^T.
-  /// (*this) and A are allowed to be the same.
-  /// If transM == kTrans, then we do it as M^T * A * M.
-  void AddMat2Sp(const Real alpha, const MatrixBase<Real> &M,
-                 MatrixTransposeType transM, const SpMatrix<Real> &A,
-                 const Real beta = 0.0);
-  /// This is a version of AddMat2Sp specialized for when M is fairly sparse.
-  /// This was required for making the raw-fMLLR code efficient.
-  void AddSmat2Sp(const Real alpha, const MatrixBase<Real> &M,
-                  MatrixTransposeType transM, const SpMatrix<Real> &A,
-                  const Real beta = 0.0);
-  /// The following function does:
-  /// this <-- beta*this  +  alpha * T * A * T^T.
-  /// (*this) and A are allowed to be the same.
-  /// If transM == kTrans, then we do it as alpha * T^T * A * T.
-  /// Currently it just calls AddMat2Sp, but if needed we
-  /// can implement it more efficiently.
-  void AddTp2Sp(const Real alpha, const TpMatrix<Real> &T,
-                MatrixTransposeType transM, const SpMatrix<Real> &A,
-                const Real beta = 0.0);
-  /// The following function does:
-  /// this <-- beta*this  +  alpha * T * T^T.
-  /// (*this) and A are allowed to be the same.
-  /// If transM == kTrans, then we do it as alpha * T^T *  T
-  /// Currently it just calls AddMat2, but if needed we
-  /// can implement it more efficiently.
-  void AddTp2(const Real alpha, const TpMatrix<Real> &T,
-              MatrixTransposeType transM, const Real beta = 0.0);
-  /// Extension of rank-N update:
-  /// this <-- beta*this + alpha * M * diag(v) * M^T.
-  /// if transM == kTrans, then
-  /// this <-- beta*this + alpha * M^T * diag(v) * M.
-  void AddMat2Vec(const Real alpha, const MatrixBase<Real> &M,
-                  MatrixTransposeType transM, const VectorBase<Real> &v,
-                  const Real beta = 0.0);
-  ///  Floors this symmetric matrix to the matrix
-  /// alpha * Floor, where the matrix Floor is positive
-  /// definite.
-  /// It is floored in the sense that after flooring,
-  ///  x^T (*this) x  >= x^T (alpha*Floor) x.
-  /// This is accomplished using an Svd.  It will crash
-  /// if Floor is not positive definite. Returns the number of
-  /// elements that were floored.
-  int ApplyFloor(const SpMatrix<Real> &Floor, Real alpha = 1.0,
-                 bool verbose = false);
-  /// Floor: Given a positive semidefinite matrix, floors the eigenvalues
-  /// to the specified quantity.  A previous version of this function had
-  /// a tolerance which is now no longer needed since we have code to
-  /// do the symmetric eigenvalue decomposition and no longer use the SVD
-  /// code for that purose.
-  int ApplyFloor(Real floor);
-  bool IsDiagonal(Real cutoff = 1.0e-05) const;
-  bool IsUnit(Real cutoff = 1.0e-05) const;
-  bool IsZero(Real cutoff = 1.0e-05) const;
-  bool IsTridiagonal(Real cutoff = 1.0e-05) const;
-  /// sqrt of sum of square elements.
-  Real FrobeniusNorm() const;
-  /// Returns true if ((*this)-other).FrobeniusNorm() <=
-  ///   tol*(*this).FrobeniusNorma()
-  bool ApproxEqual(const SpMatrix<Real> &other, float tol = 0.01) const;
-  // LimitCond:
-  // Limits the condition of symmetric positive semidefinite matrix to
-  // a specified value
-  // by flooring all eigenvalues to a positive number which is some multiple
-  // of the largest one (or zero if there are no positive eigenvalues).
-  // Takes the condition number we are willing to accept, and floors
-  // eigenvalues to the largest eigenvalue divided by this.
-  //  Returns #eigs floored or already equal to the floor.
-  // Throws exception if input is not positive definite.
-  // returns #floored.
-  MatrixIndexT LimitCond(Real maxCond = 1.0e+5, bool invert = false);
-  // as LimitCond but all done in double precision. // returns #floored.
-  MatrixIndexT LimitCondDouble(Real maxCond = 1.0e+5, bool invert = false) {
-    SpMatrix<double> dmat(*this);
-    MatrixIndexT ans = dmat.LimitCond(maxCond, invert);
-    (*this).CopyFromSp(dmat);
-    return ans;
-  }
-  Real Trace() const;
-  /// Tridiagonalize the matrix with an orthogonal transformation.  If
-  /// *this starts as S, produce T (and Q, if non-NULL) such that
-  /// T = Q A Q^T, i.e. S = Q^T T Q.  Caution: this is the other way
-  /// round from most authors (it's more efficient in row-major indexing).
-  void Tridiagonalize(MatrixBase<Real> *Q);
-  /// The symmetric QR algorithm.  This will mostly be useful in internal code.
-  /// Typically, you will call this after Tridiagonalize(), on the same object.
-  /// When called, *this (call it A at this point) must be tridiagonal; at exit,
-  /// *this will be a diagonal matrix D that is similar to A via orthogonal
-  /// transformations.  This algorithm right-multiplies Q by orthogonal
-  /// transformations.  It turns *this from a tridiagonal into a diagonal matrix
-  /// while maintaining that (Q *this Q^T) has the same value at entry and exit.
-  /// At entry Q should probably be either NULL or orthogonal, but we don't check
-  /// this.
-  void Qr(MatrixBase<Real> *Q);
- private:
- void EigInternal(VectorBase<Real> *s, MatrixBase<Real> *P,
-                   Real tolerance, int recurse) const;
-};
-/// @} end of "addtogroup matrix_group"
-/// \addtogroup matrix_funcs_scalar
-/// @{
-/// Returns tr(A B).
-float TraceSpSp(const SpMatrix<float> &A, const SpMatrix<float> &B);
-double TraceSpSp(const SpMatrix<double> &A, const SpMatrix<double> &B);
-template<typename Real>
-inline bool ApproxEqual(const SpMatrix<Real> &A,
-                        const SpMatrix<Real> &B, Real tol = 0.01) {
-  return  A.ApproxEqual(B, tol);
-}
-template<typename Real>
-inline void AssertEqual(const SpMatrix<Real> &A,
-                        const SpMatrix<Real> &B, Real tol = 0.01) {
-  KALDI_ASSERT(ApproxEqual(A, B, tol));
-}
-/// Returns tr(A B).
-template<typename Real, typename OtherReal>
-Real TraceSpSp(const SpMatrix<Real> &A, const SpMatrix<OtherReal> &B);
-// TraceSpSpLower is the same as Trace(A B) except the lower-diagonal elements
-// are counted only once not twice as they should be.  It is useful in certain
-// optimizations.
-template<typename Real>
-Real TraceSpSpLower(const SpMatrix<Real> &A, const SpMatrix<Real> &B);
-/// Returns tr(A B).
-/// No option to transpose B because would make no difference.
-template<typename Real>
-Real TraceSpMat(const SpMatrix<Real> &A, const MatrixBase<Real> &B);
-/// Returns tr(A B C)
-/// (A and C may be transposed as specified by transA and transC).
-template<typename Real>
-Real TraceMatSpMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
-                   const SpMatrix<Real> &B, const MatrixBase<Real> &C,
-                   MatrixTransposeType transC);
-/// Returns tr (A B C D)
-/// (A and C may be transposed as specified by transA and transB).
-template<typename Real>
-Real TraceMatSpMatSp(const MatrixBase<Real> &A, MatrixTransposeType transA,
-                     const SpMatrix<Real> &B, const MatrixBase<Real> &C,
-                     MatrixTransposeType transC, const SpMatrix<Real> &D);
-/** Computes v1^T * M * v2.  Not as efficient as it could be where v1 == v2
- * (but no suitable blas routines available).
- */
-/// Returns \f$ v_1^T M v_2 \f$
-/// Not as efficient as it could be where v1 == v2.
-template<typename Real>
-Real VecSpVec(const VectorBase<Real> &v1, const SpMatrix<Real> &M,
-               const VectorBase<Real> &v2);
-/// @} \addtogroup matrix_funcs_scalar
-/// \addtogroup matrix_funcs_misc
-/// @{
-/// This class describes the options for maximizing various quadratic objective
-/// functions.  It's mostly as described in the SGMM paper "the subspace
-/// Gaussian mixture model -- a structured model for speech recognition", but
-/// the diagonal_precondition option is newly added, to handle problems where
-/// different dimensions have very different scaling (we recommend to use the
-/// option but it's set false for back compatibility).
-struct SolverOptions {
-  BaseFloat K; // maximum condition number
-  BaseFloat eps;
-  std::string name;
-  bool optimize_delta;
-  bool diagonal_precondition;
-  bool print_debug_output;
-  explicit SolverOptions(const std::string &name):
-      K(1.0e+4), eps(1.0e-40), name(name),
-      optimize_delta(true), diagonal_precondition(false),
-      print_debug_output(true) { }
-  SolverOptions(): K(1.0e+4), eps(1.0e-40), name("[unknown]"),
-                   optimize_delta(true), diagonal_precondition(false),
-                   print_debug_output(true) { }
-  void Check() const;
-};
-/// Maximizes the auxiliary function
-/// \f[    Q(x) = x.g - 0.5 x^T H x     \f]
-/// using a numerically stable method. Like a numerically stable version of
-/// \f$  x := Q^{-1} g.    \f$
-/// Assumes H positive semidefinite.
-/// Returns the objective-function change.
-template<typename Real>
-Real SolveQuadraticProblem(const SpMatrix<Real> &H,
-                           const VectorBase<Real> &g,
-                           const SolverOptions &opts,
-                           VectorBase<Real> *x);
-/// Maximizes the auxiliary function :
-/// \f[   Q(x) = tr(M^T P Y) - 0.5 tr(P M Q M^T)        \f]
-/// Like a numerically stable version of  \f$  M := Y Q^{-1}   \f$.
-/// Assumes Q and P positive semidefinite, and matrix dimensions match
-/// enough to make expressions meaningful.
-/// This is mostly as described in the SGMM paper "the subspace Gaussian mixture
-/// model -- a structured model for speech recognition", but the
-/// diagonal_precondition option is newly added, to handle problems
-/// where different dimensions have very different scaling (we recommend to use
-/// the option but it's set false for back compatibility).
-template<typename Real>
-Real SolveQuadraticMatrixProblem(const SpMatrix<Real> &Q,
-                                 const MatrixBase<Real> &Y,
-                                 const SpMatrix<Real> &P,
-                                 const SolverOptions &opts,
-                                 MatrixBase<Real> *M);
-/// Maximizes the auxiliary function :
-/// \f[   Q(M) =  tr(M^T G) -0.5 tr(P_1 M Q_1 M^T) -0.5 tr(P_2 M Q_2 M^T).   \f]
-/// Encountered in matrix update with a prior. We also apply a limit on the
-/// condition but it should be less frequently necessary, and can be set larger.
-template<typename Real>
-Real SolveDoubleQuadraticMatrixProblem(const MatrixBase<Real> &G,
-                                       const SpMatrix<Real> &P1,
-                                       const SpMatrix<Real> &P2,
-                                       const SpMatrix<Real> &Q1,
-                                       const SpMatrix<Real> &Q2,
-                                       const SolverOptions &opts,
-                                       MatrixBase<Real> *M);
-/// @} End of "addtogroup matrix_funcs_misc"
-}  // namespace kaldi
-// Including the implementation (now actually just includes some
-// template specializations).
-#include "matrix/sp-matrix-inl.h"
-#endif  // KALDI_MATRIX_SP_MATRIX_H_
--- a/speechx/speechx/kaldi/matrix/sparse-matrix.cc
+++ b/speechx/speechx/kaldi/matrix/sparse-matrix.cc
-// matrix/sparse-matrix.cc
-// Copyright 2015     Johns Hopkins University (author: Daniel Povey)
-//           2015     Guoguo Chen
-//           2017     Shiyin Kang
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include <algorithm>
-#include <limits>
-#include <string>
-#include "matrix/sparse-matrix.h"
-#include "matrix/kaldi-matrix.h"
-namespace kaldi {
-template <typename Real>
-std::pair<MatrixIndexT, Real>* SparseVector<Real>::Data() {
-  if (pairs_.empty())
-    return NULL;
-  else
-    return &(pairs_[0]);
-}
-template <typename Real>
-const std::pair<MatrixIndexT, Real>* SparseVector<Real>::Data() const {
-  if (pairs_.empty())
-    return NULL;
-  else
-    return &(pairs_[0]);
-}
-template <typename Real>
-Real SparseVector<Real>::Sum() const {
-  Real sum = 0;
-  for (int32 i = 0; i < pairs_.size(); ++i) {
-    sum += pairs_[i].second;
-  }
-  return sum;
-}
-template<typename Real>
-void SparseVector<Real>::Scale(Real alpha) {
-  for (int32 i = 0; i < pairs_.size(); ++i)
-    pairs_[i].second *= alpha;
-}
-template <typename Real>
-template <typename OtherReal>
-void SparseVector<Real>::CopyElementsToVec(VectorBase<OtherReal> *vec) const {
-  KALDI_ASSERT(vec->Dim() == this->dim_);
-  vec->SetZero();
-  OtherReal *other_data = vec->Data();
-  typename std::vector<std::pair<MatrixIndexT, Real> >::const_iterator
-      iter = pairs_.begin(), end = pairs_.end();
-  for (; iter != end; ++iter)
-    other_data[iter->first] = iter->second;
-}
-template
-void SparseVector<float>::CopyElementsToVec(VectorBase<float> *vec) const;
-template
-void SparseVector<float>::CopyElementsToVec(VectorBase<double> *vec) const;
-template
-void SparseVector<double>::CopyElementsToVec(VectorBase<float> *vec) const;
-template
-void SparseVector<double>::CopyElementsToVec(VectorBase<double> *vec) const;
-template <typename Real>
-template <typename OtherReal>
-void SparseVector<Real>::AddToVec(Real alpha,
-                                  VectorBase<OtherReal> *vec) const {
-  KALDI_ASSERT(vec->Dim() == dim_);
-  OtherReal *other_data = vec->Data();
-  typename std::vector<std::pair<MatrixIndexT, Real> >::const_iterator
-      iter = pairs_.begin(), end = pairs_.end();
-  if (alpha == 1.0) {  // treat alpha==1.0 case specially.
-    for (; iter != end; ++iter)
-      other_data[iter->first] += iter->second;
-  } else {
-    for (; iter != end; ++iter)
-      other_data[iter->first] += alpha * iter->second;
-  }
-}
-template
-void SparseVector<float>::AddToVec(float alpha, VectorBase<float> *vec) const;
-template
-void SparseVector<float>::AddToVec(float alpha, VectorBase<double> *vec) const;
-template
-void SparseVector<double>::AddToVec(double alpha, VectorBase<float> *vec) const;
-template
-void SparseVector<double>::AddToVec(double alpha,
-                                    VectorBase<double> *vec) const;
-template <typename Real>
-template <typename OtherReal>
-void SparseVector<Real>::CopyFromSvec(const SparseVector<OtherReal> &other) {
-  dim_ = other.Dim();
-  pairs_.clear();
-  if (dim_ == 0) return;
-  for (int32 i = 0; i < other.NumElements(); ++i) {
-    pairs_.push_back(std::make_pair(
-        other.GetElement(i).first,
-        static_cast<Real>(other.GetElement(i).second)));
-  }
-}
-template
-void SparseVector<float>::CopyFromSvec(const SparseVector<float> &svec);
-template
-void SparseVector<float>::CopyFromSvec(const SparseVector<double> &svec);
-template
-void SparseVector<double>::CopyFromSvec(const SparseVector<float> &svec);
-template
-void SparseVector<double>::CopyFromSvec(const SparseVector<double> &svec);
-template <typename Real>
-SparseVector<Real>& SparseVector<Real>::operator = (
-    const SparseVector<Real> &other) {
-  this->CopyFromSvec(other);
-  dim_ = other.dim_;
-  pairs_ = other.pairs_;
-  return *this;
-}
-template <typename Real>
-void SparseVector<Real>::Swap(SparseVector<Real> *other) {
-  pairs_.swap(other->pairs_);
-  std::swap(dim_, other->dim_);
-}
-template <typename Real>
-void SparseVector<Real>::Write(std::ostream &os, bool binary) const {
-  if (binary) {
-    WriteToken(os, binary, "SV");
-    WriteBasicType(os, binary, dim_);
-    MatrixIndexT num_elems = pairs_.size();
-    WriteBasicType(os, binary, num_elems);
-    typename std::vector<std::pair<MatrixIndexT, Real> >::const_iterator
-        iter = pairs_.begin(), end = pairs_.end();
-    for (; iter != end; ++iter) {
-      WriteBasicType(os, binary, iter->first);
-      WriteBasicType(os, binary, iter->second);
-    }
-  } else {
-    // In text-mode, use a human-friendly, script-friendly format;
-    // format is "dim=5 [ 0 0.2 3 0.9 ] "
-    os << "dim=" << dim_ << " [ ";
-    typename std::vector<std::pair<MatrixIndexT, Real> >::const_iterator
-        iter = pairs_.begin(), end = pairs_.end();
-    for (; iter != end; ++iter)
-      os << iter->first << ' ' << iter->second << ' ';
-    os << "] ";
-  }
-}
-template <typename Real>
-void SparseVector<Real>::Read(std::istream &is, bool binary) {
-  if (binary) {
-    ExpectToken(is, binary, "SV");
-    ReadBasicType(is, binary, &dim_);
-    KALDI_ASSERT(dim_ >= 0);
-    int32 num_elems;
-    ReadBasicType(is, binary, &num_elems);
-    KALDI_ASSERT(num_elems >= 0 && num_elems <= dim_);
-    pairs_.resize(num_elems);
-    typename std::vector<std::pair<MatrixIndexT, Real> >::iterator
-        iter = pairs_.begin(), end = pairs_.end();
-    for (; iter != end; ++iter) {
-      ReadBasicType(is, binary, &(iter->first));
-      ReadBasicType(is, binary, &(iter->second));
-    }
-  } else {
-    // In text-mode, format is "dim=5 [ 0 0.2 3 0.9 ]
-    std::string str;
-    is >> str;
-    if (str.substr(0, 4) != "dim=")
-      KALDI_ERR << "Reading sparse vector, expected 'dim=xxx', got " << str;
-    std::string dim_str = str.substr(4, std::string::npos);
-    std::istringstream dim_istr(dim_str);
-    int32 dim = -1;
-    dim_istr >> dim;
-    if (dim < 0 || dim_istr.fail()) {
-      KALDI_ERR << "Reading sparse vector, expected 'dim=[int]', got " << str;
-    }
-    dim_ = dim;
-    is >> std::ws;
-    is >> str;
-    if (str != "[")
-      KALDI_ERR << "Reading sparse vector, expected '[', got " << str;
-    pairs_.clear();
-    while (1) {
-      is >> std::ws;
-      if (is.peek() == ']') {
-        is.get();
-        break;
-      }
-      MatrixIndexT i;
-      BaseFloat p;
-      is >> i >> p;
-      if (is.fail())
-        KALDI_ERR << "Error reading sparse vector, expecting numbers.";
-      KALDI_ASSERT(i >= 0 && i < dim
-                   && (pairs_.empty() || i > pairs_.back().first));
-      pairs_.push_back(std::pair<MatrixIndexT, BaseFloat>(i, p));
-    }
-  }
-}
-namespace sparse_vector_utils {
-template <typename Real>
-struct CompareFirst {
-  inline bool operator() (const std::pair<MatrixIndexT, Real> &p1,
-                           const std::pair<MatrixIndexT, Real> &p2) const {
-    return p1.first < p2.first;
-  }
-};
-}
-template <typename Real>
-SparseVector<Real>::SparseVector(
-    MatrixIndexT dim, const std::vector<std::pair<MatrixIndexT, Real> > &pairs):
-    dim_(dim),
-    pairs_(pairs) {
-  std::sort(pairs_.begin(), pairs_.end(),
-            sparse_vector_utils::CompareFirst<Real>());
-  typename std::vector<std::pair<MatrixIndexT, Real> >::iterator
-      out = pairs_.begin(), in = out,  end = pairs_.end();
-  // special case: while there is nothing to be changed, skip over
-  // initial input (avoids unnecessary copying).
-  while (in + 1 < end && in[0].first != in[1].first && in[0].second != 0.0) {
-    in++;
-    out++;
-  }
-  while (in < end) {
-    // We reach this point only at the first element of
-    // each stretch of identical .first elements.
-    *out = *in;
-    ++in;
-    while (in < end && in->first == out->first) {
-      out->second += in->second;  // this is the merge operation.
-      ++in;
-    }
-    if (out->second != Real(0.0))  // Don't keep zero elements.
-      out++;
-  }
-  pairs_.erase(out, end);
-  if (!pairs_.empty()) {
-    // range check.
-    KALDI_ASSERT(pairs_.front().first >= 0 && pairs_.back().first < dim_);
-  }
-}
-template <typename Real>
-void SparseVector<Real>::SetRandn(BaseFloat zero_prob) {
-  pairs_.clear();
-  KALDI_ASSERT(zero_prob >= 0 && zero_prob <= 1.0);
-  for (MatrixIndexT i = 0; i < dim_; i++)
-    if (WithProb(1.0 - zero_prob))
-      pairs_.push_back(std::pair<MatrixIndexT, Real>(i, RandGauss()));
-}
-template <typename Real>
-void SparseVector<Real>::Resize(MatrixIndexT dim,
-                                MatrixResizeType resize_type) {
-  if (resize_type != kCopyData || dim == 0)
-    pairs_.clear();
-  KALDI_ASSERT(dim >= 0);
-  if (dim < dim_ && resize_type == kCopyData)
-    while (!pairs_.empty() && pairs_.back().first >= dim)
-      pairs_.pop_back();
-  dim_ = dim;
-}
-template <typename Real>
-MatrixIndexT SparseMatrix<Real>::NumRows() const {
-  return rows_.size();
-}
-template <typename Real>
-MatrixIndexT SparseMatrix<Real>::NumCols() const {
-  if (rows_.empty())
-    return 0.0;
-  else
-    return rows_[0].Dim();
-}
-template <typename Real>
-MatrixIndexT SparseMatrix<Real>::NumElements() const {
-  int32 num_elements = 0;
-  for (int32 i = 0; i < rows_.size(); ++i) {
-    num_elements += rows_[i].NumElements();
-  }
-  return num_elements;
-}
-template <typename Real>
-SparseVector<Real>* SparseMatrix<Real>::Data() {
-  if (rows_.empty())
-    return NULL;
-  else
-    return rows_.data();
-}
-template <typename Real>
-const SparseVector<Real>* SparseMatrix<Real>::Data() const {
-  if (rows_.empty())
-    return NULL;
-  else
-    return rows_.data();
-}
-template <typename Real>
-Real SparseMatrix<Real>::Sum() const {
-  Real sum = 0;
-  for (int32 i = 0; i < rows_.size(); ++i) {
-    sum += rows_[i].Sum();
-  }
-  return sum;
-}
-template<typename Real>
-Real SparseMatrix<Real>::FrobeniusNorm() const {
-  Real squared_sum = 0;
-  for (int32 i = 0; i < rows_.size(); ++i) {
-    const std::pair<MatrixIndexT, Real> *row_data = rows_[i].Data();
-    for (int32 j = 0; j < rows_[i].NumElements(); ++j) {
-      squared_sum += row_data[j].second * row_data[j].second;
-    }
-  }
-  return std::sqrt(squared_sum);
-}
-template <typename Real>
-template <typename OtherReal>
-void SparseMatrix<Real>::CopyToMat(MatrixBase<OtherReal> *other,
-                                   MatrixTransposeType trans) const {
-  if (trans == kNoTrans) {
-    MatrixIndexT num_rows = rows_.size();
-    KALDI_ASSERT(other->NumRows() == num_rows);
-    for (MatrixIndexT i = 0; i < num_rows; i++) {
-      SubVector<OtherReal> vec(*other, i);
-      rows_[i].CopyElementsToVec(&vec);
-    }
-  } else {
-    OtherReal *other_col_data = other->Data();
-    MatrixIndexT other_stride = other->Stride(),
-        num_rows = NumRows(), num_cols = NumCols();
-    KALDI_ASSERT(num_rows == other->NumCols() && num_cols == other->NumRows());
-    other->SetZero();
-    for (MatrixIndexT row = 0; row < num_rows; row++, other_col_data++) {
-      const SparseVector<Real> &svec = rows_[row];
-      MatrixIndexT num_elems = svec.NumElements();
-      const std::pair<MatrixIndexT, Real> *sdata = svec.Data();
-      for (MatrixIndexT e = 0; e < num_elems; e++)
-        other_col_data[sdata[e].first * other_stride] = sdata[e].second;
-    }
-  }
-}
-template
-void SparseMatrix<float>::CopyToMat(MatrixBase<float> *other,
-                                    MatrixTransposeType trans) const;
-template
-void SparseMatrix<float>::CopyToMat(MatrixBase<double> *other,
-                                    MatrixTransposeType trans) const;
-template
-void SparseMatrix<double>::CopyToMat(MatrixBase<float> *other,
-                                    MatrixTransposeType trans) const;
-template
-void SparseMatrix<double>::CopyToMat(MatrixBase<double> *other,
-                                    MatrixTransposeType trans) const;
-template <typename Real>
-void SparseMatrix<Real>::CopyElementsToVec(VectorBase<Real> *other) const {
-  KALDI_ASSERT(other->Dim() == NumElements());
-  Real *dst_data = other->Data();
-  int32 dst_index = 0;
-  for (int32 i = 0; i < rows_.size(); ++i) {
-    for (int32 j = 0; j < rows_[i].NumElements(); ++j) {
-      dst_data[dst_index] =
-          static_cast<Real>(rows_[i].GetElement(j).second);
-      dst_index++;
-    }
-  }
-}
-template<typename Real>
-template<typename OtherReal>
-void SparseMatrix<Real>::CopyFromSmat(const SparseMatrix<OtherReal> &other,
-                                      MatrixTransposeType trans) {
-  if (trans == kNoTrans) {
-    rows_.resize(other.NumRows());
-    if (rows_.size() == 0)
-      return;
-    for (int32 r = 0; r < rows_.size(); ++r) {
-      rows_[r].CopyFromSvec(other.Row(r));
-    }
-  } else {
-    std::vector<std::vector<std::pair<MatrixIndexT, Real> > > pairs(
-        other.NumCols());
-    for (MatrixIndexT i = 0; i < other.NumRows(); ++i) {
-      for (int id = 0; id < other.Row(i).NumElements(); ++id) {
-        MatrixIndexT j = other.Row(i).GetElement(id).first;
-        Real v = static_cast<Real>(other.Row(i).GetElement(id).second);
-        pairs[j].push_back( { i, v });
-      }
-    }
-    SparseMatrix<Real> temp(other.NumRows(), pairs);
-    Swap(&temp);
-  }
-}
-template
-void SparseMatrix<float>::CopyFromSmat(const SparseMatrix<float> &other,
-                                       MatrixTransposeType trans);
-template
-void SparseMatrix<float>::CopyFromSmat(const SparseMatrix<double> &other,
-                                       MatrixTransposeType trans);
-template
-void SparseMatrix<double>::CopyFromSmat(const SparseMatrix<float> &other,
-                                        MatrixTransposeType trans);
-template
-void SparseMatrix<double>::CopyFromSmat(const SparseMatrix<double> &other,
-                                        MatrixTransposeType trans);
-template <typename Real>
-void SparseMatrix<Real>::Write(std::ostream &os, bool binary) const {
-  if (binary) {
-    // Note: we can use the same marker for float and double SparseMatrix,
-    // because internally we use WriteBasicType and ReadBasicType to read the
-    // floats and doubles, and this will automatically take care of type
-    // conversion.
-    WriteToken(os, binary, "SM");
-    int32 num_rows = rows_.size();
-    WriteBasicType(os, binary, num_rows);
-    for (int32 row = 0; row < num_rows; row++)
-      rows_[row].Write(os, binary);
-  } else {
-    // The format is "rows=10 dim=20 [ 1 0.4  9 1.2 ] dim=20 [ 3 1.7 19 0.6 ] ..
-    // not 100% efficient, but easy to work with, and we can re-use the
-    // read/write code from SparseVector.
-    int32 num_rows = rows_.size();
-    os << "rows=" << num_rows << " ";
-    for (int32 row = 0; row < num_rows; row++)
-      rows_[row].Write(os, binary);
-    os << "\n";  // Might make it a little more readable.
-  }
-}
-template <typename Real>
-void SparseMatrix<Real>::Read(std::istream &is, bool binary) {
-  if (binary) {
-    ExpectToken(is, binary, "SM");
-    int32 num_rows;
-    ReadBasicType(is, binary, &num_rows);
-    KALDI_ASSERT(num_rows >= 0 && num_rows < 10000000);
-    rows_.resize(num_rows);
-    for (int32 row = 0; row < num_rows; row++)
-      rows_[row].Read(is, binary);
-  } else {
-    std::string str;
-    is >> str;
-    if (str.substr(0, 5) != "rows=")
-      KALDI_ERR << "Reading sparse matrix, expected 'rows=xxx', got " << str;
-    std::string rows_str = str.substr(5, std::string::npos);
-    std::istringstream rows_istr(rows_str);
-    int32 num_rows = -1;
-    rows_istr >> num_rows;
-    if (num_rows < 0 || rows_istr.fail()) {
-      KALDI_ERR << "Reading sparse vector, expected 'rows=[int]', got " << str;
-    }
-    rows_.resize(num_rows);
-    for (int32 row = 0; row < num_rows; row++)
-      rows_[row].Read(is, binary);
-  }
-}
-template <typename Real>
-void SparseMatrix<Real>::AddToMat(BaseFloat alpha,
-                                  MatrixBase<Real> *other,
-                                  MatrixTransposeType trans) const {
-  if (trans == kNoTrans) {
-    MatrixIndexT num_rows = rows_.size();
-    KALDI_ASSERT(other->NumRows() == num_rows);
-    for (MatrixIndexT i = 0; i < num_rows; i++) {
-      SubVector<Real> vec(*other, i);
-      rows_[i].AddToVec(alpha, &vec);
-    }
-  } else {
-    Real *other_col_data = other->Data();
-    MatrixIndexT other_stride = other->Stride(),
-        num_rows = NumRows(), num_cols = NumCols();
-    KALDI_ASSERT(num_rows == other->NumCols() && num_cols == other->NumRows());
-    for (MatrixIndexT row = 0; row < num_rows; row++, other_col_data++) {
-      const SparseVector<Real> &svec = rows_[row];
-      MatrixIndexT num_elems = svec.NumElements();
-      const std::pair<MatrixIndexT, Real> *sdata = svec.Data();
-      for (MatrixIndexT e = 0; e < num_elems; e++)
-        other_col_data[sdata[e].first * other_stride] +=
-            alpha * sdata[e].second;
-    }
-  }
-}
-template <typename Real>
-Real VecSvec(const VectorBase<Real> &vec,
-             const SparseVector<Real> &svec) {
-  KALDI_ASSERT(vec.Dim() == svec.Dim());
-  MatrixIndexT n = svec.NumElements();
-  const std::pair<MatrixIndexT, Real> *sdata = svec.Data();
-  const Real *data = vec.Data();
-  Real ans = 0.0;
-  for (MatrixIndexT i = 0; i < n; i++)
-    ans += data[sdata[i].first] * sdata[i].second;
-  return ans;
-}
-template
-float VecSvec(const VectorBase<float> &vec,
-              const SparseVector<float> &svec);
-template
-double VecSvec(const VectorBase<double> &vec,
-              const SparseVector<double> &svec);
-template <typename Real>
-const SparseVector<Real> &SparseMatrix<Real>::Row(MatrixIndexT r) const {
-  KALDI_ASSERT(static_cast<size_t>(r) < rows_.size());
-  return rows_[r];
-}
-template <typename Real>
-void SparseMatrix<Real>::SetRow(int32 r, const SparseVector<Real> &vec) {
-  KALDI_ASSERT(static_cast<size_t>(r) < rows_.size() &&
-               vec.Dim() == rows_[0].Dim());
-  rows_[r] = vec;
-}
-template<typename Real>
-void SparseMatrix<Real>::SelectRows(const std::vector<int32> &row_indexes,
-                                    const SparseMatrix<Real> &smat_other) {
-  Resize(row_indexes.size(), smat_other.NumCols());
-  for (int i = 0; i < row_indexes.size(); ++i) {
-    SetRow(i, smat_other.Row(row_indexes[i]));
-  }
-}
-template<typename Real>
-SparseMatrix<Real>::SparseMatrix(const std::vector<int32> &indexes, int32 dim,
-                                 MatrixTransposeType trans) {
-  const std::vector<int32>& idx = indexes;
-  std::vector<std::vector<std::pair<MatrixIndexT, Real> > > pair(idx.size());
-  for (int i = 0; i < idx.size(); ++i) {
-    if (idx[i] >= 0) {
-      pair[i].push_back( { idx[i], Real(1) });
-    }
-  }
-  SparseMatrix<Real> smat_cpu(dim, pair);
-  if (trans == kNoTrans) {
-    this->Swap(&smat_cpu);
-  } else {
-    SparseMatrix<Real> tmp(smat_cpu, kTrans);
-    this->Swap(&tmp);
-  }
-}
-template<typename Real>
-SparseMatrix<Real>::SparseMatrix(const std::vector<int32> &indexes,
-                                 const VectorBase<Real> &weights, int32 dim,
-                                 MatrixTransposeType trans) {
-  const std::vector<int32>& idx = indexes;
-  const VectorBase<Real>& w = weights;
-  std::vector<std::vector<std::pair<MatrixIndexT, Real> > > pair(idx.size());
-  for (int i = 0; i < idx.size(); ++i) {
-    if (idx[i] >= 0) {
-      pair[i].push_back( { idx[i], w(i) });
-    }
-  }
-  SparseMatrix<Real> smat_cpu(dim, pair);
-  if (trans == kNoTrans) {
-    this->Swap(&smat_cpu);
-  } else {
-    SparseMatrix<Real> tmp(smat_cpu, kTrans);
-    this->Swap(&tmp);
-  }
-}
-template <typename Real>
-SparseMatrix<Real>& SparseMatrix<Real>::operator = (
-    const SparseMatrix<Real> &other) {
-  rows_ = other.rows_;
-  return *this;
-}
-template <typename Real>
-void SparseMatrix<Real>::Swap(SparseMatrix<Real> *other) {
-  rows_.swap(other->rows_);
-}
-template<typename Real>
-SparseMatrix<Real>::SparseMatrix(
-    MatrixIndexT dim,
-    const std::vector<std::vector<std::pair<MatrixIndexT, Real> > > &pairs):
-    rows_(pairs.size()) {
-  MatrixIndexT num_rows = pairs.size();
-  for (MatrixIndexT row = 0; row < num_rows; row++) {
-    SparseVector<Real> svec(dim, pairs[row]);
-    rows_[row].Swap(&svec);
-  }
-}
-template <typename Real>
-void SparseMatrix<Real>::SetRandn(BaseFloat zero_prob) {
-  MatrixIndexT num_rows = rows_.size();
-  for (MatrixIndexT row = 0; row < num_rows; row++)
-    rows_[row].SetRandn(zero_prob);
-}
-template <typename Real>
-void SparseMatrix<Real>::Resize(MatrixIndexT num_rows,
-                                MatrixIndexT num_cols,
-                                MatrixResizeType resize_type) {
-  KALDI_ASSERT(num_rows >= 0 && num_cols >= 0);
-  if (resize_type == kSetZero || resize_type == kUndefined) {
-    rows_.clear();
-    Resize(num_rows, num_cols, kCopyData);
-  } else {
-    // Assume resize_type == kCopyData from here.
-    int32 old_num_rows = rows_.size(), old_num_cols = NumCols();
-    SparseVector<Real> initializer(num_cols);
-    rows_.resize(num_rows, initializer);
-    if (num_cols != old_num_cols)
-      for (int32 row = 0; row < old_num_rows; row++)
-        rows_[row].Resize(num_cols, kCopyData);
-  }
-}
-template <typename Real>
-void SparseMatrix<Real>::AppendSparseMatrixRows(
-    std::vector<SparseMatrix<Real> > *inputs) {
-  rows_.clear();
-  size_t num_rows = 0;
-  typename std::vector<SparseMatrix<Real> >::iterator
-      input_iter = inputs->begin(),
-      input_end = inputs->end();
-  for (; input_iter != input_end; ++input_iter)
-    num_rows += input_iter->rows_.size();
-  rows_.resize(num_rows);
-  typename std::vector<SparseVector<Real> >::iterator
-      row_iter = rows_.begin(),
-      row_end = rows_.end();
-  for (input_iter = inputs->begin(); input_iter != input_end; ++input_iter) {
-    typename std::vector<SparseVector<Real> >::iterator
-        input_row_iter = input_iter->rows_.begin(),
-        input_row_end = input_iter->rows_.end();
-    for (; input_row_iter != input_row_end; ++input_row_iter, ++row_iter)
-      row_iter->Swap(&(*input_row_iter));
-  }
-  KALDI_ASSERT(row_iter == row_end);
-  int32 num_cols = NumCols();
-  for (row_iter = rows_.begin(); row_iter != row_end; ++row_iter) {
-    if (row_iter->Dim() != num_cols)
-      KALDI_ERR << "Appending rows with inconsistent dimensions, "
-                << row_iter->Dim() << " vs. " << num_cols;
-  }
-  inputs->clear();
-}
-template<typename Real>
-void SparseMatrix<Real>::Scale(Real alpha) {
-  MatrixIndexT num_rows = rows_.size();
-  for (MatrixIndexT row = 0; row < num_rows; row++)
-    rows_[row].Scale(alpha);
-}
-template<typename Real>
-SparseMatrix<Real>::SparseMatrix(const MatrixBase<Real> &mat) {
-  MatrixIndexT num_rows = mat.NumRows();
-  rows_.resize(num_rows);
-  for (int32 row = 0; row < num_rows; row++) {
-    SparseVector<Real> this_row(mat.Row(row));
-    rows_[row].Swap(&this_row);
-  }
-}
-template<typename Real>
-Real TraceMatSmat(const MatrixBase<Real> &A,
-                  const SparseMatrix<Real> &B,
-                  MatrixTransposeType trans) {
-  Real sum = 0.0;
-  if (trans == kTrans) {
-    MatrixIndexT num_rows = A.NumRows();
-    KALDI_ASSERT(B.NumRows() == num_rows);
-    for (MatrixIndexT r = 0; r < num_rows; r++)
-      sum += VecSvec(A.Row(r), B.Row(r));
-  } else {
-    const Real *A_col_data = A.Data();
-    MatrixIndexT Astride = A.Stride(), Acols = A.NumCols(), Arows = A.NumRows();
-    KALDI_ASSERT(Arows == B.NumCols() && Acols == B.NumRows());
-    sum = 0.0;
-    for (MatrixIndexT i = 0; i < Acols; i++, A_col_data++) {
-      Real col_sum = 0.0;
-      const SparseVector<Real> &svec = B.Row(i);
-      MatrixIndexT num_elems = svec.NumElements();
-      const std::pair<MatrixIndexT, Real> *sdata = svec.Data();
-      for (MatrixIndexT e = 0; e < num_elems; e++)
-        col_sum += A_col_data[Astride * sdata[e].first] * sdata[e].second;
-      sum += col_sum;
-    }
-  }
-  return sum;
-}
-template
-float TraceMatSmat(const MatrixBase<float> &A,
-                   const SparseMatrix<float> &B,
-                   MatrixTransposeType trans);
-template
-double TraceMatSmat(const MatrixBase<double> &A,
-                   const SparseMatrix<double> &B,
-                   MatrixTransposeType trans);
-void GeneralMatrix::Clear() {
-  mat_.Resize(0, 0);
-  cmat_.Clear();
-  smat_.Resize(0, 0);
-}
-GeneralMatrix& GeneralMatrix::operator= (const MatrixBase<BaseFloat> &mat) {
-  Clear();
-  mat_ = mat;
-  return *this;
-}
-GeneralMatrix& GeneralMatrix::operator= (const CompressedMatrix &cmat) {
-  Clear();
-  cmat_ = cmat;
-  return *this;
-}
-GeneralMatrix& GeneralMatrix::operator= (const SparseMatrix<BaseFloat> &smat) {
-  Clear();
-  smat_ = smat;
-  return *this;
-}
-GeneralMatrix& GeneralMatrix::operator= (const GeneralMatrix &gmat) {
-  mat_ = gmat.mat_;
-  smat_ = gmat.smat_;
-  cmat_ = gmat.cmat_;
-  return *this;
-}
-GeneralMatrixType GeneralMatrix::Type() const {
-  if (smat_.NumRows() != 0)
-    return kSparseMatrix;
-  else if (cmat_.NumRows() != 0)
-    return kCompressedMatrix;
-  else
-    return kFullMatrix;
-}
-MatrixIndexT GeneralMatrix::NumRows() const {
-  MatrixIndexT r = smat_.NumRows();
-  if (r != 0)
-    return r;
-  r = cmat_.NumRows();
-  if (r != 0)
-    return r;
-  return mat_.NumRows();
-}
-MatrixIndexT GeneralMatrix::NumCols() const {
-  MatrixIndexT r = smat_.NumCols();
-  if (r != 0)
-    return r;
-  r = cmat_.NumCols();
-  if (r != 0)
-    return r;
-  return mat_.NumCols();
-}
-void GeneralMatrix::Compress() {
-  if (mat_.NumRows() != 0) {
-    cmat_.CopyFromMat(mat_);
-    mat_.Resize(0, 0);
-  }
-}
-void GeneralMatrix::Uncompress() {
-  if (cmat_.NumRows() != 0) {
-    mat_.Resize(cmat_.NumRows(), cmat_.NumCols(), kUndefined);
-    cmat_.CopyToMat(&mat_);
-    cmat_.Clear();
-  }
-}
-void GeneralMatrix::GetMatrix(Matrix<BaseFloat> *mat) const {
-  if (mat_.NumRows() !=0) {
-    *mat = mat_;
-  } else if (cmat_.NumRows() != 0) {
-    mat->Resize(cmat_.NumRows(), cmat_.NumCols(), kUndefined);
-    cmat_.CopyToMat(mat);
-  } else if (smat_.NumRows() != 0) {
-    mat->Resize(smat_.NumRows(), smat_.NumCols(), kUndefined);
-    smat_.CopyToMat(mat);
-  } else {
-    mat->Resize(0, 0);
-  }
-}
-void GeneralMatrix::CopyToMat(MatrixBase<BaseFloat> *mat,
-                              MatrixTransposeType trans) const {
-  if (mat_.NumRows() !=0) {
-    mat->CopyFromMat(mat_, trans);
-  } else if (cmat_.NumRows() != 0) {
-    cmat_.CopyToMat(mat, trans);
-  } else if (smat_.NumRows() != 0) {
-    smat_.CopyToMat(mat, trans);
-  } else {
-    KALDI_ASSERT(mat->NumRows() == 0);
-  }
-}
-void GeneralMatrix::Scale(BaseFloat alpha) {
-  if (mat_.NumRows() != 0) {
-    mat_.Scale(alpha);
-  } else if (cmat_.NumRows() != 0) {
-    cmat_.Scale(alpha);
-  } else if (smat_.NumRows() != 0) {
-    smat_.Scale(alpha);
-  }
-}
-const SparseMatrix<BaseFloat>& GeneralMatrix::GetSparseMatrix() const {
-  if (mat_.NumRows() != 0 || cmat_.NumRows() != 0)
-    KALDI_ERR << "GetSparseMatrix called on GeneralMatrix of wrong type.";
-  return smat_;
-}
-void GeneralMatrix::SwapSparseMatrix(SparseMatrix<BaseFloat> *smat) {
-  if (mat_.NumRows() != 0 || cmat_.NumRows() != 0)
-    KALDI_ERR << "GetSparseMatrix called on GeneralMatrix of wrong type.";
-  smat->Swap(&smat_);
-}
-void GeneralMatrix::SwapCompressedMatrix(CompressedMatrix *cmat) {
-  if (mat_.NumRows() != 0 || smat_.NumRows() != 0)
-    KALDI_ERR << "GetSparseMatrix called on GeneralMatrix of wrong type.";
-  cmat->Swap(&cmat_);
-}
-const CompressedMatrix &GeneralMatrix::GetCompressedMatrix() const {
-  if (mat_.NumRows() != 0 || smat_.NumRows() != 0)
-    KALDI_ERR << "GetCompressedMatrix called on GeneralMatrix of wrong type.";
-  return cmat_;
-}
-const Matrix<BaseFloat> &GeneralMatrix::GetFullMatrix() const {
-  if (smat_.NumRows() != 0 || cmat_.NumRows() != 0)
-    KALDI_ERR << "GetFullMatrix called on GeneralMatrix of wrong type.";
-  return mat_;
-}
-void GeneralMatrix::SwapFullMatrix(Matrix<BaseFloat> *mat) {
-  if (cmat_.NumRows() != 0 || smat_.NumRows() != 0)
-    KALDI_ERR << "SwapMatrix called on GeneralMatrix of wrong type.";
-  mat->Swap(&mat_);
-}
-void GeneralMatrix::Write(std::ostream &os, bool binary) const {
-  if (smat_.NumRows() != 0) {
-    smat_.Write(os, binary);
-  } else if (cmat_.NumRows() != 0) {
-    cmat_.Write(os, binary);
-  } else {
-    mat_.Write(os, binary);
-  }
-}
-void GeneralMatrix::Read(std::istream &is, bool binary) {
-  Clear();
-  if (binary) {
-    int peekval = is.peek();
-    if (peekval == 'C') {
-      // Token CM for compressed matrix
-      cmat_.Read(is, binary);
-    } else if (peekval == 'S') {
-      // Token SM for sparse matrix
-      smat_.Read(is, binary);
-    } else {
-      mat_.Read(is, binary);
-    }
-  } else {
-    // note: in text mode we will only ever read regular
-    // or sparse matrices, because the compressed-matrix format just
-    // gets written as a regular matrix in text mode.
-    is >> std::ws;  // Eat up white space.
-    int peekval = is.peek();
-    if (peekval == 'r') {  // sparse format starts rows=[int].
-      smat_.Read(is, binary);
-    } else {
-      mat_.Read(is, binary);
-    }
-  }
-}
-void AppendGeneralMatrixRows(const std::vector<const GeneralMatrix *> &src,
-                             GeneralMatrix *mat) {
-  mat->Clear();
-  int32 size = src.size();
-  if (size == 0)
-    return;
-  bool all_sparse = true;
-  for (int32 i = 0; i < size; i++) {
-    if (src[i]->Type() != kSparseMatrix && src[i]->NumRows() != 0) {
-      all_sparse = false;
-      break;
-    }
-  }
-  if (all_sparse) {
-    std::vector<SparseMatrix<BaseFloat> > sparse_mats(size);
-    for (int32 i = 0; i < size; i++)
-      sparse_mats[i] = src[i]->GetSparseMatrix();
-    SparseMatrix<BaseFloat> appended_mat;
-    appended_mat.AppendSparseMatrixRows(&sparse_mats);
-    mat->SwapSparseMatrix(&appended_mat);
-  } else {
-    int32 tot_rows = 0, num_cols = -1;
-    for (int32 i = 0; i < size; i++) {
-      const GeneralMatrix &src_mat = *(src[i]);
-      int32 src_rows = src_mat.NumRows(), src_cols = src_mat.NumCols();
-      if (src_rows != 0) {
-        tot_rows += src_rows;
-        if (num_cols == -1) num_cols = src_cols;
-        else if (num_cols != src_cols)
-          KALDI_ERR << "Appending rows of matrices with inconsistent num-cols: "
-                    << num_cols << " vs. " << src_cols;
-      }
-    }
-    Matrix<BaseFloat> appended_mat(tot_rows, num_cols, kUndefined);
-    int32 row_offset = 0;
-    for (int32 i = 0; i < size; i++) {
-      const GeneralMatrix &src_mat = *(src[i]);
-      int32 src_rows = src_mat.NumRows();
-      if (src_rows != 0) {
-        SubMatrix<BaseFloat> dest_submat(appended_mat, row_offset, src_rows,
-                                         0, num_cols);
-        src_mat.CopyToMat(&dest_submat);
-        row_offset += src_rows;
-      }
-    }
-    KALDI_ASSERT(row_offset == tot_rows);
-    mat->SwapFullMatrix(&appended_mat);
-  }
-}
-void FilterCompressedMatrixRows(const CompressedMatrix &in,
-                                const std::vector<bool> &keep_rows,
-                                Matrix<BaseFloat> *out) {
-  KALDI_ASSERT(keep_rows.size() == static_cast<size_t>(in.NumRows()));
-  int32 num_kept_rows = 0;
-  std::vector<bool>::const_iterator iter = keep_rows.begin(),
-                                     end = keep_rows.end();
-  for (; iter != end; ++iter)
-    if (*iter)
-      num_kept_rows++;
-  if (num_kept_rows == 0)
-    KALDI_ERR << "No kept rows";
-  if (num_kept_rows == static_cast<int32>(keep_rows.size())) {
-    out->Resize(in.NumRows(), in.NumCols(), kUndefined);
-    in.CopyToMat(out);
-    return;
-  }
-  const BaseFloat heuristic = 0.33;
-  // should be > 0 and < 1.0.  represents the performance hit we get from
-  // iterating row-wise versus column-wise in compressed-matrix uncompression.
-  if (num_kept_rows > heuristic * in.NumRows()) {
-    // if quite a few of the the rows are kept, it may be more efficient
-    // to uncompress the entire compressed matrix, since per-column operation
-    // is more efficient.
-    Matrix<BaseFloat> full_mat(in);
-    FilterMatrixRows(full_mat, keep_rows, out);
-  } else {
-    out->Resize(num_kept_rows, in.NumCols(), kUndefined);
-    iter = keep_rows.begin();
-    int32 out_row = 0;
-    for (int32 in_row = 0; iter != end; ++iter, ++in_row) {
-      if (*iter) {
-        SubVector<BaseFloat> dest(*out, out_row);
-        in.CopyRowToVec(in_row, &dest);
-        out_row++;
-      }
-    }
-    KALDI_ASSERT(out_row == num_kept_rows);
-  }
-}
-template <typename Real>
-void FilterMatrixRows(const Matrix<Real> &in,
-                      const std::vector<bool> &keep_rows,
-                      Matrix<Real> *out) {
-  KALDI_ASSERT(keep_rows.size() == static_cast<size_t>(in.NumRows()));
-  int32 num_kept_rows = 0;
-  std::vector<bool>::const_iterator iter = keep_rows.begin(),
-                                     end = keep_rows.end();
-  for (; iter != end; ++iter)
-    if (*iter)
-      num_kept_rows++;
-  if (num_kept_rows == 0)
-    KALDI_ERR << "No kept rows";
-  if (num_kept_rows == static_cast<int32>(keep_rows.size())) {
-    *out = in;
-    return;
-  }
-  out->Resize(num_kept_rows, in.NumCols(), kUndefined);
-  iter = keep_rows.begin();
-  int32 out_row = 0;
-  for (int32 in_row = 0; iter != end; ++iter, ++in_row) {
-    if (*iter) {
-      SubVector<Real> src(in, in_row);
-      SubVector<Real> dest(*out, out_row);
-      dest.CopyFromVec(src);
-      out_row++;
-    }
-  }
-  KALDI_ASSERT(out_row == num_kept_rows);
-}
-template
-void FilterMatrixRows(const Matrix<float> &in,
-                      const std::vector<bool> &keep_rows,
-                      Matrix<float> *out);
-template
-void FilterMatrixRows(const Matrix<double> &in,
-                      const std::vector<bool> &keep_rows,
-                      Matrix<double> *out);
-template <typename Real>
-void FilterSparseMatrixRows(const SparseMatrix<Real> &in,
-                            const std::vector<bool> &keep_rows,
-                            SparseMatrix<Real> *out) {
-  KALDI_ASSERT(keep_rows.size() == static_cast<size_t>(in.NumRows()));
-  int32 num_kept_rows = 0;
-  std::vector<bool>::const_iterator iter = keep_rows.begin(),
-                                     end = keep_rows.end();
-  for (; iter != end; ++iter)
-    if (*iter)
-      num_kept_rows++;
-  if (num_kept_rows == 0)
-    KALDI_ERR << "No kept rows";
-  if (num_kept_rows == static_cast<int32>(keep_rows.size())) {
-    *out = in;
-    return;
-  }
-  out->Resize(num_kept_rows, in.NumCols(), kUndefined);
-  iter = keep_rows.begin();
-  int32 out_row = 0;
-  for (int32 in_row = 0; iter != end; ++iter, ++in_row) {
-    if (*iter) {
-      out->SetRow(out_row, in.Row(in_row));
-      out_row++;
-    }
-  }
-  KALDI_ASSERT(out_row == num_kept_rows);
-}
-template
-void FilterSparseMatrixRows(const SparseMatrix<float> &in,
-                            const std::vector<bool> &keep_rows,
-                            SparseMatrix<float> *out);
-template
-void FilterSparseMatrixRows(const SparseMatrix<double> &in,
-                            const std::vector<bool> &keep_rows,
-                            SparseMatrix<double> *out);
-void FilterGeneralMatrixRows(const GeneralMatrix &in,
-                             const std::vector<bool> &keep_rows,
-                             GeneralMatrix *out) {
-  out->Clear();
-  KALDI_ASSERT(keep_rows.size() == static_cast<size_t>(in.NumRows()));
-  int32 num_kept_rows = 0;
-  std::vector<bool>::const_iterator iter = keep_rows.begin(),
-                                     end = keep_rows.end();
-  for (; iter != end; ++iter)
-    if (*iter)
-      num_kept_rows++;
-  if (num_kept_rows == 0)
-    KALDI_ERR << "No kept rows";
-  if (num_kept_rows == static_cast<int32>(keep_rows.size())) {
-    *out = in;
-    return;
-  }
-  switch (in.Type()) {
-    case kCompressedMatrix: {
-      const CompressedMatrix &cmat = in.GetCompressedMatrix();
-      Matrix<BaseFloat> full_mat;
-      FilterCompressedMatrixRows(cmat, keep_rows, &full_mat);
-      out->SwapFullMatrix(&full_mat);
-      return;
-    }
-    case kSparseMatrix: {
-      const SparseMatrix<BaseFloat> &smat = in.GetSparseMatrix();
-      SparseMatrix<BaseFloat> smat_out;
-      FilterSparseMatrixRows(smat, keep_rows, &smat_out);
-      out->SwapSparseMatrix(&smat_out);
-      return;
-    }
-    case kFullMatrix: {
-      const Matrix<BaseFloat> &full_mat = in.GetFullMatrix();
-      Matrix<BaseFloat> full_mat_out;
-      FilterMatrixRows(full_mat, keep_rows, &full_mat_out);
-      out->SwapFullMatrix(&full_mat_out);
-      return;
-    }
-    default:
-      KALDI_ERR << "Invalid general-matrix type.";
-  }
-}
-void GeneralMatrix::AddToMat(BaseFloat alpha, MatrixBase<BaseFloat> *mat,
-                             MatrixTransposeType trans) const {
-  switch (this->Type()) {
-    case kFullMatrix: {
-      mat->AddMat(alpha, mat_, trans);
-      break;
-    }
-    case kSparseMatrix: {
-      smat_.AddToMat(alpha, mat, trans);
-      break;
-    }
-    case kCompressedMatrix: {
-      Matrix<BaseFloat> temp_mat(cmat_);
-      mat->AddMat(alpha, temp_mat, trans);
-      break;
-    }
-    default:
-      KALDI_ERR << "Invalid general-matrix type.";
-  }
-}
-template <class Real>
-Real SparseVector<Real>::Max(int32 *index_out) const {
-  KALDI_ASSERT(dim_ > 0 && pairs_.size() <= static_cast<size_t>(dim_));
-  Real ans = -std::numeric_limits<Real>::infinity();
-  int32 index = 0;
-  typename std::vector<std::pair<MatrixIndexT, Real> >::const_iterator
-      iter = pairs_.begin(), end = pairs_.end();
-  for (; iter != end; ++iter) {
-    if (iter->second > ans) {
-      ans = iter->second;
-      index = iter->first;
-    }
-  }
-  if (ans >= 0 || pairs_.size() == dim_) {
-    // ans >= 0 will be the normal case.
-    // if pairs_.size() == dim_ then we need to return
-    // even a negative answer as there are no spaces (hence no unlisted zeros).
-    *index_out = index;
-    return ans;
-  }
-  // all the stored elements are < 0, but there are unlisted
-  // elements -> pick the first unlisted element.
-  // Note that this class requires that the indexes are sorted
-  // and unique.
-  index = 0;  // "index" will always be the next index, that
-              // we haven't seen listed yet.
-  iter = pairs_.begin();
-  for (; iter != end; ++iter) {
-    if (iter->first > index) {  // index "index" is not listed.
-      *index_out = index;
-      return 0.0;
-    } else {
-      // index is the next potential gap in the indexes.
-      index = iter->first + 1;
-    }
-  }
-  // we can reach here if either pairs_.empty(), or
-  // pairs_ is nonempty but contains a sequence (0, 1, 2,...).
-  if (!pairs_.empty())
-    index = pairs_.back().first + 1;
-  // else leave index at zero
-  KALDI_ASSERT(index < dim_);
-  *index_out = index;
-  return 0.0;
-}
-template <typename Real>
-SparseVector<Real>::SparseVector(const VectorBase<Real> &vec) {
-  MatrixIndexT dim = vec.Dim();
-  dim_ = dim;
-  if (dim == 0)
-    return;
-  const Real *ptr = vec.Data();
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    Real val = ptr[i];
-    if (val != 0.0)
-      pairs_.push_back(std::pair<MatrixIndexT,Real>(i,val));
-  }
-}
-void GeneralMatrix::Swap(GeneralMatrix *other) {
-  mat_.Swap(&(other->mat_));
-  cmat_.Swap(&(other->cmat_));
-  smat_.Swap(&(other->smat_));
-}
-void ExtractRowRangeWithPadding(
-    const GeneralMatrix &in,
-    int32 row_offset,
-    int32 num_rows,
-    GeneralMatrix *out) {
-  // make sure 'out' is empty to start with.
-  Matrix<BaseFloat> empty_mat;
-  *out = empty_mat;
-  if (num_rows == 0) return;
-  switch (in.Type()) {
-    case kFullMatrix: {
-      const Matrix<BaseFloat> &mat_in = in.GetFullMatrix();
-      int32 num_rows_in = mat_in.NumRows(), num_cols = mat_in.NumCols();
-      KALDI_ASSERT(num_rows_in > 0);  // we can't extract >0 rows from an empty
-                                      // matrix.
-      Matrix<BaseFloat> mat_out(num_rows, num_cols, kUndefined);
-      for (int32 row = 0; row < num_rows; row++) {
-        int32 row_in = row + row_offset;
-        if (row_in < 0) row_in = 0;
-        else if (row_in >= num_rows_in) row_in = num_rows_in - 1;
-        SubVector<BaseFloat> vec_in(mat_in, row_in),
-            vec_out(mat_out, row);
-        vec_out.CopyFromVec(vec_in);
-      }
-      out->SwapFullMatrix(&mat_out);
-      break;
-    }
-    case kSparseMatrix: {
-      const SparseMatrix<BaseFloat> &smat_in = in.GetSparseMatrix();
-      int32 num_rows_in = smat_in.NumRows(),
-          num_cols = smat_in.NumCols();
-      KALDI_ASSERT(num_rows_in > 0);  // we can't extract >0 rows from an empty
-                                      // matrix.
-      SparseMatrix<BaseFloat> smat_out(num_rows, num_cols);
-      for (int32 row = 0; row < num_rows; row++) {
-        int32 row_in = row + row_offset;
-        if (row_in < 0) row_in = 0;
-        else if (row_in >= num_rows_in) row_in = num_rows_in - 1;
-        smat_out.SetRow(row, smat_in.Row(row_in));
-      }
-      out->SwapSparseMatrix(&smat_out);
-      break;
-    }
-    case kCompressedMatrix: {
-      const CompressedMatrix &cmat_in = in.GetCompressedMatrix();
-      bool allow_padding = true;
-      CompressedMatrix cmat_out(cmat_in, row_offset, num_rows,
-                                0, cmat_in.NumCols(), allow_padding);
-      out->SwapCompressedMatrix(&cmat_out);
-      break;
-    }
-    default:
-      KALDI_ERR << "Bad matrix type.";
-  }
-}
-template class SparseVector<float>;
-template class SparseVector<double>;
-template class SparseMatrix<float>;
-template class SparseMatrix<double>;
-}  // namespace kaldi
--- a/speechx/speechx/kaldi/matrix/sparse-matrix.h
+++ b/speechx/speechx/kaldi/matrix/sparse-matrix.h
-// matrix/sparse-matrix.h
-// Copyright  2015  Johns Hopkins University (author: Daniel Povey)
-//            2015  Guoguo Chen
-//            2017  Shiyin Kang
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_MATRIX_SPARSE_MATRIX_H_
-#define KALDI_MATRIX_SPARSE_MATRIX_H_ 1
-#include <utility>
-#include <vector>
-#include "matrix/matrix-common.h"
-#include "matrix/kaldi-matrix.h"
-#include "matrix/kaldi-vector.h"
-#include "matrix/compressed-matrix.h"
-namespace kaldi {
-/// \addtogroup matrix_group
-/// @{
-template <typename Real>
-class SparseVector {
- public:
-  MatrixIndexT Dim() const { return dim_; }
-  Real Sum() const;
-  template <class OtherReal>
-  void CopyElementsToVec(VectorBase<OtherReal> *vec) const;
-  // *vec += alpha * *this.
-  template <class OtherReal>
-  void AddToVec(Real alpha,
-                VectorBase<OtherReal> *vec) const;
-  template <class OtherReal>
-  void CopyFromSvec(const SparseVector<OtherReal> &other);
-  SparseVector<Real> &operator = (const SparseVector<Real> &other);
-  SparseVector(const SparseVector<Real> &other) { *this = other; }
-  void Swap(SparseVector<Real> *other);
-  // Returns the maximum value in this row and outputs the index associated with
-  // it.  This is not the index into the Data() pointer, it is the index into
-  // the vector it represents, i.e. the .first value in the pair.
-  // If this vector's Dim() is zero it is an error to call this function.
-  // If all the elements stored were negative and there underlying vector had
-  // zero indexes not listed in the elements, or if no elements are stored, it
-  // will return the first un-listed index, whose value (implicitly) is zero.
-  Real Max(int32 *index) const;
-  /// Returns the number of nonzero elements.
-  MatrixIndexT NumElements() const { return pairs_.size(); }
-  /// get an indexed element (0 <= i < NumElements()).
-  const std::pair<MatrixIndexT, Real> &GetElement(MatrixIndexT i) const {
-    return pairs_[i];
-  }
-  // returns pointer to element data, or NULL if empty (use with NumElements()).
-  std::pair<MatrixIndexT, Real> *Data();
-  // returns pointer to element data, or NULL if empty (use with NumElements());
-  // const version
-  const std::pair<MatrixIndexT, Real> *Data() const;
-  /// Sets elements to zero with probability zero_prob, else normally
-  /// distributed.  Useful in testing.
-  void SetRandn(BaseFloat zero_prob);
-  SparseVector(): dim_(0) { }
-  explicit SparseVector(MatrixIndexT dim): dim_(dim) { KALDI_ASSERT(dim >= 0); }
-  // constructor from pairs; does not assume input pairs are sorted and uniq
-  SparseVector(MatrixIndexT dim,
-               const std::vector<std::pair<MatrixIndexT, Real> > &pairs);
-  // constructor from a VectorBase that keeps only the nonzero elements of 'vec'.
-  explicit SparseVector(const VectorBase<Real> &vec);
-  /// Resizes to this dimension.  resize_type == kUndefined
-  /// behaves the same as kSetZero.
-  void Resize(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero);
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &os, bool binary);
-  /// Scale all elements of sparse vector.
-  void Scale(Real alpha);
- private:
-  MatrixIndexT dim_;
-  // pairs of (row-index, value).  Stored in sorted order with no duplicates.
-  // For now we use std::vector, but we could change this.
-  std::vector<std::pair<MatrixIndexT, Real> > pairs_;
-};
-template <typename Real>
-Real VecSvec(const VectorBase<Real> &vec,
-             const SparseVector<Real> &svec);
-template <typename Real>
-class SparseMatrix {
- public:
-  MatrixIndexT NumRows() const;
-  MatrixIndexT NumCols() const;
-  MatrixIndexT NumElements() const;
-  Real Sum() const;
-  Real FrobeniusNorm() const;
-  /// This constructor creates a SparseMatrix that just contains the nonzero
-  /// elements of 'mat'.
-  explicit SparseMatrix(const MatrixBase<Real> &mat);
-  /// Copy to matrix.  It must already have the correct size.
-  template <class OtherReal>
-  void CopyToMat(MatrixBase<OtherReal> *other,
-                 MatrixTransposeType t = kNoTrans) const;
-  /// Copies the values of all the elements in SparseMatrix into a VectorBase
-  /// object.
-  void CopyElementsToVec(VectorBase<Real> *other) const;
-  /// Copies data from another sparse matrix.
-  template<class OtherReal>
-  void CopyFromSmat(const SparseMatrix<OtherReal> &other,
-                    MatrixTransposeType trans = kNoTrans);
-  /// Does *other = *other + alpha * *this.
-  void AddToMat(BaseFloat alpha, MatrixBase<Real> *other,
-                MatrixTransposeType t = kNoTrans) const;
-  SparseMatrix<Real> &operator = (const SparseMatrix<Real> &other);
-  SparseMatrix(const SparseMatrix<Real> &other, MatrixTransposeType trans =
-                   kNoTrans) {
-    this->CopyFromSmat(other, trans);
-  }
-  void Swap(SparseMatrix<Real> *other);
-  // returns pointer to element data, or NULL if empty (use with NumElements()).
-  SparseVector<Real> *Data();
-  // returns pointer to element data, or NULL if empty (use with NumElements());
-  // const version
-  const SparseVector<Real> *Data() const;
-  // initializer from the type that elsewhere in Kaldi is referred to as type
-  // Posterior. indexed first by row-index; the pairs are (column-index, value),
-  // and the constructor does not require them to be sorted and uniq.
-  SparseMatrix(
-      int32 dim,
-      const std::vector<std::vector<std::pair<MatrixIndexT, Real> > > &pairs);
-  /// Sets up to a pseudo-randomly initialized matrix, with each element zero
-  /// with probability zero_prob and else normally distributed- mostly for
-  /// purposes of testing.
-  void SetRandn(BaseFloat zero_prob);
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &os, bool binary);
-  const SparseVector<Real> &Row(MatrixIndexT r) const;
-  /// Sets row r to "vec"; makes sure it has the correct dimension.
-  void SetRow(int32 r, const SparseVector<Real> &vec);
-  /// Select a subset of the rows of a SparseMatrix.
-  /// Sets *this to only the rows of 'smat_other' that are listed
-  /// in 'row_indexes'.
-  /// 'row_indexes' must satisfy 0 <= row_indexes[i] < smat_other.NumRows().
-  void SelectRows(const std::vector<int32> &row_indexes,
-                  const SparseMatrix<Real> &smat_other);
-  /// Sets *this to all the rows of *inputs appended together; this
-  /// function is destructive of the inputs.  Requires, obviously,
-  /// that the inputs all have the same dimension (although some may be
-  /// empty).
-  void AppendSparseMatrixRows(std::vector<SparseMatrix<Real> > *inputs);
-  SparseMatrix() { }
-  SparseMatrix(int32 num_rows, int32 num_cols) { Resize(num_rows, num_cols); }
-  /// Constructor from an array of indexes.
-  /// If trans == kNoTrans, construct a sparse matrix
-  /// with num-rows == indexes.Dim() and num-cols = 'dim'.
-  /// 'indexes' is expected to contain elements in the
-  /// range [0, dim - 1].  Each row 'i' of *this after
-  /// calling the constructor will contain  a single
-  /// element at column-index indexes[i] with value 1.0.
-  ///
-  /// If trans == kTrans, the result will be the transpose
-  /// of the sparse matrix described above.
-  SparseMatrix(const std::vector<int32> &indexes, int32 dim,
-               MatrixTransposeType trans = kNoTrans);
-  /// Constructor from an array of indexes and an array of
-  /// weights; requires indexes.Dim() == weights.Dim().
-  /// If trans == kNoTrans, construct a sparse matrix
-  /// with num-rows == indexes.Dim() and num-cols = 'dim'.
-  /// 'indexes' is expected to contain elements in the
-  /// range [0, dim - 1].  Each row 'i' of *this after
-  /// calling the constructor will contain a single
-  /// element at column-index indexes[i] with value weights[i].
-  /// If trans == kTrans, the result will be the transpose
-  /// of the sparse matrix described above.
-  SparseMatrix(const std::vector<int32> &indexes,
-               const VectorBase<Real> &weights, int32 dim,
-               MatrixTransposeType trans = kNoTrans);
-  /// Resizes the matrix; analogous to Matrix::Resize().  resize_type ==
-  /// kUndefined behaves the same as kSetZero.
-  void Resize(MatrixIndexT rows, MatrixIndexT cols,
-              MatrixResizeType resize_type = kSetZero);
-  /// Scale all elements in sparse matrix.
-  void Scale(Real alpha);
-  // Use the Matrix::CopyFromSmat() function to copy from this to Matrix.  Also
-  // see Matrix::AddSmat().  There is not very extensive functionality for
-  // SparseMat just yet (e.g. no matrix multiply); we will add things as needed
-  // and as it seems necessary.
- private:
-  // vector of SparseVectors, all of same dime (use an stl vector for now; this
-  // could change).
-  std::vector<SparseVector<Real> > rows_;
-};
-template<typename Real>
-Real TraceMatSmat(const MatrixBase<Real> &A,
-                  const SparseMatrix<Real> &B,
-                  MatrixTransposeType trans = kNoTrans);
-enum GeneralMatrixType {
-  kFullMatrix,
-  kCompressedMatrix,
-  kSparseMatrix
-};
-/// This class is a wrapper that enables you to store a matrix
-/// in one of three forms: either as a Matrix<BaseFloat>, or a CompressedMatrix,
-/// or a SparseMatrix<BaseFloat>.  It handles the I/O for you, i.e. you read
-/// and write a single object type.  It is useful for neural-net training
-/// targets which might be sparse or not, and might be compressed or not.
-class GeneralMatrix {
- public:
-  /// Returns the type of the matrix: kSparseMatrix, kCompressedMatrix or
-  /// kFullMatrix.  If this matrix is empty, returns kFullMatrix.
-  GeneralMatrixType Type() const;
-  void Compress();  // If it was a full matrix, compresses, changing Type() to
-                    // kCompressedMatrix; otherwise does nothing.
-  void Uncompress();  // If it was a compressed matrix, uncompresses, changing
-                      // Type() to kFullMatrix; otherwise does nothing.
-  void Write(std::ostream &os, bool binary) const;
-  /// Note: if you write a compressed matrix in text form, it will be read as
-  /// a regular full matrix.
-  void Read(std::istream &is, bool binary);
-  /// Returns the contents as a SparseMatrix.  This will only work if
-  /// Type() returns kSparseMatrix, or NumRows() == 0; otherwise it will crash.
-  const SparseMatrix<BaseFloat> &GetSparseMatrix() const;
-  /// Swaps the with the given SparseMatrix.  This will only work if
-  /// Type() returns kSparseMatrix, or NumRows() == 0.
-  void SwapSparseMatrix(SparseMatrix<BaseFloat> *smat);
-  /// Returns the contents as a compressed matrix.  This will only work if
-  /// Type() returns kCompressedMatrix, or NumRows() == 0; otherwise it will
-  /// crash.
-  const CompressedMatrix &GetCompressedMatrix() const;
-  /// Swaps the with the given CompressedMatrix.  This will only work if
-  /// Type() returns kCompressedMatrix, or NumRows() == 0.
-  void SwapCompressedMatrix(CompressedMatrix *cmat);
-  /// Returns the contents as a Matrix<BaseFloat>.  This will only work if
-  /// Type() returns kFullMatrix, or NumRows() == 0; otherwise it will crash.
-  const Matrix<BaseFloat>& GetFullMatrix() const;
-  /// Outputs the contents as a matrix.  This will work regardless of
-  /// Type().  Sizes its output, unlike CopyToMat().
-  void GetMatrix(Matrix<BaseFloat> *mat) const;
-  /// Swaps the with the given Matrix.  This will only work if
-  /// Type() returns kFullMatrix, or NumRows() == 0.
-  void SwapFullMatrix(Matrix<BaseFloat> *mat);
-  /// Copies contents, regardless of type, to "mat", which must be correctly
-  /// sized.  See also GetMatrix(), which will size its output for you.
-  void CopyToMat(MatrixBase<BaseFloat> *mat,
-                 MatrixTransposeType trans = kNoTrans) const;
-  /// Copies contents, regardless of type, to "cu_mat", which must be
-  /// correctly sized.  Implemented in ../cudamatrix/cu-sparse-matrix.cc
-  void CopyToMat(CuMatrixBase<BaseFloat> *cu_mat,
-                 MatrixTransposeType trans = kNoTrans) const;
-  /// Adds alpha times *this to mat.
-  void AddToMat(BaseFloat alpha, MatrixBase<BaseFloat> *mat,
-                MatrixTransposeType trans = kNoTrans) const;
-  /// Adds alpha times *this to cu_mat.
-  /// Implemented in ../cudamatrix/cu-sparse-matrix.cc
-  void AddToMat(BaseFloat alpha, CuMatrixBase<BaseFloat> *cu_mat,
-                MatrixTransposeType trans = kNoTrans) const;
-  /// Scale each element of matrix by alpha.
-  void Scale(BaseFloat alpha);
-  /// Assignment from regular matrix.
-  GeneralMatrix &operator= (const MatrixBase<BaseFloat> &mat);
-  /// Assignment from compressed matrix.
-  GeneralMatrix &operator= (const CompressedMatrix &mat);
-  /// Assignment from SparseMatrix<BaseFloat>
-  GeneralMatrix &operator= (const SparseMatrix<BaseFloat> &smat);
-  MatrixIndexT NumRows() const;
-  MatrixIndexT NumCols() const;
-  explicit GeneralMatrix(const MatrixBase<BaseFloat> &mat) { *this = mat; }
-  explicit GeneralMatrix(const CompressedMatrix &cmat) { *this = cmat; }
-  explicit GeneralMatrix(const SparseMatrix<BaseFloat> &smat) { *this = smat; }
-  GeneralMatrix() { }
-  // Assignment operator.
-  GeneralMatrix &operator =(const GeneralMatrix &other);
-  // Copy constructor
-  GeneralMatrix(const GeneralMatrix &other) { *this = other; }
-  // Sets to the empty matrix.
-  void Clear();
-  // shallow swap
-  void Swap(GeneralMatrix *other);
- private:
-  // We don't explicitly store the type of the matrix.  Rather, we make
-  // sure that only one of the matrices is ever nonempty, and the Type()
-  // returns that one, or kFullMatrix if all are empty.
-  Matrix<BaseFloat> mat_;
-  CompressedMatrix cmat_;
-  SparseMatrix<BaseFloat> smat_;
-};
-/// Appends all the matrix rows of a list of GeneralMatrixes, to get a single
-/// GeneralMatrix.  Preserves sparsity if all inputs were sparse (or empty).
-/// Does not preserve compression, if inputs were compressed; you have to
-/// re-compress manually, if that's what you need.
-void AppendGeneralMatrixRows(const std::vector<const GeneralMatrix *> &src,
-                             GeneralMatrix *mat);
-/// Outputs a SparseMatrix<Real> containing only the rows r of "in" such that
-/// keep_rows[r] == true.  keep_rows.size() must equal in.NumRows(), and rows
-/// must contain at least one "true" element.
-template <typename Real>
-void FilterSparseMatrixRows(const SparseMatrix<Real> &in,
-                            const std::vector<bool> &keep_rows,
-                            SparseMatrix<Real> *out);
-/// Outputs a Matrix<Real> containing only the rows r of "in" such that
-/// keep_keep_rows[r] == true.  keep_rows.size() must equal in.NumRows(), and
-/// keep_rows must contain at least one "true" element.
-template <typename Real>
-void FilterMatrixRows(const Matrix<Real> &in,
-                      const std::vector<bool> &keep_rows,
-                      Matrix<Real> *out);
-/// Outputs a Matrix<Real> containing only the rows r of "in" such that
-/// keep_rows[r] == true.  keep_rows.size() must equal in.NumRows(), and rows
-/// must contain at least one "true" element.
-void FilterCompressedMatrixRows(const CompressedMatrix &in,
-                                const std::vector<bool> &keep_rows,
-                                Matrix<BaseFloat> *out);
-/// Outputs a GeneralMatrix containing only the rows r of "in" such that
-/// keep_rows[r] == true.  keep_rows.size() must equal in.NumRows(), and
-/// keep_rows must contain at least one "true" element.  If in.Type() is
-/// kCompressedMatrix, the result will not be compressed; otherwise, the type
-/// is preserved.
-void FilterGeneralMatrixRows(const GeneralMatrix &in,
-                             const std::vector<bool> &keep_rows,
-                             GeneralMatrix *out);
-/// This function extracts a row-range of a GeneralMatrix and writes
-/// as a GeneralMatrix containing the same type of underlying
-/// matrix.  If the row-range is partly outside the row-range of 'in'
-/// (i.e. if row_offset < 0 or row_offset + num_rows > in.NumRows())
-/// then it will pad with copies of the first and last row as
-/// needed.
-/// This is more efficient than un-compressing and
-/// re-compressing the underlying CompressedMatrix, and causes
-/// less accuracy loss due to re-compression (no loss in most cases).
-void ExtractRowRangeWithPadding(
-    const GeneralMatrix &in,
-    int32 row_offset,
-    int32 num_rows,
-    GeneralMatrix *out);
-/// @} end of \addtogroup matrix_group
-}  // namespace kaldi
-#endif  // KALDI_MATRIX_SPARSE_MATRIX_H_
--- a/speechx/speechx/kaldi/matrix/srfft.cc
+++ b/speechx/speechx/kaldi/matrix/srfft.cc
-// matrix/srfft.cc
-// Copyright 2009-2011  Microsoft Corporation;  Go Vivace Inc.
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-//
-// This file includes a modified version of code originally published in Malvar,
-// H., "Signal processing with lapped transforms, " Artech House, Inc., 1992.  The
-// current copyright holder of the original code, Henrique S. Malvar, has given
-// his permission for the release of this modified version under the Apache
-// License v2.0.
-#include "matrix/srfft.h"
-#include "matrix/matrix-functions.h"
-namespace kaldi {
-template<typename Real>
-SplitRadixComplexFft<Real>::SplitRadixComplexFft(MatrixIndexT N) {
-  if ( (N & (N-1)) != 0 || N <= 1)
-    KALDI_ERR << "SplitRadixComplexFft called with invalid number of points "
-              << N;
-  N_ = N;
-  logn_ = 0;
-  while (N > 1) {
-    N >>= 1;
-    logn_ ++;
-  }
-  ComputeTables();
-}
-template <typename Real>
-SplitRadixComplexFft<Real>::SplitRadixComplexFft(
-    const SplitRadixComplexFft<Real> &other):
-    N_(other.N_), logn_(other.logn_) {
-  // This code duplicates tables from a previously computed object.
-  // Compare with the code in ComputeTables().
-  MatrixIndexT lg2 = logn_ >> 1;
-  if (logn_ & 1) lg2++;
-  MatrixIndexT brseed_size = 1 << lg2;
-  brseed_ = new MatrixIndexT[brseed_size];
-  std::memcpy(brseed_, other.brseed_, sizeof(MatrixIndexT) * brseed_size);
-  if (logn_ < 4) {
-    tab_ = NULL;
-  } else {
-    tab_ = new Real*[logn_ - 3];
-    for (MatrixIndexT i = logn_; i >= 4 ; i--) {
-      MatrixIndexT m = 1 << i, m2 = m / 2, m4 = m2 / 2;
-      MatrixIndexT this_array_size = 6 * (m4 - 2);
-      tab_[i-4] = new Real[this_array_size];
-      std::memcpy(tab_[i-4], other.tab_[i-4],
-                  sizeof(Real) * this_array_size);
-    }
-  }
-}
-template<typename Real>
-void SplitRadixComplexFft<Real>::ComputeTables() {
-  MatrixIndexT    imax, lg2, i, j;
-  MatrixIndexT     m, m2, m4, m8, nel, n;
-  Real    *cn, *spcn, *smcn, *c3n, *spc3n, *smc3n;
-  Real    ang, c, s;
-  lg2 = logn_ >> 1;
-  if (logn_ & 1) lg2++;
-  brseed_ = new MatrixIndexT[1 << lg2];
-  brseed_[0] = 0;
-  brseed_[1] = 1;
-  for (j = 2; j <= lg2; j++) {
-    imax = 1 << (j - 1);
-    for (i = 0; i < imax; i++) {
-      brseed_[i] <<= 1;
-      brseed_[i + imax] = brseed_[i] + 1;
-    }
-  }
-  if (logn_ < 4) {
-    tab_ = NULL;
-  } else {
-    tab_ = new Real* [logn_-3];
-    for (i = logn_; i>=4 ; i--) {
-      /* Compute a few constants */
-      m = 1 << i; m2 = m / 2; m4 = m2 / 2; m8 = m4 /2;
-      /* Allocate memory for tables */
-      nel = m4 - 2;
-      tab_[i-4] = new Real[6*nel];
-      /* Initialize pointers */
-      cn = tab_[i-4]; spcn  = cn + nel;  smcn  = spcn + nel;
-      c3n = smcn + nel;  spc3n = c3n + nel; smc3n = spc3n + nel;
-      /* Compute tables */
-      for (n = 1; n < m4; n++) {
-        if (n == m8) continue;
-        ang = n * M_2PI / m;
-        c = std::cos(ang); s = std::sin(ang);
-        *cn++ = c; *spcn++ = - (s + c); *smcn++ = s - c;
-        ang = 3 * n * M_2PI / m;
-        c = std::cos(ang); s = std::sin(ang);
-        *c3n++ = c; *spc3n++ = - (s + c); *smc3n++ = s - c;
-      }
-    }
-  }
-}
-template<typename Real>
-SplitRadixComplexFft<Real>::~SplitRadixComplexFft() {
-  delete [] brseed_;
-  if (tab_ != NULL) {
-    for (MatrixIndexT i = 0; i < logn_-3; i++)
-      delete [] tab_[i];
-    delete [] tab_;
-  }
-}
-template<typename Real>
-void SplitRadixComplexFft<Real>::Compute(Real *xr, Real *xi, bool forward) const {
-  if (!forward) {  // reverse real and imaginary parts for complex FFT.
-    Real *tmp = xr;
-    xr = xi;
-    xi = tmp;
-  }
-  ComputeRecursive(xr, xi, logn_);
-  if (logn_ > 1) {
-    BitReversePermute(xr, logn_);
-    BitReversePermute(xi, logn_);
-  }
-}
-template<typename Real>
-void SplitRadixComplexFft<Real>::Compute(Real *x, bool forward,
-                                         std::vector<Real> *temp_buffer) const {
-  KALDI_ASSERT(temp_buffer != NULL);
-  if (temp_buffer->size() != N_)
-    temp_buffer->resize(N_);
-  Real *temp_ptr = &((*temp_buffer)[0]);
-  for (MatrixIndexT i = 0; i < N_; i++) {
-    x[i] = x[i * 2];  // put the real part in the first half of x.
-    temp_ptr[i] = x[i * 2 + 1];  // put the imaginary part in temp_buffer.
-  }
-  // copy the imaginary part back to the second half of x.
-  memcpy(static_cast<void*>(x + N_),
-         static_cast<void*>(temp_ptr),
-         sizeof(Real) * N_);
-  Compute(x, x + N_, forward);
-  // Now change the format back to interleaved.
-  memcpy(static_cast<void*>(temp_ptr),
-         static_cast<void*>(x + N_),
-         sizeof(Real) * N_);
-  for (MatrixIndexT i = N_-1; i > 0; i--) {  // don't include 0,
-    // in case MatrixIndexT is unsigned, the loop would not terminate.
-    // Treat it as a special case.
-    x[i*2] = x[i];
-    x[i*2 + 1] = temp_ptr[i];
-  }
-  x[1] = temp_ptr[0];  // special case of i = 0.
-}
-template<typename Real>
-void SplitRadixComplexFft<Real>::Compute(Real *x, bool forward) {
-  this->Compute(x, forward, &temp_buffer_);
-}
-template<typename Real>
-void SplitRadixComplexFft<Real>::BitReversePermute(Real *x, MatrixIndexT logn) const {
-  MatrixIndexT      i, j, lg2, n;
-  MatrixIndexT      off, fj, gno, *brp;
-  Real    tmp, *xp, *xq;
-  lg2 = logn >> 1;
-  n = 1 << lg2;
-  if (logn & 1) lg2++;
-  /* Unshuffling loop */
-  for (off = 1; off < n; off++) {
-    fj = n * brseed_[off]; i = off; j = fj;
-    tmp = x[i]; x[i] = x[j]; x[j] = tmp;
-    xp = &x[i];
-    brp = &(brseed_[1]);
-    for (gno = 1; gno < brseed_[off]; gno++) {
-      xp += n;
-      j = fj + *brp++;
-      xq = x + j;
-      tmp = *xp; *xp = *xq; *xq = tmp;
-    }
-  }
-}
-template<typename Real>
-void SplitRadixComplexFft<Real>::ComputeRecursive(Real *xr, Real *xi, MatrixIndexT logn) const {
-  MatrixIndexT    m, m2, m4, m8, nel, n;
-  Real    *xr1, *xr2, *xi1, *xi2;
-  Real    *cn = nullptr, *spcn = nullptr, *smcn = nullptr, *c3n = nullptr,
-    *spc3n = nullptr, *smc3n = nullptr;
-  Real    tmp1, tmp2;
-  Real   sqhalf = M_SQRT1_2;
-  /* Check range of logn */
-  if (logn < 0)
-    KALDI_ERR << "Error: logn is out of bounds in SRFFT";
-  /* Compute trivial cases */
-  if (logn < 3) {
-    if (logn == 2) {  /* length m = 4 */
-      xr2  = xr + 2;
-      xi2  = xi + 2;
-      tmp1 = *xr + *xr2;
-      *xr2 = *xr - *xr2;
-      *xr  = tmp1;
-      tmp1 = *xi + *xi2;
-      *xi2 = *xi - *xi2;
-      *xi  = tmp1;
-      xr1  = xr + 1;
-      xi1  = xi + 1;
-      xr2++;
-      xi2++;
-      tmp1 = *xr1 + *xr2;
-      *xr2 = *xr1 - *xr2;
-      *xr1 = tmp1;
-      tmp1 = *xi1 + *xi2;
-      *xi2 = *xi1 - *xi2;
-      *xi1 = tmp1;
-      xr2  = xr + 1;
-      xi2  = xi + 1;
-      tmp1 = *xr + *xr2;
-      *xr2 = *xr - *xr2;
-      *xr  = tmp1;
-      tmp1 = *xi + *xi2;
-      *xi2 = *xi - *xi2;
-      *xi  = tmp1;
-      xr1  = xr + 2;
-      xi1  = xi + 2;
-      xr2  = xr + 3;
-      xi2  = xi + 3;
-      tmp1 = *xr1 + *xi2;
-      tmp2 = *xi1 + *xr2;
-      *xi1 = *xi1 - *xr2;
-      *xr2 = *xr1 - *xi2;
-      *xr1 = tmp1;
-      *xi2 = tmp2;
-      return;
-    }
-    else if (logn == 1) {   /* length m = 2 */
-      xr2  = xr + 1;
-      xi2  = xi + 1;
-      tmp1 = *xr + *xr2;
-      *xr2 = *xr - *xr2;
-      *xr  = tmp1;
-      tmp1 = *xi + *xi2;
-      *xi2 = *xi - *xi2;
-      *xi  = tmp1;
-      return;
-    }
-    else if (logn == 0) return;   /* length m = 1 */
-  }
-  /* Compute a few constants */
-  m = 1 << logn; m2 = m / 2; m4 = m2 / 2; m8 = m4 /2;
-  /* Step 1 */
-  xr1 = xr; xr2 = xr1 + m2;
-  xi1 = xi; xi2 = xi1 + m2;
-  for (n = 0; n < m2; n++) {
-    tmp1 = *xr1 + *xr2;
-    *xr2 = *xr1 - *xr2;
-    xr2++;
-    *xr1++ = tmp1;
-    tmp2 = *xi1 + *xi2;
-    *xi2 = *xi1 - *xi2;
-    xi2++;
-    *xi1++ = tmp2;
-  }
-  /* Step 2 */
-  xr1 = xr + m2; xr2 = xr1 + m4;
-  xi1 = xi + m2; xi2 = xi1 + m4;
-  for (n = 0; n < m4; n++) {
-    tmp1 = *xr1 + *xi2;
-    tmp2 = *xi1 + *xr2;
-    *xi1 = *xi1 - *xr2;
-    xi1++;
-    *xr2++ = *xr1 - *xi2;
-    *xr1++ = tmp1;
-    *xi2++ = tmp2;
-    // xr1++; xr2++; xi1++; xi2++;
-  }
-  /* Steps 3 & 4 */
-  xr1 = xr + m2; xr2 = xr1 + m4;
-  xi1 = xi + m2; xi2 = xi1 + m4;
-  if (logn >= 4) {
-    nel = m4 - 2;
-    cn  = tab_[logn-4]; spcn  = cn + nel;  smcn  = spcn + nel;
-    c3n = smcn + nel;  spc3n = c3n + nel; smc3n = spc3n + nel;
-  }
-  xr1++; xr2++; xi1++; xi2++;
-  // xr1++; xi1++;
-  for (n = 1; n < m4; n++) {
-    if (n == m8) {
-      tmp1 =  sqhalf * (*xr1 + *xi1);
-      *xi1 =  sqhalf * (*xi1 - *xr1);
-      *xr1 =  tmp1;
-      tmp2 =  sqhalf * (*xi2 - *xr2);
-      *xi2 = -sqhalf * (*xr2 + *xi2);
-      *xr2 =  tmp2;
-    } else {
-      tmp2 = *cn++ * (*xr1 + *xi1);
-      tmp1 = *spcn++ * *xr1 + tmp2;
-      *xr1 = *smcn++ * *xi1 + tmp2;
-      *xi1 = tmp1;
-      tmp2 = *c3n++ * (*xr2 + *xi2);
-      tmp1 = *spc3n++ * *xr2 + tmp2;
-      *xr2 = *smc3n++ * *xi2 + tmp2;
-      *xi2 = tmp1;
-    }
-    xr1++; xr2++; xi1++; xi2++;
-  }
-  /* Call ssrec again with half DFT length */
-  ComputeRecursive(xr, xi, logn-1);
-  /* Call ssrec again twice with one quarter DFT length.
-     Constants have to be recomputed, because they are static! */
-  // m = 1 << logn; m2 = m / 2;
-  ComputeRecursive(xr + m2, xi + m2, logn - 2);
-  // m = 1 << logn;
-  m4 = 3 * (m / 4);
-  ComputeRecursive(xr + m4, xi + m4, logn - 2);
-}
-template<typename Real>
-void SplitRadixRealFft<Real>::Compute(Real *data, bool forward) {
-  Compute(data, forward, &this->temp_buffer_);
-}
-// This code is mostly the same as the RealFft function.  It would be
-// possible to replace it with more efficient code from Rico's book.
-template<typename Real>
-void SplitRadixRealFft<Real>::Compute(Real *data, bool forward,
-                                      std::vector<Real> *temp_buffer) const {
-  MatrixIndexT N = N_, N2 = N/2;
-  KALDI_ASSERT(N%2 == 0);
-  if (forward) // call to base class
-    SplitRadixComplexFft<Real>::Compute(data, true, temp_buffer);
-  Real rootN_re, rootN_im;  // exp(-2pi/N), forward; exp(2pi/N), backward
-  int forward_sign = forward ? -1 : 1;
-  ComplexImExp(static_cast<Real>(M_2PI/N *forward_sign), &rootN_re, &rootN_im);
-  Real kN_re = -forward_sign, kN_im = 0.0;  // exp(-2pik/N), forward; exp(-2pik/N), backward
-  // kN starts out as 1.0 for forward algorithm but -1.0 for backward.
-  for (MatrixIndexT k = 1; 2*k <= N2; k++) {
-    ComplexMul(rootN_re, rootN_im, &kN_re, &kN_im);
-    Real Ck_re, Ck_im, Dk_re, Dk_im;
-    // C_k = 1/2 (B_k + B_{N/2 - k}^*) :
-    Ck_re = 0.5 * (data[2*k] + data[N - 2*k]);
-    Ck_im = 0.5 * (data[2*k + 1] - data[N - 2*k + 1]);
-    // re(D_k)= 1/2 (im(B_k) + im(B_{N/2-k})):
-    Dk_re = 0.5 * (data[2*k + 1] + data[N - 2*k + 1]);
-    // im(D_k) = -1/2 (re(B_k) - re(B_{N/2-k}))
-    Dk_im =-0.5 * (data[2*k] - data[N - 2*k]);
-    // A_k = C_k + 1^(k/N) D_k:
-    data[2*k] = Ck_re;  // A_k <-- C_k
-    data[2*k+1] = Ck_im;
-    // now A_k += D_k 1^(k/N)
-    ComplexAddProduct(Dk_re, Dk_im, kN_re, kN_im, &(data[2*k]), &(data[2*k+1]));
-    MatrixIndexT kdash = N2 - k;
-    if (kdash != k) {
-      // Next we handle the index k' = N/2 - k.  This is necessary
-      // to do now, to avoid invalidating data that we will later need.
-      // The quantities C_{k'} and D_{k'} are just the conjugates of C_k
-      // and D_k, so the equations are simple modifications of the above,
-      // replacing Ck_im and Dk_im with their negatives.
-      data[2*kdash] = Ck_re;  // A_k' <-- C_k'
-      data[2*kdash+1] = -Ck_im;
-      // now A_k' += D_k' 1^(k'/N)
-      // We use 1^(k'/N) = 1^((N/2 - k) / N) = 1^(1/2) 1^(-k/N) = -1 * (1^(k/N))^*
-      // so it's the same as 1^(k/N) but with the real part negated.
-      ComplexAddProduct(Dk_re, -Dk_im, -kN_re, kN_im, &(data[2*kdash]), &(data[2*kdash+1]));
-    }
-  }
-  {  // Now handle k = 0.
-    // In simple terms: after the complex fft, data[0] becomes the sum of real
-    // parts input[0], input[2]... and data[1] becomes the sum of imaginary
-    // pats input[1], input[3]...
-    // "zeroth" [A_0] is just the sum of input[0]+input[1]+input[2]..
-    // and "n2th" [A_{N/2}] is input[0]-input[1]+input[2]... .
-    Real zeroth = data[0] + data[1],
-        n2th = data[0] - data[1];
-    data[0] = zeroth;
-    data[1] = n2th;
-    if (!forward) {
-      data[0] /= 2;
-      data[1] /= 2;
-    }
-  }
-  if (!forward) {  // call to base class
-    SplitRadixComplexFft<Real>::Compute(data, false, temp_buffer);
-    for (MatrixIndexT i = 0; i < N; i++)
-      data[i] *= 2.0;
-    // This is so we get a factor of N increase, rather than N/2 which we would
-    // otherwise get from [ComplexFft, forward] + [ComplexFft, backward] in dimension N/2.
-    // It's for consistency with our normal FFT convensions.
-  }
-}
-template class SplitRadixComplexFft<float>;
-template class SplitRadixComplexFft<double>;
-template class SplitRadixRealFft<float>;
-template class SplitRadixRealFft<double>;
-} // end namespace kaldi
--- a/speechx/speechx/kaldi/matrix/srfft.h
+++ b/speechx/speechx/kaldi/matrix/srfft.h
-// matrix/srfft.h
-// Copyright 2009-2011  Microsoft Corporation;  Go Vivace Inc.
-//                2014  Daniel Povey
-//
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-//
-// This file includes a modified version of code originally published in Malvar,
-// H., "Signal processing with lapped transforms, " Artech House, Inc., 1992.  The
-// current copyright holder of the original code, Henrique S. Malvar, has given
-// his permission for the release of this modified version under the Apache
-// License v2.0.
-#ifndef KALDI_MATRIX_SRFFT_H_
-#define KALDI_MATRIX_SRFFT_H_
-#include "matrix/kaldi-vector.h"
-#include "matrix/kaldi-matrix.h"
-namespace kaldi {
-/// @addtogroup matrix_funcs_misc
-/// @{
-// This class is based on code by Henrique (Rico) Malvar, from his book
-// "Signal Processing with Lapped Transforms" (1992).  Copied with
-// permission, optimized by Go Vivace Inc., and converted into C++ by
-// Microsoft Corporation
-// This is a more efficient way of doing the complex FFT than ComplexFft
-// (declared in matrix-functios.h), but it only works for powers of 2.
-// Note: in multi-threaded code, you would need to have one of these objects per
-// thread, because multiple calls to Compute in parallel would not work.
-template<typename Real>
-class SplitRadixComplexFft {
- public:
-  typedef MatrixIndexT Integer;
-  // N is the number of complex points (must be a power of two, or this
-  // will crash).  Note that the constructor does some work so it's best to
-  // initialize the object once and do the computation many times.
-  SplitRadixComplexFft(Integer N);
-  // Copy constructor
-  SplitRadixComplexFft(const SplitRadixComplexFft &other);
-  // Does the FFT computation, given pointers to the real and
-  // imaginary parts.  If "forward", do the forward FFT; else
-  // do the inverse FFT (without the 1/N factor).
-  // xr and xi are pointers to zero-based arrays of size N,
-  // containing the real and imaginary parts
-  // respectively.
-  void Compute(Real *xr, Real *xi, bool forward) const;
-  // This version of Compute takes a single array of size N*2,
-  // containing [ r0 im0 r1 im1 ... ].  Otherwise its behavior is  the
-  // same as the version above.
-  void Compute(Real *x, bool forward);
-  // This version of Compute is const; it operates on an array of size N*2
-  // containing [ r0 im0 r1 im1 ... ], but it uses the argument "temp_buffer" as
-  // temporary storage instead of a class-member variable.  It will allocate it if
-  // needed.
-  void Compute(Real *x, bool forward, std::vector<Real> *temp_buffer) const;
-  ~SplitRadixComplexFft();
- protected:
-  // temp_buffer_ is allocated only if someone calls Compute with only one Real*
-  // argument and we need a temporary buffer while creating interleaved data.
-  std::vector<Real> temp_buffer_;
- private:
-  void ComputeTables();
-  void ComputeRecursive(Real *xr, Real *xi, Integer logn) const;
-  void BitReversePermute(Real *x, Integer logn) const;
-  Integer N_;
-  Integer logn_;  // log(N)
-  Integer *brseed_;
-  // brseed is Evans' seed table, ref:  (Ref: D. M. W.
-  // Evans, "An improved digit-reversal permutation algorithm ...",
-  // IEEE Trans. ASSP, Aug. 1987, pp. 1120-1125).
-  Real **tab_;       // Tables of butterfly coefficients.
-  // Disallow assignment.
-  SplitRadixComplexFft &operator =(const SplitRadixComplexFft<Real> &other);
-};
-template<typename Real>
-class SplitRadixRealFft: private SplitRadixComplexFft<Real> {
- public:
-  SplitRadixRealFft(MatrixIndexT N):  // will fail unless N>=4 and N is a power of 2.
-      SplitRadixComplexFft<Real> (N/2), N_(N) { }
-  // Copy constructor
-  SplitRadixRealFft(const SplitRadixRealFft<Real> &other):
-      SplitRadixComplexFft<Real>(other), N_(other.N_) { }
-  /// If forward == true, this function transforms from a sequence of N real points to its complex fourier
-  /// transform; otherwise it goes in the reverse direction.  If you call it
-  /// in the forward and then reverse direction and multiply by 1.0/N, you
-  /// will get back the original data.
-  /// The interpretation of the complex-FFT data is as follows: the array
-  /// is a sequence of complex numbers C_n of length N/2 with (real, im) format,
-  /// i.e. [real0, real_{N/2}, real1, im1, real2, im2, real3, im3, ...].
-  void Compute(Real *x, bool forward);
-  /// This is as the other Compute() function, but it is a const version that
-  /// uses a user-supplied buffer.
-  void Compute(Real *x, bool forward, std::vector<Real> *temp_buffer) const;
- private:
-  // Disallow assignment.
-  SplitRadixRealFft &operator =(const SplitRadixRealFft<Real> &other);
-  int N_;
-};
-/// @} end of "addtogroup matrix_funcs_misc"
-} // end namespace kaldi
-#endif
--- a/speechx/speechx/kaldi/matrix/tp-matrix.cc
+++ b/speechx/speechx/kaldi/matrix/tp-matrix.cc
-// matrix/tp-matrix.cc
-// Copyright 2009-2011  Ondrej Glembek;  Lukas Burget;  Microsoft Corporation
-//                      Saarland University;  Yanmin Qian;   Haihua Xu
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//  http://www.apache.org/licenses/LICENSE-2.0
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#include "matrix/tp-matrix.h"
-#include "matrix/sp-matrix.h"
-#include "matrix/kaldi-matrix.h"
-#include "matrix/cblas-wrappers.h"
-namespace kaldi {
-#ifndef HAVE_ATLAS
-template<typename Real>
-void TpMatrix<Real>::Invert() {
-  // these are CLAPACK types
-  KaldiBlasInt result;
-  KaldiBlasInt rows = static_cast<int>(this->num_rows_);
-  // clapack call
-  // NOTE: Even though "U" is for upper, lapack assumes column-wise storage
-  // of the data. We have a row-wise storage, therefore, we need to "invert"
-  clapack_Xtptri(&rows, this->data_, &result);
-  if (result < 0) {
-    KALDI_ERR << "Call to CLAPACK stptri_ function failed";
-  } else if (result > 0) {
-    KALDI_ERR << "Matrix is singular";
-  }
-}
-#else
-template<typename Real>
-void TpMatrix<Real>::Invert() {
-  // ATLAS doesn't implement triangular matrix inversion in packed
-  // format, so we temporarily put in non-packed format.
-  Matrix<Real> tmp(*this);
-  int rows = static_cast<int>(this->num_rows_);
-  // ATLAS call.  It's really row-major ordering and a lower triangular matrix,
-  // but there is some weirdness with Fortran-style indexing that we need to
-  // take account of, so everything gets swapped.
-  int result = clapack_Xtrtri( rows, tmp.Data(), tmp.Stride());
-  // Let's hope ATLAS has the same return value conventions as clapack.
-  // I couldn't find any documentation online.
-  if (result < 0) {
-    KALDI_ERR << "Call to ATLAS strtri function failed";
-  } else if (result > 0) {
-    KALDI_ERR << "Matrix is singular";
-  }
-  (*this).CopyFromMat(tmp);
-}
-#endif
-template<typename Real>
-Real TpMatrix<Real>::Determinant() {
-  double   det = 1.0;
-  for (MatrixIndexT i = 0; i<this->NumRows(); i++) {
-    det *= (*this)(i, i);
-  }
-  return static_cast<Real>(det);
-}
-template<typename Real>
-void TpMatrix<Real>::Swap(TpMatrix<Real> *other) {
-  std::swap(this->data_, other->data_);
-  std::swap(this->num_rows_, other->num_rows_);
-}
-template<typename Real>
-void TpMatrix<Real>::Cholesky(const SpMatrix<Real> &orig) {
-  KALDI_ASSERT(orig.NumRows() == this->NumRows());
-  MatrixIndexT n = this->NumRows();
-  this->SetZero();
-  Real *data = this->data_, *jdata = data;  // start of j'th row of matrix.
-  const Real *orig_jdata = orig.Data(); // start of j'th row of matrix.
-  for (MatrixIndexT j = 0; j < n; j++, jdata += j, orig_jdata += j) {
-    Real *kdata = data; // start of k'th row of matrix.
-    Real d(0.0);
-    for (MatrixIndexT k = 0; k < j; k++, kdata += k) {
-      Real s = cblas_Xdot(k, kdata, 1, jdata, 1);
-      // (*this)(j, k) = s = (orig(j, k) - s)/(*this)(k, k);
-      jdata[k] = s = (orig_jdata[k] - s)/kdata[k];
-      d = d + s*s;
-    }
-    // d = orig(j, j) - d;
-    d = orig_jdata[j] - d;
-    if (d >= 0.0) {
-      // (*this)(j, j) = std::sqrt(d);
-      jdata[j] = std::sqrt(d);
-    } else {
-      KALDI_ERR << "Cholesky decomposition failed. Maybe matrix "
-          "is not positive definite.";
-    }
-  }
-}
-template<typename Real>
-void TpMatrix<Real>::CopyFromMat(const MatrixBase<Real> &M,
-                                 MatrixTransposeType Trans) {
-  if (Trans == kNoTrans) {
-    KALDI_ASSERT(this->NumRows() == M.NumRows() && M.NumRows() == M.NumCols());
-    MatrixIndexT D = this->NumRows();
-    const Real *in_i = M.Data();
-    MatrixIndexT stride = M.Stride();
-    Real *out_i = this->data_;
-    for (MatrixIndexT i = 0; i < D; i++, in_i += stride, out_i += i)
-      for (MatrixIndexT j = 0; j <= i; j++)
-        out_i[j] = in_i[j];
-  } else {
-    KALDI_ASSERT(this->NumRows() == M.NumRows() && M.NumRows() == M.NumCols());
-    MatrixIndexT D = this->NumRows();
-    const Real *in_i = M.Data();
-    MatrixIndexT stride = M.Stride();
-    Real *out_i = this->data_;
-    for (MatrixIndexT i = 0; i < D; i++, in_i++, out_i += i) {
-      for (MatrixIndexT j = 0; j <= i; j++)
-        out_i[j] = in_i[stride*j];
-    }
-  }
-}
-template class TpMatrix<float>;
-template class TpMatrix<double>;
-}  // namespace kaldi
--- a/speechx/speechx/kaldi/matrix/tp-matrix.h
+++ b/speechx/speechx/kaldi/matrix/tp-matrix.h
-// matrix/tp-matrix.h
-// Copyright 2009-2011  Ondrej Glembek;  Lukas Burget;  Microsoft Corporation;
-//                      Saarland University;  Yanmin Qian;   Haihua Xu
-//                2013  Johns Hopkins Universith (author: Daniel Povey)
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//  http://www.apache.org/licenses/LICENSE-2.0
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-#ifndef KALDI_MATRIX_TP_MATRIX_H_
-#define KALDI_MATRIX_TP_MATRIX_H_
-#include "matrix/packed-matrix.h"
-namespace kaldi {
-/// \addtogroup matrix_group
-/// @{
-template<typename Real> class TpMatrix;
-/// @brief Packed symetric matrix class
-template<typename Real>
-class TpMatrix : public PackedMatrix<Real> {
-  friend class CuTpMatrix<float>;
-  friend class CuTpMatrix<double>;
- public:
-  TpMatrix() : PackedMatrix<Real>() {}
-  explicit TpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
-      : PackedMatrix<Real>(r, resize_type) {}
-  TpMatrix(const TpMatrix<Real>& orig) : PackedMatrix<Real>(orig) {}
-  /// Copy constructor from CUDA TpMatrix
-  /// This is defined in ../cudamatrix/cu-tp-matrix.cc
-  explicit TpMatrix(const CuTpMatrix<Real> &cu);
-  template<typename OtherReal> explicit TpMatrix(const TpMatrix<OtherReal>& orig)
-      : PackedMatrix<Real>(orig) {}
-  Real operator() (MatrixIndexT r, MatrixIndexT c) const {
-    if (static_cast<UnsignedMatrixIndexT>(c) >
-        static_cast<UnsignedMatrixIndexT>(r)) {
-      KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(c) <
-                   static_cast<UnsignedMatrixIndexT>(this->num_rows_));
-      return 0;
-    }
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
-                 static_cast<UnsignedMatrixIndexT>(this->num_rows_));
-    // c<=r now so don't have to check c.
-    return *(this->data_ + (r*(r+1)) / 2 + c);
-    // Duplicating code from PackedMatrix.h
-  }
-  Real &operator() (MatrixIndexT r, MatrixIndexT c) {
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
-                 static_cast<UnsignedMatrixIndexT>(this->num_rows_));
-    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(c) <=
-                 static_cast<UnsignedMatrixIndexT>(r) &&
-                 "you cannot access the upper triangle of TpMatrix using "
-                 "a non-const matrix object.");
-    return *(this->data_ + (r*(r+1)) / 2 + c);
-    // Duplicating code from PackedMatrix.h
-  }
-  // Note: Cholesky may throw KaldiFatalError.
-  void Cholesky(const SpMatrix<Real>& orig);
-  void Invert();
-  // Inverts in double precision.
-  void InvertDouble() {
-    TpMatrix<double> dmat(*this);
-    dmat.Invert();
-    (*this).CopyFromTp(dmat);
-  }
-  /// Shallow swap
-  void Swap(TpMatrix<Real> *other);
-  /// Returns the determinant of the matrix (product of diagonals)
-  Real Determinant();
-  /// CopyFromMat copies the lower triangle of M into *this
-  /// (or the upper triangle, if Trans == kTrans).
-  void CopyFromMat(const MatrixBase<Real> &M,
-                   MatrixTransposeType Trans = kNoTrans);
-  /// This is implemented in ../cudamatrix/cu-tp-matrix.cc
-  void CopyFromMat(const CuTpMatrix<Real> &other);
-  /// CopyFromTp copies another triangular matrix into this one.
-  void CopyFromTp(const TpMatrix<Real> &other) {
-    PackedMatrix<Real>::CopyFromPacked(other);
-  }
-  template<typename OtherReal> void CopyFromTp(const TpMatrix<OtherReal> &other) {
-    PackedMatrix<Real>::CopyFromPacked(other);
-  }
-  /// AddTp does *this += alpha * M.
-  void AddTp(const Real alpha, const TpMatrix<Real> &M) {
-    this->AddPacked(alpha, M);
-  }
-  TpMatrix<Real>& operator=(const TpMatrix<Real> &other) {
-    PackedMatrix<Real>::operator=(other);
-    return *this;
-  }
-  using PackedMatrix<Real>::Scale;
-  void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero) {
-    PackedMatrix<Real>::Resize(nRows, resize_type);
-  }
-};
-/// @} end of "addtogroup matrix_group".
-}  // namespace kaldi
-#endif
--- a/speechx/speechx/kaldi/util/kaldi-holder-inl.h
+++ b/speechx/speechx/kaldi/util/kaldi-holder-inl.h
@@ -754,53 +754,53 @@ class TokenVectorHolder {
 };
-class HtkMatrixHolder {
+//class HtkMatrixHolder {
- public:
+ //public:
-  typedef std::pair<Matrix<BaseFloat>, HtkHeader> T;
+  //typedef std::pair<Matrix<BaseFloat>, HtkHeader> T;
-  HtkMatrixHolder() {}
+  //HtkMatrixHolder() {}
-  static bool Write(std::ostream &os, bool binary, const T &t) {
+  //static bool Write(std::ostream &os, bool binary, const T &t) {
-    if (!binary)
+    //if (!binary)
-      KALDI_ERR << "Non-binary HTK-format write not supported.";
+      //KALDI_ERR << "Non-binary HTK-format write not supported.";
-    bool ans = WriteHtk(os, t.first, t.second);
+    //bool ans = WriteHtk(os, t.first, t.second);
-    if (!ans)
+    //if (!ans)
-      KALDI_WARN << "Error detected writing HTK-format matrix.";
+      //KALDI_WARN << "Error detected writing HTK-format matrix.";
-    return ans;
+    //return ans;
-  }
+  //}
-  void Clear() { t_.first.Resize(0, 0); }
+  //void Clear() { t_.first.Resize(0, 0); }
-  // Reads into the holder.
+  //// Reads into the holder.
-  bool Read(std::istream &is) {
+  //bool Read(std::istream &is) {
-    bool ans = ReadHtk(is, &t_.first, &t_.second);
+    //bool ans = ReadHtk(is, &t_.first, &t_.second);
-    if (!ans) {
+    //if (!ans) {
-      KALDI_WARN << "Error detected reading HTK-format matrix.";
+      //KALDI_WARN << "Error detected reading HTK-format matrix.";
-      return false;
+      //return false;
-    }
+    //}
-    return ans;
+    //return ans;
-  }
+  //}
-  // HTK-format matrices only read in binary.
+  //// HTK-format matrices only read in binary.
-  static bool IsReadInBinary() { return true; }
+  //static bool IsReadInBinary() { return true; }
-  T &Value() { return t_; }
+  //T &Value() { return t_; }
-  void Swap(HtkMatrixHolder *other) {
+  //void Swap(HtkMatrixHolder *other) {
-    t_.first.Swap(&(other->t_.first));
+    //t_.first.Swap(&(other->t_.first));
-    std::swap(t_.second, other->t_.second);
+    //std::swap(t_.second, other->t_.second);
-  }
+  //}
-  bool ExtractRange(const HtkMatrixHolder &other,
+  //bool ExtractRange(const HtkMatrixHolder &other,
-                    const std::string &range) {
+                    //const std::string &range) {
-    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    //KALDI_ERR << "ExtractRange is not defined for this type of holder.";
-    return false;
+    //return false;
-  }
+  //}
-  // Default destructor.
+  //// Default destructor.
- private:
+ //private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(HtkMatrixHolder);
+  //KALDI_DISALLOW_COPY_AND_ASSIGN(HtkMatrixHolder);
-  T t_;
+  //T t_;
-};
+//};
 // SphinxMatrixHolder can be used to read and write feature files in
 // CMU Sphinx format. 13-dimensional big-endian features are assumed.
@@ -813,104 +813,104 @@ class HtkMatrixHolder {
 // be no problem, because the usage help of Sphinx' "wave2feat" for example
 // says that Sphinx features are always big endian.
 // Note: the kFeatDim defaults to 13, see forward declaration in kaldi-holder.h
-template<int kFeatDim> class SphinxMatrixHolder {
+//template<int kFeatDim> class SphinxMatrixHolder {
- public:
+ //public:
-  typedef Matrix<BaseFloat> T;
+  //typedef Matrix<BaseFloat> T;
-  SphinxMatrixHolder() {}
+  //SphinxMatrixHolder() {}
-  void Clear() { feats_.Resize(0, 0); }
+  //void Clear() { feats_.Resize(0, 0); }
-  // Writes Sphinx-format features
+  //// Writes Sphinx-format features
-  static bool Write(std::ostream &os, bool binary, const T &m) {
+  //static bool Write(std::ostream &os, bool binary, const T &m) {
-    if (!binary) {
+    //if (!binary) {
-      KALDI_WARN << "SphinxMatrixHolder can't write Sphinx features in text ";
+      //KALDI_WARN << "SphinxMatrixHolder can't write Sphinx features in text ";
-      return false;
+      //return false;
-    }
+    //}
-    int32 size = m.NumRows() * m.NumCols();
+    //int32 size = m.NumRows() * m.NumCols();
-    if (MachineIsLittleEndian())
+    //if (MachineIsLittleEndian())
-      KALDI_SWAP4(size);
+      //KALDI_SWAP4(size);
-    // write the header
+    //// write the header
-    os.write(reinterpret_cast<char*> (&size), sizeof(size));
+    //os.write(reinterpret_cast<char*> (&size), sizeof(size));
-    for (MatrixIndexT i = 0; i < m.NumRows(); i++) {
+    //for (MatrixIndexT i = 0; i < m.NumRows(); i++) {
-      std::vector<float32> tmp(m.NumCols());
+      //std::vector<float32> tmp(m.NumCols());
-      for (MatrixIndexT j = 0; j < m.NumCols(); j++) {
+      //for (MatrixIndexT j = 0; j < m.NumCols(); j++) {
-        tmp[j] = static_cast<float32>(m(i, j));
+        //tmp[j] = static_cast<float32>(m(i, j));
-        if (MachineIsLittleEndian())
+        //if (MachineIsLittleEndian())
-          KALDI_SWAP4(tmp[j]);
+          //KALDI_SWAP4(tmp[j]);
-      }
+      //}
-      os.write(reinterpret_cast<char*>(&(tmp[0])),
+      //os.write(reinterpret_cast<char*>(&(tmp[0])),
-               tmp.size() * 4);
+               //tmp.size() * 4);
-    }
+    //}
-    return true;
+    //return true;
-  }
+  //}
-  // Reads the features into a Kaldi Matrix
+  //// Reads the features into a Kaldi Matrix
-  bool Read(std::istream &is) {
+  //bool Read(std::istream &is) {
-    int32 nmfcc;
+    //int32 nmfcc;
-    is.read(reinterpret_cast<char*> (&nmfcc), sizeof(nmfcc));
+    //is.read(reinterpret_cast<char*> (&nmfcc), sizeof(nmfcc));
-    if (MachineIsLittleEndian())
+    //if (MachineIsLittleEndian())
-      KALDI_SWAP4(nmfcc);
+      //KALDI_SWAP4(nmfcc);
-    KALDI_VLOG(2) << "#feats: " << nmfcc;
+    //KALDI_VLOG(2) << "#feats: " << nmfcc;
-    int32 nfvec = nmfcc / kFeatDim;
+    //int32 nfvec = nmfcc / kFeatDim;
-    if ((nmfcc % kFeatDim) != 0) {
+    //if ((nmfcc % kFeatDim) != 0) {
-      KALDI_WARN << "Sphinx feature count is inconsistent with vector length ";
+      //KALDI_WARN << "Sphinx feature count is inconsistent with vector length ";
-      return false;
+      //return false;
-    }
+    //}
-    feats_.Resize(nfvec, kFeatDim);
+    //feats_.Resize(nfvec, kFeatDim);
-    for (MatrixIndexT i = 0; i < feats_.NumRows(); i++) {
+    //for (MatrixIndexT i = 0; i < feats_.NumRows(); i++) {
-      if (sizeof(BaseFloat) == sizeof(float32)) {
+      //if (sizeof(BaseFloat) == sizeof(float32)) {
-        is.read(reinterpret_cast<char*> (feats_.RowData(i)),
+        //is.read(reinterpret_cast<char*> (feats_.RowData(i)),
-                kFeatDim * sizeof(float32));
+                //kFeatDim * sizeof(float32));
-        if (!is.good()) {
+        //if (!is.good()) {
-          KALDI_WARN << "Unexpected error/EOF while reading Sphinx features ";
+          //KALDI_WARN << "Unexpected error/EOF while reading Sphinx features ";
-          return false;
+          //return false;
-        }
+        //}
-        if (MachineIsLittleEndian()) {
+        //if (MachineIsLittleEndian()) {
-          for (MatrixIndexT j = 0; j < kFeatDim; j++)
+          //for (MatrixIndexT j = 0; j < kFeatDim; j++)
-            KALDI_SWAP4(feats_(i, j));
+            //KALDI_SWAP4(feats_(i, j));
-        }
+        //}
-      } else {  // KALDI_DOUBLEPRECISION=1
+      //} else {  // KALDI_DOUBLEPRECISION=1
-        float32 tmp[kFeatDim];
+        //float32 tmp[kFeatDim];
-        is.read(reinterpret_cast<char*> (tmp), sizeof(tmp));
+        //is.read(reinterpret_cast<char*> (tmp), sizeof(tmp));
-        if (!is.good()) {
+        //if (!is.good()) {
-          KALDI_WARN << "Unexpected error/EOF while reading Sphinx features ";
+          //KALDI_WARN << "Unexpected error/EOF while reading Sphinx features ";
-          return false;
+          //return false;
-        }
+        //}
-        for (MatrixIndexT j = 0; j < kFeatDim; j++) {
+        //for (MatrixIndexT j = 0; j < kFeatDim; j++) {
-          if (MachineIsLittleEndian())
+          //if (MachineIsLittleEndian())
-            KALDI_SWAP4(tmp[j]);
+            //KALDI_SWAP4(tmp[j]);
-          feats_(i, j) = static_cast<BaseFloat>(tmp[j]);
+          //feats_(i, j) = static_cast<BaseFloat>(tmp[j]);
-        }
+        //}
-      }
+      //}
-    }
+    //}
-    return true;
+    //return true;
-  }
+  //}
-  // Only read in binary
+  //// Only read in binary
-  static bool IsReadInBinary() { return true; }
+  //static bool IsReadInBinary() { return true; }
-  T &Value() { return feats_; }
+  //T &Value() { return feats_; }
-  void Swap(SphinxMatrixHolder *other) {
+  //void Swap(SphinxMatrixHolder *other) {
-    feats_.Swap(&(other->feats_));
+    //feats_.Swap(&(other->feats_));
-  }
+  //}
-  bool ExtractRange(const SphinxMatrixHolder &other,
+  //bool ExtractRange(const SphinxMatrixHolder &other,
-                    const std::string &range) {
+                    //const std::string &range) {
-    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    //KALDI_ERR << "ExtractRange is not defined for this type of holder.";
-    return false;
+    //return false;
-  }
+  //}
- private:
+ //private:
-  KALDI_DISALLOW_COPY_AND_ASSIGN(SphinxMatrixHolder);
+  //KALDI_DISALLOW_COPY_AND_ASSIGN(SphinxMatrixHolder);
-  T feats_;
+  //T feats_;
-};
+//};
 /// @} end "addtogroup holders"

--- a/speechx/speechx/kaldi/util/kaldi-holder.cc
+++ b/speechx/speechx/kaldi/util/kaldi-holder.cc
@@ -85,7 +85,7 @@ bool ParseMatrixRangeSpecifier(const std::string &range,
  return status;
 }
-bool ExtractObjectRange(const GeneralMatrix &input, const std::string &range,
+/*bool ExtractObjectRange(const GeneralMatrix &input, const std::string &range,
                        GeneralMatrix *output) {
  // We just inspect input's type and forward to the correct implementation
  // if available. For kSparseMatrix, we do just fairly inefficient conversion
@@ -135,6 +135,7 @@ template bool ExtractObjectRange(const CompressedMatrix &, const std::string &,
 template bool ExtractObjectRange(const CompressedMatrix &, const std::string &,
                                 Matrix<double> *);
+*/
 template<class Real>
 bool ExtractObjectRange(const Matrix<Real> &input, const std::string &range,
                        Matrix<Real> *output) {

--- a/speechx/speechx/kaldi/util/kaldi-holder.h
+++ b/speechx/speechx/kaldi/util/kaldi-holder.h
@@ -27,7 +27,6 @@
 #include "util/kaldi-io.h"
 #include "util/text-utils.h"
 #include "matrix/kaldi-vector.h"
-#include "matrix/sparse-matrix.h"
 namespace kaldi {
@@ -214,10 +213,10 @@ class TokenVectorHolder;
 /// A class for reading/writing HTK-format matrices.
 /// T == std::pair<Matrix<BaseFloat>, HtkHeader>
-class HtkMatrixHolder;
+//class HtkMatrixHolder;
 /// A class for reading/writing Sphinx format matrices.
-template<int kFeatDim = 13> class SphinxMatrixHolder;
+//template<int kFeatDim = 13> class SphinxMatrixHolder;
 /// This templated function exists so that we can write .scp files with
 /// 'object ranges' specified: the canonical example is a [first:last] range
@@ -249,15 +248,15 @@ bool ExtractObjectRange(const Vector<Real> &input, const std::string &range,
                        Vector<Real> *output);
 /// GeneralMatrix is always of type BaseFloat
-bool ExtractObjectRange(const GeneralMatrix &input, const std::string &range,
+//bool ExtractObjectRange(const GeneralMatrix &input, const std::string &range,
-                        GeneralMatrix *output);
+ //                       GeneralMatrix *output);
 /// CompressedMatrix is always of the type BaseFloat but it is more
 /// efficient to provide template as it uses CompressedMatrix's own
 /// conversion to Matrix<Real>
-template <class Real>
+//template <class Real>
-bool ExtractObjectRange(const CompressedMatrix &input, const std::string &range,
+//bool ExtractObjectRange(const CompressedMatrix &input, const std::string &range,
-                        Matrix<Real> *output);
+ //                       Matrix<Real> *output);
 // In SequentialTableReaderScriptImpl and RandomAccessTableReaderScriptImpl, for
 // cases where the scp contained 'range specifiers' (things in square brackets

--- a/speechx/speechx/kaldi/util/table-types.h
+++ b/speechx/speechx/kaldi/util/table-types.h
@@ -23,7 +23,8 @@
 #include "base/kaldi-common.h"
 #include "util/kaldi-table.h"
 #include "util/kaldi-holder.h"
-#include "matrix/matrix-lib.h"
+#include "matrix/kaldi-matrix.h"
+#include "matrix/kaldi-vector.h"
 namespace kaldi {
@@ -51,8 +52,8 @@ typedef RandomAccessTableReader<KaldiObjectHolder<Matrix<double> > >
 typedef RandomAccessTableReaderMapped<KaldiObjectHolder<Matrix<double> > >
                                      RandomAccessDoubleMatrixReaderMapped;
-typedef TableWriter<KaldiObjectHolder<CompressedMatrix> >
+//typedef TableWriter<KaldiObjectHolder<CompressedMatrix> >
-                                      CompressedMatrixWriter;
+                                      //CompressedMatrixWriter;
 typedef TableWriter<KaldiObjectHolder<Vector<BaseFloat> > >
                                      BaseFloatVectorWriter;
@@ -70,39 +71,39 @@ typedef SequentialTableReader<KaldiObjectHolder<Vector<double> > >
 typedef RandomAccessTableReader<KaldiObjectHolder<Vector<double> > >
                                RandomAccessDoubleVectorReader;
-typedef TableWriter<KaldiObjectHolder<CuMatrix<BaseFloat> > >
+//typedef TableWriter<KaldiObjectHolder<CuMatrix<BaseFloat> > >
-                                      BaseFloatCuMatrixWriter;
+                                      //BaseFloatCuMatrixWriter;
-typedef SequentialTableReader<KaldiObjectHolder<CuMatrix<BaseFloat> > >
+//typedef SequentialTableReader<KaldiObjectHolder<CuMatrix<BaseFloat> > >
-                              SequentialBaseFloatCuMatrixReader;
+                              //SequentialBaseFloatCuMatrixReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<CuMatrix<BaseFloat> > >
+//typedef RandomAccessTableReader<KaldiObjectHolder<CuMatrix<BaseFloat> > >
-                                RandomAccessBaseFloatCuMatrixReader;
+                                //RandomAccessBaseFloatCuMatrixReader;
-typedef RandomAccessTableReaderMapped<KaldiObjectHolder<CuMatrix<BaseFloat> > >
+//typedef RandomAccessTableReaderMapped<KaldiObjectHolder<CuMatrix<BaseFloat> > >
-                                      RandomAccessBaseFloatCuMatrixReaderMapped;
+                                      //RandomAccessBaseFloatCuMatrixReaderMapped;
-typedef TableWriter<KaldiObjectHolder<CuMatrix<double> > >
+//typedef TableWriter<KaldiObjectHolder<CuMatrix<double> > >
-                                      DoubleCuMatrixWriter;
+                                      //DoubleCuMatrixWriter;
-typedef SequentialTableReader<KaldiObjectHolder<CuMatrix<double> > >
+//typedef SequentialTableReader<KaldiObjectHolder<CuMatrix<double> > >
-                              SequentialDoubleCuMatrixReader;
+                              //SequentialDoubleCuMatrixReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<CuMatrix<double> > >
+//typedef RandomAccessTableReader<KaldiObjectHolder<CuMatrix<double> > >
-                                RandomAccessDoubleCuMatrixReader;
+                                //RandomAccessDoubleCuMatrixReader;
-typedef RandomAccessTableReaderMapped<KaldiObjectHolder<CuMatrix<double> > >
+//typedef RandomAccessTableReaderMapped<KaldiObjectHolder<CuMatrix<double> > >
-                                      RandomAccessDoubleCuMatrixReaderMapped;
+                                      //RandomAccessDoubleCuMatrixReaderMapped;
-typedef TableWriter<KaldiObjectHolder<CuVector<BaseFloat> > >
+//typedef TableWriter<KaldiObjectHolder<CuVector<BaseFloat> > >
-                    BaseFloatCuVectorWriter;
+                    //BaseFloatCuVectorWriter;
-typedef SequentialTableReader<KaldiObjectHolder<CuVector<BaseFloat> > >
+//typedef SequentialTableReader<KaldiObjectHolder<CuVector<BaseFloat> > >
-                              SequentialBaseFloatCuVectorReader;
+                              //SequentialBaseFloatCuVectorReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<CuVector<BaseFloat> > >
+//typedef RandomAccessTableReader<KaldiObjectHolder<CuVector<BaseFloat> > >
-                                RandomAccessBaseFloatCuVectorReader;
+                                //RandomAccessBaseFloatCuVectorReader;
-typedef RandomAccessTableReaderMapped<KaldiObjectHolder<CuVector<BaseFloat> > >
+//typedef RandomAccessTableReaderMapped<KaldiObjectHolder<CuVector<BaseFloat> > >
-                                      RandomAccessBaseFloatCuVectorReaderMapped;
+                                      //RandomAccessBaseFloatCuVectorReaderMapped;
-typedef TableWriter<KaldiObjectHolder<CuVector<double> > >
+//typedef TableWriter<KaldiObjectHolder<CuVector<double> > >
-                    DoubleCuVectorWriter;
+                    //DoubleCuVectorWriter;
-typedef SequentialTableReader<KaldiObjectHolder<CuVector<double> > >
+//typedef SequentialTableReader<KaldiObjectHolder<CuVector<double> > >
-                              SequentialDoubleCuVectorReader;
+                              //SequentialDoubleCuVectorReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<CuVector<double> > >
+//typedef RandomAccessTableReader<KaldiObjectHolder<CuVector<double> > >
-                                RandomAccessDoubleCuVectorReader;
+                                //RandomAccessDoubleCuVectorReader;
 typedef TableWriter<BasicHolder<int32> >  Int32Writer;
@@ -150,8 +151,6 @@ typedef TableWriter<BasicHolder<bool> >  BoolWriter;
 typedef SequentialTableReader<BasicHolder<bool> >  SequentialBoolReader;
 typedef RandomAccessTableReader<BasicHolder<bool> >  RandomAccessBoolReader;
 /// TokenWriter is a writer specialized for std::string where the strings
 /// are nonempty and whitespace-free.   T == std::string
 typedef TableWriter<TokenHolder> TokenWriter;
@@ -169,14 +168,14 @@ typedef RandomAccessTableReader<TokenVectorHolder>
                                RandomAccessTokenVectorReader;
-typedef TableWriter<KaldiObjectHolder<GeneralMatrix> >
+//typedef TableWriter<KaldiObjectHolder<GeneralMatrix> >
-                                      GeneralMatrixWriter;
+//                                      GeneralMatrixWriter;
-typedef SequentialTableReader<KaldiObjectHolder<GeneralMatrix> >
+//typedef SequentialTableReader<KaldiObjectHolder<GeneralMatrix> >
-                              SequentialGeneralMatrixReader;
+ //                             SequentialGeneralMatrixReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<GeneralMatrix> >
+//typedef RandomAccessTableReader<KaldiObjectHolder<GeneralMatrix> >
-                                RandomAccessGeneralMatrixReader;
+ //                               RandomAccessGeneralMatrixReader;
-typedef RandomAccessTableReaderMapped<KaldiObjectHolder<GeneralMatrix> >
+//typedef RandomAccessTableReaderMapped<KaldiObjectHolder<GeneralMatrix> >
-                                      RandomAccessGeneralMatrixReaderMapped;
+ //                                     RandomAccessGeneralMatrixReaderMapped;