Merge pull request #1631 from zh794390558/spx

[speechx] openfst patch and glog

Merge pull request #1631 from zh794390558/spx
[speechx] openfst patch and glog
94e5e37b · YangZhou · GitHub · 602b0b0d · cb66b742 · 94e5e37b
11 changed file
--- a/demos/audio_searching/src/encode.py
+++ b/demos/audio_searching/src/encode.py
@@ -11,11 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-
-import librosa
 import numpy as np
-from config import DEFAULT_TABLE
 from logs import LOGGER

 from paddlespeech.cli import VectorExecutor

--- a/speechx/cmake/external/openfst.cmake
+++ b/speechx/cmake/external/openfst.cmake
@@ -13,7 +13,7 @@ ExternalProject_Add(openfst
                      "CPPFLAGS=-I${gflags_BINARY_DIR}/include -I${glog_SOURCE_DIR}/src -I${glog_BINARY_DIR}"
                      "LDFLAGS=-L${gflags_BINARY_DIR} -L${glog_BINARY_DIR}"
                      "LIBS=-lgflags_nothreads -lglog -lpthread"
-  COMMAND           ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR}
+  COMMAND           ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR}
  BUILD_COMMAND     make -j 4
 )
 link_directories(${openfst_PREFIX_DIR}/lib)

--- a/speechx/examples/CMakeLists.txt
+++ b/speechx/examples/CMakeLists.txt
@@ -3,3 +3,5 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 add_subdirectory(feat)
 add_subdirectory(nnet)
 add_subdirectory(decoder)
+
+add_subdirectory(glog)
\ No newline at end of file
--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
 # Examples

-* decoder - online decoder to work as offline
+* glog - glog usage
 * feat - mfcc, linear 
 * nnet - ds2 nn
+* decoder - online decoder to work as offline

 ## How to run


--- a/speechx/examples/decoder/offline_decoder_main.cc
+++ b/speechx/examples/decoder/offline_decoder_main.cc
@@ -22,11 +22,12 @@
 #include "nnet/decodable.h"
 #include "nnet/paddle_nnet.h"

-DEFINE_string(feature_respecifier, "", "test feature rspecifier");
+DEFINE_string(feature_respecifier, "", "feature matrix rspecifier");
 DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
 DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
 DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
 DEFINE_string(lm_path, "lm.klm", "language model");
+DEFINE_int32(chunk_size, 35, "feat chunk size");


 using kaldi::BaseFloat;
@@ -43,14 +44,16 @@ int main(int argc, char* argv[]) {
    std::string model_params = FLAGS_param_path;
    std::string dict_file = FLAGS_dict_file;
    std::string lm_path = FLAGS_lm_path;
+    int32 chunk_size = FLAGS_chunk_size;
+    LOG(INFO) << "model path: " << model_graph;
+    LOG(INFO) << "model param: " << model_params;
+    LOG(INFO) << "dict path: " << dict_file;
+    LOG(INFO) << "lm path: " << lm_path;
+    LOG(INFO) << "chunk size (frame): " << chunk_size;

    int32 num_done = 0, num_err = 0;

-    ppspeech::CTCBeamSearchOptions opts;
-    opts.dict_file = dict_file;
-    opts.lm_path = lm_path;
-    ppspeech::CTCBeamSearch decoder(opts);
-
+    // frontend + nnet is decodable
    ppspeech::ModelOptions model_opts;
    model_opts.model_path = model_graph;
    model_opts.params_path = model_params;
@@ -60,33 +63,50 @@ int main(int argc, char* argv[]) {
        new ppspeech::RawDataCache());
    std::shared_ptr<ppspeech::Decodable> decodable(
        new ppspeech::Decodable(nnet, raw_data));
+    LOG(INFO) << "Init decodeable.";

-    int32 chunk_size = 35;
-    decoder.InitDecoder();
+    // init decoder
+    ppspeech::CTCBeamSearchOptions opts;
+    opts.dict_file = dict_file;
+    opts.lm_path = lm_path;
+    ppspeech::CTCBeamSearch decoder(opts);
+    LOG(INFO) << "Init decoder.";

+    decoder.InitDecoder();
    for (; !feature_reader.Done(); feature_reader.Next()) {
        string utt = feature_reader.Key();
        const kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
+        LOG(INFO) << "utt: " << utt;
+
+        // feat dim
        raw_data->SetDim(feature.NumCols());
+        LOG(INFO) << "dim: " << raw_data->Dim();
+
        int32 row_idx = 0;
        int32 num_chunks = feature.NumRows() / chunk_size;
+        LOG(INFO) << "n chunks: " << num_chunks;
        for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
+            // feat chunk
            kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
                                                          feature.NumCols());
            for (int row_id = 0; row_id < chunk_size; ++row_id) {
-                kaldi::SubVector<kaldi::BaseFloat> tmp(feature, row_idx);
+                kaldi::SubVector<kaldi::BaseFloat> feat_one_row(feature,
+                                                                row_idx);
                kaldi::SubVector<kaldi::BaseFloat> f_chunk_tmp(
                    feature_chunk.Data() + row_id * feature.NumCols(),
                    feature.NumCols());
-                f_chunk_tmp.CopyFromVec(tmp);
+                f_chunk_tmp.CopyFromVec(feat_one_row);
                row_idx++;
            }
+            // feed to raw cache
            raw_data->Accept(feature_chunk);
            if (chunk_idx == num_chunks - 1) {
                raw_data->SetFinished();
            }
+            // decode step
            decoder.AdvanceDecode(decodable);
        }
+
        std::string result;
        result = decoder.GetFinalBestPath();
        KALDI_LOG << " the result of " << utt << " is " << result;

--- a/speechx/examples/decoder/run.sh
+++ b/speechx/examples/decoder/run.sh
@@ -25,7 +25,10 @@ model_dir=../paddle_asr_model
 feat_wspecifier=./feats.ark
 cmvn=./cmvn.ark

-# 3. run feat
+
+export GLOG_logtostderr=1
+
+# 3. gen linear feat
 linear_spectrogram_main \
    --wav_rspecifier=scp:$model_dir/wav.scp \
    --feature_wspecifier=ark,t:$feat_wspecifier \
@@ -37,4 +40,4 @@ offline_decoder_main \
    --model_path=$model_dir/avg_1.jit.pdmodel \
    --param_path=$model_dir/avg_1.jit.pdparams \
    --dict_file=$model_dir/vocab.txt \
-    --lm_path=$model_dir/avg_1.jit.klm
\ No newline at end of file
+    --lm_path=$model_dir/avg_1.jit.klm
--- a/speechx/examples/feat/feature-mfcc-test.cc
+++ b/speechx/examples/feat/feature-mfcc-test.cc
@@ -41,7 +41,6 @@

 using namespace kaldi;

-
 static void UnitTestReadWave() {
    std::cout << "=== UnitTestReadWave() ===\n";


--- a/speechx/examples/feat/linear_spectrogram_main.cc
+++ b/speechx/examples/feat/linear_spectrogram_main.cc
@@ -25,6 +25,8 @@
 #include "kaldi/util/kaldi-io.h"
 #include "kaldi/util/table-types.h"

+#include <glog/logging.h>
+
 DEFINE_string(wav_rspecifier, "", "test wav scp path");
 DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
 DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
@@ -149,7 +151,7 @@ void WriteMatrix() {
        cmvn_stats(1, idx) = variance_[idx];
    }
    cmvn_stats(0, mean_.size()) = count_;
-    kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true);
+    kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, false);
 }

 int main(int argc, char* argv[]) {
@@ -161,43 +163,56 @@ int main(int argc, char* argv[]) {
    kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
    WriteMatrix();

-    // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning
-    // window -->linear_spectrogram --> cmvn
+
    int32 num_done = 0, num_err = 0;
+
+    // feature pipeline: wave cache --> decibel_normalizer --> hanning
+    // window -->linear_spectrogram --> global cmvn -> feat cache
+
    // std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(new
    // ppspeech::RawDataCache());
    std::unique_ptr<ppspeech::FeatureExtractorInterface> data_source(
        new ppspeech::RawAudioCache());

+    ppspeech::DecibelNormalizerOptions db_norm_opt;
+    std::unique_ptr<ppspeech::FeatureExtractorInterface> db_norm(
+        new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
+
    ppspeech::LinearSpectrogramOptions opt;
    opt.frame_opts.frame_length_ms = 20;
    opt.frame_opts.frame_shift_ms = 10;
-    ppspeech::DecibelNormalizerOptions db_norm_opt;
-    std::unique_ptr<ppspeech::FeatureExtractorInterface> base_feature_extractor(
-        new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
+    LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms;
+    LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms;

    std::unique_ptr<ppspeech::FeatureExtractorInterface> linear_spectrogram(
-        new ppspeech::LinearSpectrogram(opt,
-                                        std::move(base_feature_extractor)));
+        new ppspeech::LinearSpectrogram(opt, std::move(db_norm)));

    std::unique_ptr<ppspeech::FeatureExtractorInterface> cmvn(
        new ppspeech::CMVN(FLAGS_cmvn_write_path,
                           std::move(linear_spectrogram)));

    ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn));
+    LOG(INFO) << "feat dim: " << feature_cache.Dim();

-    float streaming_chunk = 0.36;
    int sample_rate = 16000;
+    float streaming_chunk = 0.36;
    int chunk_sample_size = streaming_chunk * sample_rate;
+    LOG(INFO) << "sr: " << sample_rate;
+    LOG(INFO) << "chunk size (s): " << streaming_chunk;
+    LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
+

    for (; !wav_reader.Done(); wav_reader.Next()) {
        std::string utt = wav_reader.Key();
        const kaldi::WaveData& wave_data = wav_reader.Value();
+        LOG(INFO) << "process utt: " << utt;

        int32 this_channel = 0;
        kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
                                                    this_channel);
        int tot_samples = waveform.Dim();
+        LOG(INFO) << "wav len (sample): " << tot_samples;
+
        int sample_offset = 0;
        std::vector<kaldi::Vector<BaseFloat>> feats;
        int feature_rows = 0;
@@ -209,6 +224,7 @@ int main(int argc, char* argv[]) {
            for (int i = 0; i < cur_chunk_size; ++i) {
                wav_chunk(i) = waveform(sample_offset + i);
            }
+
            kaldi::Vector<BaseFloat> features;
            feature_cache.Accept(wav_chunk);
            if (cur_chunk_size < chunk_sample_size) {

--- a/speechx/examples/feat/run.sh
+++ b/speechx/examples/feat/run.sh
@@ -25,6 +25,7 @@ feat_wspecifier=./feats.ark
 cmvn=./cmvn.ark

 # 3. run feat
+export GLOG_logtostderr=1
 linear_spectrogram_main \
    --wav_rspecifier=scp:$model_dir/wav.scp \
    --feature_wspecifier=ark,t:$feat_wspecifier \

--- a/speechx/speechx/frontend/feature_cache.h
+++ b/speechx/speechx/frontend/feature_cache.h
@@ -28,10 +28,10 @@ class FeatureCache : public FeatureExtractorInterface {
    // Feed feats or waves
    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);

-    // feats dim = num_frames * feature_dim
+    // feats size = num_frames * feat_dim
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);

-    // feature cache only cache feature which from base extractor
+    // feat dim
    virtual size_t Dim() const { return base_extractor_->Dim(); }

    virtual void SetFinished() {

--- a/speechx/speechx/frontend/raw_audio.h
+++ b/speechx/speechx/frontend/raw_audio.h
@@ -68,9 +68,10 @@ class RawDataCache : public FeatureExtractorInterface {
        data_.Resize(0);
        return true;
    }
-    virtual size_t Dim() const { return dim_; }
+
    virtual void SetFinished() { finished_ = true; }
    virtual bool IsFinished() const { return finished_; }
+    virtual size_t Dim() const { return dim_; }
    void SetDim(int32 dim) { dim_ = dim; }
    virtual void Reset() { finished_ = true; }