Merge pull request #1769 from zh794390558/nnet

[speechx] set nnet param by flags

Merge pull request #1769 from zh794390558/nnet
[speechx] set nnet param by flags
87ef68f1 · Hui Zhang · GitHub · 312fc4e1 · d4ffa161 · 87ef68f1
10 changed file
--- a/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc
+++ b/speechx/examples/ds2_ol/decoder/ctc-prefix-beam-search-decoder-ol.cc
@@ -41,7 +41,10 @@ DEFINE_string(
 DEFINE_string(model_output_names,
              "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
              "model output names");
-DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names");
+DEFINE_string(model_cache_names,
+              "chunk_state_h_box,chunk_state_c_box",
+              "model cache names");
+DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
 using kaldi::BaseFloat;
 using kaldi::Matrix;
@@ -77,7 +80,8 @@ int main(int argc, char* argv[]) {
    ppspeech::ModelOptions model_opts;
    model_opts.model_path = model_path;
    model_opts.param_path = model_params;
-    model_opts.cache_shape = FLAGS_model_cache_names;
+    model_opts.cache_names = FLAGS_model_cache_names;
+    model_opts.cache_shape = FLAGS_model_cache_shapes;
    model_opts.input_names = FLAGS_model_input_names;
    model_opts.output_names = FLAGS_model_output_names;
    std::shared_ptr<ppspeech::PaddleNnet> nnet(

--- a/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc
+++ b/speechx/examples/ds2_ol/decoder/wfst-decoder-ol.cc
@@ -44,7 +44,10 @@ DEFINE_string(
 DEFINE_string(model_output_names,
              "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
              "model output names");
-DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names");
+DEFINE_string(model_cache_names,
+              "chunk_state_h_box,chunk_state_c_box",
+              "model cache names");
+DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
 using kaldi::BaseFloat;
 using kaldi::Matrix;
@@ -80,7 +83,8 @@ int main(int argc, char* argv[]) {
    ppspeech::ModelOptions model_opts;
    model_opts.model_path = model_graph;
    model_opts.param_path = model_params;
-    model_opts.cache_shape = FLAGS_model_cache_names;
+    model_opts.cache_names = FLAGS_model_cache_names;
+    model_opts.cache_shape = FLAGS_model_cache_shapes;
    model_opts.input_names = FLAGS_model_input_names;
    model_opts.output_names = FLAGS_model_output_names;
    std::shared_ptr<ppspeech::PaddleNnet> nnet(

--- a/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
+++ b/speechx/examples/ds2_ol/feat/linear-spectrogram-wo-db-norm-ol.cc
@@ -42,8 +42,8 @@ int main(int argc, char* argv[]) {
    int32 num_done = 0, num_err = 0;
-    // feature pipeline: wave cache --> hanning
+    // feature pipeline: wave cache --> hanning window
-    // window -->linear_spectrogram --> global cmvn -> feat cache
+    // -->linear_spectrogram --> global cmvn -> feat cache
    std::unique_ptr<ppspeech::FrontendInterface> data_source(
        new ppspeech::AudioCache(3600 * 1600, true));

--- a/speechx/speechx/common/CMakeLists.txt
+++ b/speechx/speechx/common/CMakeLists.txt
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@@ -43,7 +43,10 @@ DEFINE_string(
 DEFINE_string(model_output_names,
              "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0",
              "model output names");
-DEFINE_string(model_cache_names, "5-1-1024,5-1-1024", "model cache names");
+DEFINE_string(model_cache_names,
+              "chunk_state_h_box,chunk_state_c_box",
+              "model cache names");
+DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes");
 namespace ppspeech {
@@ -70,7 +73,9 @@ ModelOptions InitModelOptions() {
    ModelOptions model_opts;
    model_opts.model_path = FLAGS_model_path;
    model_opts.param_path = FLAGS_param_path;
-    model_opts.cache_shape = FLAGS_model_cache_names;
+    model_opts.cache_names = FLAGS_model_cache_names;
+    model_opts.cache_shape = FLAGS_model_cache_shapes;
+    model_opts.input_names = FLAGS_model_input_names;
    model_opts.output_names = FLAGS_model_output_names;
    return model_opts;
 }

--- a/speechx/speechx/frontend/audio/audio_cache.h
+++ b/speechx/speechx/frontend/audio/audio_cache.h
@@ -24,7 +24,7 @@ namespace ppspeech {
 class AudioCache : public FrontendInterface {
  public:
    explicit AudioCache(int buffer_size = 1000 * kint16max,
-                        bool to_float32 = true);
+                        bool to_float32 = false);
    virtual void Accept(const kaldi::VectorBase<BaseFloat>& waves);
@@ -58,7 +58,7 @@ class AudioCache : public FrontendInterface {
    std::mutex mutex_;
    std::condition_variable ready_feed_condition_;
    kaldi::int32 timeout_;  // millisecond
-    bool to_float32_;
+    bool to_float32_;       // int16 -> float32. used in linear_spectrogram
    DISALLOW_COPY_AND_ASSIGN(AudioCache);
 };

--- a/speechx/speechx/frontend/audio/feature_cache.cc
+++ b/speechx/speechx/frontend/audio/feature_cache.cc
@@ -58,6 +58,8 @@ bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
        usleep(100);  // sleep 0.1 ms
    }
    if (cache_.empty()) return false;
+    // read from cache
    feats->Resize(cache_.front().Dim());
    feats->CopyFromVec(cache_.front());
    cache_.pop();
@@ -74,15 +76,16 @@ bool FeatureCache::Compute() {
    // join with remained
    int32 joint_len = feature.Dim() + remained_feature_.Dim();
-    int32 num_chunk =
-        ((joint_len / dim_) - frame_chunk_size_) / frame_chunk_stride_ + 1;
    Vector<BaseFloat> joint_feature(joint_len);
    joint_feature.Range(0, remained_feature_.Dim())
        .CopyFromVec(remained_feature_);
    joint_feature.Range(remained_feature_.Dim(), feature.Dim())
        .CopyFromVec(feature);
+    // one by one, or stride with window
+    // controlled by frame_chunk_stride_ and frame_chunk_size_
+    int32 num_chunk =
+        ((joint_len / dim_) - frame_chunk_size_) / frame_chunk_stride_ + 1;
    for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) {
        int32 start = chunk_idx * frame_chunk_stride_ * dim_;
@@ -101,6 +104,8 @@ bool FeatureCache::Compute() {
        cache_.push(feature_chunk);
        ready_read_condition_.notify_one();
    }
+    // cache remained feats
    int32 remained_feature_len =
        joint_len - num_chunk * frame_chunk_stride_ * dim_;
    remained_feature_.Resize(remained_feature_len);

--- a/speechx/speechx/frontend/audio/mfcc.cc
+++ b/speechx/speechx/frontend/audio/mfcc.cc
@@ -30,7 +30,7 @@ using kaldi::Matrix;
 using std::vector;
 Mfcc::Mfcc(const MfccOptions& opts,
-             std::unique_ptr<FrontendInterface> base_extractor)
+           std::unique_ptr<FrontendInterface> base_extractor)
    : opts_(opts),
      computer_(opts.mfcc_opts),
      window_function_(computer_.GetFrameOptions()) {

--- a/speechx/speechx/nnet/paddle_nnet.cc
+++ b/speechx/speechx/nnet/paddle_nnet.cc
@@ -74,6 +74,7 @@ PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) {
    LOG(INFO) << "output names: " << opts.output_names;
    vector<string> input_names_vec = absl::StrSplit(opts.input_names, ",");
    vector<string> output_names_vec = absl::StrSplit(opts.output_names, ",");
    paddle_infer::Predictor* predictor = GetPredictor();
    std::vector<std::string> model_input_names = predictor->GetInputNames();
@@ -87,6 +88,7 @@ PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) {
    for (size_t i = 0; i < output_names_vec.size(); i++) {
        assert(output_names_vec[i] == model_output_names[i]);
    }
    ReleasePredictor(predictor);
    InitCacheEncouts(opts);
 }
@@ -95,6 +97,7 @@ void PaddleNnet::Reset() { InitCacheEncouts(opts_); }
 paddle_infer::Predictor* PaddleNnet::GetPredictor() {
    paddle_infer::Predictor* predictor = nullptr;
    std::lock_guard<std::mutex> guard(pool_mutex);
    int pred_id = 0;
@@ -144,15 +147,19 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
                             Vector<BaseFloat>* inferences,
                             int32* inference_dim) {
    paddle_infer::Predictor* predictor = GetPredictor();
    int feat_row = features.Dim() / feature_dim;
    std::vector<std::string> input_names = predictor->GetInputNames();
    std::vector<std::string> output_names = predictor->GetOutputNames();
+    // feed inputs
    std::unique_ptr<paddle_infer::Tensor> input_tensor =
        predictor->GetInputHandle(input_names[0]);
    std::vector<int> INPUT_SHAPE = {1, feat_row, feature_dim};
    input_tensor->Reshape(INPUT_SHAPE);
    input_tensor->CopyFromCpu(features.Data());
    std::unique_ptr<paddle_infer::Tensor> input_len =
        predictor->GetInputHandle(input_names[1]);
    std::vector<int> input_len_size = {1};
@@ -161,32 +168,36 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
    audio_len.push_back(feat_row);
    input_len->CopyFromCpu(audio_len.data());
-    std::unique_ptr<paddle_infer::Tensor> h_box =
+    std::unique_ptr<paddle_infer::Tensor> state_h =
        predictor->GetInputHandle(input_names[2]);
    shared_ptr<Tensor<BaseFloat>> h_cache = GetCacheEncoder(input_names[2]);
-    h_box->Reshape(h_cache->get_shape());
+    state_h->Reshape(h_cache->get_shape());
-    h_box->CopyFromCpu(h_cache->get_data().data());
+    state_h->CopyFromCpu(h_cache->get_data().data());
-    std::unique_ptr<paddle_infer::Tensor> c_box =
+    std::unique_ptr<paddle_infer::Tensor> state_c =
        predictor->GetInputHandle(input_names[3]);
    shared_ptr<Tensor<float>> c_cache = GetCacheEncoder(input_names[3]);
-    c_box->Reshape(c_cache->get_shape());
+    state_c->Reshape(c_cache->get_shape());
-    c_box->CopyFromCpu(c_cache->get_data().data());
+    state_c->CopyFromCpu(c_cache->get_data().data());
+    // forward
    bool success = predictor->Run();
    if (success == false) {
        LOG(INFO) << "predictor run occurs error";
    }
+    // fetch outpus
    std::unique_ptr<paddle_infer::Tensor> h_out =
        predictor->GetOutputHandle(output_names[2]);
    assert(h_cache->get_shape() == h_out->shape());
    h_out->CopyToCpu(h_cache->get_data().data());
    std::unique_ptr<paddle_infer::Tensor> c_out =
        predictor->GetOutputHandle(output_names[3]);
    assert(c_cache->get_shape() == c_out->shape());
    c_out->CopyToCpu(c_cache->get_data().data());
-    // get result
    std::unique_ptr<paddle_infer::Tensor> output_tensor =
        predictor->GetOutputHandle(output_names[0]);
    std::vector<int> output_shape = output_tensor->shape();
@@ -195,6 +206,7 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
    inferences->Resize(row * col);
    *inference_dim = col;
    output_tensor->CopyToCpu(inferences->Data());
    ReleasePredictor(predictor);
 }

--- a/speechx/speechx/nnet/paddle_nnet.h
+++ b/speechx/speechx/nnet/paddle_nnet.h
@@ -24,7 +24,7 @@ namespace ppspeech {
 struct ModelOptions {
    std::string model_path;
    std::string param_path;
-    int thread_num;
+    int thread_num;  // predictor thread pool size
    bool use_gpu;
    bool switch_ir_optim;
    std::string input_names;
@@ -34,19 +34,14 @@ struct ModelOptions {
    bool enable_fc_padding;
    bool enable_profile;
    ModelOptions()
-        : model_path("avg_1.jit.pdmodel"),
+        : model_path(""),
-          param_path("avg_1.jit.pdiparams"),
+          param_path(""),
          thread_num(2),
          use_gpu(false),
-          input_names(
+          input_names(""),
-              "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_"
+          output_names(""),
-              "box"),
+          cache_names(""),
-          output_names(
+          cache_shape(""),
-              "save_infer_model/scale_0.tmp_1,save_infer_model/"
-              "scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/"
-              "scale_3.tmp_1"),
-          cache_names("chunk_state_h_box,chunk_state_c_box"),
-          cache_shape("3-1-1024,3-1-1024"),
          switch_ir_optim(false),
          enable_fc_padding(false),
          enable_profile(false) {}
@@ -76,17 +71,19 @@ class Tensor {
  public:
    Tensor() {}
    Tensor(const std::vector<int>& shape) : _shape(shape) {
-        int data_size = std::accumulate(
+        int neml = std::accumulate(
            _shape.begin(), _shape.end(), 1, std::multiplies<int>());
-        LOG(INFO) << "data size: " << data_size;
+        LOG(INFO) << "Tensor neml: " << neml;
-        _data.resize(data_size, 0);
+        _data.resize(neml, 0);
    }
    void reshape(const std::vector<int>& shape) {
        _shape = shape;
-        int data_size = std::accumulate(
+        int neml = std::accumulate(
            _shape.begin(), _shape.end(), 1, std::multiplies<int>());
-        _data.resize(data_size, 0);
+        _data.resize(neml, 0);
    }
    const std::vector<int>& get_shape() const { return _shape; }
    std::vector<T>& get_data() { return _data; }
@@ -98,10 +95,12 @@ class Tensor {
 class PaddleNnet : public NnetInterface {
  public:
    PaddleNnet(const ModelOptions& opts);
    virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
                             int32 feature_dim,
                             kaldi::Vector<kaldi::BaseFloat>* inferences,
                             int32* inference_dim);
    void Dim();
    virtual void Reset();
    std::shared_ptr<Tensor<kaldi::BaseFloat>> GetCacheEncoder(