add nnetout struct

cd1ced4e · Hui Zhang · 290c23b9 · cd1ced4e · cd1ced4e · cd1ced4e
9 changed file
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
 #!/bin/bash
-set +x
+set -x
 set -e
 . path.sh
@@ -11,7 +11,7 @@ stop_stage=100
 . utils/parse_options.sh
 # 1. compile
-if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+if [ ! -d ${SPEECHX_BUILD} ]; then
    pushd ${SPEECHX_ROOT} 
    bash build.sh
    popd

--- a/speechx/speechx/nnet/CMakeLists.txt
+++ b/speechx/speechx/nnet/CMakeLists.txt
@@ -14,7 +14,6 @@ target_link_libraries(nnet absl::strings)
 if(USING_U2)
  target_compile_options(nnet  PUBLIC ${PADDLE_COMPILE_FLAGS})
  target_include_directories(nnet  PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR})
-  # target_link_libraries(nnet  ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
 endif()

--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@@ -32,7 +32,7 @@ Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
 // for debug
 void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
-    nnet_cache_ = likelihood;
+    nnet_out_cache_ = likelihood;
    frames_ready_ += likelihood.NumRows();
 }
@@ -56,13 +56,13 @@ int32 Decodable::NumIndices() const { return 0; }
 int32 Decodable::TokenId2NnetId(int32 token_id) { return token_id - 1; }
 BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) {
-    CHECK_LE(index, nnet_cache_.NumCols());
+    CHECK_LE(index, nnet_out_cache_.NumCols());
    CHECK_LE(frame, frames_ready_);
    int32 frame_idx = frame - frame_offset_;
    // the nnet output is prob ranther than log prob
    // the index - 1, because the ilabel
    return acoustic_scale_ *
-           std::log(nnet_cache_(frame_idx, TokenId2NnetId(index)) +
+           std::log(nnet_out_cache_(frame_idx, TokenId2NnetId(index)) +
                    std::numeric_limits<float>::min());
 }
@@ -82,17 +82,18 @@ bool Decodable::AdvanceChunk() {
    }
    // forward feats
-    int32 vocab_dim = 0;
+    NnetOut out;
-    Vector<BaseFloat> probs;
+    nnet_->FeedForward(features, frontend_->Dim(), &out);
-    nnet_->FeedForward(features, frontend_->Dim(), &probs, &vocab_dim);
+    int32& vocab_dim = out.vocab_dim;
+    Vector<BaseFloat>& probs = out.logprobs;
    // cache nnet outupts
-    nnet_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim);
+    nnet_out_cache_.Resize(probs.Dim() / vocab_dim, vocab_dim);
-    nnet_cache_.CopyRowsFromVec(probs);
+    nnet_out_cache_.CopyRowsFromVec(probs);
    // update state
    frame_offset_ = frames_ready_;
-    frames_ready_ += nnet_cache_.NumRows();
+    frames_ready_ += nnet_out_cache_.NumRows();
    return true;
 }
@@ -102,12 +103,12 @@ bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
        return false;
    }
-    int vocab_size = nnet_cache_.NumCols();
+    int vocab_size = nnet_out_cache_.NumCols();
    likelihood->resize(vocab_size);
    for (int32 idx = 0; idx < vocab_size; ++idx) {
        (*likelihood)[idx] =
-            nnet_cache_(frame - frame_offset_, idx) * acoustic_scale_;
+            nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_;
    }
    return true;
 }
@@ -117,7 +118,7 @@ void Decodable::Reset() {
    if (nnet_ != nullptr) nnet_->Reset();
    frame_offset_ = 0;
    frames_ready_ = 0;
-    nnet_cache_.Resize(0, 0);
+    nnet_out_cache_.Resize(0, 0);
 }
 }  // namespace ppspeech
\ No newline at end of file
--- a/speechx/speechx/nnet/decodable.h
+++ b/speechx/speechx/nnet/decodable.h
@@ -62,7 +62,7 @@ class Decodable : public kaldi::DecodableInterface {
    std::shared_ptr<NnetInterface> nnet_;
    // nnet outputs' cache
-    kaldi::Matrix<kaldi::BaseFloat> nnet_cache_;
+    kaldi::Matrix<kaldi::BaseFloat> nnet_out_cache_;
    // the frame is nnet prob frame rather than audio feature frame
    // nnet frame subsample the feature frame

--- a/speechx/speechx/nnet/ds2_nnet.cc
+++ b/speechx/speechx/nnet/ds2_nnet.cc
@@ -143,9 +143,8 @@ shared_ptr<Tensor<BaseFloat>> PaddleNnet::GetCacheEncoder(const string& name) {
 }
 void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
-                             int32 feature_dim,
+                             const int32& feature_dim,
-                             Vector<BaseFloat>* inferences,
+                             NnetOut* out) {
-                             int32* inference_dim) {
    paddle_infer::Predictor* predictor = GetPredictor();
    int feat_row = features.Dim() / feature_dim;
@@ -203,9 +202,13 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
    std::vector<int> output_shape = output_tensor->shape();
    int32 row = output_shape[1];
    int32 col = output_shape[2];
-    inferences->Resize(row * col);
-    *inference_dim = col;
-    output_tensor->CopyToCpu(inferences->Data());
+    // inferences->Resize(row * col);
+    // *inference_dim = col;
+    out->logprobs.Resize(row*col);
+    out->vocab_dim = col;
+    output_tensor->CopyToCpu(out->logprobs.Data());
    ReleasePredictor(predictor);
 }

--- a/speechx/speechx/nnet/ds2_nnet.h
+++ b/speechx/speechx/nnet/ds2_nnet.h
@@ -97,9 +97,8 @@ class PaddleNnet : public NnetInterface {
    PaddleNnet(const ModelOptions& opts);
    virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
-                             int32 feature_dim,
+                             const int32& feature_dim,
-                             kaldi::Vector<kaldi::BaseFloat>* inferences,
+                             NnetOut* out);
-                             int32* inference_dim);
    void Dim();
    virtual void Reset();

--- a/speechx/speechx/nnet/nnet_itf.h
+++ b/speechx/speechx/nnet/nnet_itf.h
@@ -21,12 +21,23 @@
 namespace ppspeech {
+struct NnetOut{
+  // nnet out, maybe logprob or prob
+  kaldi::Vector<kaldi::BaseFloat> logprobs;
+  int32 vocab_dim;
+  // nnet state. Only using in Attention model.
+  std::vector<std::vector<kaldi::BaseFloat>> encoder_outs;
+  NnetOut() : logprobs({}), vocab_dim(-1), encoder_outs({}) {} 
+};
 class NnetInterface {
  public:
    virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
-                             int32 feature_dim,
+                             const int32& feature_dim,
-                             kaldi::Vector<kaldi::BaseFloat>* inferences,
+                             NnetOut* out) = 0;
-                             int32* inference_dim) = 0;
    virtual void Reset() = 0;
    virtual ~NnetInterface() {}
 };

--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -64,7 +64,7 @@ void U2NnetBase::CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
 void U2NnetBase::ForwardEncoderChunk(
    const std::vector<kaldi::BaseFloat>& chunk_feats,
-    int32 feat_dim,
+    const int32& feat_dim,
    std::vector<kaldi::BaseFloat>* ctc_probs,
    int32* vocab_dim) {
    ctc_probs->clear();
@@ -221,16 +221,17 @@ void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) {
 void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
-                         int32 feature_dim,
+                         const int32& feature_dim,
-                         kaldi::Vector<BaseFloat>* inferences,
+                         NnetOut* out) {
-                         int32* inference_dim) {
    std::vector<kaldi::BaseFloat> chunk_feats(features.Data(),
                                              features.Data() + features.Dim());
    std::vector<kaldi::BaseFloat> ctc_probs;
    ForwardEncoderChunkImpl(
-        chunk_feats, feature_dim, &ctc_probs, inference_dim);
+        chunk_feats, feature_dim, &ctc_probs, &out->vocab_dim);
-    inferences->Resize(ctc_probs.size(), kaldi::kSetZero);
-    std::memcpy(inferences->Data(),
+    out->logprobs.Resize(ctc_probs.size(), kaldi::kSetZero);
+    std::memcpy(out->logprobs.Data(),
                ctc_probs.data(),
                ctc_probs.size() * sizeof(kaldi::BaseFloat));
 }
@@ -238,9 +239,10 @@ void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
 void U2Nnet::ForwardEncoderChunkImpl(
    const std::vector<kaldi::BaseFloat>& chunk_feats,
-    int32 feat_dim,
+    const int32& feat_dim,
    std::vector<kaldi::BaseFloat>* out_prob,
    int32* vocab_dim) {
 #ifdef USE_PROFILING
    RecordEvent event(
        "ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1);

--- a/speechx/speechx/nnet/u2_nnet.h
+++ b/speechx/speechx/nnet/u2_nnet.h
@@ -61,7 +61,7 @@ class U2NnetBase : public NnetInterface {
    virtual void ForwardEncoderChunk(
        const std::vector<kaldi::BaseFloat>& chunk_feats,
-        int32 feat_dim,
+        const int32& feat_dim,
        std::vector<kaldi::BaseFloat>* ctc_probs,
        int32* vocab_dim);
@@ -72,7 +72,7 @@ class U2NnetBase : public NnetInterface {
  protected:
    virtual void ForwardEncoderChunkImpl(
        const std::vector<kaldi::BaseFloat>& chunk_feats,
-        int32 feat_dim,
+        const int32& feat_dim,
        std::vector<kaldi::BaseFloat>* ctc_probs,
        int32* vocab_dim) = 0;
@@ -93,7 +93,7 @@ class U2NnetBase : public NnetInterface {
                          // case. Otherwise, none streaming case
    int num_left_chunks_{-1};  // -1 means all left chunks
-    // asr decoder state
+    // asr decoder state, not used in nnet
    int offset_{0};  // current offset in encoder output time stamp. Used by
                     // position embedding.
    std::vector<std::vector<float>> cached_feats_{};  // features cache
@@ -106,9 +106,8 @@ class U2Nnet : public U2NnetBase {
    U2Nnet(const U2Nnet& other);
    void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
-                     int32 feature_dim,
+                     const int32& feature_dim,
-                     kaldi::Vector<kaldi::BaseFloat>* inferences,
+                     NnetOut* out) override;
-                     int32* inference_dim) override;
    void Reset() override;
@@ -123,7 +122,7 @@ class U2Nnet : public U2NnetBase {
    void ForwardEncoderChunkImpl(
        const std::vector<kaldi::BaseFloat>& chunk_feats,
-        int32 feat_dim,
+        const int32& feat_dim,
        std::vector<kaldi::BaseFloat>* ctc_probs,
        int32* vocab_dim) override;
@@ -138,6 +137,8 @@ class U2Nnet : public U2NnetBase {
    // debug
    void FeedEncoderOuts(paddle::Tensor& encoder_out);
+    const std::vector<paddle::Tensor>& EncoderOuts() const {return encoder_outs_; }
  private:
    U2ModelOptions opts_;