more debug info

fc72ab1e · Hui Zhang · 48271260 · fc72ab1e · fc72ab1e · fc72ab1e
8 changed file
--- a/speechx/build.sh
+++ b/speechx/build.sh
@@ -20,4 +20,4 @@ fi
 mkdir -p build

 cmake -B build -DBOOST_ROOT:STRING=${boost_SOURCE_DIR}
-cmake --build build
+cmake --build build -j
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc
@@ -76,11 +76,15 @@ void CTCPrefixBeamSearch::AdvanceDecode(
        // forward frame by frame
        std::vector<kaldi::BaseFloat> frame_prob;
        bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob);
-        if (flag == false) break;
+        if (flag == false) {
+            LOG(INFO) << "decoder advance decode exit." << frame_prob.size();
+            break;
+        }

        std::vector<std::vector<kaldi::BaseFloat>> likelihood;
        likelihood.push_back(frame_prob);
        AdvanceDecoding(likelihood);
+        VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_;
    }
 }

@@ -114,7 +118,11 @@ void CTCPrefixBeamSearch::AdvanceDecoding(
        std::vector<float> topk_score;
        std::vector<int32_t> topk_index;
        TopK(logp_t, first_beam_size, &topk_score, &topk_index);
-
+        VLOG(2) << "topk: " << num_frame_decoded_ << " " <<  *std::max_element(logp_t.begin(), logp_t.end()) << " " << topk_score[0];
+        for (int i = 0; i < topk_score.size(); i++){
+             VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i];
+        }
+       
        // 2. token passing
        for (int i = 0; i < topk_index.size(); ++i) {
            int id = topk_index[i];
@@ -295,7 +303,18 @@ void CTCPrefixBeamSearch::UpdateOutputs(
    outputs_.emplace_back(output);
 }

-void CTCPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); }
+void CTCPrefixBeamSearch::FinalizeSearch() { 
+    UpdateFinalContext(); 
+    
+    VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_;
+    int cnt = 0;
+    for (int i = 0; i < hypotheses_.size(); i ++){
+        VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size() << " ctc score: " << likelihood_[i];
+        for (int j = 0; j < hypotheses_[i].size(); j ++){
+            VLOG(2) <<  hypotheses_[i][j];
+        }
+    }
+}

 void CTCPrefixBeamSearch::UpdateFinalContext() {
    if (context_graph_ == nullptr) return;

--- a/speechx/speechx/frontend/audio/assembler.cc
+++ b/speechx/speechx/frontend/audio/assembler.cc
@@ -52,15 +52,21 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
        Vector<BaseFloat> feature;
        result = base_extractor_->Read(&feature);
        if (result == false || feature.Dim() == 0) {
-            if (IsFinished() == false) return false;
-            break;
+            VLOG(1) << "result: " << result << "feature dim: " << feature.Dim();
+            if (IsFinished() == false) {
+                LOG(INFO) << "finished reading feature. cache size: " << feature_cache_.size();
+                return false;
+            } else {
+                LOG(INFO) << "break";
+                break;
+            }
        }

        CHECK(feature.Dim() == dim_);
+        feature_cache_.push(feature);
+
        nframes_ += 1;
        VLOG(1) << "nframes: " << nframes_;
-
-        feature_cache_.push(feature);
    }

    if (feature_cache_.size() < receptive_filed_length_) {
@@ -68,8 +74,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
        return false;
    }

-
-    if (fill_zero_){
+    if (fill_zero_) {
        while (feature_cache_.size() < frame_chunk_size_) {
            Vector<BaseFloat> feature(dim_, kaldi::kSetZero);
            nframes_ += 1;
@@ -79,6 +84,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {

    int32 this_chunk_size = std::min(static_cast<int32>(feature_cache_.size()), frame_chunk_size_);
    feats->Resize(dim_ * this_chunk_size);
+    VLOG(1) << "read " << this_chunk_size << " feat.";

    int32 counter = 0;
    while (counter < this_chunk_size) {
@@ -97,6 +103,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
  
        counter++;
    }
+    CHECK(feature_cache_.size() == cache_size_ );

    return result;
 }

--- a/speechx/speechx/frontend/audio/feature_cache.h
+++ b/speechx/speechx/frontend/audio/feature_cache.h
@@ -41,12 +41,14 @@ class FeatureCache : public FrontendInterface {
    virtual size_t Dim() const { return dim_; }

    virtual void SetFinished() {
+        LOG(INFO) << "set finished";
        // std::unique_lock<std::mutex> lock(mutex_);
        base_extractor_->SetFinished();
-        LOG(INFO) << "set finished";
+
        // read the last chunk data
        Compute();
        // ready_feed_condition_.notify_one();
+        LOG(INFO) << "compute last feats done.";
    }

    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }

--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@@ -36,8 +36,6 @@ void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
    frames_ready_ += likelihood.NumRows();
 }

-// Decodable::Init(DecodableConfig config) {
-//}

 // return the size of frame have computed.
 int32 Decodable::NumFramesReady() const { return frames_ready_; }
@@ -70,9 +68,10 @@ bool Decodable::AdvanceChunk() {
    Vector<BaseFloat> features;
    if (frontend_ == NULL || frontend_->Read(&features) == false) {
        // no feat or frontend_ not init.
+        VLOG(1) << "decodable exit;";
        return false;
    }
-    VLOG(2) << "Forward with " << features.Dim() << " frames.";
+    VLOG(2) << "Forward in " << features.Dim() / frontend_->Dim() << " feats.";

    // forward feats
    NnetOut out;
@@ -80,6 +79,7 @@ bool Decodable::AdvanceChunk() {
    int32& vocab_dim = out.vocab_dim;
    Vector<BaseFloat>& logprobs = out.logprobs;

+    VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim  << " decoder frames.";
    // cache nnet outupts
    nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim);
    nnet_out_cache_.CopyRowsFromVec(logprobs);
@@ -114,15 +114,20 @@ bool Decodable::AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
 // read one frame likelihood
 bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
    if (EnsureFrameHaveComputed(frame) == false) {
+        LOG(INFO) << "framelikehood exit.";
        return false;
    }

+    int nrows = nnet_out_cache_.NumRows();
+    CHECK(nrows == (frames_ready_ - frame_offset_));
    int vocab_size = nnet_out_cache_.NumCols();
    likelihood->resize(vocab_size);

    for (int32 idx = 0; idx < vocab_size; ++idx) {
        (*likelihood)[idx] =
            nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_;
+
+        VLOG(4) << "nnet out: " << frame  << " offset:" << frame_offset_  << " " << nnet_out_cache_.NumRows() << " logprob: " <<  nnet_out_cache_(frame - frame_offset_, idx);
    }
    return true;
 }

--- a/speechx/speechx/nnet/u2_nnet.cc
+++ b/speechx/speechx/nnet/u2_nnet.cc
@@ -440,6 +440,7 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
        max_hyps_len = std::max(max_hyps_len, len);
        hyps_len_ptr[i] = static_cast<int64_t>(len);
    }
+    VLOG(2) << "max_hyps_len: " << max_hyps_len;

    paddle::Tensor hyps_tensor =
        paddle::full({num_hyps, max_hyps_len}, eos_, paddle::DataType::INT64);
@@ -625,8 +626,8 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
        // combinded left-to-right and right-to-lfet score
        (*rescoring_score)[i] =
            score * (1 - reverse_weight) + r_score * reverse_weight;
-        VLOG(1) << "hyp " << i << " score: " << score << " r_score: " << r_score
-                << " reverse_weight: " << reverse_weight;
+        VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score << " r_score: " << r_score
+                << " reverse_weight: " << reverse_weight << " final score: " << (*rescoring_score)[i];
    }
 }


--- a/speechx/speechx/recognizer/u2_recognizer.cc
+++ b/speechx/speechx/recognizer/u2_recognizer.cc
@@ -52,7 +52,6 @@ void U2Recognizer::Reset() {
    num_frames_ = 0;
    result_.clear();

-    feature_pipeline_->Reset();
    decodable_->Reset();
    decoder_->Reset();
 }
@@ -62,7 +61,6 @@ void U2Recognizer::ResetContinuousDecoding() {
    num_frames_ = 0;
    result_.clear();

-    feature_pipeline_->Reset();
    decodable_->Reset();
    decoder_->Reset();
 }
@@ -192,10 +190,12 @@ void U2Recognizer::AttentionRescoring() {
    // combine ctc score and rescoring score
    for (size_t i = 0; i < num_hyps; i++) {
        VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i]
-                << " ctc_score: " << result_[i].score;
+                << " ctc_score: " << result_[i].score << " rescoring_weight: " <<  opts_.decoder_opts.rescoring_weight << " ctc_weight: " <<  opts_.decoder_opts.ctc_weight;
        result_[i].score =
            opts_.decoder_opts.rescoring_weight * rescoring_score[i] +
            opts_.decoder_opts.ctc_weight * result_[i].score;
+
+        VLOG(1) << "hyp: " << result_[0].sentence << " score: " << result_[0].score;
    }

    std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc);

--- a/speechx/speechx/recognizer/u2_recognizer_main.cc
+++ b/speechx/speechx/recognizer/u2_recognizer_main.cc
@@ -62,6 +62,7 @@ int main(int argc, char* argv[]) {
        LOG(INFO) << "wav len (sample): " << tot_samples;

        int sample_offset = 0;
+        int cnt = 0;
        while (sample_offset < tot_samples) {
            int cur_chunk_size =
                std::min(chunk_sample_size, tot_samples - sample_offset);
@@ -77,12 +78,14 @@ int main(int argc, char* argv[]) {
                recognizer.SetFinished();
            }
            recognizer.Decode();
-            LOG(INFO) << "Pratial result: " << recognizer.GetPartialResult();
+            LOG(INFO) << "Pratial result: " << cnt << " " << recognizer.GetPartialResult();

            // no overlap
            sample_offset += cur_chunk_size;
+            cnt++;
        }
        CHECK(sample_offset == tot_samples);
+        VLOG(1) << "num decode: " << cnt;

        // recognizer.SetFinished();