提交 fc72ab1e 编写于 作者: H Hui Zhang

more debug info

上级 48271260
...@@ -20,4 +20,4 @@ fi ...@@ -20,4 +20,4 @@ fi
mkdir -p build mkdir -p build
cmake -B build -DBOOST_ROOT:STRING=${boost_SOURCE_DIR} cmake -B build -DBOOST_ROOT:STRING=${boost_SOURCE_DIR}
cmake --build build cmake --build build -j
...@@ -76,11 +76,15 @@ void CTCPrefixBeamSearch::AdvanceDecode( ...@@ -76,11 +76,15 @@ void CTCPrefixBeamSearch::AdvanceDecode(
// forward frame by frame // forward frame by frame
std::vector<kaldi::BaseFloat> frame_prob; std::vector<kaldi::BaseFloat> frame_prob;
bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob); bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob);
if (flag == false) break; if (flag == false) {
LOG(INFO) << "decoder advance decode exit." << frame_prob.size();
break;
}
std::vector<std::vector<kaldi::BaseFloat>> likelihood; std::vector<std::vector<kaldi::BaseFloat>> likelihood;
likelihood.push_back(frame_prob); likelihood.push_back(frame_prob);
AdvanceDecoding(likelihood); AdvanceDecoding(likelihood);
VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_;
} }
} }
...@@ -114,7 +118,11 @@ void CTCPrefixBeamSearch::AdvanceDecoding( ...@@ -114,7 +118,11 @@ void CTCPrefixBeamSearch::AdvanceDecoding(
std::vector<float> topk_score; std::vector<float> topk_score;
std::vector<int32_t> topk_index; std::vector<int32_t> topk_index;
TopK(logp_t, first_beam_size, &topk_score, &topk_index); TopK(logp_t, first_beam_size, &topk_score, &topk_index);
VLOG(2) << "topk: " << num_frame_decoded_ << " " << *std::max_element(logp_t.begin(), logp_t.end()) << " " << topk_score[0];
for (int i = 0; i < topk_score.size(); i++){
VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i];
}
// 2. token passing // 2. token passing
for (int i = 0; i < topk_index.size(); ++i) { for (int i = 0; i < topk_index.size(); ++i) {
int id = topk_index[i]; int id = topk_index[i];
...@@ -295,7 +303,18 @@ void CTCPrefixBeamSearch::UpdateOutputs( ...@@ -295,7 +303,18 @@ void CTCPrefixBeamSearch::UpdateOutputs(
outputs_.emplace_back(output); outputs_.emplace_back(output);
} }
void CTCPrefixBeamSearch::FinalizeSearch() { UpdateFinalContext(); } void CTCPrefixBeamSearch::FinalizeSearch() {
UpdateFinalContext();
VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_;
int cnt = 0;
for (int i = 0; i < hypotheses_.size(); i ++){
VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size() << " ctc score: " << likelihood_[i];
for (int j = 0; j < hypotheses_[i].size(); j ++){
VLOG(2) << hypotheses_[i][j];
}
}
}
void CTCPrefixBeamSearch::UpdateFinalContext() { void CTCPrefixBeamSearch::UpdateFinalContext() {
if (context_graph_ == nullptr) return; if (context_graph_ == nullptr) return;
......
...@@ -52,15 +52,21 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) { ...@@ -52,15 +52,21 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
Vector<BaseFloat> feature; Vector<BaseFloat> feature;
result = base_extractor_->Read(&feature); result = base_extractor_->Read(&feature);
if (result == false || feature.Dim() == 0) { if (result == false || feature.Dim() == 0) {
if (IsFinished() == false) return false; VLOG(1) << "result: " << result << "feature dim: " << feature.Dim();
break; if (IsFinished() == false) {
LOG(INFO) << "finished reading feature. cache size: " << feature_cache_.size();
return false;
} else {
LOG(INFO) << "break";
break;
}
} }
CHECK(feature.Dim() == dim_); CHECK(feature.Dim() == dim_);
feature_cache_.push(feature);
nframes_ += 1; nframes_ += 1;
VLOG(1) << "nframes: " << nframes_; VLOG(1) << "nframes: " << nframes_;
feature_cache_.push(feature);
} }
if (feature_cache_.size() < receptive_filed_length_) { if (feature_cache_.size() < receptive_filed_length_) {
...@@ -68,8 +74,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) { ...@@ -68,8 +74,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
return false; return false;
} }
if (fill_zero_) {
if (fill_zero_){
while (feature_cache_.size() < frame_chunk_size_) { while (feature_cache_.size() < frame_chunk_size_) {
Vector<BaseFloat> feature(dim_, kaldi::kSetZero); Vector<BaseFloat> feature(dim_, kaldi::kSetZero);
nframes_ += 1; nframes_ += 1;
...@@ -79,6 +84,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) { ...@@ -79,6 +84,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
int32 this_chunk_size = std::min(static_cast<int32>(feature_cache_.size()), frame_chunk_size_); int32 this_chunk_size = std::min(static_cast<int32>(feature_cache_.size()), frame_chunk_size_);
feats->Resize(dim_ * this_chunk_size); feats->Resize(dim_ * this_chunk_size);
VLOG(1) << "read " << this_chunk_size << " feat.";
int32 counter = 0; int32 counter = 0;
while (counter < this_chunk_size) { while (counter < this_chunk_size) {
...@@ -97,6 +103,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) { ...@@ -97,6 +103,7 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
counter++; counter++;
} }
CHECK(feature_cache_.size() == cache_size_ );
return result; return result;
} }
......
...@@ -41,12 +41,14 @@ class FeatureCache : public FrontendInterface { ...@@ -41,12 +41,14 @@ class FeatureCache : public FrontendInterface {
virtual size_t Dim() const { return dim_; } virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { virtual void SetFinished() {
LOG(INFO) << "set finished";
// std::unique_lock<std::mutex> lock(mutex_); // std::unique_lock<std::mutex> lock(mutex_);
base_extractor_->SetFinished(); base_extractor_->SetFinished();
LOG(INFO) << "set finished";
// read the last chunk data // read the last chunk data
Compute(); Compute();
// ready_feed_condition_.notify_one(); // ready_feed_condition_.notify_one();
LOG(INFO) << "compute last feats done.";
} }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
......
...@@ -36,8 +36,6 @@ void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) { ...@@ -36,8 +36,6 @@ void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
frames_ready_ += likelihood.NumRows(); frames_ready_ += likelihood.NumRows();
} }
// Decodable::Init(DecodableConfig config) {
//}
// return the size of frame have computed. // return the size of frame have computed.
int32 Decodable::NumFramesReady() const { return frames_ready_; } int32 Decodable::NumFramesReady() const { return frames_ready_; }
...@@ -70,9 +68,10 @@ bool Decodable::AdvanceChunk() { ...@@ -70,9 +68,10 @@ bool Decodable::AdvanceChunk() {
Vector<BaseFloat> features; Vector<BaseFloat> features;
if (frontend_ == NULL || frontend_->Read(&features) == false) { if (frontend_ == NULL || frontend_->Read(&features) == false) {
// no feat or frontend_ not init. // no feat or frontend_ not init.
VLOG(1) << "decodable exit;";
return false; return false;
} }
VLOG(2) << "Forward with " << features.Dim() << " frames."; VLOG(2) << "Forward in " << features.Dim() / frontend_->Dim() << " feats.";
// forward feats // forward feats
NnetOut out; NnetOut out;
...@@ -80,6 +79,7 @@ bool Decodable::AdvanceChunk() { ...@@ -80,6 +79,7 @@ bool Decodable::AdvanceChunk() {
int32& vocab_dim = out.vocab_dim; int32& vocab_dim = out.vocab_dim;
Vector<BaseFloat>& logprobs = out.logprobs; Vector<BaseFloat>& logprobs = out.logprobs;
VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim << " decoder frames.";
// cache nnet outupts // cache nnet outupts
nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim); nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim);
nnet_out_cache_.CopyRowsFromVec(logprobs); nnet_out_cache_.CopyRowsFromVec(logprobs);
...@@ -114,15 +114,20 @@ bool Decodable::AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs, ...@@ -114,15 +114,20 @@ bool Decodable::AdvanceChunk(kaldi::Vector<kaldi::BaseFloat>* logprobs,
// read one frame likelihood // read one frame likelihood
bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) { bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
if (EnsureFrameHaveComputed(frame) == false) { if (EnsureFrameHaveComputed(frame) == false) {
LOG(INFO) << "framelikehood exit.";
return false; return false;
} }
int nrows = nnet_out_cache_.NumRows();
CHECK(nrows == (frames_ready_ - frame_offset_));
int vocab_size = nnet_out_cache_.NumCols(); int vocab_size = nnet_out_cache_.NumCols();
likelihood->resize(vocab_size); likelihood->resize(vocab_size);
for (int32 idx = 0; idx < vocab_size; ++idx) { for (int32 idx = 0; idx < vocab_size; ++idx) {
(*likelihood)[idx] = (*likelihood)[idx] =
nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_; nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_;
VLOG(4) << "nnet out: " << frame << " offset:" << frame_offset_ << " " << nnet_out_cache_.NumRows() << " logprob: " << nnet_out_cache_(frame - frame_offset_, idx);
} }
return true; return true;
} }
......
...@@ -440,6 +440,7 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps, ...@@ -440,6 +440,7 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
max_hyps_len = std::max(max_hyps_len, len); max_hyps_len = std::max(max_hyps_len, len);
hyps_len_ptr[i] = static_cast<int64_t>(len); hyps_len_ptr[i] = static_cast<int64_t>(len);
} }
VLOG(2) << "max_hyps_len: " << max_hyps_len;
paddle::Tensor hyps_tensor = paddle::Tensor hyps_tensor =
paddle::full({num_hyps, max_hyps_len}, eos_, paddle::DataType::INT64); paddle::full({num_hyps, max_hyps_len}, eos_, paddle::DataType::INT64);
...@@ -625,8 +626,8 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps, ...@@ -625,8 +626,8 @@ void U2Nnet::AttentionRescoring(const std::vector<std::vector<int>>& hyps,
// combinded left-to-right and right-to-lfet score // combinded left-to-right and right-to-lfet score
(*rescoring_score)[i] = (*rescoring_score)[i] =
score * (1 - reverse_weight) + r_score * reverse_weight; score * (1 - reverse_weight) + r_score * reverse_weight;
VLOG(1) << "hyp " << i << " score: " << score << " r_score: " << r_score VLOG(1) << "hyp " << i << " " << hyp.size() << " score: " << score << " r_score: " << r_score
<< " reverse_weight: " << reverse_weight; << " reverse_weight: " << reverse_weight << " final score: " << (*rescoring_score)[i];
} }
} }
......
...@@ -52,7 +52,6 @@ void U2Recognizer::Reset() { ...@@ -52,7 +52,6 @@ void U2Recognizer::Reset() {
num_frames_ = 0; num_frames_ = 0;
result_.clear(); result_.clear();
feature_pipeline_->Reset();
decodable_->Reset(); decodable_->Reset();
decoder_->Reset(); decoder_->Reset();
} }
...@@ -62,7 +61,6 @@ void U2Recognizer::ResetContinuousDecoding() { ...@@ -62,7 +61,6 @@ void U2Recognizer::ResetContinuousDecoding() {
num_frames_ = 0; num_frames_ = 0;
result_.clear(); result_.clear();
feature_pipeline_->Reset();
decodable_->Reset(); decodable_->Reset();
decoder_->Reset(); decoder_->Reset();
} }
...@@ -192,10 +190,12 @@ void U2Recognizer::AttentionRescoring() { ...@@ -192,10 +190,12 @@ void U2Recognizer::AttentionRescoring() {
// combine ctc score and rescoring score // combine ctc score and rescoring score
for (size_t i = 0; i < num_hyps; i++) { for (size_t i = 0; i < num_hyps; i++) {
VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i] VLOG(1) << "hyp " << i << " rescoring_score: " << rescoring_score[i]
<< " ctc_score: " << result_[i].score; << " ctc_score: " << result_[i].score << " rescoring_weight: " << opts_.decoder_opts.rescoring_weight << " ctc_weight: " << opts_.decoder_opts.ctc_weight;
result_[i].score = result_[i].score =
opts_.decoder_opts.rescoring_weight * rescoring_score[i] + opts_.decoder_opts.rescoring_weight * rescoring_score[i] +
opts_.decoder_opts.ctc_weight * result_[i].score; opts_.decoder_opts.ctc_weight * result_[i].score;
VLOG(1) << "hyp: " << result_[0].sentence << " score: " << result_[0].score;
} }
std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc);
......
...@@ -62,6 +62,7 @@ int main(int argc, char* argv[]) { ...@@ -62,6 +62,7 @@ int main(int argc, char* argv[]) {
LOG(INFO) << "wav len (sample): " << tot_samples; LOG(INFO) << "wav len (sample): " << tot_samples;
int sample_offset = 0; int sample_offset = 0;
int cnt = 0;
while (sample_offset < tot_samples) { while (sample_offset < tot_samples) {
int cur_chunk_size = int cur_chunk_size =
std::min(chunk_sample_size, tot_samples - sample_offset); std::min(chunk_sample_size, tot_samples - sample_offset);
...@@ -77,12 +78,14 @@ int main(int argc, char* argv[]) { ...@@ -77,12 +78,14 @@ int main(int argc, char* argv[]) {
recognizer.SetFinished(); recognizer.SetFinished();
} }
recognizer.Decode(); recognizer.Decode();
LOG(INFO) << "Pratial result: " << recognizer.GetPartialResult(); LOG(INFO) << "Pratial result: " << cnt << " " << recognizer.GetPartialResult();
// no overlap // no overlap
sample_offset += cur_chunk_size; sample_offset += cur_chunk_size;
cnt++;
} }
CHECK(sample_offset == tot_samples); CHECK(sample_offset == tot_samples);
VLOG(1) << "num decode: " << cnt;
// recognizer.SetFinished(); // recognizer.SetFinished();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册