From 862150b597edb63efe4f9bf578f6259b8b8ea3a9 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 8 Oct 2021 07:32:18 +0000
Subject: [PATCH] u2 kaldi can train, but ctc loss high

---
 deepspeech/exps/u2_kaldi/model.py             |  6 ++++--
 deepspeech/models/u2.py                       | 11 +++++++++--
 deepspeech/modules/loss.py                    |  8 ++++----
 deepspeech/training/trainer.py                |  4 ++--
 examples/aishell/s0/run.sh                    |  2 +-
 examples/librispeech/s1/conf/transformer.yaml | 10 +++++-----
 examples/librispeech/s2/README.md             | 14 +++-----------
 examples/librispeech/s2/conf/transformer.yaml |  6 +++---
 examples/librispeech/s2/local/test.sh         |  4 +++-
 examples/librispeech/s2/local/train.sh        |  6 +++---
 examples/librispeech/s2/run.sh                |  6 +++---
 utils/avg_model.py                            |  4 ++--
 12 files changed, 42 insertions(+), 39 deletions(-)

diff --git a/deepspeech/exps/u2_kaldi/model.py b/deepspeech/exps/u2_kaldi/model.py
index 6a932d75..759ad608 100644
--- a/deepspeech/exps/u2_kaldi/model.py
+++ b/deepspeech/exps/u2_kaldi/model.py
@@ -393,6 +393,7 @@ class U2Tester(U2Trainer):
                         texts,
                         texts_len,
                         fout=None):
+        logger.info(f"Input: {audio.shape} {audio_len}, {texts} {texts_len}")
         cfg = self.config.decoding
         errors_sum, len_refs, num_ins = 0.0, 0, 0
         errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
@@ -430,8 +431,9 @@ class U2Tester(U2Trainer):
             num_ins += 1
             if fout:
                 fout.write(utt + " " + result + "\n")
-            logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
-                        (target, result))
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
             logger.info("One example error rate [%s] = %f" %
                         (cfg.error_rate_type, error_rate_func(target, result)))
 
diff --git a/deepspeech/models/u2.py b/deepspeech/models/u2.py
index 1ca6a4fe..c2e6e3f0 100644
--- a/deepspeech/models/u2.py
+++ b/deepspeech/models/u2.py
@@ -297,10 +297,12 @@ class U2BaseModel(nn.Layer):
             num_decoding_left_chunks,
             simulate_streaming)  # (B, maxlen, encoder_dim)
         maxlen = encoder_out.size(1)
+        # logger.info(f"att:maxlen {maxlen}")
         encoder_dim = encoder_out.size(2)
         running_size = batch_size * beam_size
         encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
             running_size, maxlen, encoder_dim)  # (B*N, maxlen, encoder_dim)
+        # logger.info(f"att: encoder_mask {encoder_mask}")
         encoder_mask = encoder_mask.unsqueeze(1).repeat(
             1, beam_size, 1, 1).view(running_size, 1,
                                      maxlen)  # (B*N, 1, max_len)
@@ -314,6 +316,7 @@ class U2BaseModel(nn.Layer):
             device)  # (B*N, 1)
         end_flag = paddle.zeros_like(scores, dtype=paddle.bool)  # (B*N, 1)
         cache: Optional[List[paddle.Tensor]] = None
+        # logger.info(f"att: hyps {hyps} eos: {self.eos}")
         # 2. Decoder forward step by step
         for i in range(1, maxlen + 1):
             # Stop if all batch and all beam produce eos
@@ -323,6 +326,7 @@ class U2BaseModel(nn.Layer):
             # 2.1 Forward decoder step
             hyps_mask = subsequent_mask(i).unsqueeze(0).repeat(
                 running_size, 1, 1).to(device)  # (B*N, i, i)
+            # logger.info(f"att: {i} {hyps_mask}")
             # logp: (B*N, vocab)
             logp, cache = self.decoder.forward_one_step(
                 encoder_out, encoder_mask, hyps, hyps_mask, cache)
@@ -332,7 +336,7 @@ class U2BaseModel(nn.Layer):
             top_k_logp = mask_finished_scores(top_k_logp, end_flag)
             top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos)
 
-            # 2.3 Seconde beam prune: select topk score with history
+            # 2.3 Second beam prune: select topk score with history
             scores = scores + top_k_logp  # (B*N, N), broadcast add
             scores = scores.view(batch_size, beam_size * beam_size)  # (B, N*N)
             scores, offset_k_index = scores.topk(k=beam_size)  # (B, N)
@@ -356,9 +360,10 @@ class U2BaseModel(nn.Layer):
             hyps = paddle.cat(
                 (last_best_k_hyps, best_k_pred.view(-1, 1)),
                 dim=1)  # (B*N, i+1)
-
+            # logger.info(f"att: hyps {hyps}")
             # 2.6 Update end flag
             end_flag = paddle.eq(hyps[:, -1], self.eos).view(-1, 1)
+            # logger.info(f"att: end_flag {end_flag}")
 
         # 3. Select best of best
         scores = scores.view(batch_size, beam_size)
@@ -368,6 +373,7 @@ class U2BaseModel(nn.Layer):
             batch_size, dtype=paddle.long) * beam_size
         best_hyps = paddle.index_select(hyps, index=best_hyps_index, axis=0)
         best_hyps = best_hyps[:, 1:]
+        # logger.info(f"att: best_hyps {best_hyps}")
         return best_hyps
 
     def ctc_greedy_search(
@@ -802,6 +808,7 @@ class U2BaseModel(nn.Layer):
         else:
             raise ValueError(f"Not support decoding method: {decoding_method}")
 
+        logger.info(f"hyps: {hyps}")
         res = [text_feature.defeaturize(hyp) for hyp in hyps]
         return res
 
diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py
index f692a818..182d1644 100644
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@@ -49,7 +49,7 @@ class CTCLoss(nn.Layer):
         # (TODO:Hui Zhang) ctc loss does not support int64 labels
         ys_pad = ys_pad.astype(paddle.int32)
         loss = self.loss(
-            logits, ys_pad, hlens, ys_lens, norm_by_times=self.batch_average)
+            logits, ys_pad, hlens, ys_lens, norm_by_batchsize=self.batch_average)
         if self.batch_average:
             # Batch-size average
             loss = loss / B
@@ -90,8 +90,8 @@ class LabelSmoothingLoss(nn.Layer):
             size (int): the number of class
             padding_idx (int): padding class id which will be ignored for loss
             smoothing (float): smoothing rate (0.0 means the conventional CE)
-            normalize_length (bool): 
-                True, normalize loss by sequence length; 
+            normalize_length (bool):
+                True, normalize loss by sequence length;
                 False, normalize loss by batch size.
                 Defaults to False.
         """
@@ -108,7 +108,7 @@ class LabelSmoothingLoss(nn.Layer):
         The model outputs and data labels tensors are flatten to
         (batch*seqlen, class) shape and a mask is applied to the
         padding part which should not be calculated for loss.
-        
+
         Args:
             x (paddle.Tensor): prediction (batch, seqlen, class)
             target (paddle.Tensor):
diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py
index 3a922c6f..21768883 100644
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@@ -163,8 +163,8 @@ class Trainer():
             checkpoint_path=self.args.checkpoint_path)
         if infos:
             # restore from ckpt
-            self.iteration = infos["step"]
-            self.epoch = infos["epoch"]
+            self.iteration = infos["step"] + 1
+            self.epoch = infos["epoch"] + 1
             scratch = False
         else:
             self.iteration = 0
diff --git a/examples/aishell/s0/run.sh b/examples/aishell/s0/run.sh
index e5ab12a5..f461df75 100755
--- a/examples/aishell/s0/run.sh
+++ b/examples/aishell/s0/run.sh
@@ -6,7 +6,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml
-avg_num=1
+avg_num=5
 model_type=offline
 
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml
index bc2ec606..0cf59b38 100644
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@@ -16,7 +16,7 @@ collator:
   spm_model_prefix: 'data/bpe_unigram_5000'
   mean_std_filepath: ""
   augmentation_config: conf/augmentation.json
-  batch_size: 64
+  batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
   specgram_type: fbank #linear, mfcc, fbank
   feat_dim: 80
@@ -73,13 +73,13 @@ model:
 
 training:
   n_epoch: 120
-  accum_grad: 2
+  accum_grad: 4
   global_grad_clip: 5.0
-  optim: adam
+  optim: noam 
   optim_conf:
-    lr: 0.004
+    lr: 10.0
     weight_decay: 1e-06
-  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler: noam     # pytorch v1.1.0+ required
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
diff --git a/examples/librispeech/s2/README.md b/examples/librispeech/s2/README.md
index e4022f01..c9392626 100644
--- a/examples/librispeech/s2/README.md
+++ b/examples/librispeech/s2/README.md
@@ -14,11 +14,6 @@
 | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search |  | |  
 | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring |  |  |  
 
-### Test w/o length filter
-| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
-| --- | --- | --- | --- | --- | --- | --- | --- |
-| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention |  |  |  
-
 
 ## Chunk Conformer
 
@@ -33,9 +28,6 @@
 ## Transformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention |  |  |  
-
-### Test w/o length filter
-| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
-| --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | | |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 9.27137279510498, | 0.038421 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 9.27137279510498, | 0.120112 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 9.27137279510498, | 0.116441 |  
diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/s2/conf/transformer.yaml
index f7c27d1f..7c5da8f6 100644
--- a/examples/librispeech/s2/conf/transformer.yaml
+++ b/examples/librispeech/s2/conf/transformer.yaml
@@ -12,7 +12,7 @@ collator:
   stride_ms: 10.0
   window_ms: 25.0
   sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  batch_size: 32 
+  batch_size: 30 
   maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
   maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
   minibatches: 0 # for debug
@@ -22,7 +22,7 @@ collator:
   batch_frames_out: 0
   batch_frames_inout: 0
   augmentation_config: conf/augmentation.json
-  num_workers: 2
+  num_workers: 0
   subsampling_factor: 1
   num_encs: 1
 
@@ -81,7 +81,7 @@ scheduler_conf:
   lr_decay: 1.0
 
 decoding:
-  batch_size: 64
+  batch_size: 1
   error_rate_type: wer
   decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
   lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
diff --git a/examples/librispeech/s2/local/test.sh b/examples/librispeech/s2/local/test.sh
index efd06f35..cbea85bd 100755
--- a/examples/librispeech/s2/local/test.sh
+++ b/examples/librispeech/s2/local/test.sh
@@ -30,13 +30,15 @@ echo "chunk mode ${chunk_mode}"
 #    exit 1
 #fi
 
+#for type in attention ctc_greedy_search; do
 for type in attention ctc_greedy_search; do
     echo "decoding ${type}"
     if [ ${chunk_mode} == true ];then
         # stream decoding only support batchsize=1
         batch_size=1
     else
-        batch_size=64
+        #batch_size=64
+        batch_size=1
     fi
     python3 -u ${BIN_DIR}/test.py \
     --model-name u2_kaldi \
diff --git a/examples/librispeech/s2/local/train.sh b/examples/librispeech/s2/local/train.sh
index c7525259..a9579098 100755
--- a/examples/librispeech/s2/local/train.sh
+++ b/examples/librispeech/s2/local/train.sh
@@ -19,8 +19,8 @@ echo "using ${device}..."
 
 mkdir -p exp
 
-seed=1024
-if [ ${seed} ]; then
+seed=0
+if [ ${seed} != 0 ]; then
     export FLAGS_cudnn_deterministic=True
 fi
 
@@ -32,7 +32,7 @@ python3 -u ${BIN_DIR}/train.py \
 --output exp/${ckpt_name} \
 --seed ${seed}
 
-if [ ${seed} ]; then
+if [ ${seed} != 0 ]; then
     unset FLAGS_cudnn_deterministic
 fi
 
diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/s2/run.sh
index 26398dd1..8dd93736 100755
--- a/examples/librispeech/s2/run.sh
+++ b/examples/librispeech/s2/run.sh
@@ -6,7 +6,7 @@ stage=0
 stop_stage=100
 conf_path=conf/transformer.yaml
 dict_path=data/train_960_unigram5000_units.txt
-avg_num=5
+avg_num=10
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
 avg_ckpt=avg_${avg_num}
@@ -20,12 +20,12 @@ fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=0,1,2,3 ./local/train.sh ${conf_path}  ${ckpt}
+    CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./local/train.sh ${conf_path}  ${ckpt}
 fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # avg n best model
-    avg.sh exp/${ckpt}/checkpoints ${avg_num}
+    avg.sh latest exp/${ckpt}/checkpoints ${avg_num}
 fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
diff --git a/utils/avg_model.py b/utils/avg_model.py
index 8ec792f5..3a0739c9 100755
--- a/utils/avg_model.py
+++ b/utils/avg_model.py
@@ -80,8 +80,8 @@ def main(args):
         data = json.dumps({
             "avg_ckpt": args.dst_model,
             "ckpt": path_list,
-            "epoch": selected_epochs.tolist(),
-            "val_loss": beat_val_scores.tolist(),
+            "epoch": selected_epochs,
+            "val_loss": beat_val_scores,
         })
         f.write(data + "\n")
 
-- 
GitLab