more config or u2pp

b10512eb · Hui Zhang · 00b2c1c8 · b10512eb · b10512eb · b10512eb
5 changed file
--- a/examples/wenetspeech/asr1/README.md
+++ b/examples/wenetspeech/asr1/README.md
@@ -12,3 +12,34 @@ show model.tar.gz
 ```
 tar tf model.tar.gz 
 ```
+other way is:
+```bash
+tar cvzf asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz model.yaml conf/tuning/ conf/chunk_conformer.yaml conf/preprocess.yaml data/mean_std.json exp/chunk_conformer/checkpoints/
+```
+## Export Static Model
+>> `data/test_meeting/data.list`
+>> {"input": [{"name": "input1", "shape": [3.2230625, 80], "feat": "/home/PaddleSpeech/dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0163.wav", "filetype": "sound"}], "output": [{"name": "target1", "shape": [9, 5538], "text": "\u697c\u5e02\u8c03\u63a7\u5c06\u53bb\u5411\u4f55\u65b9", "token": "\u697c \u5e02 \u8c03 \u63a7 \u5c06 \u53bb \u5411 \u4f55 \u65b9", "tokenid": "1891 1121 3502 1543 1018 477 528 163 1657"}], "utt": "BAC009S0764W0163", "utt2spk": "S0764"}
+>> Test Wav: 
+>> wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
+### U2 chunk conformer
+>> UiDecoder
+>> Make sure `reverse_weight` in config is `0.0`
+>> https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz
+```
+tar zxvf asr1_chunk_conformer_u2_wenetspeech_ckpt_1.1.0.model.tar.gz
+./local/export.sh conf/chunk_conformer.yaml exp/chunk_conformer/checkpoints/avg_10 ./export.ji
+```
+### U2++ chunk conformer
+>> BiDecoder
+>> https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.0.model.tar.gz
+>> Make sure `reverse_weight` in config is not `0.0`
+```
+./local/export.sh conf/chunk_conformer_u2pp.yaml exp/chunk_conformer/checkpoints/avg_10 ./export.ji
+```
--- a/examples/wenetspeech/asr1/conf/chunk_conformer.yaml
+++ b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml
@@ -39,6 +39,7 @@ decoder_conf:
 model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
+    reverse_weight: 0.0 # unidecoder
    length_normalized_loss: false
    init_type: 'kaiming_uniform' 
@@ -53,8 +54,9 @@ test_manifest: data/test_meeting/data.list
 ###########################################
 #              Dataloader                 #
 ###########################################
-vocab_filepath: data/lang_char/vocab.txt 
+use_streaming_data: True
 unit_type: 'char'
+vocab_filepath: data/lang_char/vocab.txt 
 preprocess_config: conf/preprocess.yaml
 spm_model_prefix: ''
 feat_dim: 80

--- a/examples/wenetspeech/asr1/conf/chunk_conformer_u2pp.yaml
+++ b/examples/wenetspeech/asr1/conf/chunk_conformer_u2pp.yaml
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    use_cnn_module: True
+    cnn_module_kernel: 15
+    activation_type: swish
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+# decoder related
+decoder: bitransformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 3     # the number of encoder blocks
+    r_num_blocks: 3   #only for bitransformer
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3    # only for bitransformer decoder
+    init_type: 'kaiming_uniform' # !Warning: need to convergence
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/train_l/data.list
+dev_manifest: data/dev/data.list
+test_manifest: data/test_meeting/data.list
+###########################################
+#              Dataloader                 #
+###########################################
+use_stream_data: True
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+spm_model_prefix: ''
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 32
+do_filter: True
+maxlen_in: 1200  # if do_filter == False && input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 100  # if do_filter == False && output length > maxlen-out, batchsize is automatically reduced
+minlen_in: 10
+minlen_out: 0
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 150 
+accum_grad: 8
+global_grad_clip: 5.0
+dist_sampler: False
+optim: adam
+optim_conf:
+  lr: 0.002
+  weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/wenetspeech/asr1/local/export.sh
+++ b/examples/wenetspeech/asr1/local/export.sh
@@ -14,6 +14,8 @@ jit_model_export_path=$3
 # export can not using StreamdataDataloader, set use_stream_dta False
+# u2: reverse_weight should be 0.0
+# u2pp: reverse_weight should be same with config file. e.g. 0.3
 python3 -u ${BIN_DIR}/export.py \
 --ngpu ${ngpu} \
 --config ${config_path} \

--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -565,7 +565,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
            [len(hyp[0]) for hyp in hyps], place=device,
            dtype=paddle.long)  # (beam_size,)
        hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
-        logger.info(
+        logger.debug(
            f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}")
        hyps_lens = hyps_lens + 1  # Add <sos> at begining
@@ -590,7 +590,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
            # last decoder output token is `eos`, for laste decoder input token.
            score += decoder_out[i][len(hyp[0])][self.eos]
-            logger.info(f"hyp {i} len {len(hyp[0])} l2r score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}")
+            logger.debug(f"hyp {i} len {len(hyp[0])} l2r score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}")
            if reverse_weight > 0:
                r_score = 0.0
@@ -598,7 +598,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
                    r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w]
                r_score += r_decoder_out[i][len(hyp[0])][self.eos]
-                logger.info(f"hyp {i} len {len(hyp[0])} r2l score: {score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}")
+                logger.info(f"hyp {i} len {len(hyp[0])} r2l score: {r_score} ctc_score: {hyp[1]} reverse_weight: {reverse_weight}")
                score = score * (1 - reverse_weight) + r_score * reverse_weight
@@ -608,7 +608,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
                best_score = score
                best_index = i
-        logger.info(f"result: {hyps[best_index]}")
+        logger.debug(f"result: {hyps[best_index]}")
        return hyps[best_index][0]
    @jit.to_static(property=True)