add librispeech asr1

41eeed04 · huangyuxin · fb6d1e2c · 41eeed04 · 41eeed04 · 41eeed04
20 changed file
--- a/examples/aishell/asr1/local/align.sh
+++ b/examples/aishell/asr1/local/align.sh
@@ -24,7 +24,7 @@ python3 -u ${BIN_DIR}/alignment.py \
 --decode_config ${decode_config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
--opts decoding.decode_batch_size ${batch_size}
+--opts decode.decode_batch_size ${batch_size}

 if [ $? -ne 0 ]; then
    echo "Failed in ctc alignment!"

--- a/examples/aishell/asr1/local/test.sh
+++ b/examples/aishell/asr1/local/test.sh
@@ -30,7 +30,7 @@ for type in attention ctc_greedy_search; do
        # stream decoding only support batchsize=1
        batch_size=1
    else
-        batch_size=64
+        batch_size=1
    fi
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
@@ -40,8 +40,8 @@ for type in attention ctc_greedy_search; do
    --decode_config ${decode_config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.decode_batch_size ${batch_size}
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size}

    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
@@ -60,8 +60,8 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    --decode_config ${decode_config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size}
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size}

    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"

--- a/examples/aishell/asr1/local/test_wav.sh
+++ b/examples/aishell/asr1/local/test_wav.sh
@@ -46,8 +46,8 @@ for type in  attention_rescoring; do
    --decode_config ${decode_config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.decode_batch_size ${batch_size} \
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size} \
    --audio_file ${audio_file}

    if [ $? -ne 0 ]; then

--- a/examples/csmsc/voc5/README.md
+++ b/examples/csmsc/voc5/README.md
@@ -125,8 +125,8 @@ HiFiGAN checkpoint contains files listed below.
 ```text
 hifigan_csmsc_ckpt_0.1.1
 ├── default.yaml                  # default config used to train hifigan
-├── feats_stats.npy                  # statistics used to normalize spectrogram when training hifigan
-└── snapshot_iter_2500000.pdz     # generator parameters of hifigan
+├── feats_stats.npy                  # generator parameters of hifigan
+└── snapshot_iter_2500000.pdz     # statistics used to normalize spectrogram when training hifigan
 ```

 ## Acknowledgement

--- a/examples/librispeech/asr1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: conformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: True
-        use_cnn_module: True
-        cnn_module_kernel: 15
-        activation_type: 'swish'
-        pos_enc_layer_type: 'rel_pos'
-        selfattention_layer_type: 'rel_selfattn'
-        causal: True
-        use_dynamic_chunk: true
-        cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
-        use_dynamic_left_chunk: false
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    use_cnn_module: True
+    cnn_module_kernel: 15
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    causal: True
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false

-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0

-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false


-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test

-collator:
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
-  mean_std_filepath: ""
-  augmentation_config: conf/preprocess.yaml
-  feat_dim: 80
-  stride_ms: 10.0
-  window_ms: 25.0
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  batch_size: 16
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  minibatches: 0 # for debug
-  batch_count: auto
-  batch_bins: 0 
-  batch_frames_in: 0
-  batch_frames_out: 0
-  batch_frames_inout: 0
-  augmentation_config: conf/preprocess.yaml 
-  num_workers: 0
-  subsampling_factor: 1
-  num_encs: 1
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'spm'
+spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
+mean_std_filepath: ""
+augmentation_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 16
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+augmentation_config: conf/preprocess.yaml 
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 120
+accum_grad: 8
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.001
+  weight_decay: 1e-06 
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5

-training:
-  n_epoch: 120
-  accum_grad: 8
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.001
-    weight_decay: 1e-06 
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5

-decoding:
-  batch_size: 128
-  error_rate_type: wer
-  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  beam_size: 10
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: true  # simulate streaming inference. Defaults to False.


--- a/examples/librispeech/asr1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: transformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
-        use_dynamic_chunk: true
-        use_dynamic_left_chunk: false
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    use_dynamic_chunk: true
+    use_dynamic_left_chunk: false

-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0

-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false

-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test


-collator:
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
-  mean_std_filepath: ""
-  augmentation_config: conf/preprocess.yaml
-  feat_dim: 80
-  stride_ms: 10.0
-  window_ms: 25.0
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  batch_size: 64
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  minibatches: 0 # for debug
-  batch_count: auto
-  batch_bins: 0 
-  batch_frames_in: 0
-  batch_frames_out: 0
-  batch_frames_inout: 0
-  augmentation_config: conf/preprocess.yaml 
-  num_workers: 0
-  subsampling_factor: 1
-  num_encs: 1
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'spm'
+spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
+mean_std_filepath: ""
+augmentation_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 64
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+augmentation_config: conf/preprocess.yaml 
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1


-training:
-  n_epoch: 120
-  accum_grad: 1
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.001
-    weight_decay: 1e-06
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-
-
-decoding:
-  batch_size: 64
-  error_rate_type: wer
-  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: true  # simulate streaming inference. Defaults to False.
\ No newline at end of file
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 120
+accum_grad: 1
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.001
+  weight_decay: 1e-06
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
\ No newline at end of file
--- a/examples/librispeech/asr1/conf/conformer.yaml
+++ b/examples/librispeech/asr1/conf/conformer.yaml
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: conformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: True
-        use_cnn_module: True
-        cnn_module_kernel: 15
-        activation_type: 'swish'
-        pos_enc_layer_type: 'rel_pos'
-        selfattention_layer_type: 'rel_selfattn'
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    use_cnn_module: True
+    cnn_module_kernel: 15
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'

-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0

-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        ctc_grad_norm_type: null 
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    ctc_grad_norm_type: null 
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false


-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test-clean
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test-clean


-collator:
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
-  mean_std_filepath: ""
-  augmentation_config: conf/preprocess.yaml
-  feat_dim: 80
-  stride_ms: 10.0
-  window_ms: 25.0
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  batch_size: 16
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  minibatches: 0 # for debug
-  batch_count: auto
-  batch_bins: 0 
-  batch_frames_in: 0
-  batch_frames_out: 0
-  batch_frames_inout: 0
-  augmentation_config: conf/preprocess.yaml 
-  num_workers: 0
-  subsampling_factor: 1
-  num_encs: 1
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'spm'
+spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
+mean_std_filepath: ""
+augmentation_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 16
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+augmentation_config: conf/preprocess.yaml 
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
  

-training:
-  n_epoch: 70
-  accum_grad: 8
-  global_grad_clip: 3.0
-  optim: adam
-  optim_conf:
-    lr: 0.004
-    weight_decay: 1e-06
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-
-
-decoding:
-  batch_size: 64
-  error_rate_type: wer
-  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  beam_size: 10
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 70
+accum_grad: 8
+global_grad_clip: 3.0
+optim: adam
+optim_conf:
+  lr: 0.004
+  weight_decay: 1e-06
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5


--- a/examples/librispeech/asr1/conf/transformer.yaml
+++ b/examples/librispeech/asr1/conf/transformer.yaml
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: transformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true

-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0

-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false


 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.train
-  dev_manifest: data/manifest.dev
-  test_manifest: data/manifest.test-clean
-  min_input_len: 0.5  # second
-  max_input_len: 30.0 # second
-  min_output_len: 0.0 # tokens
-  max_output_len: 400.0 # tokens
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 100.0
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test-clean

-collator:
-  vocab_filepath: data/lang_char/vocab.txt
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
-  mean_std_filepath: ""
-  augmentation_config: conf/preprocess.yaml
-  feat_dim: 80
-  stride_ms: 10.0
-  window_ms: 25.0
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  batch_size: 32 
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
-  minibatches: 0 # for debug
-  batch_count: auto
-  batch_bins: 0 
-  batch_frames_in: 0
-  batch_frames_out: 0
-  batch_frames_inout: 0
-  augmentation_config: conf/preprocess.yaml 
-  num_workers: 0
-  subsampling_factor: 1
-  num_encs: 1
-
-
-training:
-  n_epoch: 120 
-  accum_grad: 4
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.004
-    weight_decay: 1e-06
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 100
-  checkpoint:
-    kbest_n: 50
-    latest_n: 5
-
-
-decoding:
-  batch_size: 64
-  error_rate_type: wer
-  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt
+unit_type: 'spm'
+spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
+mean_std_filepath: ""
+augmentation_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 32 
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+augmentation_config: conf/preprocess.yaml 
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1


+###########################################
+#                 Training                #
+###########################################
+n_epoch: 120 
+accum_grad: 4
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.004
+  weight_decay: 1e-06
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml
+++ b/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml
+decode_batch_size: 128
+error_rate_type: wer
+decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+beam_size: 10
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: true  # simulate streaming inference. Defaults to False.
--- a/examples/librispeech/asr1/conf/tuning/decode.yaml
+++ b/examples/librispeech/asr1/conf/tuning/decode.yaml
+decode_batch_size: 64
+error_rate_type: wer
+decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+beam_size: 10
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/librispeech/asr1/local/align.sh
+++ b/examples/librispeech/asr1/local/align.sh
 #!/bin/bash

-if [ $# != 2 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
    exit -1
 fi

@@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."

 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
+ckpt_prefix=$3

 batch_size=1
 output_dir=${ckpt_prefix}
@@ -20,9 +21,10 @@ mkdir -p ${output_dir}
 python3 -u ${BIN_DIR}/alignment.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
+--decode_config ${decode_config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size}
+--opts decode.decode_batch_size ${batch_size}

 if [ $? -ne 0 ]; then
    echo "Failed in ctc alignment!"

--- a/examples/librispeech/asr1/local/test.sh
+++ b/examples/librispeech/asr1/local/test.sh
@@ -15,8 +15,8 @@ recog_set="test-clean"
 stage=0
 stop_stage=100

-if [ $# != 2 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix"
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
    exit -1
 fi

@@ -24,7 +24,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."

 config_path=$1
-ckpt_prefix=$2
+decode_config_path=$2
+ckpt_prefix=$3

 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
@@ -52,10 +53,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        python3 -u ${BIN_DIR}/test.py \
            --ngpu ${ngpu} \
            --config ${config_path} \
+            --decode_config ${decode_config_path} \
            --result_file ${ckpt_prefix}.${type}.rsl \
            --checkpoint_path ${ckpt_prefix} \
-            --opts decoding.decoding_method ${type} \
-            --opts decoding.batch_size ${batch_size}
+            --opts decode.decoding_method ${type} \
+            --opts decode.decode_batch_size ${batch_size}

        if [ $? -ne 0 ]; then
            echo "Failed in evaluation!"
@@ -76,10 +78,11 @@ for type in ctc_greedy_search; do
    python3 -u ${BIN_DIR}/test.py \
        --ngpu ${ngpu} \
        --config ${config_path} \
+        --decode_config ${decode_config_path} \
        --result_file ${ckpt_prefix}.${type}.rsl \
        --checkpoint_path ${ckpt_prefix} \
-        --opts decoding.decoding_method ${type} \
-        --opts decoding.batch_size ${batch_size}
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}

    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
@@ -96,10 +99,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    python3 -u ${BIN_DIR}/test.py \
        --ngpu ${ngpu} \
        --config ${config_path} \
+        --decode_config ${decode_config_path} \
        --result_file ${ckpt_prefix}.${type}.rsl \
        --checkpoint_path ${ckpt_prefix} \
-        --opts decoding.decoding_method ${type} \
-        --opts decoding.batch_size ${batch_size}
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}

    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"

--- a/examples/librispeech/asr1/local/test_wav.sh
+++ b/examples/librispeech/asr1/local/test_wav.sh
 #!/bin/bash

-if [ $# != 3 ];then
-    echo "usage: ${0} config_path ckpt_path_prefix audio_file"
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
    exit -1
 fi

@@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."

 config_path=$1
-ckpt_prefix=$2
-audio_file=$3
+decode_config_path=$2
+ckpt_prefix=$3
+audio_file=$4

 mkdir -p data
 wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
@@ -49,10 +50,11 @@ for type in attention_rescoring; do
    python3 -u ${BIN_DIR}/test_wav.py \
    --ngpu ${ngpu} \
    --config ${config_path} \
+    --decode_config ${decode_config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} \
-    --opts decoding.batch_size ${batch_size} \
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size} \
    --audio_file ${audio_file}

    #score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}

--- a/examples/librispeech/asr1/run.sh
+++ b/examples/librispeech/asr1/run.sh
@@ -8,6 +8,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=50
 conf_path=conf/transformer.yaml
+decode_conf_path=conf/tuning/decode.yaml
 avg_num=30
 audio_file=data/demo_002_en.wav

@@ -34,17 +35,17 @@ fi

 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi

 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # ctc alignment of test data
-    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi

 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # test a single .wav file
-    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
 fi

 if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then

--- a/examples/tiny/asr1/conf/conformer.yaml
+++ b/examples/tiny/asr1/conf/conformer.yaml
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.tiny
-  dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny
-  min_input_len: 0.5  # second
-  max_input_len: 20.0 # second
-  min_output_len: 0.0 # tokens
-  max_output_len: 400.0 # tokens
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.tiny
+dev_manifest: data/manifest.tiny
+test_manifest: data/manifest.tiny
+min_input_len: 0.5  # second
+max_input_len: 20.0 # second
+min_output_len: 0.0 # tokens
+max_output_len: 400.0 # tokens
+min_output_input_ratio: 0.05
+max_output_input_ratio: 10.0
  
-collator:
-  mean_std_filepath: ""
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
-  augmentation_config: conf/preprocess.yaml
-  batch_size: 4
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
-  feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2

+###########################################
+#              Dataloader                 #
+###########################################
+mean_std_filepath: ""
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'spm'
+spm_model_prefix: 'data/lang_char/bpe_unigram_200'
+augmentation_config: conf/preprocess.yaml
+batch_size: 4
+raw_wav: True  # use raw_wav or kaldi feature
+spectrum_type: fbank #linear, mfcc, fbank
+feat_dim: 80
+delta_delta: False
+dither: 1.0
+target_sample_rate: 16000
+max_freq: None
+n_fft: None
+stride_ms: 10.0
+window_ms: 25.0
+use_dB_normalization: True
+target_dB: -20
+random_seed: 0
+keep_transcription_text: False
+sortagrad: True 
+shuffle_method: batch_shuffle
+num_workers: 2

-# network architecture
-model:
-    cmvn_file: "data/mean_std.json"
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: conformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
-        use_cnn_module: True
-        cnn_module_kernel: 15
-        activation_type: 'swish'
-        pos_enc_layer_type: 'rel_pos'
-        selfattention_layer_type: 'rel_selfattn'

-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: "data/mean_std.json"
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    use_cnn_module: True
+    cnn_module_kernel: 15
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'

-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0

+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false

-training:
-  n_epoch: 5
-  accum_grad: 4
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.002
-    weight_decay: 1e-06
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 1
-  checkpoint:
-    kbest_n: 10
-    latest_n: 1

+###########################################
+#                 training                #
+###########################################
+n_epoch: 5
+accum_grad: 4
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.002
+  weight_decay: 1e-06
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 1
+checkpoint:
+  kbest_n: 10
+  latest_n: 1

-decoding:
-  batch_size: 64
-  error_rate_type: wer
-  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.


--- a/examples/tiny/asr1/conf/transformer.yaml
+++ b/examples/tiny/asr1/conf/transformer.yaml
 # https://yaml.org/type/float.html
-data:
-  train_manifest: data/manifest.tiny
-  dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny
-  min_input_len: 0.5  # second
-  max_input_len: 20.0 # second
-  min_output_len: 0.0 # tokens
-  max_output_len: 400.0 # tokens
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.tiny
+dev_manifest: data/manifest.tiny
+test_manifest: data/manifest.tiny
+min_input_len: 0.5  # second
+max_input_len: 20.0 # second
+min_output_len: 0.0 # tokens
+max_output_len: 400.0 # tokens
+min_output_input_ratio: 0.05
+max_output_input_ratio: 10.0
  
-collator:
-  mean_std_filepath: data/mean_std.json
-  vocab_filepath: data/lang_char/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
-  augmentation_config: conf/preprocess.yaml
-  batch_size: 4
-  raw_wav: True  # use raw_wav or kaldi feature
-  spectrum_type: fbank #linear, mfcc, fbank
-  feat_dim: 80
-  delta_delta: False
-  dither: 1.0
-  target_sample_rate: 16000
-  max_freq: None
-  n_fft: None
-  stride_ms: 10.0
-  window_ms: 25.0
-  use_dB_normalization: True
-  target_dB: -20
-  random_seed: 0
-  keep_transcription_text: False
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 2
+###########################################
+#              Dataloader                 #
+###########################################
+mean_std_filepath: data/mean_std.json
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'spm'
+spm_model_prefix: 'data/lang_char/bpe_unigram_200'
+augmentation_config: conf/preprocess.yaml
+batch_size: 4
+raw_wav: True  # use raw_wav or kaldi feature
+spectrum_type: fbank #linear, mfcc, fbank
+feat_dim: 80
+delta_delta: False
+dither: 1.0
+target_sample_rate: 16000
+max_freq: None
+n_fft: None
+stride_ms: 10.0
+window_ms: 25.0
+use_dB_normalization: True
+target_dB: -20
+random_seed: 0
+keep_transcription_text: False
+sortagrad: True 
+shuffle_method: batch_shuffle
+num_workers: 2

-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: transformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true

-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0

-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false


-training:
-  n_epoch: 5
-  accum_grad: 1
-  global_grad_clip: 5.0
-  optim: adam
-  optim_conf:
-    lr: 0.002
-    weight_decay: 1e-06
-  scheduler: warmuplr     
-  scheduler_conf:
-    warmup_steps: 25000
-    lr_decay: 1.0
-  log_interval: 1
-  checkpoint:
-    kbest_n: 2
-    latest_n: 1
+###########################################
+#                 training                #
+###########################################
+n_epoch: 5
+accum_grad: 1
+global_grad_clip: 5.0
+optim: adam
+optim_conf:
+  lr: 0.002
+  weight_decay: 1e-06
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 1
+checkpoint:
+  kbest_n: 2
+  latest_n: 1


-decoding:
-  batch_size: 8 #64
-  error_rate_type: wer
-  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
-  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
-  alpha: 2.5
-  beta: 0.3
-  beam_size: 10
-  cutoff_prob: 1.0
-  cutoff_top_n: 0
-  num_proc_bsearch: 8
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
-      # <0: for decoding, use full chunk.
-      # >0: for decoding, use fixed chunk size as set.
-      # 0: used for training, it's prohibited here. 
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
-

--- a/paddlespeech/s2t/exps/u2/bin/alignment.py
+++ b/paddlespeech/s2t/exps/u2/bin/alignment.py
@@ -46,7 +46,7 @@ if __name__ == "__main__":
    if args.decode_config:
        decode_confs = CfgNode(new_allowed=True)
        decode_confs.merge_from_file(args.decode_config)
-        config.decoding = decode_confs
+        config.decode = decode_confs
    if args.opts:
        config.merge_from_list(args.opts)
    config.freeze()

--- a/paddlespeech/s2t/exps/u2/bin/test.py
+++ b/paddlespeech/s2t/exps/u2/bin/test.py
@@ -50,7 +50,7 @@ if __name__ == "__main__":
    if args.decode_config:
        decode_confs = CfgNode(new_allowed=True)
        decode_confs.merge_from_file(args.decode_config)
-        config.decoding = decode_confs
+        config.decode = decode_confs
    if args.opts:
        config.merge_from_list(args.opts)
    config.freeze()

--- a/paddlespeech/s2t/exps/u2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py
@@ -81,7 +81,7 @@ class U2Infer():
            ilen = paddle.to_tensor(feat.shape[0])
            xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0)

-            decode_config = self.config.decoding
+            decode_config = self.config.decode
            result_transcripts = self.model.decode(
                xs,
                ilen,
@@ -135,7 +135,7 @@ if __name__ == "__main__":
    if args.decode_config:
        decode_confs = CfgNode(new_allowed=True)
        decode_confs.merge_from_file(args.decode_config)
-        config.decoding = decode_confs
+        config.decode = decode_confs
    if args.opts:
        config.merge_from_list(args.opts)
    config.freeze()

--- a/paddlespeech/s2t/exps/u2/config.py
+++ b/paddlespeech/s2t/exps/u2/config.py
@@ -29,7 +29,7 @@ U2Model.params(_C)

 U2Trainer.params(_C)

-_C.decoding = U2Tester.params()
+_C.decode = U2Tester.params()


 def get_cfg_defaults():