chunk_conformer.yaml 3.4 KB
Newer Older
1 2 3 4 5 6
# https://yaml.org/type/float.html
data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
  min_input_len: 0.5
7
  max_input_len: 30.0
8 9 10
  min_output_len: 0.0
  max_output_len: 400.0
  min_output_input_ratio: 0.05
11
  max_output_input_ratio: 100.0
12 13 14 15 16 17 18 19

collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
  augmentation_config: conf/augmentation.json
  batch_size: 16
20
  raw_wav: True  # use raw_wav or kaldi feature
H
Hui Zhang 已提交
21
  spectrum_type: fbank #linear, mfcc, fbank
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
  feat_dim: 80
  delta_delta: False
  dither: 1.0
  target_sample_rate: 16000
  max_freq: None
  n_fft: None
  stride_ms: 10.0
  window_ms: 25.0
  use_dB_normalization: True
  target_dB: -20
  random_seed: 0
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
  num_workers: 2


# network architecture
model:
    cmvn_file: "data/mean_std.json"
    cmvn_file_type: "json"
    # encoder related
    encoder: conformer
    encoder_conf:
        output_size: 256    # dimension of attention
        attention_heads: 4
        linear_units: 2048  # the number of units of position-wise feed forward
        num_blocks: 12      # the number of encoder blocks
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.0
        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
        normalize_before: True
        use_cnn_module: True
        cnn_module_kernel: 15
        activation_type: 'swish'
        pos_enc_layer_type: 'rel_pos'
        selfattention_layer_type: 'rel_selfattn'
        causal: True
H
Hui Zhang 已提交
61
        use_dynamic_chunk: true
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
        cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
        use_dynamic_left_chunk: false

    # decoder related
    decoder: transformer
    decoder_conf:
        attention_heads: 4
        linear_units: 2048
        num_blocks: 6
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        self_attention_dropout_rate: 0.0
        src_attention_dropout_rate: 0.0

    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
H
Hui Zhang 已提交
79 80
        ctc_dropoutrate: 0.0
        ctc_grad_norm_type: instance
81 82 83 84 85
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


training:
86
  n_epoch: 240
87
  accum_grad: 8
88 89 90 91 92 93 94 95 96 97
  global_grad_clip: 5.0
  optim: adam
  optim_conf:
    lr: 0.001
    weight_decay: 1e-06
  scheduler: warmuplr     # pytorch v1.1.0+ required
  scheduler_conf:
    warmup_steps: 25000
    lr_decay: 1.0
  log_interval: 100
H
Haoxin Ma 已提交
98 99 100
  checkpoint:
    kbest_n: 50
    latest_n: 5
101 102 103 104 105 106 107 108 109 110 111 112 113


decoding:
  batch_size: 128
  error_rate_type: wer
  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
  alpha: 2.5
  beta: 0.3
  beam_size: 10
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
H
Hui Zhang 已提交
114
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
H
Hui Zhang 已提交
115
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
116 117 118 119
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
H
Hui Zhang 已提交
120
  simulate_streaming: true  # simulate streaming inference. Defaults to False.
121 122