add u2 config

2fa6bbbe · Hui Zhang · 9626e99c · 9626e99c · 2fa6bbbe · 2fa6bbbe
10 changed file
--- a/examples/tiny/s0/local/download_model.sh
+++ b/examples/tiny/s0/local/download_model.sh
-#! /usr/bin/env bash
-
-. ${MAIN_ROOT}/utils/utility.sh
-
-DIR=data/pretrain
-mkdir -p ${DIR}
-
-URL='https://deepspeech.bj.bcebos.com/eng_models/librispeech_model_fluid.tar.gz'
-MD5=fafb11fe57c3ecd107147056453f5348
-TARGET=${DIR}/librispeech_model_fluid.tar.gz
-
-
-echo "Download LibriSpeech model ..."
-download $URL $MD5 $TARGET
-if [ $? -ne 0 ]; then
-    echo "Fail to download LibriSpeech model!"
-    exit 1
-fi
-tar -zxvf $TARGET -C ${DIR}
-
-exit 0
--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/s1/conf/chunk_confermer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+
+# use raw_wav or kaldi feature
+raw_wav: true
+
+# feature extraction
+collate_conf:
+    # waveform level config
+    wav_distortion_conf:
+        wav_dither: 1.0
+        wav_distortion_rate: 0.0
+        distortion_methods: []
+    speed_perturb: true
+    feature_extraction_conf:
+        feature_type: 'fbank'
+        mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        using_pitch: false
+    # spec level config
+    # spec_swap: false
+    feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
+    spec_aug: true
+    spec_aug_conf:
+        warp_for_time: False
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+        max_w: 80
+
+# dataset related
+dataset_conf:
+    max_length: 40960
+    min_length: 0
+    batch_type: 'static' # static or dynamic
+    # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
+    batch_size: 16
+    sort: true
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 180
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
\ No newline at end of file
--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/s1/conf/chunk_transformer.yaml
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    use_dynamic_chunk: true
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# use raw_wav or kaldi feature
+raw_wav: true
+
+# feature extraction
+collate_conf:
+    # waveform level config
+    wav_distortion_conf:
+        wav_dither: 0.0
+        wav_distortion_rate: 0.0
+        distortion_methods: []
+    speed_perturb: false
+    feature_extraction_conf:
+        feature_type: 'fbank'
+        mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        using_pitch: false
+    # spec level config
+    # spec_swap: false
+    feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
+    spec_aug: true
+    spec_aug_conf:
+        warp_for_time: False
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+        max_w: 80
+
+
+# dataset related
+dataset_conf:
+    max_length: 40960
+    min_length: 0
+    batch_type: 'static' # static or dynamic
+    # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
+    batch_size: 16
+    sort: true
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 180
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
\ No newline at end of file
--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# use raw_wav or kaldi feature
+raw_wav: true
+
+# feature extraction
+collate_conf:
+    # waveform level config
+    wav_distortion_conf:
+        wav_dither: 0.1
+        wav_distortion_rate: 0.0
+        distortion_methods: []
+    speed_perturb: true
+    feature_extraction_conf:
+        feature_type: 'fbank'
+        mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        using_pitch: false
+    # spec level config
+    # spec_swap: false
+    feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
+    spec_aug: true
+    spec_aug_conf:
+        warp_for_time: False
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+        max_w: 80
+
+
+# dataset related
+dataset_conf:
+    max_length: 40960
+    min_length: 0
+    batch_type: 'static' # static or dynamic
+    # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
+    batch_size: 16
+    sort: true
+
+grad_clip: 5
+accum_grad: 4
+max_epoch: 240
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
\ No newline at end of file
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
+# network architecture
+# encoder related
+encoder: transformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# use raw_wav or kaldi feature
+raw_wav: true
+
+# feature extraction
+collate_conf:
+    # waveform level config
+    wav_distortion_conf:
+        wav_dither: 0.1
+        wav_distortion_rate: 0.0
+        distortion_methods: []
+    speed_perturb: true
+    feature_extraction_conf:
+        feature_type: 'fbank'
+        mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        using_pitch: false
+    # spec level config
+    feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature
+    spec_aug: true
+    spec_aug_conf:
+        warp_for_time: False
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+        max_w: 80
+
+
+# dataset related
+dataset_conf:
+    max_length: 40960
+    min_length: 0
+    batch_type: 'static' # static or dynamic
+    # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB
+    batch_size: 26
+    sort: true
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 240
+log_interval: 100
+
+optim: adam
+optim_conf:
+    lr: 0.002
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
\ No newline at end of file
--- a/examples/tiny/s1/local/data.sh
+++ b/examples/tiny/s1/local/data.sh
+../../s0/local/data.sh
\ No newline at end of file
--- a/examples/tiny/s1/local/download_lm_en.sh
+++ b/examples/tiny/s1/local/download_lm_en.sh
+../../s0/local/download_lm_en.sh
\ No newline at end of file
--- a/examples/tiny/s1/path.sh
+++ b/examples/tiny/s1/path.sh
+export MAIN_ROOT=${PWD}/../../../
+
+export PATH=${MAIN_ROOT}:${PWD}/tools:${PATH}
+export LC_ALL=C
+
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8 
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+
+MODEL=u2
+export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
--- a/examples/tiny/s1/run.sh
+++ b/examples/tiny/s1/run.sh
+#!/bin/bash
+set -e
+
+source path.sh
+
+# prepare data
+bash ./local/data.sh
+
+# train model
+bash ./local/train.sh
+
+# test model
+bash ./local/test.sh
+
+# infer model
+bash ./local/infer.sh
--- a/tests/u2_model_test.py
+++ b/tests/u2_model_test.py
@@ -13,8 +13,11 @@
 # limitations under the License.

 import paddle
-import numpy as np
+
 import unittest
+import numpy as np
+from yacs.config import CfgNode as CN
+
 from deepspeech.models.u2 import U2TransformerModel
 from deepspeech.models.u2 import U2ConformerModel

@@ -41,9 +44,82 @@ class TestU2Model(unittest.TestCase):
        self.text_len = paddle.to_tensor(text_len, dtype='int64')

    def test_transformer(self):
+        conf_str = """
+            # network architecture
+            # encoder related
+            encoder: transformer
+            encoder_conf:
+                output_size: 256    # dimension of attention
+                attention_heads: 4
+                linear_units: 2048  # the number of units of position-wise feed forward
+                num_blocks: 12      # the number of encoder blocks
+                dropout_rate: 0.1
+                positional_dropout_rate: 0.1
+                attention_dropout_rate: 0.0
+                input_layer: conv2d # encoder architecture type
+                normalize_before: true
+
+            # decoder related
+            decoder: transformer
+            decoder_conf:
+                attention_heads: 4
+                linear_units: 2048
+                num_blocks: 6
+                dropout_rate: 0.1
+                positional_dropout_rate: 0.1
+                self_attention_dropout_rate: 0.0
+                src_attention_dropout_rate: 0.0
+
+            # hybrid CTC/attention
+            model_conf:
+                ctc_weight: 0.3
+                lsm_weight: 0.1     # label smoothing option
+                length_normalized_loss: false
+        """
+        cfg = CN().load_cfg(conf_str)
+        print(cfg)
        model = U2TransformerModel()

    def test_conformer(self):
+        conf_str = """
+            # network architecture
+            # encoder related
+            encoder: conformer
+            encoder_conf:
+                output_size: 256    # dimension of attention
+                attention_heads: 4
+                linear_units: 2048  # the number of units of position-wise feed forward
+                num_blocks: 12      # the number of encoder blocks
+                dropout_rate: 0.1
+                positional_dropout_rate: 0.1
+                attention_dropout_rate: 0.0
+                input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+                normalize_before: true
+                cnn_module_kernel: 15
+                use_cnn_module: True
+                activation_type: 'swish'
+                pos_enc_layer_type: 'rel_pos'
+                selfattention_layer_type: 'rel_selfattn'
+
+            # decoder related
+            decoder: transformer
+            decoder_conf:
+                attention_heads: 4
+                linear_units: 2048
+                num_blocks: 6
+                dropout_rate: 0.1
+                positional_dropout_rate: 0.1
+                self_attention_dropout_rate: 0.0
+                src_attention_dropout_rate: 0.0
+
+            # hybrid CTC/attention
+            model_conf:
+                ctc_weight: 0.3
+                lsm_weight: 0.1     # label smoothing option
+                length_normalized_loss: false
+        """
+        cfg = CN().load_cfg(conf_str)
+        print(cfg)
        model = U2ConformerModel()