refactor data pipe; fix conf; fix u2 default params

88d82b4a · Hui Zhang · c4df6bac · 88d82b4a · 88d82b4a · 88d82b4a
10 changed file
--- a/deepspeech/exps/u2/config.py
+++ b/deepspeech/exps/u2/config.py
@@ -18,6 +18,7 @@ from deepspeech.exps.u2.model import U2Trainer
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.u2 import U2Model

+
 _C = CfgNode()

 _C.data = ManifestDataset.params()

--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
@@ -148,6 +148,8 @@ class FeatureNormalizer(object):
                          batch_size=64,
                          eps=1e-20):
        """Compute mean and std from randomly sampled instances."""
+        paddle.set_device('cpu')
+        
        collate_func = CollateFunc(featurize_func)
        dataset = AudioDataset(manifest_path, num_samples, self._rng)
        data_loader = DataLoader(

--- a/deepspeech/models/u2.py
+++ b/deepspeech/models/u2.py
@@ -61,12 +61,14 @@ class U2BaseModel(nn.Module):
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        # network architecture
        default = CfgNode()
+        # allow add new item when merge_with_file
+        default.set_new_allowed(True)
        default.cmvn_file = ""
        default.cmvn_file_type = "npz"
        default.input_dim = 0
        default.output_dim = 0
        # encoder related
-        default.encoder = 'conformer'
+        default.encoder = 'transformer'
        default.encoder_conf = CfgNode(
            dict(
                output_size=256,  # dimension of attention
@@ -78,11 +80,12 @@ class U2BaseModel(nn.Module):
                attention_dropout_rate=0.0,
                input_layer='conv2d',  # encoder input type, you can chose conv2d, conv2d6 and conv2d8
                normalize_before=True,
-                cnn_module_kernel=15,
-                use_cnn_module=True,
-                activation_type='swish',
-                pos_enc_layer_type='rel_pos',
-                selfattention_layer_type='rel_selfattn', ))
+                # use_cnn_module=True,
+                # cnn_module_kernel=15,
+                # activation_type='swish',
+                # pos_enc_layer_type='rel_pos',
+                # selfattention_layer_type='rel_selfattn', 
+                ))
        # decoder related
        default.decoder = 'transformer'
        default.decoder_conf = CfgNode(

--- a/examples/librispeech/s1/conf/chunk_confermer.yaml
+++ b/examples/librispeech/s1/conf/chunk_confermer.yaml
 # https://yaml.org/type/float.html
 data:
-  train_manifest: data/manifest.tiny
-  dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
+  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
  augmentation_config: conf/augmentation.json
  batch_size: 4

--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
 # https://yaml.org/type/float.html
 data:
-  train_manifest: data/manifest.tiny
-  dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
+  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
  augmentation_config: conf/augmentation.json
  batch_size: 64

--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
 # https://yaml.org/type/float.html
 data:
-  train_manifest: data/manifest.tiny
-  dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_5000'

--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
 # https://yaml.org/type/float.html
 data:
-  train_manifest: data/manifest.tiny
-  dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny
-  vocab_filepath: data/vocab.txt 
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
+  vocab_filepath: data/vocab.txt
  unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
+  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
  augmentation_config: conf/augmentation.json
  batch_size: 64

--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/s1/local/data.sh
@@ -27,8 +27,20 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
        exit 1
    fi

+    for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
+        mv data/manifest.${set} data/manifest.${set}.raw
+    done
+
    for set in train-clean-100 train-clean-360 train-other-500; do
-        cat data/manifest.${set} >> data/manifest.train.raw
+        cat data/manifest.${set}.raw >> data/manifest.train.raw
+    done
+
+    for set in dev-clean dev-other; do
+        cat data/manifest.${set}.raw >> data/manifest.dev.raw
+    done
+
+    for set in test-clean test-other; do
+        cat data/manifest.${set}.raw >> data/manifest.test.raw
    done
 fi

@@ -73,20 +85,24 @@ fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # format manifest with tokenids, vocab size
-    python3 ${MAIN_ROOT}/utils/format_data.py \
-    --feat_type "raw" \
-    --cmvn_path "data/mean_std.json" \
-    --unit_type "spm" \
-    --spm_model_prefix ${bpeprefix} \
-    --vocab_path="data/vocab.txt" \
-    --manifest_path="data/manifest.train.raw" \
-    --output_path="data/manifest.train"
-
-
-    if [ $? -ne 0 ]; then
-        echo "Formt mnaifest failed. Terminated."
-        exit 1
-    fi
+    for set in train dev test dev-clean dev-other test-clean test-other; do
+    {
+        python3 ${MAIN_ROOT}/utils/format_data.py \
+        --feat_type "raw" \
+        --cmvn_path "data/mean_std.json" \
+        --unit_type "spm" \
+        --spm_model_prefix ${bpeprefix} \
+        --vocab_path="data/vocab.txt" \
+        --manifest_path="data/manifest.${set}.raw" \
+        --output_path="data/manifest.${set}"
+
+        if [ $? -ne 0 ]; then
+            echo "Formt mnaifest failed. Terminated."
+            exit 1
+        fi
+    }&
+    done
+    wait
 fi

 echo "LibriSpeech Data preparation done."

--- a/examples/librispeech/s1/run.sh
+++ b/examples/librispeech/s1/run.sh
@@ -4,7 +4,8 @@ source path.sh

 stage=0
 stop_stage=100
-ckpt=conformer
+conf_path=conf/transformer.yaml
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 avg_num=30
 avg_ckpt=avg_${avg_num}

@@ -17,7 +18,7 @@ fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=4,5,6,7 ./local/train.sh conf/conformer.yaml  ${ckpt}
+    CUDA_VISIBLE_DEVICES=4,5,6,7 ./local/train.sh ${conf_path}  ${ckpt}
 fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
@@ -27,10 +28,10 @@ fi

 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=7 ./local/test.sh conf/conformer.yaml exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi

 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # export ckpt avg_n
-    CUDA_VISIBLE_DEVICES= ./local/export.sh conf/conformer.yaml exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
 fi
--- a/examples/tiny/s1/run.sh
+++ b/examples/tiny/s1/run.sh
@@ -4,7 +4,8 @@ source path.sh

 stage=0
 stop_stage=100
-ckpt=conformer
+conf_path=conf/transformer.yaml
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 avg_num=1
 avg_ckpt=avg_${avg_num}

@@ -17,7 +18,7 @@ fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=0 ./local/train.sh conf/conformer.yaml  ${ckpt}
+    CUDA_VISIBLE_DEVICES=4,5,6,7 ./local/train.sh ${conf_path}  ${ckpt}
 fi

 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
@@ -27,10 +28,10 @@ fi

 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/test.sh conf/conformer.yaml exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi

 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # export ckpt avg_n
-    CUDA_VISIBLE_DEVICES= ./local/export.sh conf/conformer.yaml exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
 fi