diff --git a/examples/aishell/README.md b/examples/aishell/README.md index 82ef91da96f47e99e5589695f8de9776279aece8..a9bba074f2f6bff624976260c847fce8cc6a75f1 100644 --- a/examples/aishell/README.md +++ b/examples/aishell/README.md @@ -1,7 +1,9 @@ # ASR -* s0 for deepspeech2 -* s1 for u2/transformer/conformer +* asr0 - deepspeech2 Streaming/Non-Streaming +* asr1 - transformer/conformer Streaming/Non-Streaming +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature + ## Data diff --git a/examples/aishell/s0/.gitignore b/examples/aishell/asr0/.gitignore similarity index 100% rename from examples/aishell/s0/.gitignore rename to examples/aishell/asr0/.gitignore diff --git a/examples/aishell/s0/README.md b/examples/aishell/asr0/README.md similarity index 100% rename from examples/aishell/s0/README.md rename to examples/aishell/asr0/README.md diff --git a/examples/aishell/s0/conf/augmentation.json b/examples/aishell/asr0/conf/augmentation.json similarity index 100% rename from examples/aishell/s0/conf/augmentation.json rename to examples/aishell/asr0/conf/augmentation.json diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/asr0/conf/deepspeech2.yaml similarity index 100% rename from examples/aishell/s0/conf/deepspeech2.yaml rename to examples/aishell/asr0/conf/deepspeech2.yaml diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml similarity index 100% rename from examples/aishell/s0/conf/deepspeech2_online.yaml rename to examples/aishell/asr0/conf/deepspeech2_online.yaml diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/asr0/local/data.sh similarity index 96% rename from examples/aishell/s0/local/data.sh rename to examples/aishell/asr0/local/data.sh index f4fccbe6e034064935f1d41abd9aebdc0cc7d7ac..23f04f2a640b5cbeb0ac43fbd07877642dee38ae 100755 --- a/examples/aishell/s0/local/data.sh +++ b/examples/aishell/asr0/local/data.sh @@ -32,8 +32,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --manifest_path="data/manifest.train.raw" \ --spectrum_type="linear" \ --delta_delta=false \ - --stride_ms=10.0 \ - --window_ms=20.0 \ + --stride_ms=10 \ + --window_ms=20 \ --sample_rate=16000 \ --use_dB_normalization=True \ --num_samples=2000 \ @@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for dataset in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "char" \ --vocab_path="data/vocab.txt" \ diff --git a/examples/aishell/s0/local/download_lm_ch.sh b/examples/aishell/asr0/local/download_lm_ch.sh similarity index 100% rename from examples/aishell/s0/local/download_lm_ch.sh rename to examples/aishell/asr0/local/download_lm_ch.sh diff --git a/examples/aishell/s0/local/export.sh b/examples/aishell/asr0/local/export.sh similarity index 100% rename from examples/aishell/s0/local/export.sh rename to examples/aishell/asr0/local/export.sh diff --git a/examples/aishell/s0/local/test.sh b/examples/aishell/asr0/local/test.sh similarity index 100% rename from examples/aishell/s0/local/test.sh rename to examples/aishell/asr0/local/test.sh diff --git a/examples/aishell/s0/local/test_export.sh b/examples/aishell/asr0/local/test_export.sh similarity index 100% rename from examples/aishell/s0/local/test_export.sh rename to examples/aishell/asr0/local/test_export.sh diff --git a/examples/aishell/s0/local/test_hub.sh b/examples/aishell/asr0/local/test_hub.sh similarity index 100% rename from examples/aishell/s0/local/test_hub.sh rename to examples/aishell/asr0/local/test_hub.sh diff --git a/examples/aishell/s0/local/train.sh b/examples/aishell/asr0/local/train.sh similarity index 100% rename from examples/aishell/s0/local/train.sh rename to examples/aishell/asr0/local/train.sh diff --git a/examples/aishell/s0/path.sh b/examples/aishell/asr0/path.sh similarity index 100% rename from examples/aishell/s0/path.sh rename to examples/aishell/asr0/path.sh diff --git a/examples/aishell/s0/run.sh b/examples/aishell/asr0/run.sh similarity index 100% rename from examples/aishell/s0/run.sh rename to examples/aishell/asr0/run.sh diff --git a/examples/aishell/s1/.gitignore b/examples/aishell/asr1/.gitignore similarity index 100% rename from examples/aishell/s1/.gitignore rename to examples/aishell/asr1/.gitignore diff --git a/examples/aishell/s1/README.md b/examples/aishell/asr1/README.md similarity index 67% rename from examples/aishell/s1/README.md rename to examples/aishell/asr1/README.md index 0096c73e30ec57bb41ddf54508a9d197c31adf3a..8c53f95f67514ad8ba5f9050b4d5e1aa3651ecbc 100644 --- a/examples/aishell/s1/README.md +++ b/examples/aishell/asr1/README.md @@ -19,3 +19,13 @@ Need set `decoding.decoding_chunk_size=16` when decoding. | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | - | 0.059400 | + + +## Transformer + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 | \ No newline at end of file diff --git a/examples/aishell/s1/conf/augmentation.json b/examples/aishell/asr1/conf/augmentation.json similarity index 100% rename from examples/aishell/s1/conf/augmentation.json rename to examples/aishell/asr1/conf/augmentation.json diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml similarity index 97% rename from examples/aishell/s1/conf/chunk_conformer.yaml rename to examples/aishell/asr1/conf/chunk_conformer.yaml index 8682538b303e9a4c1a2b0d928a06c4a33454f9e2..336a6c46224d6be6cd0d1846a742de66973f5109 100644 --- a/examples/aishell/s1/conf/chunk_conformer.yaml +++ b/examples/aishell/asr1/conf/chunk_conformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'char' spm_model_prefix: '' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 32 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml similarity index 97% rename from examples/aishell/s1/conf/conformer.yaml rename to examples/aishell/asr1/conf/conformer.yaml index 71cd044ed19dd91ece61673d5c0852a4a0cef4f9..0e9d79d8b25e0a19955db8d7c3f9f79c25501ae8 100644 --- a/examples/aishell/s1/conf/conformer.yaml +++ b/examples/aishell/asr1/conf/conformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'char' spm_model_prefix: '' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 64 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -37,7 +37,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dd4cfd273e0f654169955d29ed76c818ded9181c --- /dev/null +++ b/examples/aishell/asr1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c021f66b71513b300b98d6f50fcc39573cc85dca --- /dev/null +++ b/examples/aishell/asr1/conf/transformer.yaml @@ -0,0 +1,112 @@ +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test + min_input_len: 0.5 + max_input_len: 20.0 # second + min_output_len: 0.0 + max_output_len: 400.0 + min_output_input_ratio: 0.05 + max_output_input_ratio: 10.0 + + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'char' + spm_model_prefix: '' + augmentation_config: conf/preprocess.yaml + batch_size: 64 + raw_wav: True # use raw_wav or kaldi feature + spectrum_type: fbank #linear, mfcc, fbank + feat_dim: 80 + delta_delta: False + dither: 1.0 + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 25.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 2 + +# network architecture +model: + cmvn_file: + cmvn_file_type: "json" + # encoder related + encoder: transformer + encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + + # decoder related + decoder: transformer + decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + + # hybrid CTC/attention + model_conf: + ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: null + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + + +training: + n_epoch: 120 + accum_grad: 2 + global_grad_clip: 5.0 + optim: adam + optim_conf: + lr: 0.002 + weight_decay: 1e-6 + scheduler: warmuplr # pytorch v1.1.0+ required + scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 + log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 + + +decoding: + batch_size: 128 + error_rate_type: cer + decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' + lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm + alpha: 2.5 + beta: 0.3 + beam_size: 10 + cutoff_prob: 1.0 + cutoff_top_n: 0 + num_proc_bsearch: 8 + ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. + decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. + num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. + simulate_streaming: False # simulate streaming inference. Defaults to False. + + diff --git a/examples/aishell/s1/local/aishell_train_lms.sh b/examples/aishell/asr1/local/aishell_train_lms.sh similarity index 100% rename from examples/aishell/s1/local/aishell_train_lms.sh rename to examples/aishell/asr1/local/aishell_train_lms.sh diff --git a/examples/aishell/s1/local/align.sh b/examples/aishell/asr1/local/align.sh similarity index 100% rename from examples/aishell/s1/local/align.sh rename to examples/aishell/asr1/local/align.sh diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/asr1/local/data.sh similarity index 96% rename from examples/aishell/s1/local/data.sh rename to examples/aishell/asr1/local/data.sh index 2b9f69ae46c35fbaba51ba7e9629147e053ffdc5..76e28075298c4817e6b553fcb0870cf145fa06f0 100755 --- a/examples/aishell/s1/local/data.sh +++ b/examples/aishell/asr1/local/data.sh @@ -33,8 +33,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --sample_rate=16000 \ --use_dB_normalization=False \ --num_samples=-1 \ @@ -67,7 +67,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for dataset in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "char" \ --vocab_path="data/vocab.txt" \ diff --git a/examples/aishell/s1/local/export.sh b/examples/aishell/asr1/local/export.sh similarity index 100% rename from examples/aishell/s1/local/export.sh rename to examples/aishell/asr1/local/export.sh diff --git a/examples/aishell/s1/local/test.sh b/examples/aishell/asr1/local/test.sh similarity index 100% rename from examples/aishell/s1/local/test.sh rename to examples/aishell/asr1/local/test.sh diff --git a/examples/aishell/s1/local/test_hub.sh b/examples/aishell/asr1/local/test_hub.sh similarity index 99% rename from examples/aishell/s1/local/test_hub.sh rename to examples/aishell/asr1/local/test_hub.sh index 99b141c8107ddd4d4bf118d90fd6d9b2441d69af..6e78ec784bb8cf1445ebbe83fd39f4c8a441f418 100755 --- a/examples/aishell/s1/local/test_hub.sh +++ b/examples/aishell/asr1/local/test_hub.sh @@ -23,8 +23,6 @@ fi # exit 1 #fi - - for type in attention_rescoring; do echo "decoding ${type}" batch_size=1 diff --git a/examples/aishell/s1/local/tlg.sh b/examples/aishell/asr1/local/tlg.sh similarity index 100% rename from examples/aishell/s1/local/tlg.sh rename to examples/aishell/asr1/local/tlg.sh diff --git a/examples/aishell/s1/local/train.sh b/examples/aishell/asr1/local/train.sh similarity index 100% rename from examples/aishell/s1/local/train.sh rename to examples/aishell/asr1/local/train.sh diff --git a/examples/aishell/s1/path.sh b/examples/aishell/asr1/path.sh similarity index 100% rename from examples/aishell/s1/path.sh rename to examples/aishell/asr1/path.sh diff --git a/examples/aishell/s1/run.sh b/examples/aishell/asr1/run.sh similarity index 100% rename from examples/aishell/s1/run.sh rename to examples/aishell/asr1/run.sh diff --git a/examples/aishell/s1/utils b/examples/aishell/asr1/utils similarity index 100% rename from examples/aishell/s1/utils rename to examples/aishell/asr1/utils diff --git a/examples/callcenter/s1/.gitignore b/examples/callcenter/asr1/.gitignore similarity index 100% rename from examples/callcenter/s1/.gitignore rename to examples/callcenter/asr1/.gitignore diff --git a/examples/callcenter/s1/README.md b/examples/callcenter/asr1/README.md similarity index 100% rename from examples/callcenter/s1/README.md rename to examples/callcenter/asr1/README.md diff --git a/examples/callcenter/s1/conf/augmentation.json b/examples/callcenter/asr1/conf/augmentation.json similarity index 100% rename from examples/callcenter/s1/conf/augmentation.json rename to examples/callcenter/asr1/conf/augmentation.json diff --git a/examples/callcenter/s1/conf/chunk_conformer.yaml b/examples/callcenter/asr1/conf/chunk_conformer.yaml similarity index 97% rename from examples/callcenter/s1/conf/chunk_conformer.yaml rename to examples/callcenter/asr1/conf/chunk_conformer.yaml index a853658a859c409cb7109e08b4a9c74d4610fe87..b18b46fe6aa0a91476f5b6fcac0c8e03d3745f42 100644 --- a/examples/callcenter/s1/conf/chunk_conformer.yaml +++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'char' spm_model_prefix: '' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 32 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/callcenter/s1/conf/conformer.yaml b/examples/callcenter/asr1/conf/conformer.yaml similarity index 97% rename from examples/callcenter/s1/conf/conformer.yaml rename to examples/callcenter/asr1/conf/conformer.yaml index bd4f45788ef039e8bc302936ca4167dfd86c5585..47c438a6d1b2f453500540e102e54d48dbe8cd5f 100644 --- a/examples/callcenter/s1/conf/conformer.yaml +++ b/examples/callcenter/asr1/conf/conformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'char' spm_model_prefix: '' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 32 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -37,7 +37,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/callcenter/asr1/conf/preprocess.yaml b/examples/callcenter/asr1/conf/preprocess.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dd4cfd273e0f654169955d29ed76c818ded9181c --- /dev/null +++ b/examples/callcenter/asr1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/callcenter/s1/local/align.sh b/examples/callcenter/asr1/local/align.sh similarity index 100% rename from examples/callcenter/s1/local/align.sh rename to examples/callcenter/asr1/local/align.sh diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/asr1/local/data.sh similarity index 96% rename from examples/callcenter/s1/local/data.sh rename to examples/callcenter/asr1/local/data.sh index 634bb8d0eb24c660fb17ba80fa78082192b33f03..c40c752abc981ee8354d32d2ad99c1326173bcf8 100755 --- a/examples/callcenter/s1/local/data.sh +++ b/examples/callcenter/asr1/local/data.sh @@ -21,8 +21,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --sample_rate=8000 \ --use_dB_normalization=False \ --num_samples=-1 \ @@ -55,7 +55,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for dataset in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "char" \ --vocab_path="data/vocab.txt" \ diff --git a/examples/callcenter/s1/local/download_lm_ch.sh b/examples/callcenter/asr1/local/download_lm_ch.sh similarity index 100% rename from examples/callcenter/s1/local/download_lm_ch.sh rename to examples/callcenter/asr1/local/download_lm_ch.sh diff --git a/examples/callcenter/s1/local/export.sh b/examples/callcenter/asr1/local/export.sh similarity index 100% rename from examples/callcenter/s1/local/export.sh rename to examples/callcenter/asr1/local/export.sh diff --git a/examples/callcenter/s1/local/test.sh b/examples/callcenter/asr1/local/test.sh similarity index 100% rename from examples/callcenter/s1/local/test.sh rename to examples/callcenter/asr1/local/test.sh diff --git a/examples/callcenter/s1/local/train.sh b/examples/callcenter/asr1/local/train.sh similarity index 100% rename from examples/callcenter/s1/local/train.sh rename to examples/callcenter/asr1/local/train.sh diff --git a/examples/callcenter/s1/path.sh b/examples/callcenter/asr1/path.sh similarity index 100% rename from examples/callcenter/s1/path.sh rename to examples/callcenter/asr1/path.sh diff --git a/examples/callcenter/s1/run.sh b/examples/callcenter/asr1/run.sh similarity index 100% rename from examples/callcenter/s1/run.sh rename to examples/callcenter/asr1/run.sh diff --git a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py index e32f619e90eaded75aa465fa8bc2ae39b6e77486..85f478c20ddba18d40c175c724370b81c93e46d0 100644 --- a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py +++ b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py @@ -22,6 +22,7 @@ import argparse import codecs import json import os +from pathlib import Path import soundfile @@ -79,6 +80,7 @@ def create_manifest(data_dir, manifest_path_prefix): audio_path = os.path.abspath(os.path.join(subfolder, fname)) audio_id = os.path.basename(fname)[:-4] + utt2spk = Path(audio_path).parent.name audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) @@ -87,6 +89,7 @@ def create_manifest(data_dir, manifest_path_prefix): json.dumps( { 'utt': audio_id, + 'utt2spk': str(utt2spk), 'feat': audio_path, 'feat_shape': (duration, ), # second 'text': text, diff --git a/examples/dataset/aishell/aishell.py b/examples/dataset/aishell/aishell.py index 66e0690138a91d3fd465547a71fcde6d1922b3ef..95ed040860a452a0fcdbcf4321e6c4c94c110a4c 100644 --- a/examples/dataset/aishell/aishell.py +++ b/examples/dataset/aishell/aishell.py @@ -22,6 +22,7 @@ import argparse import codecs import json import os +from pathlib import Path import soundfile @@ -81,6 +82,8 @@ def create_manifest(data_dir, manifest_path_prefix): # if no transcription for audio then skipped if audio_id not in transcript_dict: continue + + utt2spk = Path(audio_path).parent.name audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) text = transcript_dict[audio_id] @@ -88,6 +91,7 @@ def create_manifest(data_dir, manifest_path_prefix): json.dumps( { 'utt': audio_id, + 'utt2spk': str(utt2spk), 'feat': audio_path, 'feat_shape': (duration, ), # second 'text': text diff --git a/examples/dataset/librispeech/librispeech.py b/examples/dataset/librispeech/librispeech.py index e85bbb3aa44d0bd775698ac0c79ce5b690f83a00..69f0db599e12d0a482a8d7783eb85ce9e04c744d 100644 --- a/examples/dataset/librispeech/librispeech.py +++ b/examples/dataset/librispeech/librispeech.py @@ -78,7 +78,7 @@ def create_manifest(data_dir, manifest_path): print("Creating manifest %s ..." % manifest_path) json_lines = [] total_sec = 0.0 - total_text = 0.0 + total_char = 0.0 total_num = 0 for subfolder, _, filelist in sorted(os.walk(data_dir)): @@ -89,25 +89,28 @@ def create_manifest(data_dir, manifest_path): text_filepath = os.path.join(subfolder, text_filelist[0]) for line in io.open(text_filepath, encoding="utf8"): segments = line.strip().split() + nchars = len(segments[1:]) text = ' '.join(segments[1:]).lower() audio_filepath = os.path.abspath( os.path.join(subfolder, segments[0] + '.flac')) audio_data, samplerate = soundfile.read(audio_filepath) duration = float(len(audio_data)) / samplerate + + utt = os.path.splitext(os.path.basename(audio_filepath))[0] + utt2spk = '-'.join(utt.split('-')[:2]) + json_lines.append( json.dumps({ - 'utt': - os.path.splitext(os.path.basename(audio_filepath))[0], - 'feat': - audio_filepath, - 'feat_shape': (duration, ), #second - 'text': - text + 'utt': utt, + 'utt2spk': utt2spk, + 'feat': audio_filepath, + 'feat_shape': (duration, ), # second + 'text': text, })) total_sec += duration - total_text += len(text) + total_char += nchars total_num += 1 with codecs.open(manifest_path, 'w', 'utf-8') as out_file: @@ -122,8 +125,8 @@ def create_manifest(data_dir, manifest_path): print(f"{subset}:", file=f) print(f"{total_num} utts", file=f) print(f"{total_sec / (60*60)} h", file=f) - print(f"{total_text} text", file=f) - print(f"{total_text / total_sec} text/sec", file=f) + print(f"{total_char} char", file=f) + print(f"{total_char / total_sec} char/sec", file=f) print(f"{total_sec / total_num} sec/utt", file=f) diff --git a/examples/dataset/mini_librispeech/mini_librispeech.py b/examples/dataset/mini_librispeech/mini_librispeech.py index 65fee81a70bd96e01d0cf119c5081179230e5708..730c73a8b4dc44691351717de1bfe918f3b957ac 100644 --- a/examples/dataset/mini_librispeech/mini_librispeech.py +++ b/examples/dataset/mini_librispeech/mini_librispeech.py @@ -74,15 +74,16 @@ def create_manifest(data_dir, manifest_path): audio_filepath = os.path.join(subfolder, segments[0] + '.flac') audio_data, samplerate = soundfile.read(audio_filepath) duration = float(len(audio_data)) / samplerate + + utt = os.path.splitext(os.path.basename(audio_filepath))[0] + utt2spk = '-'.join(utt.split('-')[:2]) json_lines.append( json.dumps({ - 'utt': - os.path.splitext(os.path.basename(audio_filepath))[0], - 'feat': - audio_filepath, + 'utt': utt, + 'utt2spk': utt2spk, + 'feat': audio_filepath, 'feat_shape': (duration, ), #second - 'text': - text + 'text': text, })) total_sec += duration diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/examples/dataset/ted_en_zh/ted_en_zh.py index 14bef01d2b129a04dc0aac21765321893c470ac8..a8cbb83793710d9971ff320d6968b743a13d5df1 100644 --- a/examples/dataset/ted_en_zh/ted_en_zh.py +++ b/examples/dataset/ted_en_zh/ted_en_zh.py @@ -72,14 +72,17 @@ def create_manifest(data_dir, manifest_path_prefix): continue audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) + + + translation_str = " ".join(translation.split()) + trancription_str = " ".join(trancription.split()) json_lines.append( json.dumps( { 'utt': utt, 'feat': audio_path, 'feat_shape': (duration, ), # second - 'text': " ".join(translation.split()), - 'text1': " ".join(trancription.split()) + 'text': [translation_str, trancription_str], }, ensure_ascii=False)) diff --git a/examples/dataset/thchs30/thchs30.py b/examples/dataset/thchs30/thchs30.py index 77a264cbba1171a027e6548a6176d5d9822515b5..2ec4ddab29b1cca3a586269eabee4d78d4d9220e 100644 --- a/examples/dataset/thchs30/thchs30.py +++ b/examples/dataset/thchs30/thchs30.py @@ -113,6 +113,8 @@ def create_manifest(data_dir, manifest_path_prefix): assert os.path.exists(audio_path) and os.path.exists(text_path) audio_id = os.path.basename(audio_path)[:-4] + spk = audio_id.split('_')[0] + word_text, syllable_text, phone_text = read_trn(text_path) audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) @@ -122,6 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix): json.dumps( { 'utt': audio_id, + 'utt2spk', spk, 'feat': audio_path, 'feat_shape': (duration, ), # second 'text': word_text, # charactor diff --git a/examples/dataset/timit/timit.py b/examples/dataset/timit/timit.py index 311d445cb3a1e5123889d50b5028ca1aeb85ca19..c4a9f06631809bd4ca1d72755576d631f8590055 100644 --- a/examples/dataset/timit/timit.py +++ b/examples/dataset/timit/timit.py @@ -180,12 +180,12 @@ def create_manifest(data_dir, manifest_path_prefix): json.dumps( { 'utt': utt_id, + 'utt2spk': spk, + 'utt2gender': gender, 'feat': str(audio_path), 'feat_shape': (duration, ), # second 'text': word_text, # word 'phone': phone_text, - 'spk': spk, - 'gender': gender, }, ensure_ascii=False)) diff --git a/examples/dataset/timit/timit_kaldi_standard_split.py b/examples/dataset/timit/timit_kaldi_standard_split.py index 2b494c06db990e8c932cc28a5a974cddc9e4e943..26aa76c72d65d29e11501ef6ca1003191e3a41d0 100644 --- a/examples/dataset/timit/timit_kaldi_standard_split.py +++ b/examples/dataset/timit/timit_kaldi_standard_split.py @@ -24,6 +24,7 @@ import json import os import soundfile +from pathlib import Path parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( @@ -67,10 +68,17 @@ def create_manifest(data_dir, manifest_path_prefix): audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) text = phn_dict[audio_id] + + gender_spk = str(Path(audio_path).parent.stem) + spk = gender_spk[1:] + gender = gender_spk[0] + utt_id = '_'.join([spk, gender, audio_id]) json_lines.append( json.dumps( { 'utt': audio_id, + 'utt2spk': spk, + 'utt2gender': gender, 'feat': audio_path, 'feat_shape': (duration, ), # second 'text': text diff --git a/examples/dataset/voxforge/voxforge.py b/examples/dataset/voxforge/voxforge.py index 36282bd609f372aec47080686c102b7f81b02286..373791bffe04114a51d89f6bf84c6dde504be84c 100644 --- a/examples/dataset/voxforge/voxforge.py +++ b/examples/dataset/voxforge/voxforge.py @@ -175,9 +175,12 @@ def generate_manifest(data_dir, manifest_path): audio_data, samplerate = soundfile.read(u) duration = float(len(audio_data)) / samplerate + + utt = os.path.splitext(os.path.basename(u))[0] json_lines.append( json.dumps({ - 'utt': os.path.splitext(os.path.basename(u))[0], + 'utt': utt, + 'utt2spk': speaker, 'feat': u, 'feat_shape': (duration, ), #second 'text': trans.lower() diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md index 5943cf1d7884de4fa8bc39a04e6e8651f32b0a2c..74441fd0915d6cf91473659d2f973c0de60af34e 100644 --- a/examples/librispeech/README.md +++ b/examples/librispeech/README.md @@ -1,8 +1,9 @@ # ASR -* s0 is for deepspeech2 offline -* s1 is for transformer/conformer/U2 -* s2 is for transformer/conformer/U2 w/ kaldi feat, need install Kaldi +* asr0 - deepspeech2 Streaming/Non-Streaming +* asr1 - transformer/conformer Streaming/Non-Streaming +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature + ## Data | Data Subset | Duration in Seconds | diff --git a/examples/librispeech/s0/README.md b/examples/librispeech/asr0/README.md similarity index 100% rename from examples/librispeech/s0/README.md rename to examples/librispeech/asr0/README.md diff --git a/examples/librispeech/s0/conf/augmentation.json b/examples/librispeech/asr0/conf/augmentation.json similarity index 100% rename from examples/librispeech/s0/conf/augmentation.json rename to examples/librispeech/asr0/conf/augmentation.json diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml similarity index 100% rename from examples/librispeech/s0/conf/deepspeech2.yaml rename to examples/librispeech/asr0/conf/deepspeech2.yaml diff --git a/examples/librispeech/s0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml similarity index 100% rename from examples/librispeech/s0/conf/deepspeech2_online.yaml rename to examples/librispeech/asr0/conf/deepspeech2_online.yaml diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/asr0/local/data.sh similarity index 97% rename from examples/librispeech/s0/local/data.sh rename to examples/librispeech/asr0/local/data.sh index fd2b0c0138dceb8352344b3b7beb9b19a2f94c40..0f276cecad316bcb444fddd36df988c0618b7152 100755 --- a/examples/librispeech/s0/local/data.sh +++ b/examples/librispeech/asr0/local/data.sh @@ -50,8 +50,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --spectrum_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=20.0 \ + --stride_ms=10 \ + --window_ms=20 \ --use_dB_normalization=True \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" @@ -81,7 +81,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/librispeech/s0/local/download_lm_en.sh b/examples/librispeech/asr0/local/download_lm_en.sh similarity index 100% rename from examples/librispeech/s0/local/download_lm_en.sh rename to examples/librispeech/asr0/local/download_lm_en.sh diff --git a/examples/librispeech/s0/local/export.sh b/examples/librispeech/asr0/local/export.sh similarity index 100% rename from examples/librispeech/s0/local/export.sh rename to examples/librispeech/asr0/local/export.sh diff --git a/examples/librispeech/s0/local/test.sh b/examples/librispeech/asr0/local/test.sh similarity index 100% rename from examples/librispeech/s0/local/test.sh rename to examples/librispeech/asr0/local/test.sh diff --git a/examples/librispeech/s0/local/test_hub.sh b/examples/librispeech/asr0/local/test_hub.sh similarity index 100% rename from examples/librispeech/s0/local/test_hub.sh rename to examples/librispeech/asr0/local/test_hub.sh diff --git a/examples/librispeech/s0/local/train.sh b/examples/librispeech/asr0/local/train.sh similarity index 100% rename from examples/librispeech/s0/local/train.sh rename to examples/librispeech/asr0/local/train.sh diff --git a/examples/librispeech/s0/path.sh b/examples/librispeech/asr0/path.sh similarity index 100% rename from examples/librispeech/s0/path.sh rename to examples/librispeech/asr0/path.sh diff --git a/examples/librispeech/s0/run.sh b/examples/librispeech/asr0/run.sh similarity index 100% rename from examples/librispeech/s0/run.sh rename to examples/librispeech/asr0/run.sh diff --git a/examples/librispeech/s1/.gitignore b/examples/librispeech/asr1/.gitignore similarity index 100% rename from examples/librispeech/s1/.gitignore rename to examples/librispeech/asr1/.gitignore diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/asr1/README.md similarity index 74% rename from examples/librispeech/s1/README.md rename to examples/librispeech/asr1/README.md index b7ec93ebec92020d9089233a218dbb2143f40c37..20255db8e9be1fff1361eda8670947d853c2382b 100644 --- a/examples/librispeech/s1/README.md +++ b/examples/librispeech/asr1/README.md @@ -21,7 +21,7 @@ ## Transformer | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 7.404532432556152 | 0.056204 | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 7.404532432556152 | 0.058658 | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 7.404532432556152 | 0.058278 | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 7.404532432556152 | 0.045591 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.805267604192098, | 0.049795 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 | \ No newline at end of file diff --git a/examples/librispeech/s1/cmd.sh b/examples/librispeech/asr1/cmd.sh similarity index 100% rename from examples/librispeech/s1/cmd.sh rename to examples/librispeech/asr1/cmd.sh diff --git a/examples/librispeech/s1/conf/augmentation.json b/examples/librispeech/asr1/conf/augmentation.json similarity index 100% rename from examples/librispeech/s1/conf/augmentation.json rename to examples/librispeech/asr1/conf/augmentation.json diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml similarity index 97% rename from examples/librispeech/s1/conf/chunk_conformer.yaml rename to examples/librispeech/asr1/conf/chunk_conformer.yaml index 4d0e6ceb1c3ec629383a634b727b00bb09623be4..2bfb0fb6f7b939c69372cd6be0bc676edcf92880 100644 --- a/examples/librispeech/s1/conf/chunk_conformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml @@ -15,7 +15,7 @@ collator: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 16 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml similarity index 97% rename from examples/librispeech/s1/conf/chunk_transformer.yaml rename to examples/librispeech/asr1/conf/chunk_transformer.yaml index c7b53f95bd6b0a7d67acaf339972fd09f52ab2b8..fe533777630a3aa1c26577492457ff2e4c06c848 100644 --- a/examples/librispeech/s1/conf/chunk_transformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml @@ -15,7 +15,7 @@ collator: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 64 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: transformer diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml similarity index 97% rename from examples/librispeech/s1/conf/conformer.yaml rename to examples/librispeech/asr1/conf/conformer.yaml index 3bc942dc064ff8958280bf90ba7d6fdb8180bd94..c844baaafb3ab4c55a95baa7f4b2a43c9ec40f7c 100644 --- a/examples/librispeech/s1/conf/conformer.yaml +++ b/examples/librispeech/asr1/conf/conformer.yaml @@ -15,7 +15,7 @@ collator: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 16 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: conformer diff --git a/examples/librispeech/asr1/conf/preprocess.yaml b/examples/librispeech/asr1/conf/preprocess.yaml new file mode 100644 index 0000000000000000000000000000000000000000..021ca4c58022f696c5218dedfe20e7244f09bd7f --- /dev/null +++ b/examples/librispeech/asr1/conf/preprocess.yaml @@ -0,0 +1,25 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml similarity index 97% rename from examples/librispeech/s1/conf/transformer.yaml rename to examples/librispeech/asr1/conf/transformer.yaml index 3cc17004c0ac103efd16e5d4899b910805faeda5..5a158f3ed69ee90f2936cff5016937f7b20932b7 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/asr1/conf/transformer.yaml @@ -15,7 +15,7 @@ collator: unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_5000' mean_std_filepath: "" - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 32 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -38,7 +38,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: transformer diff --git a/examples/librispeech/s1/local/align.sh b/examples/librispeech/asr1/local/align.sh similarity index 100% rename from examples/librispeech/s1/local/align.sh rename to examples/librispeech/asr1/local/align.sh diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/asr1/local/data.sh similarity index 66% rename from examples/librispeech/s1/local/data.sh rename to examples/librispeech/asr1/local/data.sh index 56fec8463c9c27eda2853ac83d5b8d030942d8cf..35f4e635fa26b99f89498c761dc1e6906a24899a 100755 --- a/examples/librispeech/s1/local/data.sh +++ b/examples/librispeech/asr1/local/data.sh @@ -8,6 +8,11 @@ nbpe=5000 bpemode=unigram bpeprefix="data/bpe_${bpemode}_${nbpe}" +stride_ms=10 +window_ms=25 +sample_rate=16000 +feat_dim=80 + source ${MAIN_ROOT}/utils/parse_options.sh @@ -27,21 +32,21 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then exit 1 fi - for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do - mv data/manifest.${set} data/manifest.${set}.raw + for sub in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do + mv data/manifest.${sub} data/manifest.${sub}.raw done rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw - for set in train-clean-100 train-clean-360 train-other-500; do - cat data/manifest.${set}.raw >> data/manifest.train.raw + for sub in train-clean-100 train-clean-360 train-other-500; do + cat data/manifest.${sub}.raw >> data/manifest.train.raw done - for set in dev-clean dev-other; do - cat data/manifest.${set}.raw >> data/manifest.dev.raw + for sub in dev-clean dev-other; do + cat data/manifest.${sub}.raw >> data/manifest.dev.raw done - for set in test-clean test-other; do - cat data/manifest.${set}.raw >> data/manifest.test.raw + for sub in test-clean test-other; do + cat data/manifest.${sub}.raw >> data/manifest.test.raw done fi @@ -52,11 +57,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ --spectrum_type="fbank" \ - --feat_dim=80 \ + --feat_dim=${feat_dim} \ --delta_delta=false \ - --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --sample_rate=${sample_rate} \ + --stride_ms=${stride_ms} \ + --window_ms=${window_ms} \ --use_dB_normalization=False \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" @@ -85,16 +90,15 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size - for set in train dev test dev-clean dev-other test-clean test-other; do + for sub in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ --vocab_path="data/vocab.txt" \ - --manifest_path="data/manifest.${set}.raw" \ - --output_path="data/manifest.${set}" + --manifest_path="data/manifest.${sub}.raw" \ + --output_path="data/manifest.${sub}" if [ $? -ne 0 ]; then echo "Formt mnaifest failed. Terminated." @@ -103,6 +107,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then }& done wait + + for sub in train dev; do + mv data/manifest.${sub} data/manifest.${sub}.fmt + done +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + for sub in train dev; do + remove_longshortdata.py --maxframes 3000 --maxchars 400 --stride_ms ${stride_ms} data/manifest.${sub}.fmt data/manifest.${sub} + done fi echo "LibriSpeech Data preparation done." diff --git a/examples/librispeech/s1/local/download_lm_en.sh b/examples/librispeech/asr1/local/download_lm_en.sh similarity index 100% rename from examples/librispeech/s1/local/download_lm_en.sh rename to examples/librispeech/asr1/local/download_lm_en.sh diff --git a/examples/librispeech/s1/local/export.sh b/examples/librispeech/asr1/local/export.sh similarity index 100% rename from examples/librispeech/s1/local/export.sh rename to examples/librispeech/asr1/local/export.sh diff --git a/examples/librispeech/s1/local/test.sh b/examples/librispeech/asr1/local/test.sh similarity index 100% rename from examples/librispeech/s1/local/test.sh rename to examples/librispeech/asr1/local/test.sh diff --git a/examples/librispeech/s1/local/test_hub.sh b/examples/librispeech/asr1/local/test_hub.sh similarity index 100% rename from examples/librispeech/s1/local/test_hub.sh rename to examples/librispeech/asr1/local/test_hub.sh diff --git a/examples/librispeech/s1/local/train.sh b/examples/librispeech/asr1/local/train.sh similarity index 100% rename from examples/librispeech/s1/local/train.sh rename to examples/librispeech/asr1/local/train.sh diff --git a/examples/librispeech/s1/path.sh b/examples/librispeech/asr1/path.sh similarity index 100% rename from examples/librispeech/s1/path.sh rename to examples/librispeech/asr1/path.sh diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/asr1/run.sh similarity index 100% rename from examples/librispeech/s1/run.sh rename to examples/librispeech/asr1/run.sh diff --git a/examples/librispeech/s1/utils b/examples/librispeech/asr1/utils similarity index 100% rename from examples/librispeech/s1/utils rename to examples/librispeech/asr1/utils diff --git a/examples/librispeech/s2/.gitignore b/examples/librispeech/asr2/.gitignore similarity index 100% rename from examples/librispeech/s2/.gitignore rename to examples/librispeech/asr2/.gitignore diff --git a/examples/librispeech/s2/README.md b/examples/librispeech/asr2/README.md similarity index 100% rename from examples/librispeech/s2/README.md rename to examples/librispeech/asr2/README.md diff --git a/examples/librispeech/s2/cmd.sh b/examples/librispeech/asr2/cmd.sh similarity index 100% rename from examples/librispeech/s2/cmd.sh rename to examples/librispeech/asr2/cmd.sh diff --git a/examples/librispeech/s2/conf/augmentation.json b/examples/librispeech/asr2/conf/augmentation.json similarity index 100% rename from examples/librispeech/s2/conf/augmentation.json rename to examples/librispeech/asr2/conf/augmentation.json diff --git a/examples/librispeech/s2/conf/decode/decode.yaml b/examples/librispeech/asr2/conf/decode/decode.yaml similarity index 100% rename from examples/librispeech/s2/conf/decode/decode.yaml rename to examples/librispeech/asr2/conf/decode/decode.yaml diff --git a/examples/librispeech/s2/conf/decode/decode_att.yaml b/examples/librispeech/asr2/conf/decode/decode_att.yaml similarity index 100% rename from examples/librispeech/s2/conf/decode/decode_att.yaml rename to examples/librispeech/asr2/conf/decode/decode_att.yaml diff --git a/examples/librispeech/s2/conf/decode/decode_ctc.yaml b/examples/librispeech/asr2/conf/decode/decode_ctc.yaml similarity index 100% rename from examples/librispeech/s2/conf/decode/decode_ctc.yaml rename to examples/librispeech/asr2/conf/decode/decode_ctc.yaml diff --git a/examples/librispeech/s2/conf/decode/decode_wo_lm.yaml b/examples/librispeech/asr2/conf/decode/decode_wo_lm.yaml similarity index 100% rename from examples/librispeech/s2/conf/decode/decode_wo_lm.yaml rename to examples/librispeech/asr2/conf/decode/decode_wo_lm.yaml diff --git a/examples/librispeech/s2/conf/fbank.conf b/examples/librispeech/asr2/conf/fbank.conf similarity index 100% rename from examples/librispeech/s2/conf/fbank.conf rename to examples/librispeech/asr2/conf/fbank.conf diff --git a/examples/librispeech/s2/conf/lm/transformer.yaml b/examples/librispeech/asr2/conf/lm/transformer.yaml similarity index 100% rename from examples/librispeech/s2/conf/lm/transformer.yaml rename to examples/librispeech/asr2/conf/lm/transformer.yaml diff --git a/examples/librispeech/s2/conf/pitch.conf b/examples/librispeech/asr2/conf/pitch.conf similarity index 100% rename from examples/librispeech/s2/conf/pitch.conf rename to examples/librispeech/asr2/conf/pitch.conf diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml similarity index 100% rename from examples/librispeech/s2/conf/transformer.yaml rename to examples/librispeech/asr2/conf/transformer.yaml diff --git a/examples/librispeech/s2/local/align.sh b/examples/librispeech/asr2/local/align.sh similarity index 100% rename from examples/librispeech/s2/local/align.sh rename to examples/librispeech/asr2/local/align.sh diff --git a/examples/librispeech/s2/local/cacu_perplexity.sh b/examples/librispeech/asr2/local/cacu_perplexity.sh similarity index 100% rename from examples/librispeech/s2/local/cacu_perplexity.sh rename to examples/librispeech/asr2/local/cacu_perplexity.sh diff --git a/examples/librispeech/s2/local/data.sh b/examples/librispeech/asr2/local/data.sh similarity index 100% rename from examples/librispeech/s2/local/data.sh rename to examples/librispeech/asr2/local/data.sh diff --git a/examples/librispeech/s2/local/data_prep.sh b/examples/librispeech/asr2/local/data_prep.sh similarity index 100% rename from examples/librispeech/s2/local/data_prep.sh rename to examples/librispeech/asr2/local/data_prep.sh diff --git a/examples/librispeech/s2/local/download_lm_en.sh b/examples/librispeech/asr2/local/download_lm_en.sh similarity index 100% rename from examples/librispeech/s2/local/download_lm_en.sh rename to examples/librispeech/asr2/local/download_lm_en.sh diff --git a/examples/librispeech/s2/local/espnet_json_to_manifest.py b/examples/librispeech/asr2/local/espnet_json_to_manifest.py similarity index 100% rename from examples/librispeech/s2/local/espnet_json_to_manifest.py rename to examples/librispeech/asr2/local/espnet_json_to_manifest.py diff --git a/examples/librispeech/s2/local/export.sh b/examples/librispeech/asr2/local/export.sh similarity index 100% rename from examples/librispeech/s2/local/export.sh rename to examples/librispeech/asr2/local/export.sh diff --git a/examples/librispeech/s2/local/recog.sh b/examples/librispeech/asr2/local/recog.sh similarity index 100% rename from examples/librispeech/s2/local/recog.sh rename to examples/librispeech/asr2/local/recog.sh diff --git a/examples/librispeech/s2/local/test.sh b/examples/librispeech/asr2/local/test.sh similarity index 100% rename from examples/librispeech/s2/local/test.sh rename to examples/librispeech/asr2/local/test.sh diff --git a/examples/librispeech/s2/local/train.sh b/examples/librispeech/asr2/local/train.sh similarity index 100% rename from examples/librispeech/s2/local/train.sh rename to examples/librispeech/asr2/local/train.sh diff --git a/examples/librispeech/s2/path.sh b/examples/librispeech/asr2/path.sh similarity index 100% rename from examples/librispeech/s2/path.sh rename to examples/librispeech/asr2/path.sh diff --git a/examples/librispeech/s2/run.sh b/examples/librispeech/asr2/run.sh similarity index 100% rename from examples/librispeech/s2/run.sh rename to examples/librispeech/asr2/run.sh diff --git a/examples/librispeech/s2/steps b/examples/librispeech/asr2/steps similarity index 100% rename from examples/librispeech/s2/steps rename to examples/librispeech/asr2/steps diff --git a/examples/librispeech/s2/utils b/examples/librispeech/asr2/utils similarity index 100% rename from examples/librispeech/s2/utils rename to examples/librispeech/asr2/utils diff --git a/examples/other/1xt2x/aishell/local/data.sh b/examples/other/1xt2x/aishell/local/data.sh index 0bf35e1f582c424c4634d395d42867b7775673ad..85574260b023f8559c16869666a34b7f42599679 100755 --- a/examples/other/1xt2x/aishell/local/data.sh +++ b/examples/other/1xt2x/aishell/local/data.sh @@ -50,7 +50,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for dataset in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.npz" \ --unit_type "char" \ --vocab_path="data/vocab.txt" \ diff --git a/examples/other/1xt2x/baidu_en8k/local/data.sh b/examples/other/1xt2x/baidu_en8k/local/data.sh index f0bde77fe4ec4d0abb25634a28bb4bdfae91766f..8e378ff053ba78c970a50ca5d938975f3464f50f 100755 --- a/examples/other/1xt2x/baidu_en8k/local/data.sh +++ b/examples/other/1xt2x/baidu_en8k/local/data.sh @@ -65,7 +65,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.npz" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/other/1xt2x/librispeech/local/data.sh b/examples/other/1xt2x/librispeech/local/data.sh index 6f9bc5566cbdb1d482fc00988e87f1a15c2b3647..7387472d53cd0cb6ed6a73eb9eb8e5d3ba6685aa 100755 --- a/examples/other/1xt2x/librispeech/local/data.sh +++ b/examples/other/1xt2x/librispeech/local/data.sh @@ -63,7 +63,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test dev-clean dev-other test-clean test-other; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.npz" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/ted_en_zh/README.md b/examples/ted_en_zh/README.md index 5664b06b31a299b662df6d87e07222ecae1d1d5f..6d6886daf11089168e8272cd0100b1edeb79d3ee 100644 --- a/examples/ted_en_zh/README.md +++ b/examples/ted_en_zh/README.md @@ -1,3 +1,3 @@ # TED En -> Zh -* t0 for u2 speech translation +* st0 - conformer/transformer speech translation diff --git a/examples/ted_en_zh/st0/.gitignore b/examples/ted_en_zh/st0/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..469c61715f4dc74da526f3a4a5a5e2a6d287d716 --- /dev/null +++ b/examples/ted_en_zh/st0/.gitignore @@ -0,0 +1,3 @@ +TED-En-Zh +data +exp diff --git a/examples/ted_en_zh/t0/README.md b/examples/ted_en_zh/st0/README.md similarity index 100% rename from examples/ted_en_zh/t0/README.md rename to examples/ted_en_zh/st0/README.md diff --git a/examples/ted_en_zh/t0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml similarity index 100% rename from examples/ted_en_zh/t0/conf/transformer.yaml rename to examples/ted_en_zh/st0/conf/transformer.yaml diff --git a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml similarity index 100% rename from examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml rename to examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/st0/local/data.sh similarity index 91% rename from examples/ted_en_zh/t0/local/data.sh rename to examples/ted_en_zh/st0/local/data.sh index b080a5b497e703c6b2f1c2d385315f771b33a1a0..d3acbd4486b3753e70fe7d0c3f71b4f1b3576583 100755 --- a/examples/ted_en_zh/t0/local/data.sh +++ b/examples/ted_en_zh/st0/local/data.sh @@ -9,7 +9,7 @@ stop_stage=100 nbpe=8000 bpemode=unigram bpeprefix="data/bpe_${bpemode}_${nbpe}" -data_dir=./TED_EnZh +data_dir=./TED-En-Zh source ${MAIN_ROOT}/utils/parse_options.sh @@ -21,7 +21,7 @@ mkdir -p data if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ! -e ${data_dir} ]; then - echo "Error: Dataset is not avaiable. Please download and unzip the dataset" + echo "Error: ${data_dir} Dataset is not avaiable. Please download and unzip the dataset" echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0" echo "The tree of the directory should be:" echo "." @@ -54,8 +54,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --use_dB_normalization=False \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" @@ -88,8 +88,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size for set in train dev test; do { - python3 ${MAIN_ROOT}/utils/format_triplet_data.py \ - --feat_type "raw" \ + python3 ${MAIN_ROOT}/utils/format_data.py \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ diff --git a/examples/ted_en_zh/t0/local/test.sh b/examples/ted_en_zh/st0/local/test.sh similarity index 100% rename from examples/ted_en_zh/t0/local/test.sh rename to examples/ted_en_zh/st0/local/test.sh diff --git a/examples/ted_en_zh/t0/local/train.sh b/examples/ted_en_zh/st0/local/train.sh similarity index 100% rename from examples/ted_en_zh/t0/local/train.sh rename to examples/ted_en_zh/st0/local/train.sh diff --git a/examples/ted_en_zh/t0/path.sh b/examples/ted_en_zh/st0/path.sh similarity index 100% rename from examples/ted_en_zh/t0/path.sh rename to examples/ted_en_zh/st0/path.sh diff --git a/examples/ted_en_zh/t0/run.sh b/examples/ted_en_zh/st0/run.sh similarity index 93% rename from examples/ted_en_zh/t0/run.sh rename to examples/ted_en_zh/st0/run.sh index ed9ab5f87506aeec6b0edf9ad3f2b4299bb9647c..fb4bc33880b0013d05e19c734554a51348b6a484 100755 --- a/examples/ted_en_zh/t0/run.sh +++ b/examples/ted_en_zh/st0/run.sh @@ -22,7 +22,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/examples/ted_en_zh/t0/.gitignore b/examples/ted_en_zh/t0/.gitignore deleted file mode 100644 index 123e5174a4e676b2ac3e616673dac984e958c4b5..0000000000000000000000000000000000000000 --- a/examples/ted_en_zh/t0/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -TED_EnZh -data -exp diff --git a/examples/thchs30/README.md b/examples/thchs30/README.md index 7b3cc3d9547f011610bd823ffc1c8229a926c33f..9a0026a0f0c212b8a0b2b5cd8ec6c2cd6f6bb115 100644 --- a/examples/thchs30/README.md +++ b/examples/thchs30/README.md @@ -1,3 +1,3 @@ # thchs30 -* a0 for mfa alignment +* align0 - mfa alignment diff --git a/examples/thchs30/a0/README.md b/examples/thchs30/align0/README.md similarity index 100% rename from examples/thchs30/a0/README.md rename to examples/thchs30/align0/README.md diff --git a/examples/thchs30/a0/data/dict/syllable.lexicon b/examples/thchs30/align0/data/dict/syllable.lexicon similarity index 100% rename from examples/thchs30/a0/data/dict/syllable.lexicon rename to examples/thchs30/align0/data/dict/syllable.lexicon diff --git a/examples/thchs30/a0/local/data.sh b/examples/thchs30/align0/local/data.sh similarity index 100% rename from examples/thchs30/a0/local/data.sh rename to examples/thchs30/align0/local/data.sh diff --git a/examples/thchs30/a0/local/gen_word2phone.py b/examples/thchs30/align0/local/gen_word2phone.py similarity index 100% rename from examples/thchs30/a0/local/gen_word2phone.py rename to examples/thchs30/align0/local/gen_word2phone.py diff --git a/examples/thchs30/a0/local/reorganize_thchs30.py b/examples/thchs30/align0/local/reorganize_thchs30.py similarity index 100% rename from examples/thchs30/a0/local/reorganize_thchs30.py rename to examples/thchs30/align0/local/reorganize_thchs30.py diff --git a/examples/thchs30/a0/path.sh b/examples/thchs30/align0/path.sh similarity index 100% rename from examples/thchs30/a0/path.sh rename to examples/thchs30/align0/path.sh diff --git a/examples/thchs30/a0/run.sh b/examples/thchs30/align0/run.sh similarity index 100% rename from examples/thchs30/a0/run.sh rename to examples/thchs30/align0/run.sh diff --git a/examples/timit/README.md b/examples/timit/README.md index b7c8b754521a1cbc9a8535e6f5c67854c68f8ba1..778398748d6acb5dfbcb9a63ebd32c99a50a8b02 100644 --- a/examples/timit/README.md +++ b/examples/timit/README.md @@ -1,3 +1,7 @@ # TIMIT -* s1 u2 model with phone unit +asr model with phone unit + +* asr0 - deepspeech2 Streaming/Non-Streaming +* asr1 - transformer/conformer Streaming/Non-Streaming +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature \ No newline at end of file diff --git a/examples/timit/s1/.gitignore b/examples/timit/asr1/.gitignore similarity index 100% rename from examples/timit/s1/.gitignore rename to examples/timit/asr1/.gitignore diff --git a/examples/timit/s1/README.md b/examples/timit/asr1/README.md similarity index 100% rename from examples/timit/s1/README.md rename to examples/timit/asr1/README.md diff --git a/examples/timit/s1/conf/augmentation.json b/examples/timit/asr1/conf/augmentation.json similarity index 100% rename from examples/timit/s1/conf/augmentation.json rename to examples/timit/asr1/conf/augmentation.json diff --git a/examples/timit/s1/conf/dev_spk.list b/examples/timit/asr1/conf/dev_spk.list similarity index 100% rename from examples/timit/s1/conf/dev_spk.list rename to examples/timit/asr1/conf/dev_spk.list diff --git a/examples/timit/s1/conf/phones.60-48-39.map b/examples/timit/asr1/conf/phones.60-48-39.map similarity index 100% rename from examples/timit/s1/conf/phones.60-48-39.map rename to examples/timit/asr1/conf/phones.60-48-39.map diff --git a/examples/timit/asr1/conf/preprocess.yaml b/examples/timit/asr1/conf/preprocess.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dd4cfd273e0f654169955d29ed76c818ded9181c --- /dev/null +++ b/examples/timit/asr1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/timit/s1/conf/test_spk.list b/examples/timit/asr1/conf/test_spk.list similarity index 100% rename from examples/timit/s1/conf/test_spk.list rename to examples/timit/asr1/conf/test_spk.list diff --git a/examples/timit/s1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml similarity index 97% rename from examples/timit/s1/conf/transformer.yaml rename to examples/timit/asr1/conf/transformer.yaml index d3ced898ef4b496d10c2410c6528179099c0ad14..1d18468b80025b5ced93c08db0e7f38acc2eb937 100644 --- a/examples/timit/s1/conf/transformer.yaml +++ b/examples/timit/asr1/conf/transformer.yaml @@ -14,7 +14,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: "word" mean_std_filepath: "" - augmentation_config: "" + augmentation_config: conf/preprocess.yaml batch_size: 64 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -37,7 +37,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: transformer diff --git a/examples/timit/s1/local/align.sh b/examples/timit/asr1/local/align.sh similarity index 100% rename from examples/timit/s1/local/align.sh rename to examples/timit/asr1/local/align.sh diff --git a/examples/timit/s1/local/data.sh b/examples/timit/asr1/local/data.sh similarity index 96% rename from examples/timit/s1/local/data.sh rename to examples/timit/asr1/local/data.sh index ad4ddde3fc9fa5696f0818497126f01da9e36431..e588e48df112c604878b0128251f867ce90905b2 100755 --- a/examples/timit/s1/local/data.sh +++ b/examples/timit/asr1/local/data.sh @@ -35,8 +35,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --use_dB_normalization=False \ --num_workers=${num_workers} \ --output_path="data/mean_std.json" @@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then for set in train dev test; do { python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/timit/s1/local/export.sh b/examples/timit/asr1/local/export.sh similarity index 100% rename from examples/timit/s1/local/export.sh rename to examples/timit/asr1/local/export.sh diff --git a/examples/timit/s1/local/test.sh b/examples/timit/asr1/local/test.sh similarity index 100% rename from examples/timit/s1/local/test.sh rename to examples/timit/asr1/local/test.sh diff --git a/examples/timit/s1/local/timit_data_prep.sh b/examples/timit/asr1/local/timit_data_prep.sh similarity index 100% rename from examples/timit/s1/local/timit_data_prep.sh rename to examples/timit/asr1/local/timit_data_prep.sh diff --git a/examples/timit/s1/local/timit_norm_trans.pl b/examples/timit/asr1/local/timit_norm_trans.pl similarity index 100% rename from examples/timit/s1/local/timit_norm_trans.pl rename to examples/timit/asr1/local/timit_norm_trans.pl diff --git a/examples/timit/s1/local/train.sh b/examples/timit/asr1/local/train.sh similarity index 100% rename from examples/timit/s1/local/train.sh rename to examples/timit/asr1/local/train.sh diff --git a/examples/timit/s1/path.sh b/examples/timit/asr1/path.sh similarity index 100% rename from examples/timit/s1/path.sh rename to examples/timit/asr1/path.sh diff --git a/examples/timit/s1/run.sh b/examples/timit/asr1/run.sh similarity index 100% rename from examples/timit/s1/run.sh rename to examples/timit/asr1/run.sh diff --git a/examples/tiny/README.md b/examples/tiny/README.md index 6766f59a24399d54fd0d3a7a76ba5dc82a38d0bd..f36baae6306823dcd87cd92117a886931656f22e 100644 --- a/examples/tiny/README.md +++ b/examples/tiny/README.md @@ -1,2 +1,3 @@ -* s0 for deepspeech2 -* s1 for U2 +* asr0 - deepspeech2 Streaming/Non-Streaming +* asr1 - transformer/conformer Streaming/Non-Streaming +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature diff --git a/examples/tiny/s0/.gitignore b/examples/tiny/asr0/.gitignore similarity index 100% rename from examples/tiny/s0/.gitignore rename to examples/tiny/asr0/.gitignore diff --git a/examples/tiny/s0/README.md b/examples/tiny/asr0/README.md similarity index 100% rename from examples/tiny/s0/README.md rename to examples/tiny/asr0/README.md diff --git a/examples/tiny/s0/conf/augmentation.json b/examples/tiny/asr0/conf/augmentation.json similarity index 100% rename from examples/tiny/s0/conf/augmentation.json rename to examples/tiny/asr0/conf/augmentation.json diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml similarity index 100% rename from examples/tiny/s0/conf/deepspeech2.yaml rename to examples/tiny/asr0/conf/deepspeech2.yaml diff --git a/examples/tiny/s0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml similarity index 100% rename from examples/tiny/s0/conf/deepspeech2_online.yaml rename to examples/tiny/asr0/conf/deepspeech2_online.yaml diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/asr0/local/data.sh similarity index 96% rename from examples/tiny/s0/local/data.sh rename to examples/tiny/asr0/local/data.sh index 711ebee406b0b101b07bd38453257128995f25bf..f1fb8cb1d093a3adde7f71cebc1ecee50bff7238 100755 --- a/examples/tiny/s0/local/data.sh +++ b/examples/tiny/asr0/local/data.sh @@ -34,8 +34,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --spectrum_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=20.0 \ + --stride_ms=10 \ + --window_ms=20 \ --use_dB_normalization=False \ --num_workers=2 \ --output_path="data/mean_std.json" @@ -63,7 +63,6 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type ${unit_type} \ --vocab_path="data/vocab.txt" \ diff --git a/examples/tiny/s0/local/download_lm_en.sh b/examples/tiny/asr0/local/download_lm_en.sh similarity index 100% rename from examples/tiny/s0/local/download_lm_en.sh rename to examples/tiny/asr0/local/download_lm_en.sh diff --git a/examples/tiny/s0/local/export.sh b/examples/tiny/asr0/local/export.sh similarity index 100% rename from examples/tiny/s0/local/export.sh rename to examples/tiny/asr0/local/export.sh diff --git a/examples/tiny/s0/local/test.sh b/examples/tiny/asr0/local/test.sh similarity index 100% rename from examples/tiny/s0/local/test.sh rename to examples/tiny/asr0/local/test.sh diff --git a/examples/tiny/s0/local/train.sh b/examples/tiny/asr0/local/train.sh similarity index 100% rename from examples/tiny/s0/local/train.sh rename to examples/tiny/asr0/local/train.sh diff --git a/examples/tiny/s0/path.sh b/examples/tiny/asr0/path.sh similarity index 100% rename from examples/tiny/s0/path.sh rename to examples/tiny/asr0/path.sh diff --git a/examples/tiny/s0/run.sh b/examples/tiny/asr0/run.sh similarity index 100% rename from examples/tiny/s0/run.sh rename to examples/tiny/asr0/run.sh diff --git a/examples/tiny/s1/.gitignore b/examples/tiny/asr1/.gitignore similarity index 100% rename from examples/tiny/s1/.gitignore rename to examples/tiny/asr1/.gitignore diff --git a/examples/tiny/s1/conf/augmentation.json b/examples/tiny/asr1/conf/augmentation.json similarity index 100% rename from examples/tiny/s1/conf/augmentation.json rename to examples/tiny/asr1/conf/augmentation.json diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml similarity index 98% rename from examples/tiny/s1/conf/chunk_confermer.yaml rename to examples/tiny/asr1/conf/chunk_confermer.yaml index c518666977faef8c0862be3e7c7f4d5b5244a5fc..6bed27f5c9caba478f127064d2fcce102eccf1f7 100644 --- a/examples/tiny/s1/conf/chunk_confermer.yaml +++ b/examples/tiny/asr1/conf/chunk_confermer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 4 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml similarity index 98% rename from examples/tiny/s1/conf/chunk_transformer.yaml rename to examples/tiny/asr1/conf/chunk_transformer.yaml index 29c30b262048b46bf08d132aebbb24bd7186bf71..7aed1b1933ca1edcf34e6c45a49dbc68eed91527 100644 --- a/examples/tiny/s1/conf/chunk_transformer.yaml +++ b/examples/tiny/asr1/conf/chunk_transformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 4 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml similarity index 98% rename from examples/tiny/s1/conf/conformer.yaml rename to examples/tiny/asr1/conf/conformer.yaml index 8487da771930e6f615ac9fe0e718bab310f66970..2c09b3ae6954cff537a7f1c934b4193e56f3243f 100644 --- a/examples/tiny/s1/conf/conformer.yaml +++ b/examples/tiny/asr1/conf/conformer.yaml @@ -15,7 +15,7 @@ collator: vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 4 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank diff --git a/examples/tiny/asr1/conf/preprocess.yaml b/examples/tiny/asr1/conf/preprocess.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dd4cfd273e0f654169955d29ed76c818ded9181c --- /dev/null +++ b/examples/tiny/asr1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml similarity index 96% rename from examples/tiny/s1/conf/transformer.yaml rename to examples/tiny/asr1/conf/transformer.yaml index cc9b5c5158adf2ca74ccf715e6edaf61cb320953..1378e848dceee2565e1d4de1b31d6e887ba65103 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/asr1/conf/transformer.yaml @@ -11,11 +11,11 @@ data: max_output_input_ratio: 10.0 collator: - mean_std_filepath: "" + mean_std_filepath: data/mean_std.json vocab_filepath: data/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/bpe_unigram_200' - augmentation_config: conf/augmentation.json + augmentation_config: conf/preprocess.yaml batch_size: 4 raw_wav: True # use raw_wav or kaldi feature spectrum_type: fbank #linear, mfcc, fbank @@ -37,7 +37,7 @@ collator: # network architecture model: - cmvn_file: "data/mean_std.json" + cmvn_file: cmvn_file_type: "json" # encoder related encoder: transformer diff --git a/examples/tiny/s1/local/align.sh b/examples/tiny/asr1/local/align.sh similarity index 100% rename from examples/tiny/s1/local/align.sh rename to examples/tiny/asr1/local/align.sh diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/asr1/local/data.sh similarity index 96% rename from examples/tiny/s1/local/data.sh rename to examples/tiny/asr1/local/data.sh index b25f993f6107d222ec7feea63967e5bdea1b291a..87539d5ed33b3ebbb21d398dc78349ead6cd27e3 100755 --- a/examples/tiny/s1/local/data.sh +++ b/examples/tiny/asr1/local/data.sh @@ -38,8 +38,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ + --stride_ms=10 \ + --window_ms=25 \ --use_dB_normalization=False \ --num_workers=2 \ --output_path="data/mean_std.json" @@ -69,7 +69,6 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size python3 ${MAIN_ROOT}/utils/format_data.py \ - --feat_type "raw" \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ diff --git a/examples/tiny/s1/local/export.sh b/examples/tiny/asr1/local/export.sh similarity index 100% rename from examples/tiny/s1/local/export.sh rename to examples/tiny/asr1/local/export.sh diff --git a/examples/tiny/s1/local/test.sh b/examples/tiny/asr1/local/test.sh similarity index 100% rename from examples/tiny/s1/local/test.sh rename to examples/tiny/asr1/local/test.sh diff --git a/examples/tiny/s1/local/train.sh b/examples/tiny/asr1/local/train.sh similarity index 100% rename from examples/tiny/s1/local/train.sh rename to examples/tiny/asr1/local/train.sh diff --git a/examples/tiny/s1/path.sh b/examples/tiny/asr1/path.sh similarity index 100% rename from examples/tiny/s1/path.sh rename to examples/tiny/asr1/path.sh diff --git a/examples/tiny/s1/run.sh b/examples/tiny/asr1/run.sh similarity index 100% rename from examples/tiny/s1/run.sh rename to examples/tiny/asr1/run.sh diff --git a/examples/wenetspeech/README.md b/examples/wenetspeech/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0cb0f354c742912d317db40bfbe47c65f27a4fed --- /dev/null +++ b/examples/wenetspeech/README.md @@ -0,0 +1,58 @@ +* asr0 - deepspeech2 Streaming/Non-Streaming +* asr1 - transformer/conformer Streaming/Non-Streaming +* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature + +# [WenetSpeech](https://github.com/wenet-e2e/WenetSpeech) + +A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition + +## Description + +### Creation + +All the data are collected from YouTube and Podcast. Optical character recognition (OCR) and automatic speech recognition (ASR) techniques are adopted to label each YouTube and Podcast recording, respectively. To improve the quality of the corpus, we use a novel end-to-end label error detection method to further validate and filter the data. + +### Categories + +In summary, WenetSpeech groups all data into 3 categories, as the following table shows: + +| Set | Hours | Confidence | Usage | +|------------|-------|-------------|---------------------------------------| +| High Label | 10005 | >=0.95 | Supervised Training | +| Weak Label | 2478 | [0.6, 0.95] | Semi-supervised or noise training | +| Unlabel | 9952 | / | Unsupervised training or Pre-training | +| In Total | 22435 | / | All above | + +### High Label Data + +We classify the high label into 10 groups according to its domain, speaking style, and scenarios. + +| Domain | Youtube | Podcast | Total | +|-------------|---------|---------|--------| +| audiobook | 0 | 250.9 | 250.9 | +| commentary | 112.6 | 135.7 | 248.3 | +| documentary | 386.7 | 90.5 | 477.2 | +| drama | 4338.2 | 0 | 4338.2 | +| interview | 324.2 | 614 | 938.2 | +| news | 0 | 868 | 868 | +| reading | 0 | 1110.2 | 1110.2 | +| talk | 204 | 90.7 | 294.7 | +| variety | 603.3 | 224.5 | 827.8 | +| others | 144 | 507.5 | 651.5 | +| Total | 6113 | 3892 | 10005 | + +As shown in the following table, we provide 3 training subsets, namely `S`, `M` and `L` for building ASR systems on different data scales. + +| Training Subsets | Confidence | Hours | +|------------------|-------------|-------| +| L | [0.95, 1.0] | 10005 | +| M | 1.0 | 1000 | +| S | 1.0 | 100 | + +### Evaluation Sets + +| Evaluation Sets | Hours | Source | Description | +|-----------------|-------|--------------|-----------------------------------------------------------------------------------------| +| DEV | 20 | Internet | Specially designed for some speech tools which require cross-validation set in training | +| TEST\_NET | 23 | Internet | Match test | +| TEST\_MEETING | 15 | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset | \ No newline at end of file diff --git a/examples/wenetspeech/asr1/.gitignore b/examples/wenetspeech/asr1/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..02a229225b1bd83122ea8a3945166876d7183447 --- /dev/null +++ b/examples/wenetspeech/asr1/.gitignore @@ -0,0 +1,3 @@ +data +exp +*.profile diff --git a/examples/wenetspeech/asr1/README.md b/examples/wenetspeech/asr1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c08b94e29b92a80a8487e171330e5928b492ece4 --- /dev/null +++ b/examples/wenetspeech/asr1/README.md @@ -0,0 +1,14 @@ +## Pack Model + +pack model to tar.gz, e.g. + +```bash +./utils/pack_model.sh --preprocess_conf conf/preprocess.yaml --dict data/vocab.txt conf/conformer.yaml '' data/mean_std.json exp/conformer/checkpoints/wenetspeec +h.pdparams + +``` + +show model.tar.gz +``` +tar tf model.tar.gz +``` diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md new file mode 100644 index 0000000000000000000000000000000000000000..5aff041f80931ae13e0f275147f55d91f1759d6e --- /dev/null +++ b/examples/wenetspeech/asr1/RESULTS.md @@ -0,0 +1,24 @@ +# WenetSpeech + + +## Conformer + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | dev | attention | | | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | ctc_greedy_search | | | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test meeting | ctc_prefix_beam_search | | | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | attention_rescoring | | | + + + +## Conformer Pretrain Model + +Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wenetspeech/20211025_conformer_exp.tar.gz + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention | - | 0.048456 | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | - | 0.052915 | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 | \ No newline at end of file diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0340dc85dcd6d4ca7ca0eedd08b70d16c3846e01 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/conformer.yaml @@ -0,0 +1,113 @@ +# network architecture +model: + # encoder related + encoder: conformer + encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + cnn_module_norm: layer_norm + activation_type: swish + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + + # decoder related + decoder: transformer + decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + + # hybrid CTC/attention + model_conf: + ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: null + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + +# https://yaml.org/type/float.html +data: + train_manifest: data/manifest.train + dev_manifest: data/manifest.dev + test_manifest: data/manifest.test + min_input_len: 0.1 # second + max_input_len: 12.0 # second + min_output_len: 1.0 + max_output_len: 400.0 + min_output_input_ratio: 0.05 + max_output_input_ratio: 10.0 + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'char' + spm_model_prefix: '' + augmentation_config: conf/preprocess.yaml + batch_size: 64 + raw_wav: True # use raw_wav or kaldi feature + spectrum_type: fbank #linear, mfcc, fbank + feat_dim: 80 + delta_delta: False + dither: 1.0 + target_sample_rate: 16000 + max_freq: None + n_fft: None + stride_ms: 10.0 + window_ms: 25.0 + use_dB_normalization: True + target_dB: -20 + random_seed: 0 + keep_transcription_text: False + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 2 + + +training: + n_epoch: 240 + accum_grad: 16 + global_grad_clip: 5.0 + log_interval: 100 + checkpoint: + kbest_n: 50 + latest_n: 5 + optim: adam + optim_conf: + lr: 0.001 + weight_decay: 1e-6 + scheduler: warmuplr # pytorch v1.1.0+ required + scheduler_conf: + warmup_steps: 5000 + lr_decay: 1.0 + + +decoding: + batch_size: 128 + error_rate_type: cer + decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' + lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm + alpha: 2.5 + beta: 0.3 + beam_size: 10 + cutoff_prob: 1.0 + cutoff_top_n: 0 + num_proc_bsearch: 8 + ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. + decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. + num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. + simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/wenetspeech/asr1/conf/preprocess.yaml b/examples/wenetspeech/asr1/conf/preprocess.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dd4cfd273e0f654169955d29ed76c818ded9181c --- /dev/null +++ b/examples/wenetspeech/asr1/conf/preprocess.yaml @@ -0,0 +1,29 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: true + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false + + + + diff --git a/examples/wenetspeech/asr1/local/data.sh b/examples/wenetspeech/asr1/local/data.sh new file mode 100755 index 0000000000000000000000000000000000000000..67b3d5a55ff6fa8c842d540c3814fb35aafa48e1 --- /dev/null +++ b/examples/wenetspeech/asr1/local/data.sh @@ -0,0 +1,129 @@ +#!/bin/bash + +# Copyright 2021 Mobvoi Inc(Author: Di Wu, Binbin Zhang) +# NPU, ASLP Group (Author: Qijie Shao) + +stage=-1 +stop_stage=100 + +# Use your own data path. You need to download the WenetSpeech dataset by yourself. +wenetspeech_data_dir=./wenetspeech +# Make sure you have 1.2T for ${shards_dir} +shards_dir=./wenetspeech_shards + +#wenetspeech training set +set=L +train_set=train_`echo $set | tr 'A-Z' 'a-z'` +dev_set=dev +test_sets="test_net test_meeting" + +cmvn=true +cmvn_sampling_divisor=20 # 20 means 5% of the training data to estimate cmvn + + +. ${MAIN_ROOT}/utils/parse_options.sh || exit 1; +set -u +set -o pipefail + + +mkdir -p data +TARGET_DIR=${MAIN_ROOT}/examples/dataset +mkdir -p ${TARGET_DIR} + +if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then + # download data + echo "Please follow https://github.com/wenet-e2e/WenetSpeech to download the data." + exit 0; +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo "Data preparation" + local/wenetspeech_data_prep.sh \ + --train-subset $set \ + $wenetspeech_data_dir \ + data || exit 1; +fi + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + # generate manifests + python3 ${TARGET_DIR}/aishell/aishell.py \ + --manifest_prefix="data/manifest" \ + --target_dir="${TARGET_DIR}/aishell" + + if [ $? -ne 0 ]; then + echo "Prepare Aishell failed. Terminated." + exit 1 + fi + + for dataset in train dev test; do + mv data/manifest.${dataset} data/manifest.${dataset}.raw + done +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # compute mean and stddev for normalizer + if $cmvn; then + full_size=`cat data/${train_set}/wav.scp | wc -l` + sampling_size=$((full_size / cmvn_sampling_divisor)) + shuf -n $sampling_size data/$train_set/wav.scp \ + > data/$train_set/wav.scp.sampled + num_workers=$(nproc) + + python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ + --manifest_path="data/manifest.train.raw" \ + --spectrum_type="fbank" \ + --feat_dim=80 \ + --delta_delta=false \ + --stride_ms=10 \ + --window_ms=25 \ + --sample_rate=16000 \ + --use_dB_normalization=False \ + --num_samples=-1 \ + --num_workers=${num_workers} \ + --output_path="data/mean_std.json" + + if [ $? -ne 0 ]; then + echo "Compute mean and stddev failed. Terminated." + exit 1 + fi + fi +fi + +dict=data/dict/lang_char.txt +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # download data, generate manifests + # build vocabulary + python3 ${MAIN_ROOT}/utils/build_vocab.py \ + --unit_type="char" \ + --count_threshold=0 \ + --vocab_path="data/vocab.txt" \ + --manifest_paths "data/manifest.train.raw" + + if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 + fi +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # format manifest with tokenids, vocab size + for dataset in train dev test; do + { + python3 ${MAIN_ROOT}/utils/format_data.py \ + --cmvn_path "data/mean_std.json" \ + --unit_type "char" \ + --vocab_path="data/vocab.txt" \ + --manifest_path="data/manifest.${dataset}.raw" \ + --output_path="data/manifest.${dataset}" + + if [ $? -ne 0 ]; then + echo "Formt mnaifest failed. Terminated." + exit 1 + fi + } & + done + wait +fi + +echo "Aishell data preparation done." +exit 0 diff --git a/examples/wenetspeech/asr1/local/extract_meta.py b/examples/wenetspeech/asr1/local/extract_meta.py new file mode 100644 index 0000000000000000000000000000000000000000..4de0b7d45631af71ea428f4817b0db0488722daf --- /dev/null +++ b/examples/wenetspeech/asr1/local/extract_meta.py @@ -0,0 +1,102 @@ +# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) +# Mobvoi Inc(Author: Di Wu, Binbin Zhang) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os +import argparse +import json + + +def get_args(): + parser = argparse.ArgumentParser(description=""" + This script is used to process raw json dataset of WenetSpeech, + where the long wav is splitinto segments and + data of wenet format is generated. + """) + parser.add_argument('input_json', help="""Input json file of WenetSpeech""") + parser.add_argument('output_dir', help="""Output dir for prepared data""") + + args = parser.parse_args() + return args + + +def meta_analysis(input_json, output_dir): + input_dir = os.path.dirname(input_json) + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + try: + with open(input_json, 'r') as injson: + json_data = json.load(injson) + except Exception: + sys.exit(f'Failed to load input json file: {input_json}') + else: + if json_data['audios'] is not None: + with open(f'{output_dir}/text', 'w') as utt2text, \ + open(f'{output_dir}/segments', 'w') as segments, \ + open(f'{output_dir}/utt2dur', 'w') as utt2dur, \ + open(f'{output_dir}/wav.scp', 'w') as wavscp, \ + open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \ + open(f'{output_dir}/reco2dur', 'w') as reco2dur: + for long_audio in json_data['audios']: + try: + long_audio_path = os.path.realpath( + os.path.join(input_dir, long_audio['path'])) + aid = long_audio['aid'] + segments_lists = long_audio['segments'] + duration = long_audio['duration'] + assert (os.path.exists(long_audio_path)) + except AssertionError: + print(f'''Warning: {aid} something is wrong, + maybe AssertionError, skipped''') + continue + except Exception: + print(f'''Warning: {aid} something is wrong, maybe the + error path: {long_audio_path}, skipped''') + continue + else: + wavscp.write(f'{aid}\t{long_audio_path}\n') + reco2dur.write(f'{aid}\t{duration}\n') + for segment_file in segments_lists: + try: + sid = segment_file['sid'] + start_time = segment_file['begin_time'] + end_time = segment_file['end_time'] + dur = end_time - start_time + text = segment_file['text'] + segment_subsets = segment_file["subsets"] + except Exception: + print(f'''Warning: {segment_file} something + is wrong, skipped''') + continue + else: + utt2text.write(f'{sid}\t{text}\n') + segments.write( + f'{sid}\t{aid}\t{start_time}\t{end_time}\n' + ) + utt2dur.write(f'{sid}\t{dur}\n') + segment_sub_names = " ".join(segment_subsets) + utt2subsets.write( + f'{sid}\t{segment_sub_names}\n') + +def main(): + args = get_args() + + meta_analysis(args.input_json, args.output_dir) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/examples/wenetspeech/asr1/local/process_opus.py b/examples/wenetspeech/asr1/local/process_opus.py new file mode 100644 index 0000000000000000000000000000000000000000..603e0082cc80dbfd4f56cb3ad3cbeb24012a03fa --- /dev/null +++ b/examples/wenetspeech/asr1/local/process_opus.py @@ -0,0 +1,89 @@ +# Copyright 2021 NPU, ASLP Group (Author: Qijie Shao) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# process_opus.py: segmentation and downsampling of opus audio + +# usage: python3 process_opus.py wav.scp segments output_wav.scp + +from pydub import AudioSegment +import sys +import os + + +def read_file(wav_scp, segments): + wav_scp_dict = {} + with open(wav_scp, 'r', encoding='UTF-8') as fin: + for line_str in fin: + wav_id, path = line_str.strip().split() + wav_scp_dict[wav_id] = path + + utt_list = [] + seg_path_list = [] + start_time_list = [] + end_time_list = [] + with open(segments, 'r', encoding='UTF-8') as fin: + for line_str in fin: + arr = line_str.strip().split() + assert len(arr) == 4 + utt_list.append(arr[0]) + seg_path_list.append(wav_scp_dict[arr[1]]) + start_time_list.append(float(arr[2])) + end_time_list.append(float(arr[3])) + return utt_list, seg_path_list, start_time_list, end_time_list + + +# TODO(Qijie): Fix the process logic +def output(output_wav_scp, utt_list, seg_path_list, start_time_list, + end_time_list): + num_utts = len(utt_list) + step = int(num_utts * 0.01) + with open(output_wav_scp, 'w', encoding='UTF-8') as fout: + previous_wav_path = "" + for i in range(num_utts): + utt_id = utt_list[i] + current_wav_path = seg_path_list[i] + output_dir = (os.path.dirname(current_wav_path)) \ + .replace("audio", 'audio_seg') + seg_wav_path = os.path.join(output_dir, utt_id + '.wav') + + # if not os.path.exists(output_dir): + # os.makedirs(output_dir) + + if current_wav_path != previous_wav_path: + source_wav = AudioSegment.from_file(current_wav_path) + previous_wav_path = current_wav_path + + start = int(start_time_list[i] * 1000) + end = int(end_time_list[i] * 1000) + target_audio = source_wav[start:end].set_frame_rate(16000) + target_audio.export(seg_wav_path, format="wav") + + fout.write("{} {}\n".format(utt_id, seg_wav_path)) + if i % step == 0: + print("seg wav finished: {}%".format(int(i / step))) + + +def main(): + wav_scp = sys.argv[1] + segments = sys.argv[2] + output_wav_scp = sys.argv[3] + + utt_list, seg_path_list, start_time_list, end_time_list \ + = read_file(wav_scp, segments) + output(output_wav_scp, utt_list, seg_path_list, start_time_list, + end_time_list) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/examples/wenetspeech/asr1/local/test.sh b/examples/wenetspeech/asr1/local/test.sh new file mode 100755 index 0000000000000000000000000000000000000000..47bd2f6338a7d062b094c327f39f5362fae39865 --- /dev/null +++ b/examples/wenetspeech/asr1/local/test.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +if [ $# != 2 ];then + echo "usage: ${0} config_path ckpt_path_prefix" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_prefix=$2 + +chunk_mode=false +if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then + chunk_mode=true +fi + +# download language model +#bash local/download_lm_ch.sh +#if [ $? -ne 0 ]; then +# exit 1 +#fi + + +for type in attention ctc_greedy_search; do + echo "decoding ${type}" + if [ ${chunk_mode} == true ];then + # stream decoding only support batchsize=1 + batch_size=1 + else + batch_size=64 + fi + output_dir=${ckpt_prefix} + mkdir -p ${output_dir} + python3 -u ${BIN_DIR}/test.py \ + --nproc ${ngpu} \ + --config ${config_path} \ + --result_file ${output_dir}/${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done + +for type in ctc_prefix_beam_search attention_rescoring; do + echo "decoding ${type}" + batch_size=1 + output_dir=${ckpt_prefix} + mkdir -p ${output_dir} + python3 -u ${BIN_DIR}/test.py \ + --nproc ${ngpu} \ + --config ${config_path} \ + --result_file ${output_dir}/${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done + +exit 0 diff --git a/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh new file mode 100755 index 0000000000000000000000000000000000000000..858530534efdaf28818ea6a6f1cc742667d6e71b --- /dev/null +++ b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh @@ -0,0 +1,135 @@ +#!/usr/bin/env bash + +# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) +# Seasalt AI, Inc (Author: Guoguo Chen) +# Mobvoi Inc(Author: Di Wu, Binbin Zhang) +# NPU, ASLP Group (Author: Qijie Shao) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e +set -o pipefail + +stage=1 +prefix= +train_subset=L + +. ./tools/parse_options.sh || exit 1; + +filter_by_id () { + idlist=$1 + input=$2 + output=$3 + field=1 + if [ $# -eq 4 ]; then + field=$4 + fi + cat $input | perl -se ' + open(F, "<$idlist") || die "Could not open id-list file $idlist"; + while() { + @A = split; + @A>=1 || die "Invalid id-list file line $_"; + $seen{$A[0]} = 1; + } + while(<>) { + @A = split; + @A > 0 || die "Invalid file line $_"; + @A >= $field || die "Invalid file line $_"; + if ($seen{$A[$field-1]}) { + print $_; + } + }' -- -idlist="$idlist" -field="$field" > $output ||\ + (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1; +} + +subset_data_dir () { + utt_list=$1 + src_dir=$2 + dest_dir=$3 + mkdir -p $dest_dir || exit 1; + # wav.scp text segments utt2dur + filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\ + (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1; + filter_by_id $utt_list $src_dir/text $dest_dir/text ||\ + (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1; + filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\ + (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1; + awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco + filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\ + (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1; + rm -f $dest_dir/reco +} + +if [ $# -ne 2 ]; then + echo "Usage: $0 [options] " + echo " e.g.: $0 --train-subset L /disk1/audio_data/wenetspeech/ data/" + echo "" + echo "This script takes the WenetSpeech source directory, and prepares the" + echo "WeNet format data directory." + echo " --prefix # Prefix for output data directory." + echo " --stage # Processing stage." + echo " --train-subset # Train subset to be created." + exit 1 +fi + +wenetspeech_dir=$1 +data_dir=$2 + +declare -A subsets +subsets=( + [L]="train_l" + [M]="train_m" + [S]="train_s" + [W]="train_w" + [DEV]="dev" + [TEST_NET]="test_net" + [TEST_MEETING]="test_meeting") + +prefix=${prefix:+${prefix}_} + +corpus_dir=$data_dir/${prefix}corpus/ +if [ $stage -le 1 ]; then + echo "$0: Extract meta into $corpus_dir" + # Sanity check. + [ ! -f $wenetspeech_dir/WenetSpeech.json ] &&\ + echo "$0: Please download $wenetspeech_dir/WenetSpeech.json!" && exit 1; + [ ! -d $wenetspeech_dir/audio ] &&\ + echo "$0: Please download $wenetspeech_dir/audio!" && exit 1; + + [ ! -d $corpus_dir ] && mkdir -p $corpus_dir + + # Files to be created: + # wav.scp text segments utt2dur + python3 local/extract_meta.py \ + $wenetspeech_dir/WenetSpeech.json $corpus_dir || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0: Split data to train, dev, test_net, and test_meeting" + [ ! -f $corpus_dir/utt2subsets ] &&\ + echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1; + for label in $train_subset DEV TEST_NET TEST_MEETING; do + if [ ! ${subsets[$label]+set} ]; then + echo "$0: Subset $label is not defined in WenetSpeech.json." && exit 1; + fi + subset=${subsets[$label]} + [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset + cat $corpus_dir/utt2subsets | \ + awk -v s=$label '{for (i=2;i<=NF;i++) if($i==s) print $0;}' \ + > $corpus_dir/${prefix}${subset}_utt_list|| exit 1; + subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \ + $corpus_dir $data_dir/${prefix}$subset || exit 1; + done +fi + +echo "$0: Done" \ No newline at end of file diff --git a/examples/wenetspeech/asr1/path.sh b/examples/wenetspeech/asr1/path.sh new file mode 100644 index 0000000000000000000000000000000000000000..666b29bce2611ae6a23b71b4d8f460cbc58c6c1e --- /dev/null +++ b/examples/wenetspeech/asr1/path.sh @@ -0,0 +1,15 @@ +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + +# model exp +MODEL=u2 +export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin diff --git a/examples/wenetspeech/asr1/run.sh b/examples/wenetspeech/asr1/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..8c4a12cb46056113781e40fc475388426e0c92bf --- /dev/null +++ b/examples/wenetspeech/asr1/run.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +. path.sh || exit 1; +set -e + +gpus=0,1,2,3,4,5,6,7 +stage=0 +stop_stage=100 +conf_path=conf/conformer.yaml + +average_checkpoint=true +avg_num=10 + +. ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +avg_ckpt=avg_${avg_num} +ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') +echo "checkpoint name ${ckpt}" + +audio_file="data/tmp.wav" + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/data.sh || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `exp` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # avg n best model + avg.sh best exp/${ckpt}/checkpoints ${avg_num} +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # test ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # ctc alignment of test data + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # export ckpt avg_n + CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +fi + +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + # test a single .wav file + CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 +fi diff --git a/examples/wenetspeech/asr1/utils b/examples/wenetspeech/asr1/utils new file mode 120000 index 0000000000000000000000000000000000000000..973afe674f2c85f7a400600f963e0709767602dc --- /dev/null +++ b/examples/wenetspeech/asr1/utils @@ -0,0 +1 @@ +../../../utils \ No newline at end of file diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 22d4238ac832fa79fec3b3286ef3e64c16937441..9f5448ccf702f5cbf10db6db9b45a9a19ccdf3a9 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -27,7 +27,9 @@ from paddle import distributed as dist from paddle.io import DataLoader from yacs.config import CfgNode +from paddlespeech.s2t.frontend.featurizer import TextFeaturizer from paddlespeech.s2t.io.collator import SpeechCollator +from paddlespeech.s2t.io.dataloader import BatchDataLoader from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.io.sampler import SortagradBatchSampler from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler @@ -249,92 +251,103 @@ class U2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() - config.defrost() - config.collator.keep_transcription_text = False - # train/valid dataset, return token ids - config.data.manifest = config.data.train_manifest - train_dataset = ManifestDataset.from_config(config) - - config.data.manifest = config.data.dev_manifest - dev_dataset = ManifestDataset.from_config(config) - - collate_fn_train = SpeechCollator.from_config(config) - - config.collator.augmentation_config = "" - collate_fn_dev = SpeechCollator.from_config(config) - - if self.parallel: - batch_sampler = SortagradDistributedBatchSampler( - train_dataset, + if self.train: + # train/valid dataset, return token ids + self.train_loader = BatchDataLoader( + json_file=config.data.train_manifest, + train_mode=True, + sortagrad=False, batch_size=config.collator.batch_size, - num_replicas=None, - rank=None, - shuffle=True, - drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) - else: - batch_sampler = SortagradBatchSampler( - train_dataset, - shuffle=True, + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=self.args.nprocs, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config.collator. + augmentation_config, # aug will be off when train_mode=False + n_iter_processes=config.collator.num_workers, + subsampling_factor=1, + num_encs=1) + + self.valid_loader = BatchDataLoader( + json_file=config.data.dev_manifest, + train_mode=False, + sortagrad=False, batch_size=config.collator.batch_size, - drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) - self.train_loader = DataLoader( - train_dataset, - batch_sampler=batch_sampler, - collate_fn=collate_fn_train, - num_workers=config.collator.num_workers, ) - self.valid_loader = DataLoader( - dev_dataset, - batch_size=config.collator.batch_size, - shuffle=False, - drop_last=False, - collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers, ) - - # test dataset, return raw text - config.data.manifest = config.data.test_manifest - # filter test examples, will cause less examples, but no mismatch with training - # and can use large batch size , save training time, so filter test egs now. - config.data.min_input_len = 0.0 # second - config.data.max_input_len = float('inf') # second - config.data.min_output_len = 0.0 # tokens - config.data.max_output_len = float('inf') # tokens - config.data.min_output_input_ratio = 0.00 - config.data.max_output_input_ratio = float('inf') - - test_dataset = ManifestDataset.from_config(config) - # return text ord id - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" - self.test_loader = DataLoader( - test_dataset, - batch_size=config.decoding.batch_size, - shuffle=False, - drop_last=False, - collate_fn=SpeechCollator.from_config(config), - num_workers=config.collator.num_workers, ) - # return text token id - config.collator.keep_transcription_text = False - self.align_loader = DataLoader( - test_dataset, - batch_size=config.decoding.batch_size, - shuffle=False, - drop_last=False, - collate_fn=SpeechCollator.from_config(config), - num_workers=config.collator.num_workers, ) - logger.info("Setup train/valid/test/align Dataloader!") + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=self.args.nprocs, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config.collator. + augmentation_config, # aug will be off when train_mode=False + n_iter_processes=config.collator.num_workers, + subsampling_factor=1, + num_encs=1) + logger.info("Setup train/valid Dataloader!") + else: + # test dataset, return raw text + self.test_loader = BatchDataLoader( + json_file=config.data.test_manifest, + train_mode=False, + sortagrad=False, + batch_size=config.decoding.batch_size, + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=1, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config.collator. + augmentation_config, # aug will be off when train_mode=False + n_iter_processes=1, + subsampling_factor=1, + num_encs=1) + + self.align_loader = BatchDataLoader( + json_file=config.data.test_manifest, + train_mode=False, + sortagrad=False, + batch_size=config.decoding.batch_size, + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=1, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config.collator. + augmentation_config, # aug will be off when train_mode=False + n_iter_processes=1, + subsampling_factor=1, + num_encs=1) + logger.info("Setup test/align Dataloader!") def setup_model(self): config = self.config model_conf = config.model with UpdateConfig(model_conf): - model_conf.input_dim = self.train_loader.collate_fn.feature_size - model_conf.output_dim = self.train_loader.collate_fn.vocab_size + if self.train: + model_conf.input_dim = self.train_loader.feat_dim + model_conf.output_dim = self.train_loader.vocab_size + else: + model_conf.input_dim = self.test_loader.feat_dim + model_conf.output_dim = self.test_loader.vocab_size model = U2Model.from_config(model_conf) @@ -343,6 +356,11 @@ class U2Trainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) + self.model = model + logger.info("Setup model!") + + if not self.train: + return train_config = config.training optim_type = train_config.optim @@ -383,10 +401,9 @@ class U2Trainer(Trainer): optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler) optimizer = OptimizerFactory.from_args(optim_type, optimzer_args) - self.model = model self.optimizer = optimizer self.lr_scheduler = lr_scheduler - logger.info("Setup model/optimizer/lr_scheduler!") + logger.info("Setup optimizer/lr_scheduler!") class U2Tester(U2Trainer): @@ -421,14 +438,19 @@ class U2Tester(U2Trainer): def __init__(self, config, args): super().__init__(config, args) + self.text_feature = TextFeaturizer( + unit_type=self.config.collator.unit_type, + vocab_filepath=self.config.collator.vocab_filepath, + spm_model_prefix=self.config.collator.spm_model_prefix) + self.vocab_list = self.text_feature.vocab_list - def ordid2token(self, texts, texts_len): + def id2token(self, texts, texts_len, text_feature): """ ord() id to chr() chr """ trans = [] for text, n in zip(texts, texts_len): n = n.numpy().item() ids = text[:n] - trans.append(''.join([chr(i) for i in ids])) + trans.append(text_feature.defeaturize(ids.numpy().tolist())) return trans def compute_metrics(self, @@ -444,12 +466,11 @@ class U2Tester(U2Trainer): error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer start_time = time.time() - text_feature = self.test_loader.collate_fn.text_feature - target_transcripts = self.ordid2token(texts, texts_len) + target_transcripts = self.id2token(texts, texts_len, self.text_feature) result_transcripts, result_tokenids = self.model.decode( audio, audio_len, - text_feature=text_feature, + text_feature=self.text_feature, decoding_method=cfg.decoding_method, lang_model_path=cfg.lang_model_path, beam_alpha=cfg.alpha, @@ -499,7 +520,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.test_loader.collate_fn.stride_ms + stride_ms = self.config.collator.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 @@ -558,8 +579,7 @@ class U2Tester(U2Trainer): def align(self): ctc_utils.ctc_align(self.config, self.model, self.align_loader, self.config.decoding.batch_size, - self.align_loader.collate_fn.stride_ms, - self.align_loader.collate_fn.vocab_list, + self.config.collator.stride_ms, self.vocab_list, self.args.result_file) def load_inferspec(self): @@ -573,7 +593,7 @@ class U2Tester(U2Trainer): infer_model = U2InferModel.from_pretrained(self.test_loader, self.config.model.clone(), self.args.checkpoint_path) - feat_dim = self.test_loader.collate_fn.feature_size + feat_dim = self.test_loader.feat_dim input_spec = [ paddle.static.InputSpec(shape=[1, None, feat_dim], dtype='float32'), # audio, [B,T,D] diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index 0d8508c205f7f06cc5daabd6ff8c3cbf6205864b..d82034c8234df7bb621f2756dd047f40e0475e5c 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -392,6 +392,7 @@ class U2Tester(U2Trainer): unit_type=self.config.collator.unit_type, vocab_filepath=self.config.collator.vocab_filepath, spm_model_prefix=self.config.collator.spm_model_prefix) + self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): """ ord() id to chr() chr """ @@ -529,8 +530,7 @@ class U2Tester(U2Trainer): def align(self): ctc_utils.ctc_align(self.config, self.model, self.align_loader, self.config.decoding.batch_size, - self.align_loader.collate_fn.stride_ms, - self.align_loader.collate_fn.vocab_list, + self.config.collator.stride_ms, self.vocab_list, self.args.result_file) def load_inferspec(self): diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py index 13dc3a44d49b1e2f98b457086190a09c16f38fd9..65dccad385c07ee59edf585a603ab822d8d38607 100644 --- a/paddlespeech/s2t/frontend/audio.py +++ b/paddlespeech/s2t/frontend/audio.py @@ -24,6 +24,8 @@ import soundfile import soxbindings as sox from scipy import signal +from .utility import convert_samples_from_float32 +from .utility import convert_samples_to_float32 from .utility import subfile_from_tar @@ -689,15 +691,7 @@ class AudioSegment(): Audio sample type is usually integer or float-point. Integers will be scaled to [-1, 1] in float32. """ - float32_samples = samples.astype('float32') - if samples.dtype in np.sctypes['int']: - bits = np.iinfo(samples.dtype).bits - float32_samples *= (1. / 2**(bits - 1)) - elif samples.dtype in np.sctypes['float']: - pass - else: - raise TypeError("Unsupported sample type: %s." % samples.dtype) - return float32_samples + return convert_samples_to_float32(samples) def _convert_samples_from_float32(self, samples, dtype): """Convert sample type from float32 to dtype. @@ -708,20 +702,4 @@ class AudioSegment(): This is for writing a audio file. """ - dtype = np.dtype(dtype) - output_samples = samples.copy() - if dtype in np.sctypes['int']: - bits = np.iinfo(dtype).bits - output_samples *= (2**(bits - 1) / 1.) - min_val = np.iinfo(dtype).min - max_val = np.iinfo(dtype).max - output_samples[output_samples > max_val] = max_val - output_samples[output_samples < min_val] = min_val - elif samples.dtype in np.sctypes['float']: - min_val = np.finfo(dtype).min - max_val = np.finfo(dtype).max - output_samples[output_samples > max_val] = max_val - output_samples[output_samples < min_val] = min_val - else: - raise TypeError("Unsupported sample type: %s." % samples.dtype) - return output_samples.astype(dtype) + return convert_samples_from_float32(samples, dtype) diff --git a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py index 7f3bd9e1253fbcf491743ba912472b22b4d8f0e9..21f512e9b9e2827b6c6e23b53f16badf3ec8c958 100644 --- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py +++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py @@ -92,7 +92,9 @@ class TextFeaturizer(): tokens = self.tokenize(text) ids = [] for token in tokens: - token = token if token in self.vocab_dict else self.unk + if token not in self.vocab_dict: + logger.debug(f"Text Token: {token} -> {self.unk}") + token = self.unk ids.append(self.vocab_dict[token]) return ids diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py index 089890d2a2b5f2f5e024fcfb29667b23a0da232b..703f2127d7e71b093030658f6d88c45979770c61 100644 --- a/paddlespeech/s2t/frontend/utility.py +++ b/paddlespeech/s2t/frontend/utility.py @@ -30,7 +30,8 @@ logger = Log(__name__).getlog() __all__ = [ "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs", "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS", - "EOS", "UNK", "BLANK", "MASKCTC", "SPACE" + "EOS", "UNK", "BLANK", "MASKCTC", "SPACE", "convert_samples_to_float32", + "convert_samples_from_float32" ] IGNORE_ID = -1 @@ -342,3 +343,50 @@ def load_cmvn(cmvn_file: str, filetype: str): else: raise ValueError(f"cmvn file type no support: {filetype}") return cmvn[0], cmvn[1] + + +def convert_samples_to_float32(samples): + """Convert sample type to float32. + + Audio sample type is usually integer or float-point. + Integers will be scaled to [-1, 1] in float32. + + PCM16 -> PCM32 + """ + float32_samples = samples.astype('float32') + if samples.dtype in np.sctypes['int']: + bits = np.iinfo(samples.dtype).bits + float32_samples *= (1. / 2**(bits - 1)) + elif samples.dtype in np.sctypes['float']: + pass + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return float32_samples + + +def convert_samples_from_float32(samples, dtype): + """Convert sample type from float32 to dtype. + + Audio sample type is usually integer or float-point. For integer + type, float32 will be rescaled from [-1, 1] to the maximum range + supported by the integer type. + + PCM32 -> PCM16 + """ + dtype = np.dtype(dtype) + output_samples = samples.copy() + if dtype in np.sctypes['int']: + bits = np.iinfo(dtype).bits + output_samples *= (2**(bits - 1) / 1.) + min_val = np.iinfo(dtype).min + max_val = np.iinfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + elif samples.dtype in np.sctypes['float']: + min_val = np.finfo(dtype).min + max_val = np.finfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return output_samples.astype(dtype) diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py index cb7349d00d3c59cf50288c4f4b5511f19bab3aaf..5f2335496c6c1cfddb8a62874d328fd51084aec9 100644 --- a/paddlespeech/s2t/io/collator.py +++ b/paddlespeech/s2t/io/collator.py @@ -199,8 +199,8 @@ class SpeechCollatorBase(): for idx, item in enumerate(batch): utts.append(item['utt']) - audio = item['feat'] - text = item['text'] + audio = item['input'][0]['feat'] + text = item['output'][0]['text'] audio, text = self.process_utterance(audio, text) audios.append(audio) # [T, D] @@ -343,9 +343,10 @@ class TripletSpeechCollator(SpeechCollator): for idx, item in enumerate(batch): utts.append(item['utt']) - audio = item['feat'] - translation = item['text'] - transcription = item['text1'] + audio = item['input'][0]['feat'] + translation = item['output'][0]['text'] + transcription = item['output'][1]['text'] + audio, translation, transcription = self.process_utterance( audio, translation, transcription) diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index c503107a01534bf5b1c05b5b6ab51aae418c6217..61eeb00f1c41b586762035fe1a097b2197a74008 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -103,7 +103,7 @@ class ManifestDataset(Dataset): min_output_len=min_output_len, max_output_input_ratio=max_output_input_ratio, min_output_input_ratio=min_output_input_ratio) - self._manifest.sort(key=lambda x: x["feat_shape"][0]) + self._manifest.sort(key=lambda x: x["input"][0]["shape"][0]) def __len__(self): return len(self._manifest) @@ -188,34 +188,16 @@ class AudioDataset(Dataset): if sort: data = sorted(data, key=lambda x: x["feat_shape"][0]) if raw_wav: - assert data[0]['feat'].split(':')[0].splitext()[-1] not in ('.ark', - '.scp') - data = map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms)) + path_suffix = data[0]['feat'].split(':')[0].splitext()[-1] + assert path_suffix not in ('.ark', '.scp') + # m second to n frame + data = list( + map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms), + data)) self.input_dim = data[0]['feat_shape'][1] self.output_dim = data[0]['token_shape'][1] - # with open(data_file, 'r') as f: - # for line in f: - # arr = line.strip().split('\t') - # if len(arr) != 7: - # continue - # key = arr[0].split(':')[1] - # tokenid = arr[5].split(':')[1] - # output_dim = int(arr[6].split(':')[1].split(',')[1]) - # if raw_wav: - # wav_path = ':'.join(arr[1].split(':')[1:]) - # duration = int(float(arr[2].split(':')[1]) * 1000 / 10) - # data.append((key, wav_path, duration, tokenid)) - # else: - # feat_ark = ':'.join(arr[1].split(':')[1:]) - # feat_info = arr[2].split(':')[1].split(',') - # feat_dim = int(feat_info[1].strip()) - # num_frames = int(feat_info[0].strip()) - # data.append((key, feat_ark, num_frames, tokenid)) - # self.input_dim = feat_dim - # self.output_dim = output_dim - valid_data = [] for i in range(len(data)): length = data[i]['feat_shape'][0] @@ -223,17 +205,17 @@ class AudioDataset(Dataset): # remove too lang or too short utt for both input and output # to prevent from out of memory if length > max_length or length < min_length: - # logging.warn('ignore utterance {} feature {}'.format( - # data[i][0], length)) pass elif token_length > token_max_length or token_length < token_min_length: pass else: valid_data.append(data[i]) + logger.info(f"raw dataset len: {len(data)}") data = valid_data + num_data = len(data) + logger.info(f"dataset len after filter: {num_data}") self.minibatch = [] - num_data = len(data) # Dynamic batch size if batch_type == 'dynamic': assert (max_frames_in_batch > 0) @@ -258,7 +240,9 @@ class AudioDataset(Dataset): cur = end def __len__(self): + """number of example(batch)""" return len(self.minibatch) def __getitem__(self, idx): + """batch example of idx""" return self.minibatch[idx] diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py index e810662df377af2db05e128c1cbda9d2042afa89..38ff1396389f55c5dfcd8f42656483e5981a3e54 100644 --- a/paddlespeech/s2t/io/reader.py +++ b/paddlespeech/s2t/io/reader.py @@ -18,8 +18,10 @@ import kaldiio import numpy as np import soundfile -from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation +from .utility import feat_type +from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.s2t.utils.log import Log +# from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline as Transformation __all__ = ["LoadInputsAndTargets"] @@ -322,20 +324,7 @@ class LoadInputsAndTargets(): "Not supported: loader_type={}".format(filetype)) def file_type(self, filepath): - suffix = filepath.split(":")[0].split('.')[-1].lower() - if suffix == 'ark': - return 'mat' - elif suffix == 'scp': - return 'scp' - elif suffix == 'npy': - return 'npy' - elif suffix == 'npz': - return 'npz' - elif suffix in ['wav', 'flac']: - # PCM16 - return 'sound' - else: - raise ValueError(f"Not support filetype: {suffix}") + return feat_type(filepath) class SoundHDF5File(): diff --git a/paddlespeech/s2t/io/utility.py b/paddlespeech/s2t/io/utility.py index 392031ba81c5b6e641538891b81b36767805d2ff..1a90e3d0461ba9bb278b1c4be358449c2b2c6191 100644 --- a/paddlespeech/s2t/io/utility.py +++ b/paddlespeech/s2t/io/utility.py @@ -17,7 +17,7 @@ import numpy as np from paddlespeech.s2t.utils.log import Log -__all__ = ["pad_list", "pad_sequence"] +__all__ = ["pad_list", "pad_sequence", "feat_type"] logger = Log(__name__).getlog() @@ -85,3 +85,20 @@ def pad_sequence(sequences: List[np.ndarray], out_tensor[:length, i, ...] = tensor return out_tensor + + +def feat_type(filepath): + suffix = filepath.split(":")[0].split('.')[-1].lower() + if suffix == 'ark': + return 'mat' + elif suffix == 'scp': + return 'scp' + elif suffix == 'npy': + return 'npy' + elif suffix == 'npz': + return 'npz' + elif suffix in ['wav', 'flac']: + # PCM16 + return 'sound' + else: + raise ValueError(f"Not support filetype: {suffix}") diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 9977cecc4bb2ecc337f7126cd7c1a38d95424358..4f833372a9ace05d9228c5ccad537a9e627dae88 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -860,7 +860,7 @@ class U2Model(U2DecodeModel): int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc """ # cmvn - if 'cmvn_file' in configs and configs['cmvn_file'] is not None: + if 'cmvn_file' in configs and configs['cmvn_file']: mean, istd = load_cmvn(configs['cmvn_file'], configs['cmvn_file_type']) global_cmvn = GlobalCMVN( @@ -934,8 +934,8 @@ class U2Model(U2DecodeModel): DeepSpeech2Model: The model built from pretrained result. """ with UpdateConfig(config): - config.input_dim = dataloader.collate_fn.feature_size - config.output_dim = dataloader.collate_fn.vocab_size + config.input_dim = dataloader.feat_dim + config.output_dim = dataloader.vocab_size model = cls.from_config(config) diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index 80eaf97542b090adec271570ce991e693b61e124..3d5f8cd1d3aaff3841a8b519bb7b3af178c700ef 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/cmvn.py b/paddlespeech/s2t/modules/cmvn.py index 6e97f82458921bf287778b0e95a869ca54950138..67f71b6678e9908613b0fe867a44453fb204297a 100644 --- a/paddlespeech/s2t/modules/cmvn.py +++ b/paddlespeech/s2t/modules/cmvn.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py index 7601a5cca607df2820ba45583190516b2c80a48c..7ec92554eec73b8889335b3a16fd1a34692bb021 100644 --- a/paddlespeech/s2t/modules/conformer_convolution.py +++ b/paddlespeech/s2t/modules/conformer_convolution.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index b0ab869a212b2fb3f9b9216e9f41f4b8b07b0fe0..6b4d959123b19cc23cd42bdcf68491ac6e5f61de 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py index 4d516068239beac686c116846d75a3dbebcd8194..520b18dea17928b6fe95bbda804bd89ef28aa904 100644 --- a/paddlespeech/s2t/modules/decoder_layer.py +++ b/paddlespeech/s2t/modules/decoder_layer.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py index 9207658f99bec9d9bcec8425d7f8a74f2bc146f5..5d4e91753b38129a9c2c71d706787af9d14a903d 100644 --- a/paddlespeech/s2t/modules/embedding.py +++ b/paddlespeech/s2t/modules/embedding.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index 0cde5b9f2360f005565d17c68cd7384b74039a2c..5c8ba0810d00db66a3c96238cf5d243802eb9d7b 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py index 29d5a2d872e61a7f8aec7ace9bde05829043b08c..d39c0695a044cd9cdc5969b547be911565015672 100644 --- a/paddlespeech/s2t/modules/encoder_layer.py +++ b/paddlespeech/s2t/modules/encoder_layer.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/loss.py b/paddlespeech/s2t/modules/loss.py index 5750f5a0f79e8b2bce0f89b7f68c4c696d9e85f8..c7d9bd45dd2bf005a575098456c435a173678d26 100644 --- a/paddlespeech/s2t/modules/loss.py +++ b/paddlespeech/s2t/modules/loss.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/mask.py b/paddlespeech/s2t/modules/mask.py index 6576cb9221e027feeaee4651b2f73795fa174225..d6b63761b49b530db68a7ff0bb342675124c9fca 100644 --- a/paddlespeech/s2t/modules/mask.py +++ b/paddlespeech/s2t/modules/mask.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/positionwise_feed_forward.py b/paddlespeech/s2t/modules/positionwise_feed_forward.py index 347264e9d6cd9211f2c54ca556e99ba36cd1ae71..e2619cd49dc15ef7d9ddb1fbbb991f3fe3eb1c35 100644 --- a/paddlespeech/s2t/modules/positionwise_feed_forward.py +++ b/paddlespeech/s2t/modules/positionwise_feed_forward.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py index 759bd540f5b0149a03ef70ab6353bdcd6817d16d..99a8300f246149e924fe741f53934259d404e4e8 100644 --- a/paddlespeech/s2t/modules/subsampling.py +++ b/paddlespeech/s2t/modules/subsampling.py @@ -1,4 +1,5 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2019 Mobvoi Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/transform/cmvn.py b/paddlespeech/s2t/transform/cmvn.py index 4d2d2324f660dd676d6774e8aedc37f0e7cb6585..aa1e6b4450f41103c1f0c9f2e723bfcdbe2cde9d 100644 --- a/paddlespeech/s2t/transform/cmvn.py +++ b/paddlespeech/s2t/transform/cmvn.py @@ -13,6 +13,7 @@ # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) import io +import json import h5py import kaldiio @@ -157,3 +158,40 @@ class UtteranceCMVN(): x = np.divide(x, std) return x + + +class GlobalCMVN(): + "Apply Global CMVN" + + def __init__(self, + cmvn_path, + norm_means=True, + norm_vars=True, + std_floor=1.0e-20): + self.cmvn_path = cmvn_path + self.norm_means = norm_means + self.norm_vars = norm_vars + self.std_floor = std_floor + + with open(cmvn_path) as f: + cmvn_stats = json.load(f) + self.count = cmvn_stats['frame_num'] + self.mean = np.array(cmvn_stats['mean_stat']) / self.count + self.square_sums = np.array(cmvn_stats['var_stat']) + self.var = self.square_sums / self.count - self.mean**2 + self.std = np.maximum(np.sqrt(self.var), self.std_floor) + + def __repr__(self): + return f"""{self.__class__.__name__}( + cmvn_path={self.cmvn_path}, + norm_means={self.norm_means}, + norm_vars={self.norm_vars},)""" + + def __call__(self, x, uttid=None): + # x: [Time, Dim] + if self.norm_means: + x = np.subtract(x, self.mean) + + if self.norm_vars: + x = np.divide(x, self.std) + return x diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py index 153d494bf76b7d1ea99c79caf16af88ddc70638c..873adb0b8ab2e5d67bb434fb6c1ab907114bc35d 100644 --- a/paddlespeech/s2t/transform/perturb.py +++ b/paddlespeech/s2t/transform/perturb.py @@ -16,6 +16,7 @@ import librosa import numpy import scipy import soundfile +import soxbindings as sox from paddlespeech.s2t.io.reader import SoundHDF5File @@ -82,7 +83,6 @@ class SpeedPerturbation(): def __call__(self, x, uttid=None, train=True): if not train: return x - x = x.astype(numpy.float32) if self.accept_uttid: ratio = self.utt2ratio[uttid] @@ -108,6 +108,110 @@ class SpeedPerturbation(): return y +class SpeedPerturbationSox(): + """SpeedPerturbationSox + + The speed perturbation in kaldi uses sox-speed instead of sox-tempo, + and sox-speed just to resample the input, + i.e pitch and tempo are changed both. + + To speed up or slow down the sound of a file, + use speed to modify the pitch and the duration of the file. + This raises the speed and reduces the time. + The default factor is 1.0 which makes no change to the audio. + 2.0 doubles speed, thus time length is cut by a half and pitch is one interval higher. + + "Why use speed option instead of tempo -s in SoX for speed perturbation" + https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8 + + tempo option: + sox -t wav input.wav -t wav output.tempo0.9.wav tempo -s 0.9 + + speed option: + sox -t wav input.wav -t wav output.speed0.9.wav speed 0.9 + + If we use speed option like above, the pitch of audio also will be changed, + but the tempo option does not change the pitch. + """ + + def __init__( + self, + lower=0.9, + upper=1.1, + utt2ratio=None, + keep_length=True, + sr=16000, + seed=None, ): + self.sr = sr + self.keep_length = keep_length + self.state = numpy.random.RandomState(seed) + + if utt2ratio is not None: + self.utt2ratio = {} + # Use the scheduled ratio for each utterances + self.utt2ratio_file = utt2ratio + self.lower = None + self.upper = None + self.accept_uttid = True + + with open(utt2ratio, "r") as f: + for line in f: + utt, ratio = line.rstrip().split(None, 1) + ratio = float(ratio) + self.utt2ratio[utt] = ratio + else: + self.utt2ratio = None + # The ratio is given on runtime randomly + self.lower = lower + self.upper = upper + + def __repr__(self): + if self.utt2ratio is None: + return f"""{self.__class__.__name__}( + lower={self.lower}, + upper={self.upper}, + keep_length={self.keep_length}, + sample_rate={self.sr})""" + + else: + return f"""{self.__class__.__name__}( + utt2ratio={self.utt2ratio_file}, + sample_rate={self.sr})""" + + def __call__(self, x, uttid=None, train=True): + if not train: + return x + + x = x.astype(numpy.float32) + if self.accept_uttid: + ratio = self.utt2ratio[uttid] + else: + ratio = self.state.uniform(self.lower, self.upper) + + tfm = sox.Transformer() + tfm.set_globals(multithread=False) + tfm.speed(ratio) + y = tfm.build_array(input_array=x, sample_rate_in=self.sr) + + if self.keep_length: + diff = abs(len(x) - len(y)) + if len(y) > len(x): + # Truncate noise + y = y[diff // 2:-((diff + 1) // 2)] + elif len(y) < len(x): + # Assume the time-axis is the first: (Time, Channel) + pad_width = [(diff // 2, (diff + 1) // 2)] + [ + (0, 0) for _ in range(y.ndim - 1) + ] + y = numpy.pad( + y, pad_width=pad_width, constant_values=0, mode="constant") + + if y.ndim == 2 and x.ndim == 1: + # (T, C) -> (T) + y = y.sequence(1) + return y + + class BandpassPerturbation(): """BandpassPerturbation diff --git a/paddlespeech/s2t/transform/spec_augment.py b/paddlespeech/s2t/transform/spec_augment.py index 83e4e2e7502390dd2610c15923eabe0be694b802..5ce950851a4ee6dbaa2bcbe529cbc89ce714a60b 100644 --- a/paddlespeech/s2t/transform/spec_augment.py +++ b/paddlespeech/s2t/transform/spec_augment.py @@ -34,6 +34,9 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"): :returns numpy.ndarray: time warped spectrogram (time, freq) """ window = max_time_warp + if window == 0: + return x + if mode == "PIL": t = x.shape[0] if t - window <= window: diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index df3130dadcd68e2416cc860cd41bb0e6c3d2c651..da91ef92174b817bd8778b6a53518eed2b9e6f1b 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -14,6 +14,7 @@ # Modified from espnet(https://github.com/espnet/espnet) import librosa import numpy as np +from python_speech_features import logfbank def stft(x, @@ -304,3 +305,94 @@ class IStft(): win_length=self.win_length, window=self.window, center=self.center, ) + + +class LogMelSpectrogramKaldi(): + def __init__( + self, + fs=16000, + n_mels=80, + n_fft=512, # fft point + n_shift=160, # unit:sample, 10ms + win_length=400, # unit:sample, 25ms + window="povey", + fmin=20, + fmax=None, + eps=1e-10, + dither=False): + self.fs = fs + self.n_mels = n_mels + self.n_fft = n_fft + if n_shift > win_length: + raise ValueError("Stride size must not be greater than " + "window size.") + self.n_shift = n_shift / fs # unit: ms + self.win_length = win_length / fs # unit: ms + + self.window = window + self.fmin = fmin + if fmax is None: + fmax_ = fmax if fmax else self.fs / 2 + elif fmax > int(self.fs / 2): + raise ValueError("fmax must not be greater than half of " + "sample rate.") + self.fmax = fmax_ + + self.eps = eps + self.remove_dc_offset = True + self.preemph = 0.97 + self.dither = dither + + def __repr__(self): + return ( + "{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, " + "n_shift={n_shift}, win_length={win_length}, preemph={preemph}, window={window}, " + "fmin={fmin}, fmax={fmax}, eps={eps}, dither={dither}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_fft=self.n_fft, + n_shift=self.n_shift, + preemph=self.preemph, + win_length=self.win_length, + window=self.window, + fmin=self.fmin, + fmax=self.fmax, + eps=self.eps, + dither=self.dither, )) + + def __call__(self, x): + """ + + Args: + x (np.ndarray): shape (Ti,) + + Raises: + ValueError: not support (Ti, C) + + Returns: + np.ndarray: (T, D) + """ + if x.ndim != 1: + raise ValueError("Not support x: [Time, Channel]") + + if x.dtype in np.sctypes['float']: + # PCM32 -> PCM16 + bits = np.iinfo(np.int16).bits + x = x * 2**(bits - 1) + + # logfbank need PCM16 input + y = logfbank( + signal=x, + samplerate=self.fs, + winlen=self.win_length, # unit ms + winstep=self.n_shift, # unit ms + nfilt=self.n_mels, + nfft=self.n_fft, + lowfreq=self.fmin, + highfreq=self.fmax, + dither=self.dither, + remove_dc_offset=self.remove_dc_offset, + preemph=self.preemph, + wintype=self.window) + return y diff --git a/paddlespeech/s2t/transform/transformation.py b/paddlespeech/s2t/transform/transformation.py index 1aee4b36f79479d5aa60d4d2148bb3431697bcde..381b0cdc9d92c9d583bf357935dcf8ac9759c9aa 100644 --- a/paddlespeech/s2t/transform/transformation.py +++ b/paddlespeech/s2t/transform/transformation.py @@ -45,7 +45,8 @@ import_alias = dict( stft2fbank="paddlespeech.s2t.transform.spectrogram:Stft2LogMelSpectrogram", wpe="paddlespeech.s2t.transform.wpe:WPE", channel_selector="paddlespeech.s2t.transform.channel_selector:ChannelSelector", -) + fbank_kaldi="paddlespeech.s2t.transform.spectrogram:LogMelSpectrogramKaldi", + cmvn_json="paddlespeech.s2t.transform.cmvn:GlobalCMVN") class Transformation(): diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py index 296d272a46525fc3fdbe42b572e12924fb5557a2..e47554dcabbfb596b04e7037c6fb6a151aded2e2 100755 --- a/utils/compute_mean_std.py +++ b/utils/compute_mean_std.py @@ -33,8 +33,8 @@ add_arg('spectrum_type', str, choices=['linear', 'mfcc', 'fbank']) add_arg('feat_dim', int, 13, "Audio feature dim.") add_arg('delta_delta', bool, False, "Audio feature with delta delta.") -add_arg('stride_ms', float, 10.0, "stride length in ms.") -add_arg('window_ms', float, 20.0, "stride length in ms.") +add_arg('stride_ms', int, 10, "stride length in ms.") +add_arg('window_ms', int, 20, "stride length in ms.") add_arg('sample_rate', int, 16000, "target sample rate.") add_arg('use_dB_normalization', bool, True, "do dB normalization.") add_arg('target_dB', int, -20, "target dB.") @@ -61,8 +61,8 @@ def main(): spectrum_type=args.spectrum_type, feat_dim=args.feat_dim, delta_delta=args.delta_delta, - stride_ms=args.stride_ms, - window_ms=args.window_ms, + stride_ms=float(args.stride_ms), + window_ms=float(args.window_ms), n_fft=None, max_freq=None, target_sample_rate=args.sample_rate, diff --git a/utils/format_data.py b/utils/format_data.py index 6fe36997a6513121e2878a00306e9d09018af47c..2fa1924a072faa67ad559f68174896846f8cbdf9 100755 --- a/utils/format_data.py +++ b/utils/format_data.py @@ -20,13 +20,13 @@ import json from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.frontend.utility import read_manifest +from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.s2t.utils.utility import print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable -add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), mat(ark), scp") add_arg('cmvn_path', str, 'examples/librispeech/data/mean_std.json', "Filepath of cmvn.") @@ -62,27 +62,76 @@ def main(): vocab_size = text_feature.vocab_size print(f"Vocab size: {vocab_size}") + # josnline like this + # { + # "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}], + # "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}], + # "utt2spk": "111-2222", + # "utt": "111-2222-333" + # } count = 0 for manifest_path in args.manifest_paths: manifest_jsons = read_manifest(manifest_path) for line_json in manifest_jsons: + output_json = { + "input": [], + "output": [], + 'utt': line_json['utt'], + 'utt2spk': line_json.get('utt2spk', 'global'), + } + + # output line = line_json['text'] - tokens = text_feature.tokenize(line) - tokenids = text_feature.featurize(line) - line_json['token'] = tokens - line_json['token_id'] = tokenids - line_json['token_shape'] = (len(tokenids), vocab_size) - feat_shape = line_json['feat_shape'] - assert isinstance(feat_shape, (list, tuple)), type(feat_shape) - if args.feat_type == 'raw': - feat_shape.append(feat_dim) - line_json['filetype'] = 'sound' - else: # kaldi - raise NotImplementedError('no support kaldi feat now!') - fout.write(json.dumps(line_json) + '\n') + if isinstance(line, str): + # only one target + tokens = text_feature.tokenize(line) + tokenids = text_feature.featurize(line) + output_json['output'].append({ + 'name': 'target1', + 'shape': (len(tokenids), vocab_size), + 'text': line, + 'token': ' '.join(tokens), + 'tokenid': ' '.join(map(str, tokenids)), + }) + else: + # isinstance(line, list), multi target in one vocab + for i, item in enumerate(line, 1): + tokens = text_feature.tokenize(item) + tokenids = text_feature.featurize(item) + output_json['output'].append({ + 'name': f'target{i}', + 'shape': (len(tokenids), vocab_size), + 'text': item, + 'token': ' '.join(tokens), + 'tokenid': ' '.join(map(str, tokenids)), + }) + + # input + line = line_json['feat'] + if isinstance(line, str): + # only one input + feat_shape = line_json['feat_shape'] + assert isinstance(feat_shape, (list, tuple)), type(feat_shape) + filetype = feat_type(line) + if filetype == 'sound': + feat_shape.append(feat_dim) + else: # kaldi + raise NotImplementedError('no support kaldi feat now!') + + output_json['input'].append({ + "name": "input1", + "shape": feat_shape, + "feat": line, + "filetype": filetype, + }) + else: + # isinstance(line, list), multi input + raise NotImplementedError("not support multi input now!") + + fout.write(json.dumps(output_json) + '\n') count += 1 - print(f"Examples number: {count}") + print(f"{args.manifest_paths} Examples number: {count}") fout.close() diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py index 79b3d2cb2dd8b4cb2edea6f055b109d35dc0cae7..e0b5ece37353dc9cc592d440ce11ba486569bfaf 100755 --- a/utils/format_triplet_data.py +++ b/utils/format_triplet_data.py @@ -20,13 +20,13 @@ import json from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.frontend.utility import read_manifest +from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.s2t.utils.utility import print_arguments parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable -add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kaldi") add_arg('cmvn_path', str, 'examples/librispeech/data/mean_std.json', "Filepath of cmvn.") @@ -79,9 +79,11 @@ def main(): line_json['token1'] = tokens line_json['token_id1'] = tokenids line_json['token_shape1'] = (len(tokenids), vocab_size) + feat_shape = line_json['feat_shape'] assert isinstance(feat_shape, (list, tuple)), type(feat_shape) - if args.feat_type == 'raw': + filetype = feat_type(line_json['feat']) + if filetype == 'sound': feat_shape.append(feat_dim) else: # kaldi raise NotImplementedError('no support kaldi feat now!') diff --git a/utils/pack_model.sh b/utils/pack_model.sh new file mode 100755 index 0000000000000000000000000000000000000000..8acd59a640bb6fe10e22a38fc1c2f6d6c71ca46a --- /dev/null +++ b/utils/pack_model.sh @@ -0,0 +1,164 @@ +#!/usr/bin/env bash + +# Copyright 2019 Johns Hopkins University (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +[ -f ./path.sh ] && . ./path.sh + +results="" +# e.g., "exp/tr_it_pytorch_train/decode_dt_it_decode/result.wrd.txt +# exp/tr_it_pytorch_train/decode_et_it_decode/result.wrd.txt"' +lm="" +dict="" +etc="" +outfile="model" +preprocess_conf="" + +help_message=$(cat < --dict , for example: +: exp/train_rnnlm/rnnlm.model.best +: data/lang_char +: conf/train.yaml +: conf/decode.yaml +: data/tr_it/cmvn.ark +: exp/tr_it_pytorch_train/results/model.last10.avg.best +EOF +) + +. utils/parse_options.sh + +if [ $# != 4 ]; then + echo "${help_message}" + exit 1 +fi + +tr_conf=$1 +dec_conf=$2 +cmvn=$3 +e2e=$4 + +echo " - Model files (archived to ${outfile}.tar.gz by \`\$ pack_model.sh\`)" +echo " - model link: (put the model link manually.)" + +# configs +if [ -e ${tr_conf} ]; then + tar cfh ${outfile}.tar ${tr_conf} + echo -n " - training config file: \`" + echo ${tr_conf} | sed -e "s/$/\`/" +else + echo "missing ${tr_conf}" + exit 1 +fi +if [ -e ${dec_conf} ]; then + tar rfh ${outfile}.tar ${dec_conf} + echo -n " - decoding config file: \`" + echo ${dec_conf} | sed -e "s/$/\`/" +else + echo "missing ${dec_conf}" + exit 1 +fi +# NOTE(kan-bayashi): preprocess conf is optional +if [ -n "${preprocess_conf}" ]; then + tar rfh ${outfile}.tar ${preprocess_conf} + echo -n " - preprocess config file: \`" + echo ${preprocess_conf} | sed -e "s/$/\`/" +fi + +# cmvn +if [ -e ${cmvn} ]; then + tar rfh ${outfile}.tar ${cmvn} + echo -n " - cmvn file: \`" + echo ${cmvn} | sed -e "s/$/\`/" +else + echo "missing ${cmvn}" + exit 1 +fi + +# e2e +if [ -e ${e2e} ]; then + tar rfh ${outfile}.tar ${e2e} + echo -n " - e2e file: \`" + echo ${e2e} | sed -e "s/$/\`/" + + e2e_conf=$(dirname ${e2e})/model.json + if [ ! -e ${e2e_conf} ]; then + echo missing ${e2e_conf} + #exit 1 + else + echo -n " - e2e JSON file: \`" + echo ${e2e_conf} | sed -e "s/$/\`/" + tar rfh ${outfile}.tar ${e2e_conf} + fi +else + echo "missing ${e2e}" + exit 1 +fi + +# lm +if [ -n "${lm}" ]; then + if [ -e ${lm} ]; then + tar rfh ${outfile}.tar ${lm} + echo -n " - lm file: \`" + echo ${lm} | sed -e "s/$/\`/" + + lm_conf=$(dirname ${lm})/model.json + if [ ! -e ${lm_conf} ]; then + echo missing ${lm_conf} + exit 1 + else + echo -n " - lm JSON file: \`" + echo ${lm_conf} | sed -e "s/$/\`/" + tar rfh ${outfile}.tar ${lm_conf} + fi + else + echo "missing ${lm}" + exit 1 + fi +fi + +# dict +if [ -n "${dict}" ]; then + if [ -e ${dict} ]; then + tar rfh ${outfile}.tar ${dict} + echo -n " - dict file: \`" + echo ${dict} | sed -e "s/$/\`/" + else + echo "missing ${dict}" + exit 1 + fi +fi + +# etc +for x in ${etc}; do + if [ -e ${x} ]; then + tar rfh ${outfile}.tar ${x} + echo -n " - etc file: \`" + echo ${x} | sed -e "s/$/\`/" + else + echo "missing ${x}" + exit 1 + fi +done + +# finally compress the tar file +gzip -f ${outfile}.tar + +# results +if [ -n "${results}" ]; then + echo " - Results (paste them by yourself or obtained by \`\$ pack_model.sh --results \`)" + echo "\`\`\`" +fi +for x in ${results}; do + if [ -e ${x} ]; then + echo "${x}" + grep -e Avg -e SPKR -m 2 ${x} + else + echo "missing ${x}" + exit 1 + fi +done +if [ -n "${results}" ]; then + echo "\`\`\`" +fi + +exit 0 diff --git a/utils/remove_longshortdata.py b/utils/remove_longshortdata.py new file mode 100755 index 0000000000000000000000000000000000000000..131b4a5828bee7dc3e2520ed1694e80230c57f56 --- /dev/null +++ b/utils/remove_longshortdata.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +"""remove longshort data from manifest""" +import argparse +import logging + +import jsonlines + +from paddlespeech.s2t.utils.cli_utils import get_commandline_args + +# manifest after format +# josnline like this +# { +# "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}], +# "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}], +# "utt2spk": "111-2222", +# "utt": "111-2222-333" +# } + + +def get_parser(): + parser = argparse.ArgumentParser( + description="remove longshort data from format manifest", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) + parser.add_argument( + "--verbose", "-V", default=0, type=int, help="Verbose option") + parser.add_argument( + "--iaxis", + default=0, + type=int, + help="multi inputs index, 0 is the first") + parser.add_argument( + "--oaxis", + default=0, + type=int, + help="multi outputs index, 0 is the first") + parser.add_argument("--maxframes", default=2000, type=int, help="maxframes") + parser.add_argument("--minframes", default=10, type=int, help="minframes") + parser.add_argument("--maxchars", default=200, type=int, help="max tokens") + parser.add_argument("--minchars", default=0, type=int, help="min tokens") + parser.add_argument( + "--stride_ms", default=10, type=int, help="stride in ms unit.") + parser.add_argument( + "rspecifier", + type=str, + help="jsonl format manifest. e.g. manifest.jsonl") + parser.add_argument( + "wspecifier_or_wxfilename", + type=str, + help="Write specifier. e.g. manifest.jsonl") + return parser + + +def filter_input(args, line): + tmp = line['input'][args.iaxis] + if args.sound: + # second to frame + nframe = tmp['shape'][0] * 1000 / args.stride_ms + else: + nframe = tmp['shape'][0] + + if nframe < args.minframes or nframe > args.maxframes: + return True + else: + return False + + +def filter_output(args, line): + nchars = len(line['output'][args.iaxis]['text']) + if nchars < args.minchars or nchars > args.maxchars: + return True + else: + return False + + +def main(): + args = get_parser().parse_args() + + logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + if args.verbose > 0: + logging.basicConfig(level=logging.INFO, format=logfmt) + else: + logging.basicConfig(level=logging.WARN, format=logfmt) + logging.info(get_commandline_args()) + + with jsonlines.open(args.rspecifier, 'r') as reader: + lines = list(reader) + logging.info(f"Example: {len(lines)}") + feat = lines[0]['input'][args.iaxis]['feat'] + args.soud = False + if feat.split('.')[-1] not in 'ark, scp': + args.sound = True + + count = 0 + filter = 0 + with jsonlines.open(args.wspecifier_or_wxfilename, 'w') as writer: + for line in lines: + if filter_input(args, line) or filter_output(args, line): + filter += 1 + continue + writer.write(line) + count += 1 + logging.info(f"Example after filter: {count}\{filter}") + + +if __name__ == '__main__': + main() diff --git a/utils/show_results.sh b/utils/show_results.sh new file mode 100755 index 0000000000000000000000000000000000000000..42f80ee6300b26cf6116ff6017cd3de65777d115 --- /dev/null +++ b/utils/show_results.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +mindepth=0 +maxdepth=1 + +. utils/parse_options.sh + +if [ $# -gt 1 ]; then + echo "Usage: $0 --mindepth 0 --maxdepth 1 [exp]" 1>&2 + echo "" + echo "Show the system environments and the evaluation results in Markdown format." + echo 'The default of is "exp/".' + exit 1 +fi + +[ -f ./path.sh ] && . ./path.sh +set -euo pipefail +if [ $# -eq 1 ]; then + exp=$1 +else + exp=exp +fi + + +cat << EOF + +# RESULTS +## Environments +- date: \`$(LC_ALL=C date)\` +EOF + +python3 << EOF +import sys, paddle +pyversion = sys.version.replace('\n', ' ') + +print(f"""- python version: \`{pyversion}\` +- paddle version: \`paddle {paddle.__version__}\`""") +EOF + +cat << EOF +- Git hash: \`$(git rev-parse HEAD)\` + - Commit date: \`$(git log -1 --format='%cd')\` + +EOF + +while IFS= read -r expdir; do + if ls ${expdir}/decode_*/result.txt &> /dev/null; then + # 1. Show the result table + cat << EOF +## $(basename ${expdir}) +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +EOF + grep -e Avg ${expdir}/decode_*/result.txt \ + | sed -e "s#${expdir}/\([^/]*\)/result.txt:#|\1#g" \ + | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|' + echo + + # 2. Show the result table for WER + if ls ${expdir}/decode_*/result.wrd.txt &> /dev/null; then + cat << EOF +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +EOF + grep -e Avg ${expdir}/decode_*/result.wrd.txt \ + | sed -e "s#${expdir}/\([^/]*\)/result.wrd.txt:#|\1#g" \ + | sed -e 's#Sum/Avg##g' | tr '|' ' ' | tr -s ' ' '|' + echo + fi + fi +done < <(find ${exp} -mindepth ${mindepth} -maxdepth ${maxdepth} -type d)