diff --git a/examples/aishell/s1/README.md b/examples/aishell/s1/README.md index 0096c73e30ec57bb41ddf54508a9d197c31adf3a..8c53f95f67514ad8ba5f9050b4d5e1aa3651ecbc 100644 --- a/examples/aishell/s1/README.md +++ b/examples/aishell/s1/README.md @@ -19,3 +19,13 @@ Need set `decoding.decoding_chunk_size=16` when decoding. | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 | | conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | - | 0.059400 | + + +## Transformer + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 | \ No newline at end of file diff --git a/examples/aishell/s1/conf/transformer.yaml b/examples/aishell/s1/conf/transformer.yaml index 7803097ac8437064bc18af713d68321f842ce2b6..c021f66b71513b300b98d6f50fcc39573cc85dca 100644 --- a/examples/aishell/s1/conf/transformer.yaml +++ b/examples/aishell/s1/conf/transformer.yaml @@ -73,7 +73,7 @@ model: training: - n_epoch: 240 + n_epoch: 120 accum_grad: 2 global_grad_clip: 5.0 optim: adam diff --git a/examples/aishell/s1/local/test_hub.sh b/examples/aishell/s1/local/test_hub.sh index 99b141c8107ddd4d4bf118d90fd6d9b2441d69af..6e78ec784bb8cf1445ebbe83fd39f4c8a441f418 100755 --- a/examples/aishell/s1/local/test_hub.sh +++ b/examples/aishell/s1/local/test_hub.sh @@ -23,8 +23,6 @@ fi # exit 1 #fi - - for type in attention_rescoring; do echo "decoding ${type}" batch_size=1 diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/s1/README.md index b7ec93ebec92020d9089233a218dbb2143f40c37..20255db8e9be1fff1361eda8670947d853c2382b 100644 --- a/examples/librispeech/s1/README.md +++ b/examples/librispeech/s1/README.md @@ -21,7 +21,7 @@ ## Transformer | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | | --- | --- | --- | --- | --- | --- | --- | --- | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 7.404532432556152 | 0.056204 | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 7.404532432556152 | 0.058658 | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 7.404532432556152 | 0.058278 | -| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 7.404532432556152 | 0.045591 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.805267604192098, | 0.049795 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 | +| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 | \ No newline at end of file diff --git a/examples/wenetspeech/README.md b/examples/wenetspeech/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fbb322d6ea1d2cf72da9fc634f19ec4333f16ee2 --- /dev/null +++ b/examples/wenetspeech/README.md @@ -0,0 +1,54 @@ +# [WenetSpeech](https://github.com/wenet-e2e/WenetSpeech) + +A 10000+ Hours Multi-domain Chinese Corpus for Speech Recognition + +## Description + +### Creation + +All the data are collected from YouTube and Podcast. Optical character recognition (OCR) and automatic speech recognition (ASR) techniques are adopted to label each YouTube and Podcast recording, respectively. To improve the quality of the corpus, we use a novel end-to-end label error detection method to further validate and filter the data. + +### Categories + +In summary, WenetSpeech groups all data into 3 categories, as the following table shows: + +| Set | Hours | Confidence | Usage | +|------------|-------|-------------|---------------------------------------| +| High Label | 10005 | >=0.95 | Supervised Training | +| Weak Label | 2478 | [0.6, 0.95] | Semi-supervised or noise training | +| Unlabel | 9952 | / | Unsupervised training or Pre-training | +| In Total | 22435 | / | All above | + +### High Label Data + +We classify the high label into 10 groups according to its domain, speaking style, and scenarios. + +| Domain | Youtube | Podcast | Total | +|-------------|---------|---------|--------| +| audiobook | 0 | 250.9 | 250.9 | +| commentary | 112.6 | 135.7 | 248.3 | +| documentary | 386.7 | 90.5 | 477.2 | +| drama | 4338.2 | 0 | 4338.2 | +| interview | 324.2 | 614 | 938.2 | +| news | 0 | 868 | 868 | +| reading | 0 | 1110.2 | 1110.2 | +| talk | 204 | 90.7 | 294.7 | +| variety | 603.3 | 224.5 | 827.8 | +| others | 144 | 507.5 | 651.5 | +| Total | 6113 | 3892 | 10005 | + +As shown in the following table, we provide 3 training subsets, namely `S`, `M` and `L` for building ASR systems on different data scales. + +| Training Subsets | Confidence | Hours | +|------------------|-------------|-------| +| L | [0.95, 1.0] | 10005 | +| M | 1.0 | 1000 | +| S | 1.0 | 100 | + +### Evaluation Sets + +| Evaluation Sets | Hours | Source | Description | +|-----------------|-------|--------------|-----------------------------------------------------------------------------------------| +| DEV | 20 | Internet | Specially designed for some speech tools which require cross-validation set in training | +| TEST\_NET | 23 | Internet | Match test | +| TEST\_MEETING | 15 | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset | \ No newline at end of file diff --git a/examples/wenetspeech/asr1/.gitignore b/examples/wenetspeech/asr1/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..02a229225b1bd83122ea8a3945166876d7183447 --- /dev/null +++ b/examples/wenetspeech/asr1/.gitignore @@ -0,0 +1,3 @@ +data +exp +*.profile diff --git a/examples/wenetspeech/asr1/README.md b/examples/wenetspeech/asr1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5aff041f80931ae13e0f275147f55d91f1759d6e --- /dev/null +++ b/examples/wenetspeech/asr1/README.md @@ -0,0 +1,24 @@ +# WenetSpeech + + +## Conformer + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | dev | attention | | | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | ctc_greedy_search | | | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test meeting | ctc_prefix_beam_search | | | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | test net | attention_rescoring | | | + + + +## Conformer Pretrain Model + +Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wenetspeech/20211025_conformer_exp.tar.gz + +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention | - | 0.048456 | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | - | 0.052915 | +| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 | \ No newline at end of file diff --git a/examples/wenetspeech/asr1/local/data.sh b/examples/wenetspeech/asr1/local/data.sh old mode 100644 new mode 100755 diff --git a/examples/wenetspeech/asr1/local/test.sh b/examples/wenetspeech/asr1/local/test.sh old mode 100644 new mode 100755 index e7c6434699b386dd60bf5d6db71a80ed6def620a..47bd2f6338a7d062b094c327f39f5362fae39865 --- a/examples/wenetspeech/asr1/local/test.sh +++ b/examples/wenetspeech/asr1/local/test.sh @@ -1 +1,69 @@ -decode_modes="attention_rescoring ctc_greedy_search" \ No newline at end of file +#!/bin/bash + +if [ $# != 2 ];then + echo "usage: ${0} config_path ckpt_path_prefix" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_prefix=$2 + +chunk_mode=false +if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then + chunk_mode=true +fi + +# download language model +#bash local/download_lm_ch.sh +#if [ $? -ne 0 ]; then +# exit 1 +#fi + + +for type in attention ctc_greedy_search; do + echo "decoding ${type}" + if [ ${chunk_mode} == true ];then + # stream decoding only support batchsize=1 + batch_size=1 + else + batch_size=64 + fi + output_dir=${ckpt_prefix} + mkdir -p ${output_dir} + python3 -u ${BIN_DIR}/test.py \ + --nproc ${ngpu} \ + --config ${config_path} \ + --result_file ${output_dir}/${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done + +for type in ctc_prefix_beam_search attention_rescoring; do + echo "decoding ${type}" + batch_size=1 + output_dir=${ckpt_prefix} + mkdir -p ${output_dir} + python3 -u ${BIN_DIR}/test.py \ + --nproc ${ngpu} \ + --config ${config_path} \ + --result_file ${output_dir}/${type}.rsl \ + --checkpoint_path ${ckpt_prefix} \ + --opts decoding.decoding_method ${type} \ + --opts decoding.batch_size ${batch_size} + + if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 + fi +done + +exit 0 diff --git a/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py index 7f3bd9e1253fbcf491743ba912472b22b4d8f0e9..21f512e9b9e2827b6c6e23b53f16badf3ec8c958 100644 --- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py +++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py @@ -92,7 +92,9 @@ class TextFeaturizer(): tokens = self.tokenize(text) ids = [] for token in tokens: - token = token if token in self.vocab_dict else self.unk + if token not in self.vocab_dict: + logger.debug(f"Text Token: {token} -> {self.unk}") + token = self.unk ids.append(self.vocab_dict[token]) return ids diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 916a6a0596e073dfca7f9a626bc412f3445e9cee..4f833372a9ace05d9228c5ccad537a9e627dae88 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -860,7 +860,7 @@ class U2Model(U2DecodeModel): int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc """ # cmvn - if configs['cmvn_file'] is not None: + if 'cmvn_file' in configs and configs['cmvn_file']: mean, istd = load_cmvn(configs['cmvn_file'], configs['cmvn_file_type']) global_cmvn = GlobalCMVN(